def Execute(context):
    timeStamp = TimeStamp()
    connectionSQL = ConnectionSQL(context)

    criterion = 'gini'
    max_features = None
    max_depth = 14
    min_samples_split=6
    min_samples_leaf=6
    n_jobs=-1

    # model parameter ranges - used for searching for optimial parameter settings
    criterionList = ['gini', 'entropy']
    max_featuresList = [7, 8, 9, 10, 11, 12, 13, 14, None]
    max_depthList = [6, 8, 10, 11, 12, 13, 14, None]
    min_samples_splitList = [3, 5, 6, 7, 8, 9]
    min_samples_leafList = [3, 4, 5, 6, 7]
    parameterRangeDict = {'criterion':criterionList, 'max_features':max_featuresList, 'max_depth':max_depthList, 'min_samples_split':min_samples_splitList, 'min_samples_leaf':min_samples_leafList }

    timeStamp1 = TimeStamp('Loading X, Y')
    # 0=DataID, 1=P_A, 2=P_B, 3=P_C, 4=P_D, 5=P_E, 6=P_F, 7=P_G, 8=State, ... 27=Cost
    # Load Training Data and Cross Validation Data
    train_X, train_Y, train_DataID = connectionSQL.GetFeaturesAndResultsFromCache(featuresColumns=__featuresColumns, dataType="Train", modelName=__modelName, preProcessDataFrame=__PreProcessDataFrame, viewName=__viewName)
    train_DataID = None

    # Load Cross Validation Data
    cross_X, cross_Y, cross_DataID = connectionSQL.GetFeaturesAndResultsFromCache(featuresColumns=__featuresColumns, dataType="Cross", modelName=__modelName, preProcessDataFrame=__PreProcessDataFrame, viewName=__viewName)
    cross_DataID = None

    print '  Elaspe=' + timeStamp1.Elaspse
    print ''

    if (__optimiseParameters):
        timeStamp1 = TimeStamp('Optimise parameters')
        # package paramaters
        dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y}

        # Optimal Paramater selection
        #parameterDictMax = __OptimalParamaterSelection(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.2 # 20%
        parameterDictMax, accuracyMax = optimiseModelParameters.ExecuteMoneCarlo(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)

        criterion = parameterDictMax['criterion']
        max_features = parameterDictMax['max_features']
        max_depth = parameterDictMax['max_depth']
        min_samples_split = parameterDictMax['min_samples_split']
        min_samples_leaf = parameterDictMax['min_samples_leaf']

        print 'Elaspe=' + timeStamp1.Elaspse
        print ''

    timeStamp1 = TimeStamp('Fit model')
    print __modelName + ' criterion=' + criterion + ' max_features=' + str(max_features) + ' max_depth=' + str(max_depth) + ' min_samples_split=' + str(min_samples_split) + ' min_samples_leaf=' + str(min_samples_leaf)
    clf = DecisionTreeClassifierMultiClass(splitter='best', random_state=1, 
                                           criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    model = clf.fit(train_X, train_Y) # All features must be float.
    print 'Elaspe=' + timeStamp1.Elaspse 
    print ''

    # Training error reports
    SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y)
    SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y)

    # Cross validation report
    SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5)

    # UPDATE DRV_Predict with all data predictions.
    timeStamp1 = TimeStamp('Update_DRV_Predict')
    all_X, all_Y, all_DataID = connectionSQL.GetFeaturesAndResultsFromCache(featuresColumns=__featuresColumns, dataType="ALL", modelName=__modelName, preProcessDataFrame=__PreProcessDataFrame, viewName=__viewName)
    all_Y = None
    connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID)
    print 'Elaspe=' + timeStamp1.Elaspse
    print ''

    # Feature analysis summary
    featureImportances = enumerate(clf.feature_importances_)
    featureImportanceHistogram = np.array([(importance,train_X.columns[i]) for (i,importance) in featureImportances if importance > 0.01])

    print __modelName + ' Elaspe=' + timeStamp.Elaspse
    return __modelName, featureImportanceHistogram
def Execute(context):
    connectionSQL = ConnectionSQL(context)

    penalty = None
    alpha = 0.0001
    fit_intercept = True
    n_iter = 20
    shuffle = False
    eta0 = 1

    # model parameter ranges - used for searching for optimial parameter settings
    penaltyList = ['l1', 'l2', 'elasticnet', None]
    alphaList = [0.1, 0.001, 0.0001, 0.00001]
    fit_interceptList = [True, False]
    n_iterList = [5, 10, 20, 30, 40, 100]
    shuffleList = [False]
    eta0List = [1]

    parameterRangeDict = {'penalty':penaltyList, 'alpha':alphaList, 'fit_intercept':fit_interceptList, 'n_iter':n_iterList, 'shuffle':shuffleList, 'eta0':eta0List }

    # 0=DataID, 1=Actual, 2=Base, ...
    # Load Training Data and Cross Validation Data
    train_X, train_Y, train_DataID = __GetFeaturesAndResults(context=context, connectionSQL=connectionSQL, dataType="Train", preProcessDataFrame=__PreProcessDataFrame)
    train_DataID = None

    # Load Cross Validation Data
    cross_X, cross_Y, cross_DataID = __GetFeaturesAndResults(context=context, connectionSQL=connectionSQL, dataType="Cross", preProcessDataFrame=__PreProcessDataFrame)
    cross_DataID = None

    if (__optimiseParameters):
        parameterDictMax = {}
        accuracyMax = 0.0

        # package paramaters
        dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y}
        
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.20 # 20%
        parameterDictMax, accuracyMax = optimiseModelParameters.ExecuteMoneCarlo(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)

        penalty = parameterDictMax['penalty']
        alpha = parameterDictMax['alpha']
        fit_intercept = parameterDictMax['fit_intercept']
        n_iter = parameterDictMax['n_iter']
        shuffle = parameterDictMax['shuffle']
        eta0 = parameterDictMax['eta0']

    print __modelName + 'Classifier'
    clf = Perceptron(penalty=penalty, alpha=alpha, fit_intercept=fit_intercept, n_iter=n_iter, shuffle=shuffle, random_state=1, eta0=eta0, warm_start=False)
    model = clf.fit(train_X, train_Y) # All features must be float.
    print ''

    # Training error reports
    SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y)
    SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y)

    # Cross validation report
    SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5)

    weights = clf.coef_[0]  # narray
    weights = weights/sum(weights)
    modelNames = train_X.keys();
    weightsDict = {}
    for index in range(len(modelNames)):
        weightsDict[modelNames[index]] = weights[index]

    weightsString = ''
    prefix = ''
    for key, value in weightsDict.iteritems():
        weightsString += prefix + key + '=' + str(value)
        prefix = ' '

    print 'Weights ' + weightsString
    print ''

    # UPDATE DRV_Predict with all data predictions.
    all_X, all_Y, all_DataID = __GetFeaturesAndResults(context=context, connectionSQL=connectionSQL, dataType="ALL", preProcessDataFrame=__PreProcessDataFrame)
    all_Y = None
    connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID)

    return __modelName
Esempio n. 3
0
def Execute(context):
    connectionSQL = ConnectionSQL(context)

    kernel = 'linear'
    degree = 3

    # model parameter ranges - used for searching for optimial parameter settings
    kernelList = ['linear','poly','rbf','sigmoid']

    parameterRangeDict = {'kernel':kernelList }

    # 0=DataID, 1=Actual, 2=Pclass, ...
    # Load Training Data and Cross Validation Data
    train_X, train_Y, train_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Train", preProcessDataFrame=__PreProcessDataFrame)
    train_DataID = None

    # Load Cross Validation Data
    cross_X, cross_Y, cross_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Cross", preProcessDataFrame=__PreProcessDataFrame)
    cross_DataID = None

    if (__optimiseParameters):
        parameterDictMax = {}
        accuracyMax = 0.0

        # package paramaters
        dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y}

        # Optimal Paramater selection
        # Kernel=Linear
        kernelList = ['linear']
        degreeList = [3]
        parameterRangeDict = {'kernel':kernelList, 'degree':degreeList }
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.2 # 20%
        parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)
        if (accuracy > accuracyMax):
            parameterDictMax = parameterDict
            accuracyMax = accuracy

        # Kernel=poly
        kernelList = ['poly']
        degreeList = [2, 3, 4]
        parameterRangeDict = {'kernel':kernelList, 'degree':degreeList }
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.2 # 20%
        parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)
        if (accuracy > accuracyMax):
            parameterDictMax = parameterDict
            accuracyMax = accuracy

        # Kernel=rbf
        kernelList = ['rbf']
        degreeList = [3]
        parameterRangeDict = {'kernel':kernelList, 'degree':degreeList }
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.2 # 20%
        parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)
        if (accuracy > accuracyMax):
            parameterDictMax = parameterDict
            accuracyMax = accuracy

        # Kernel=sigmoid
        kernelList = ['sigmoid']
        degreeList = [3]
        parameterRangeDict = {'kernel':kernelList, 'degree':degreeList }
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.2 # 20%
        parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)
        if (accuracy > accuracyMax):
            parameterDictMax = parameterDict
            accuracyMax = accuracy

        kernel = parameterDictMax['kernel']
        degree = parameterDictMax['degree']

    print __modelName + 'Classifier'
    clf = svm.SVC(probability=True, verbose=False, random_state=1, kernel=kernel, degree=degree)
    model = clf.fit(train_X, train_Y) # All features must be float.
    print ''

    # Training error reports
    SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y)
    SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y)

    # Cross validation report
    SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5)

    # UPDATE DRV_Predict with all data predictions.
    all_X, all_Y, all_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="ALL", preProcessDataFrame=__PreProcessDataFrame)
    all_Y = None
    connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID)

    return __modelName
def Execute(context):
    connectionSQL = ConnectionSQL(context)

    # Default model parameter values
    n_estimators = 50
    max_features = 8 # None
    max_depth = 12
    min_samples_split=5
    min_samples_leaf=4
    bootstrap=False
    n_jobs=-1 # Avaiable CPUs
    random_state=1 # Use None or 0 to randomise results

    # model parameter ranges - used for searching for optimial parameter settings
    n_estimatorsList = [20, 30, 40, 50, 70, 80, 100, 110]
    max_featuresList = [7, 8, 9, 10, 11, 12, 13, 14, None]
    #max_featuresList = ['auto']
    max_depthList = [6, 8, 10, 11, 12, 13, 14]
    min_samples_splitList = [3, 5, 6, 7, 8, 9]
    min_samples_leafList = [3, 4, 5, 6, 7]
    bootstrapList = [False]
    parameterRangeDict = {'n_estimators':n_estimatorsList, 'max_features':max_featuresList, 'max_depth':max_depthList, 'min_samples_split':min_samples_splitList, 'min_samples_leaf':min_samples_leafList, 'bootstrap':bootstrapList }

    # 0=DataID, 1=Actual, 2=Pclass, ...
    # Load Training Data and Cross Validation Data
    train_X, train_Y, train_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Train", preProcessDataFrame=__PreProcessDataFrame)
    #train_DataID = None

    # Load Cross Validation Data
    cross_X, cross_Y, cross_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Cross", preProcessDataFrame=__PreProcessDataFrame)
    #cross_DataID = None

    if (__optimiseParameters):
        # package paramaters
        dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y}

        # Optimal Paramater selection
        #parameterDictMax = __OptimalParamaterSelection(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)
        optimiseModelParameters = OptimiseModelParameters(modelName=__modelName)
        optimiseModelParameters.Percent = 0.05 # 5%
        parameterDictMax, accuracyMax = optimiseModelParameters.ExecuteMoneCarlo(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict)

        bootstrap = parameterDictMax['bootstrap']
        min_samples_leaf = parameterDictMax['min_samples_leaf']
        n_estimators = parameterDictMax['n_estimators']
        max_features = parameterDictMax['max_features']
        min_samples_split = parameterDictMax['min_samples_split']
        max_depth = parameterDictMax['max_depth']

    print __modelName + ' bootstrap=' + str(bootstrap) + ' min_samples_leaf=' + str(min_samples_leaf) + ' n_estimators=' + str(n_estimators) + " max_features=" + str(max_features) + " min_samples_split=" + str(min_samples_split) + " max_depth=" + str(max_depth)
    oob_score = bootstrap  # Can only have oob_score if bootstrap = True
    clf = RandomForestClassifier(n_jobs=-1, oob_score=oob_score, random_state=1,
                                 bootstrap=bootstrap, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split, max_depth=max_depth)
    model = clf.fit(train_X, train_Y) # All features must be float.
    print ''

    # Persist trained classifier to disk
    # clfUNC = __modelName + '.clf'
    #joblib.dump(clfUNC)
    #clf = joblib.load(clfUNC)

    # Training error reports
    SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y)
    SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y)

    # Cross validation report
    SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5)

    # UPDATE DRV_Predict with all data predictions.
    all_X, all_Y, all_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="ALL", preProcessDataFrame=__PreProcessDataFrame)
    all_Y = None
    connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID)

    # Feature analysis summary
    featureImportances = enumerate(clf.feature_importances_)
    featureImportanceHistogram = np.array([(importance,train_X.columns[i]) for (i,importance) in featureImportances if importance > 0.005])

    return __modelName, featureImportanceHistogram