예제 #1
0
def calculateLearningCurve():
    classifier = classifierSelector.constructGradientBoostingClassifier()
    trainData = dataReader.getTrainData()

    # feature engineering
    trainData =  featureExtractor.convertTargetFeatureToNumeric(trainData)
    xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True)

    trainSizes =  np.linspace(100000,500000,5,dtype=int)

    plot_learning_curve(classifier,xTrain,yTrain,trainSizes)
예제 #2
0
def trainClassifierOnTrainingDataReturnAll(numberOfTrainingExamples = -1):

    trainData = dataReader.getTrainData(numberOfTrainingExamples)

    # feature engineering
    trainData =  featureExtractor.convertTargetFeatureToNumeric(trainData)
    xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True)


     # classifier training
    classifier = classifierSelector.trainClassifier(xTrain, yTrain)

    return classifier, xTrain, yTrain
예제 #3
0
def calculateValidationCurve():
    classifier = classifierSelector.constructGradientBoostingClassifier()

    numberOfTrainData = 50000

    trainData = dataReader.getTrainData(numberOfTrainData)

    # feature engineering
    trainData =  featureExtractor.convertTargetFeatureToNumeric(trainData)
    xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True)

    paramRange = [0.1,0.13,0.16]

    plot_validation_curve(classifier,xTrain,yTrain,"learning_rate",paramRange)
예제 #4
0
def trainClassifierOnTrainingData(trainData=None, numberOfTrainingExamples = -1, margins=None):

    if trainData is None:
        trainData = dataReader.getTrainData(numberOfTrainingExamples,margins)

    # feature engineering
    trainData =  regularFeatExtr.convertTargetFeatureToNumeric(trainData)
    xTrain, yTrain = regularFeatExtr.getRegularFeatures(trainData, True)


     # classifier training
    classifier = classifierSelector.trainClassifier(xTrain, yTrain)

    return classifier
예제 #5
0
def testGeneralPerformanceUsingCrossValidationScore():
    # train 28k and test = 7k
    # trainDataSize = 35000
    trainDataSize = 150000

    classifier = classifierSelector.constructGradientBoostingClassifier()
    # classifier = classifierSelector.constructRandomForestClassifier()
    # classifier = SVC(verbose=1)

    xTrain,yTrain = constructTrainingData(trainDataSize)

    cv = StratifiedShuffleSplit(yTrain,n_iter=1,train_size=50000,test_size=100000)

    cv_scores = cross_val_score(classifier, xTrain, yTrain, cv=cv, n_jobs=-1,scoring="log_loss",verbose=1)

    scoreMean = cv_scores.mean()

    print "Mean score is {}".format(scoreMean)