def calculateLearningCurve(): classifier = classifierSelector.constructGradientBoostingClassifier() trainData = dataReader.getTrainData() # feature engineering trainData = featureExtractor.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True) trainSizes = np.linspace(100000,500000,5,dtype=int) plot_learning_curve(classifier,xTrain,yTrain,trainSizes)
def trainClassifierOnTrainingDataReturnAll(numberOfTrainingExamples = -1): trainData = dataReader.getTrainData(numberOfTrainingExamples) # feature engineering trainData = featureExtractor.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True) # classifier training classifier = classifierSelector.trainClassifier(xTrain, yTrain) return classifier, xTrain, yTrain
def calculateValidationCurve(): classifier = classifierSelector.constructGradientBoostingClassifier() numberOfTrainData = 50000 trainData = dataReader.getTrainData(numberOfTrainData) # feature engineering trainData = featureExtractor.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True) paramRange = [0.1,0.13,0.16] plot_validation_curve(classifier,xTrain,yTrain,"learning_rate",paramRange)
def trainClassifierOnTrainingData(trainData=None, numberOfTrainingExamples = -1, margins=None): if trainData is None: trainData = dataReader.getTrainData(numberOfTrainingExamples,margins) # feature engineering trainData = regularFeatExtr.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = regularFeatExtr.getRegularFeatures(trainData, True) # classifier training classifier = classifierSelector.trainClassifier(xTrain, yTrain) return classifier
def testGeneralPerformanceUsingCrossValidationScore(): # train 28k and test = 7k # trainDataSize = 35000 trainDataSize = 150000 classifier = classifierSelector.constructGradientBoostingClassifier() # classifier = classifierSelector.constructRandomForestClassifier() # classifier = SVC(verbose=1) xTrain,yTrain = constructTrainingData(trainDataSize) cv = StratifiedShuffleSplit(yTrain,n_iter=1,train_size=50000,test_size=100000) cv_scores = cross_val_score(classifier, xTrain, yTrain, cv=cv, n_jobs=-1,scoring="log_loss",verbose=1) scoreMean = cv_scores.mean() print "Mean score is {}".format(scoreMean)