def runOnFullData(trainFile,testFile=None,NO_OF_EXECUTION=1,flag=None):
    """
    Desc : Run the all classifiers on full data based on model selected.
    Args:
        trainFile: trainfilePath
        testFile:  testFilePath
        NO_OF_EXECUTION: Number of executions (default=1)
        flag: Flag=1 -- Model_1 , Flag_2=2 -- Model_2

    Returns: None

    """
    if(flag==1):
        ### Business Features
        TrainDataset=loadData(trainFile)
        TestDataset=loadData(trainFile)
    else:
        ### Sentiment Features
        TrainDataset=loadDataSenti(trainFile)
        TestDataset=loadDataSenti(testFile)
    print('\n Train dataset after loading : {0}').format(TrainDataset.shape)
    print('\n Test dataset after loading : {0}').format(TestDataset.shape)
    No_Of_Cols=TrainDataset.shape[1]-1
    Xtrain=TrainDataset[:,0:No_Of_Cols]
    Ytrain=TrainDataset[:,No_Of_Cols]
    Xtest=TestDataset[:,0:No_Of_Cols]
    Ytest=TestDataset[:,No_Of_Cols]
    classalgs = {#'Random': algs.Classifier(),
                     'Naive Bayes': algs.NaiveBayes(),
                    'Logistic Regression' : algs.LogisticRegression(),
                     'GradientBoostingRegressor' : algs.GradientBoost(),
                     'SVM_rbf' :algs.SVM(),
                     'SVM_linear' :algs.SVM(),
                     'SVM_Sigmoid' :algs.SVM('sigmoid'),
                    'DecisionTreeRegressor' : algs.DecisionTreeReg(),
                    'DecisionTreeClassifier' : algs.DecisionTreeClassifier(),
                    'RandomForestRegressor' : algs.RandForest()
                     }
    for learnername, learner in classalgs.iteritems():
        print('Running learner = {0}').format(learnername)
        # Train model
        learner.learn(Xtrain, Ytrain)
        # Test model
        predictions = learner.predict(Xtest)
        #print('\n Predictions : {0}').format(predictions)
        accuracy = getAccuracy(Ytest, predictions)
        print 'Accuracy for ' + learnername + ': ' + str(accuracy)
        accuracy=RMSE_Evaluation(Ytest, predictions)
        print 'RMSE Error for ' + learnername + ': ' + str(accuracy)
        accuracy=r_square_Evaluation(Ytest, predictions)
        print 'R2 Score for ' + learnername + ': ' + str(accuracy)
    def runClassifiers(self,dataset,testDataset):
        #trainset, testset = self.splitdataset(dataset)
        numinputs = dataset.shape[1]-1
        Xtrain = dataset[:,0:numinputs]
        ytrain = dataset[:,numinputs]
        print('Split into train={0} and test={1} ').format(Xtrain.shape, testDataset.shape)
        classalgs = {   'Logistic Regression_10' : algs.LogisticRegression(C=10),
                        'Logistic Regression_1' : algs.LogisticRegression(C=1),
                        'Logistic Regression_.1' : algs.LogisticRegression(C=0.1),
                        'Logistic Regression_.01' : algs.LogisticRegression(C=0.01),
                        'Logistic Regression_.001' : algs.LogisticRegression(C=0.001),
                        #'GradientBoostingClassifier_10' : algs.GradientBoost(n_estimators=10),
                         #'GradientBoostingClassifier_5' : algs.GradientBoost(n_estimators=5),
                         #'GradientBoostingClassifier_4' : algs.GradientBoost(n_estimators=4),
                         #'GradientBoostingClassifier_3' : algs.GradientBoost(n_estimators=3),
                         #'GradientBoostingClassifier_2' : algs.GradientBoost(n_estimators=2),
                         'Gauassian SVM_300' :algs.SVM(C=300),
                         'Gauassian SVM_200' :algs.SVM(C=200),
                         'Gauassian SVM_100' :algs.SVM(C=100),
                         'Gauassian SVM_50' :algs.SVM(C=50),
                         'Gauassian SVM_20' :algs.SVM(C=20),
                         #'Neural Net_4':algs.backPropogation((Xtrain.shape[1],4,1)),
                         #'Neural Net_8':algs.backPropogation((Xtrain.shape[1],8,1)),
                         #'Neural Net_16':algs.backPropogation((Xtrain.shape[1],16,1)),
                         #'Neural Net_32':algs.backPropogation((Xtrain.shape[1],32,1)),
                         #'Neural Net_64':algs.backPropogation((Xtrain.shape[1],64,1))
                         }

        # Runs all the algorithms on the data and print out results
        for learnername, learner in classalgs.iteritems():
            print 'Running learner = ' + learnername
            # Train model
            #learner.featureSelection(trainset[0])
            learner.learn(Xtrain, ytrain)
            # Test model
            predictions = learner.predict(testDataset)
            self.prediction=predictions
            print predictions
            #accuracy = util.getaccuracy(testset[1], predictions)
            #print 'Accuracy for ' + learnername + ': ' + str(accuracy)
            fileName='output_'+learnername+'.csv'
            self.writeFile(fileName)
            Ytest = test[:,numinputs]
            print('\n XTrain : {0} \n YTrain : {1} \n Xtest : {2} \n YTest : {3}').format(Xtrain.shape,Ytrain.shape,Xtest.shape,Ytest.shape)
            ### Run the classifiers
            classalgs = {#'Logistic Regression_10' : algs.LogisticRegression(C=10),
                        #'Logistic Regression_1' : algs.LogisticRegression(C=1),
                        'Logistic Regression_.1' : algs.LogisticRegression(C=0.1),
                        #'Logistic Regression_.01' : algs.LogisticRegression(C=0.01),
                        #'Logistic Regression_.001' : algs.LogisticRegression(C=0.001),
                         #'GradientBoostingClassifier_10' : algs.GradientBoost(n_estimators=10),
                         #'GradientBoostingClassifier_5' : algs.GradientBoost(n_estimators=5),
                         #'GradientBoostingClassifier_4' : algs.GradientBoost(n_estimators=4),
                         #'GradientBoostingClassifier_3' : algs.GradientBoost(n_estimators=3),
                         #'GradientBoostingClassifier_2' : algs.GradientBoost(n_estimators=2),
                         #'Gauassian SVM_300' :algs.SVM(C=300),
                         #'Gauassian SVM_200' :algs.SVM(C=200),
                         'Gauassian SVM_100' :algs.SVM(C=100),
                         #'Gauassian SVM_50' :algs.SVM(C=50),
                         #'Gauassian SVM_20' :algs.SVM(C=20),
                         #'Neural Net_4':algs.backPropogation((Xtrain.shape[1],4,1)),
                         #'Neural Net_8':algs.backPropogation((Xtrain.shape[1],8,1)),
                         #'Neural Net_16':algs.backPropogation((Xtrain.shape[1],16,1)),
                         #'Neural Net_32':algs.backPropogation((Xtrain.shape[1],32,1)),
                         #'Neural Net_64':algs.backPropogation((Xtrain.shape[1],64,1))
                          }

            accuracyD={}
            for learnername, learner in classalgs.iteritems():
                print 'Running learner = ' + learnername
                learner.learn(Xtrain, Ytrain)
                predictions = learner.predict(Xtest)
                recall = util.getRecall(Ytest, predictions)
Example #4
0
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, target_id, feature_selection):
	'''execute learning task using the specified algorithm'''

	# feature selection
	# feature selection
	k = 30
	#k = 100000
	if feature_selection:
		print '  ...performing feature selection'
		if X.shape[1] < k:
			k = X.shape[1]

		pearsons = []
		pearsons_print = []
		for i in range(X.shape[1]):
			if sum(np.asarray(X[:,i])) != 0:
				p = pearsonr(np.squeeze(np.asarray(X[:,i])), y)
				pearsons.append(abs(p[0]))
				pearsons_print.append(p[0])
			else:
				pearsons.append(0)
				pearsons_print.append(0)

		# best_features = np.array(pearsons).argsort()[-k:][::-1]

		sorted_features = np.array(pearsons).argsort()[:][::-1]

		best_features = []
		remove_list = []
		i = 0
		while len(best_features) < k:
			if not i in remove_list:
				best_features.append(sorted_features[i])
				for j in range(i, X.shape[1]):
					p = pearsonr(np.asarray(X[:,sorted_features[i]]).tolist(), np.asarray(X[:,sorted_features[j]]).tolist())
					if abs(p[0]) >= 0.7:
						remove_list.append(j)
			i += 1


		old_headers = list(headers)
		headers = [headers[i] for i in best_features]
		f = open(out_dir+"correlations_" + fname + '.csv', 'w')
		for header in headers:
			f.write(str(header) + ' & ' + str(float("{0:.2f}".format(pearsons_print[old_headers.index(header)]))) + '\n')
		f.close()
		new_X = X[:,best_features]

	else:
		new_X = X
		best_features = 'all'

	print alg

	# execute algorithm
	if alg == 'DT':
		results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers)
	elif alg == 'RF':
		results, features, model = ML.RF(new_X, y, best_features, n_estimators=100)
	elif alg == 'RFsmall':
		results, features, model = ML.RF(new_X, y, best_features, n_estimators=10)
	elif alg == 'SVM':
		results, model = ML.SVM(new_X, y, best_features)
	elif alg == 'LR':
		results, features, model = ML.LR(new_X, y, best_features)

	if not results:
		return

	# set2model_instance[fname] = (model, best_features)

	# export results
	# results_list.append([fname] + results[0:3])

	in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])
	if 'features' in locals():
		features = features.flatten()
		in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features))

	return model, best_features, [fname] + results[0:3]
def runClassifier(trainFile,testFile=None,NO_OF_EXECUTION=1,flag=None):
    """
    Desc : K-fold cross validation followed by classifier executions.
           Display results 'Accuracy','RMSE score','R2-Score'
    Args:
        trainFile: trainfilePath
        testFile:  testFilePath
        NO_OF_EXECUTION: Number of executions (default=1)
        flag: Flag=1 -- Model_1 , Flag_2=2 -- Model_2

    Returns: None

    """
    if(flag==1):
        ### Business Features
        dataset=loadData(trainFile)
    else:
        ### Sentiment Features
        dataset=loadDataSenti(trainFile)
    print('\n Size of dataset after loading : {0}').format(dataset.shape)
    kf=KfoldCrossValidation(dataset)
    i=1
    for train_index, test_index in kf:
        ### For Each Fold
        print('\n For Fold : {0}').format(i)
        i=i+1
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = dataset[train_index], dataset[test_index]

        print('\n Train : {0} \n Test : {1}').format(X_train.shape, X_test.shape)

        #trainset,testset=getSplit(dataset)

        trainset,testset=getSplitNew(X_train,X_test)
        #trainset,testset=beliefNet(trainset,testset)


        print('Running  on train={0} and test={1} samples').format(trainset[0].shape, testset[0].shape)

        #ForwardFeatureSelection(trainset[0], trainset[1],testset[0],testset[1])


        classalgs = {#'Random': algs.Classifier(),
                     'Naive Bayes': algs.NaiveBayes(),
                    'Logistic Regression' : algs.LogisticRegression(),
                     'GradientBoostingRegressor' : algs.GradientBoost(),
                     'SVM_rbf' :algs.SVM(),
                     'SVM_linear' :algs.SVM(),
                     'SVM_Sigmoid' :algs.SVM('sigmoid'),
                    'DecisionTreeRegressor' : algs.DecisionTreeReg(),
                    'DecisionTreeClassifier' : algs.DecisionTreeClassifier(),
                    'RandomForestRegressor' : algs.RandForest()
                     }
        for learnername, learner in classalgs.iteritems():
            print('Running learner = {0}').format(learnername)
            # Train model
            learner.learn(trainset[0], trainset[1])
            # Test model
            predictions = learner.predict(testset[0])
            #print('\n Predictions : {0}').format(predictions)
            accuracy = getAccuracy(testset[1], predictions)
            print 'Accuracy for ' + learnername + ': ' + str(accuracy)
            accuracy=RMSE_Evaluation(testset[1], predictions)
            print 'RMSE Error for ' + learnername + ': ' + str(accuracy)
            accuracy=r_square_Evaluation(testset[1], predictions)
            print 'R2 Score for ' + learnername + ': ' + str(accuracy)