Beispiel #1
0
def main():
	split = 0.3
	p = optparse.OptionParser()
	#take training data set 
	p.add_option('--train_dataset', '-i', default='/afs/cern.ch/user/s/sganju/private/2014_target.csv')
	#specify target column
	p.add_option('--target', '-y', default="target")	
	#parse inputs
	options, arguments = p.parse_args()
	#split different numerical values
	#load from files 
	train = pd.read_csv(options.train_dataset) 
	data = train[["id", "cpu", "creator", "dbs" , "dtype" , "era" ,  "nblk"	, "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct"]]

		
	#load target values 
	target = train['target']
	
	#TRAINING DATA SET 
	features_train, features_test, target_train, target_test = train_test_split(data, target, test_size=split, random_state=0)
	#diffrentiate on the basis of type of problem
	#RANDOM FOREST CLASSIFIER 
	rf = RandomForestClassifier(n_estimators=100)
	rf = rf.fit(features_train, target_train)
	cal_score_accuracy("RANDOM FOREST CLASSIFIER",rf, features_test, target_test)
	#test data set then make predictions 
	test = pd.read_csv('dataframe-20130101-20130107-TARGET.csv')
	test = test[["id", "cpu", "creator", "dbs" , "dtype" , "era" ,  "nblk"	, "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct"]]
	predictions = rf.predict_proba(test)
def init_turns_module(values, trees, data, labels):
    # Fit regression model
    global turns_regr
    turns_regr = RandomForestClassifier(n_estimators=trees)
    turns_regr.fit(data[:, [0,1]], labels)
    print "init_turns, importances: ", turns_regr.feature_importances_
    return
Beispiel #3
0
def prediction_confusion_matrix():
    df=get_data()
    X=df.ix[:, (df.columns !='class') & (df.columns !='code')].as_matrix() #this gives a numpy
    print X
    y1=df.ix[:,df.columns=='class'].as_matrix()
    y=y1.reshape(683,1)
    Y=binary(y)
    #split into test and training sets

    features_train, features_test, outcome_train, outcome_test = cv.train_test_split(X, Y, test_size=0.4,random_state=1)

    #Random Forest Classifier for the classification.
    forest=RandomForestClassifier(n_estimators=10,min_samples_leaf= 10, criterion=
    'gini', max_features= 'auto', max_depth= None)
    forest=forest.fit(features_train,outcome_train)
    predicted=forest.predict(features_test)

    #Confusion_matrix
    confusion_matrix=metrics.confusion_matrix(outcome_test,predicted)

    output ={
          'Random Forest Classifier':

          {
            'fp':confusion_matrix[0][1],
            'tp':confusion_matrix[1][1],
            'fn':confusion_matrix[1][0],
            'tn':confusion_matrix[0][0]
          }
    }
    return jsonify(output)
Beispiel #4
0
def rand_forest(train_bow,train_labels,test_bow,test_labels,bow_indexes):
    print("Training rndForest")
    rf_classifier=RandomForestClassifier()

    rf_classifier.fit(train_bow,train_labels)
    print("Testing rndForest")
    test(rf_classifier,"rf",test_bow,test_labels,bow_indexes)
 def __init__(self, data, classes, tree_features, n_trees=100):
     self.n_features = np.shape(data)[1]
     n_rows = np.shape(data)[0]
     n_nans = np.sum(np.isnan(data), 0)
     data = data[:, n_nans < n_rows]
     self.n_features = np.shape(data)[1]
     
     n_nans = np.sum(np.isnan(data), 1)
     data = data[n_nans < self.n_features, :]
     self.n_rows = np.shape(data)[0]
     
     if (tree_features > self.n_features):
         tree_features = self.n_features
     
     self.col_list = np.zeros((n_trees, tree_features), dtype='int')
     self.n_trees = n_trees
     self.bags = []
     for i in range(n_trees):
         cols = sample(range(self.n_features), tree_features)
         cols.sort()
         self.col_list[i, :] = cols
         data_temp = data[:, cols]
         n_nans = np.sum(np.isnan(data_temp), 1)
         data_temp = data_temp[n_nans == 0, :]
         classes_temp = classes[n_nans == 0]
         #bag = BaggingClassifier(n_estimators=1, max_features=tree_features)
         bag = RandomForestClassifier(n_estimators=1, max_features=tree_features)
         bag.fit(data_temp, classes_temp)
         self.bags.append(bag)
         print(np.shape(data_temp))
def train_and_predict():
	print('Converting data...')

	config.X = np.array(config.X)
	config.Y = np.array(config.Y)
	config.X_test = np.array(config.X_test)
	#print(config.X.shape)
	#print(config.Y.shape)
	#print(config.X_test.shape)
	print('Training...')
	print('Time Elapsed: ' + str((time.time() - config.start_time)/60))

	num_classes = len(config.Y[1, :])

	for i in range(num_classes):
		print('Creating Classifier: ', i)
		rf = RandomForestClassifier(n_estimators=500, max_depth=5, n_jobs=-1, oob_score=True, verbose=2, criterion="entropy")
		gbm = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic')
		
		print('Fitting Random Forest Classifier: ', i)
		rf.fit(config.X, config.Y[:, i])

		print('Fitting With XGBoost Classifier: ', i)
		gbm.fit(config.X, config.Y[:, i])

		print('Getting Random Forest Predictions for attribute: ', i)
		y_pred_rf = rf.predict(config.X_test)
		config.Y_pred_rf.append(y_pred_rf)
		print(y_pred_rf)

		print('Getting XGBoost Predictions for attribute: ', i)
		y_pred_xgb = gbm.predict(config.X_test)
		config.Y_pred_xgb.append(y_pred_xgb)
		print(y_pred_xgb)
Beispiel #7
0
def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
           inverse=False, lc_filter=None):
    """

    path: Dirección del dataset a ocupar para entrenar
    index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar
    class_filter: Lista de clases que se quiere utilizar
    feature_filter: Lista de features que se quiere utilizar

    """
    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux
            
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14,
                                     min_samples_split=5)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    return pd.concat(results)
Beispiel #8
0
def crossval_roc(X, y):
    cv = StratifiedKFold(y, n_folds=10)
    clf = RandomForestClassifier()

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        fitted = clf.fit(X[train], y[train])
        probas_ = fitted.predict_proba(X[test])
        scored_ = fitted.predict(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        #roc_auc = auc(fpr, tpr)
        roc_auc = roc_auc_score(scored_, y[test], average="micro")
        #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))


    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    return plt.plot(mean_fpr, mean_tpr, 
                    label='Mean ROC (area = %0.2f)' % mean_auc, lw=1)
def get_randomforest_classifier(X_train, y_train, params=None):
    param_grid = {"max_depth": [4, 5, 6, 7],
                  "max_features": [3, 5],
                  "criterion": ["gini", "entropy"]}
                  
    if params is None:
                  
        log = RandomForestClassifier()
        t = start("training random forest ")
        cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123)
        clf = grid_search.GridSearchCV(log, param_grid, cv=cv, n_jobs=4, scoring='roc_auc')
        clf = clf.fit(X_train,y_train)
        report(t, nitems=10*len(param_grid))
        
        print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
        print "With parameters:"
    
        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(param_grid.keys()):
            print '\t%s: %r' % (param_name, best_parameters[param_name]) 
    else:
        clf = RandomForestClassifier(**params)
        clf = clf.fit(X_train,y_train)
        
    return clf
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
def random_forest_classify(train_data,train_label,test_data):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(train_data, ravel(train_label))
    test_label=rf.predict(test_data)
    
    save_result(test_label,'sklearn_random_forest_classify_Result.csv')  
    return test_label 
def predict_rf(train_features, test_features, train_labels, test_labels):
  model = RandomForestClassifier(n_estimators=1000)
  model.fit(train_features, train_labels)
  predictions = model.predict(train_features)
  print get_accuracy(predictions, train_labels)
  predictions = model.predict(test_features)
  print get_accuracy(predictions, test_labels)
def train_model_on_gestures(wav_list):

    gestures = {'vattene':0, 'vieniqui':1, 'perfetto':2, 'furbo':3, 'cheduepalle':4,
                    'chevuoi':5, 'daccordo':6, 'seipazzo':7, 'combinato':8, 'freganiente':9, 
                    'ok':10, 'cosatifarei':11, 'basta':12, 'prendere':13, 'noncenepiu':14,
                    'fame':15, 'tantotempo':16, 'buonissimo':17, 'messidaccordo':18, 'sonostufo':19}

    dataX = []
    i = 0
    for wav in wav_list:
        path = re.sub('\_audio.wav$', '', wav)
        print '\n', '##############'
        print path[-25:]
        sample = VideoMat(path, True)
        sk = Skelet(sample)
        rate, data = get_data(wav)
        data_frame = np.asarray(create_features(data, sample.labels, sample.numFrames, sk))
        #print 'data_frame !', data_frame.shape
        #data_frame2 = np.asarray(Head_inter(path, sample.labels).data_frame)
        #data_frame = np.hstack((data_frame, data_frame2))
        dataX += copy.copy(data_frame)
        
        
    # 1 target / 19 * 6 joints infos / 8 Head/Hand distances / 5 Head box = 128 features
    #Train model: Don't use the Head box features, don't really improve the model  
    data_frame = np.asarray(dataX)
    Y = data_frame[:, 0]
    Y = np.asarray([gestures[i] for i in Y])
    X = data_frame[:, 1:]
    X = X.astype(np.float32, copy=False)
    X = X[:, :122] 
    clf = RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_split=10, 
            min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2
    clf = clf.fit(X, Y)
    pickle.dump(clf, open('gradient_boosting_model_gestures.pkl','wb'))
def run():
    mean_acc = 0.0
    mean_logloss = 0.0
    skf, X_all, labels = gen_cv()
    for fold, (test_index, train_index) in enumerate(skf, start=1):
        logger.info('at fold: {0}'.format(fold))
        logger.info('train samples: {0}, test samples: {1}'.format(len(train_index), len(test_index)))
        X_train, X_test = X_all[train_index], X_all[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        rfc = RandomForestClassifier(n_jobs=10, random_state=919)
        rfc.fit(X_train, y_train)
        y_test_predicted = rfc.predict(X_test)
        y_test_proba = rfc.predict_proba(X_test)
        # equals = y_test == y_test_predicted
        # acc = np.sum(equals) / float(len(equals))
        acc = accuracy_score(y_test, y_test_predicted)
        logger.info('test data predicted accuracy: {0}'.format(acc))
        # log loss -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
        logloss = log_loss(y_test, y_test_proba)
        logger.info('log loss at test data: {0}'.format(logloss))
        # logger.info('log loss at test data using label: {0}'.format(log_loss(y_test, y_test_predicted)))
        mean_acc += acc
        mean_logloss += logloss

    n_folds = skf.n_folds
    logger.info('mean acc: {0}'.format(mean_acc / n_folds))
    logger.info('mean log loss: {0}'.format(mean_logloss / n_folds))
 def cls_create(xs, ys):
     
     if algo == "SVM":
         classifier = svm.SVC(C = self.parm, probability=True)
         
     elif algo == "RF":
         classifier = RandomForestClassifier(n_estimators = int(self.parm), criterion='entropy',  n_jobs = 1)
     #
     #classifier = LDA()
     
     new_xs = xs
     
     """
     positive_count = len([y for y in ys if y > 0])
     if positive_count >= 20:
     
         #self.selector = svm.LinearSVC(C = 1, dual = False, penalty="l1")
         self.selector = LDA()
         new_xs = self.selector.fit_transform(xs, ys)
     else:
         self.selector = None
     """
     
     classifier.fit(new_xs, ys)
     probs = classifier.predict_proba(new_xs)            
     
     #self.pclassifier = svm.SVC(parm_val = 1.0)
     #self.pclassifier.fit(probs, ys)
     
     self.threshold, self.positive, self.negative = best_threshold_for_f1(probs, 20, ys)
     return classifier
def brute_force_acc_rd(features_train, labels_train, features_test, labels_test, ids):

    #0.818181818182
    clf = RandomForestClassifier(bootstrap=True,
            criterion='entropy', max_depth=None, max_features=2,
            max_leaf_nodes=16, min_samples_split=10, n_estimators=1000,
            n_jobs=-1, oob_score=False)

    clf = clf.fit(features_train, labels_train)
    # print(clf.best_estimator_)
    pred = clf.predict(features_test)
    acc = accuracy_score(labels_test, pred)
    #print pred
    # if(acc > 0.80):
    #     print acc
    t0 = time.time()
    print acc
    feature_importance = clf.feature_importances_
    # feature_importance = 100.0 * (feature_importance / feature_importance.max())
    # print feature_importance
    if(acc > 0.815):
        data_train.to_csv("data_train{}.tst".format(round(acc,5)), "\t")
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        print feature_importance

    if(acc > 0.819):
        predictions_file = open("data/canivel_random_forest_819.csv", "wb")
        predictions_file_object = csv.writer(predictions_file)
        predictions_file_object.writerow(["PassengerId", "Survived"])
        predictions_file_object.writerows(zip(ids, pred))
        predictions_file.close()
        print ("NEW FILE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! YEA!!!!")

    return acc
Beispiel #17
0
    def model_and_predict(self, X_train, y_train, X_test):
        district_idx = self.columns.index('PdDistrict')
        districts = set(X_train[:,district_idx])
        district_ys = {}
        # Grow forest and predict separately for each district's records
        for d in districts:
            district_X_train = X_train[X_train[:, district_idx] == d]
            district_X_train = np.delete(district_X_train, district_idx, 1)
            district_y_train = y_train[X_train[:, district_idx] == d]
            district_X_test = X_test[X_test[:, district_idx] == d]
            district_X_test = np.delete(district_X_test, district_idx, 1)
            print "Growing forest for", d

            # Not saving output in Git so make this deterministic 
            # with random_state
            rf = RandomForestClassifier(n_estimators=self.n_trees, n_jobs=-1,
                                        random_state=782629)
            rf.fit(district_X_train, district_y_train)

            district_ys[d] = list(rf.predict(district_X_test))
            print "Finished", d

        print "All predictions made"

        y_hat = []
        for row in X_test:
            d_ys = district_ys[row[district_idx]]
            y_hat.append(d_ys.pop(0))

        return y_hat
def main():
    X, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.1, random_state=None)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    for i in range(8):
        clf = RandomForestClassifier(n_estimators = 2**i)   
        clf.fit(X,y)
        plot_surface(clf, X, y)
def buildModel(df):
	train_y = df['arr_del15'][:train_len]
	train_x = df[cols][:train_len]

	# transform categorical features
	train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0]
	train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0]
	train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0]
	
	pd.set_option('display.max_rows', 500)
	print(train_x)

	# train_x['origin'] = pd.factorize(train_x['origin'])[0]
	#	train_x['dest'] = pd.factorize(train_x['dest'])[0]
	# print(train_x)
	train_x = enc.fit_transform(train_x)
	print(train_x.shape)

	# Create Random Forest classifier with 50 trees
	clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
	clf_rf.fit(train_x.toarray(), train_y)

	del train_x, train_y
	print("Model built")
	return clf_rf
 def test_save_prediction(self):
     model = RandomForestClassifier()
     model.id = get_model_id(model)
     model.fit(self.iris.data, self.iris.target)
     indexes = np.fromfunction(lambda x: x, (self.iris.data.shape[0], ), dtype=np.int32)
     saving_predict_proba(model, self.iris.data, indexes)
     os.remove('RandomForestClassifier_r0_N__m5_0p0__m4_2__m1_auto__m0_N__m3_1__m2_N__n0_10__b0_1__c1_gini__c0_N_0_149.csv')
Beispiel #21
0
    def __init__(self):
        super(ClassifyDriver, self).__init__()

        if CLASSIFIER == "SVM":
            self.driver = svm.SVC()
        elif CLASSIFIER == "GBC":
            self.driver = GradientBoostingClassifier(n_estimators=300, max_depth=5, learning_rate=0.05)
        elif CLASSIFIER == "RFC":
            self.driver = RandomForestClassifier(n_estimators=N_ESTIMATORS, n_jobs=N_JOBS)
        else:
            raise Exception("Classifier %s not supported" % CLASSIFIER)

        genuineX = []
        forgeryX = []

        genuineY = []
        forgeryY = []

        # Training process
        for sigs in self.train_set:
            personTrain = PersonTraining(sigs)
            genuine, forgery = personTrain.calc_train_set()
            genuineX.extend(genuine)
            forgeryX.extend(forgery)

        genuineY = [1] * len(genuineX)
        forgeryY = [0] * len(forgeryX)

        trainX = genuineX + forgeryX
        trainY = genuineY + forgeryY

        self.driver.fit(trainX, trainY)
Beispiel #22
0
def rforests(trainx, trainy, test, n_estimators=100, k=5):
	trainy = np.ravel(trainy)

	forest = RandomForestClassifier(n_estimators)
	forest.fit(trainx, trainy)


	prob_train = forest.predict_proba(trainx)
	prob_test = forest.predict_proba(test)

	# Since the index is the number of the country that's been chosen
	# we can use these with argsort to get the maximum 5., we will have to do this
	# for the entire matrix though.
	sort_train = np.argsort(prob_train)[:,-k:]
	sort_test = np.argsort(prob_test)[:,-k:]

	# Now we need to transform these back to countries, but to map I need to
	# have a dataframe.
	col_names = []

	for i in range(k):
		name = "country_destination_" + str(i+1)
		col_names.append(name)

	pred_train = pd.DataFrame(sort_train, columns=col_names)
	pred_test = pd.DataFrame(sort_test, columns=col_names)

	for name in col_names:
		pred_train[name] = pred_train[name].map(dicts.country)
		pred_test[name] = pred_test[name].map(dicts.country)

	pred_train = np.fliplr(pred_train)
	pred_test = np.fliplr(pred_test)

	return forest, pred_train, pred_test
def cross_validate():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Cross-Validating")
    rf = RandomForestClassifier(n_estimators=10,
                                verbose=1,
                                compute_importances=True,
                                n_jobs=2)
    cv = cross_validation.KFold(len(data),
                                k=10,
                                indices=False)
    results = []
    for traincv, testcv in cv:
        print "\t-- cv [%d]"%len(results)
        print "\t","extracting features"
        #...
        feacv = features.extract_features(feature_names,
                                          traincv)
        print "\t","learning"
        rf.fit(feacv, data["OpenStatus"])
        print "\t","predicting"
        probs = rf.predict_proba(testcv)
        print "\t","evaluating"
        results.append( llfun(target[testcv],
                              [x["OpenStatus"] for x in probas]) )
    print "LogLoss: " + str( np.array(results).mean() )
Beispiel #24
0
def get_preds(features, trees=3000, depth=19):  # features is the number of latents features that I want the nmf to run on
    # Create dataframes
    df = get_nmf(k=features)
    df_full = add_yahoo_to_df(df)
    df_train = add_dummies(df_full)   # Why aren't you using df_full?

    df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is
    df_test_full = add_yahoo_to_df(df_test)
    df_test_full = add_dummies(df_test_full)

    # Create models
    X_model_class, y_model_class = get_classifier_data(df_full)
    rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth)
    rf_class.fit(X_model_class, y_model_class)
    #
    X_model_regress, y_model_regress = get_regressor_data(df_full)
    rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth)
    rf_regress.fit(X_model_regress, y_model_regress)

    # Get X and y values
    X_classify, y_classify  = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11']))
    X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11']))

    # Run models

    classifier_preds = rf_class.predict(X_classify)
    classifier_accuracy = accuracy_score(classifier_preds, y_classify)

    regressor_preds = rf_regress.predict(X_regress)
    regressor_mse = mean_squared_error(regressor_preds, y_regress)

    # I want to return the number of features, k, along with the accuracy of the classifier
    # and the MSE of the regressor.  This will give me an idea of how well things are doing
    # based on the number of features.
    return [features, classifier_accuracy, regressor_mse]
Beispiel #25
0
def myforest(train, test, trees=250):
    #Training data prep-------------------------------------------------------------------------------------------
    csv_file_object = csv.reader(open(train, 'rb')) #Load in the training csv file
    header = csv_file_object.next() #Skip the fist line as it is a header
    output_header = header[0:2]
    train_data=[]
    for row in csv_file_object: #Skip through each row in the csv file
        train_data.append(row[1:]) #adding each row to the data variable
    train_data = np.array(train_data) #Then convert from a list to an array

    #Test data prep-----------------------------------------------------------------------------------------------
    test_file_object = csv.reader(open(test, 'rb')) #Load in the test csv file
    header = test_file_object.next() #Skip the fist line as it is a header
    test_data=[] #Create a variable called 'test_data'
    ids = []
    for row in test_file_object: #Skip through each row in the csv file
        ids.append(row[0])
        test_data.append(row[1:]) #adding each row to the data variable
    test_data = np.array(test_data) #Then convert from a list to an array

    #Train the forest
    print 'Training'
    forest = RandomForestClassifier(n_estimators=trees)
    forest = forest.fit(train_data[0::,1::], train_data[0::,0])

    print 'Predicting'
    output = forest.predict(test_data)

    open_file_object = csv.writer(open("result.csv", "wb"))
    open_file_object.writerow([output_header[0],output_header[1]])
    open_file_object.writerows(zip(ids, output))
Beispiel #26
0
def main():

    S, col_names_S = load_data(config.paths.training_data,
                               config.paths.cache_folder)
    Xs, Ys, col_names_S = extract_xy(S, col_names_S)

    a = RandomForestClassifier(n_estimators=1)
    a.fit(Xs.toarray(), Ys.toarray().ravel())
    best_features = a.feature_importances_
    max_ind, max_val = max(enumerate(best_features), key=operator.itemgetter(1))
    print best_features
    print max_ind, max_val

    print Xs.shape
    print Ys.shape
    param_range = [1, 3, 5, 7, 10, 15, 20, 30, 60, 80]
    train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='entropy'), Xs, Ys.toarray().ravel(),
                                                 'n_estimators', param_range)

    print train_scores
    print test_scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve for Random Forest")
    plt.xlabel("Number of Trees")
    plt.ylabel("Score")
    plt.plot(param_range, train_mean, label="Training Score", color='r')
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color='r')
    plt.plot(param_range, test_mean, label="Test Score", color='b')
    plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color='b')
    plt.legend(loc="best")
    plt.show()
def Random_Forest_classifier(train_input_data,train_output_data,test_input_data,test_output_data):
    tree_list = []
    accuracy_percent = []
    for trees in range(10,200,10):
        clf = RandomForestClassifier(trees)
        clf.fit(train_input_data,train_output_data)
        predicted_output = clf.predict(test_input_data)
        error_list = []
        if isinstance(predicted_output,list) ==False:
            predicted_output = predicted_output.tolist()
        if isinstance(test_output_data,list) ==False:
            test_output_data = test_output_data.tolist()
        for i in range(len(test_output_data)):
            cur_univ_similarities =  similar_univs[similar_univs['univName'] == predicted_output[i]]
            cur_univ_similarity_list = cur_univ_similarities.values.tolist()
            cur_univ_similarity_list = [item for sublist in cur_univ_similarity_list for item in sublist]
            if test_output_data[i] in cur_univ_similarity_list[1:]:
                error_list.append(0)
            else:
                error_list.append(1)
        tree_list.append(trees)
        accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100))
    tree_list = np.array(tree_list)
    accuracy_percent = np.array(accuracy_percent)
    plt.plot(tree_list,accuracy_percent)
    plt.xlabel('Number of trees')
    plt.ylabel('Percent of accuracy')
    plt.title('Varation of accuracy with trees')
    plt.grid(True)
    plt.savefig("rf1.png")
    plt.show()
    return predicted_output
Beispiel #28
0
def crossValIteration(dat,classes,cutoff,prop=0.9,reshuffle=False):
	if reshuffle:
		dat.samples = sampleReshuffle(dat)
	saved_samples = [i for i in dat.samples]
	dat.samples = ["{0}_$$_{1}".format(i,v) for i,v in enumerate(dat.samples)]
	train,test=dat.splitTraining(prop, classes)
	print test.samples
	selectedSampleIndicies = [int(i.split("_$$_")[0]) for i in test.samples]
	dat.samples = saved_samples
	print test.samples
	test.samples = [i.split("_$$_")[1] for i in test.samples]
	train.samples = [i.split("_$$_")[1] for i in train.samples]
	print "Training set has {0} samples from classes: {1}".format(len(train.samples),",".join(set(train.samples)))
	print "Test set has {0} samples from classes: {1}".format(len(test.samples),",".join(set(test.samples)))
	print "Selecting data..."
	# select features for each disease
	print "Number of selections made for each class:"

	print "Setting up SVM..."
	Xtrain = train.values.transpose()
	Ytrain = train.samples

	clf=RandomForestClassifier(n_estimators=1000)
	clf.fit(Xtrain,Ytrain)

	Xtest = test.values.transpose()
	Ytest = test.samples
	print "Predicting R-forest..."
	#classification results versus actual
	acc = zip(Ytest,clf.predict(Xtest)) # (actual,predicted)... for each sample
	print acc # this is the elemental form of the "result" lists processed below
	print sum([i[0] == i[1] for i in acc])*1.0/len(acc)
	return acc
Beispiel #29
0
 def randomForest_eval_func(self, chromosome):
     n_estimators, max_features, window_size = self.decode_chromosome(chromosome)
     if self.check_log(n_estimators, max_features, window_size):
         return self.get_means_from_log(n_estimators, max_features, window_size)[0]
     folded_dataset = self.create_folded_dataset(window_size)
     indim = 21 * (2 * window_size + 1)
     mean_AUC = 0
     mean_decision_value = 0
     mean_mcc = 0
     sample_size_over_thousand_flag = False
     for test_fold in xrange(self.fold):
         test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold)
         if len(test_labels) + len(train_labels) > 1000:
             sample_size_over_thousand_flag = True
         clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
         clf.fit(train_dataset, train_labels)
         probas = clf.predict_proba(test_dataset)
         decision_values = map(lambda x: x[1], probas) # Probability of being binding residue
         AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels)
         mean_AUC += AUC
         mean_decision_value += decision_value_and_max_mcc[0]
         mean_mcc += decision_value_and_max_mcc[1]
         if sample_size_over_thousand_flag:
             break
     if not sample_size_over_thousand_flag:
         mean_AUC /= self.fold
         mean_decision_value /= self.fold
         mean_mcc /= self.fold
     self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc)
     self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc)
     return mean_AUC
Beispiel #30
0
def algo(a):
    global data
    global week 
    target = data['target']
    data = data[["id", "cpu", "creator", "dbs" , "dtype" , "era" ,  "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]]
    week['target'] = 0
    week['target'] = week.apply(convert, axis=1)
    week['target'] = week['target'].astype(int)
    test1 = week
    week = week[["id", "cpu", "creator", "dbs" , "dtype" , "era" ,  "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]]
    if a == 'rf':
        #RANDOM FOREST CLASSIFIER 
        rf = RandomForestClassifier(n_estimators=100)
        rf = rf.fit(data, target)
	predictions = rf.predict(week)
	cal_score("RANDOM FOREST", rf, predictions, test1['target'])
    if a == "sgd":
        #SGD CLASSIFIER     
        clf = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
            fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
            loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
            random_state=None, shuffle=True, verbose=0,
            warm_start=False)
        clf.fit(data, target)
        predictions = clf.predict(week)
	cal_score("SGD Regression",clf, predictions, test1['target'])
    if a == "nb":
	clf = GaussianNB()
	clf.fit(data, target)
	predictions = clf.predict(week)
	cal_score("NAIVE BAYES", clf, predictions, test1['target'])
Beispiel #31
0
    # get numeric columns
    if use_numeric:
        x_train, x_train_num = numeric_filter(x_train)
        x_test, x_test_num = numeric_filter(x_test)

    fitted_dfs,fitted_encoders,encoder_names = categ_encoder(
        x_train, y_train[target_col],cols=x_train.columns, encoders=('target','count')
    )    
    if use_numeric:
        fitted_dfs.append(x_train_num)

    train_feat = pd.concat(fitted_dfs,axis=1) 
    train_label = y_train[target_col]
    #modeling:
    logger.writelines('{}: start modeling...\n'.format(time.ctime()))
    rf = RandomForestClassifier(200,max_depth=5)
    rf.fit(train_feat,train_label)
    #validation:
    logger.writelines('{}: start evaluation...\n'.format(time.ctime()))
    test_feats = [en.transform(x_test) for en in  fitted_encoders]
    if use_numeric:
        test_feats.append(x_test_num)
    test_feat = pd.concat(test_feats,axis=1)
    test_label = y_test[target_col]
    train_res = eval_formater(evalution(rf,train_feat,train_label))
    test_res = eval_formater(evalution(rf,test_feat,test_label))
    logger.writelines('{}: evaluation done.\n'.format(time.ctime()))
    write_model_info(logger,rf,train_feat)
    logger.writelines('train:\n----\n')
    logger.writelines(train_res)
    logger.writelines('validation:\n----\n')
Beispiel #32
0
        pred_comb[(self.pred[1] == 1) & (self.pred[2] != 1) & (self.pred[3] != 1) & (self.pred[4] != 1)] = 1
        pred_comb[(self.pred[1] != 1) & (self.pred[2] == 1) & (self.pred[3] != 1) & (self.pred[4] != 1)] = 2
        pred_comb[(self.pred[1] != 1) & (self.pred[2] != 1) & (self.pred[3] == 1) & (self.pred[4] != 1)] = 3
        pred_comb[(self.pred[1] != 1) & (self.pred[2] != 1) & (self.pred[3] != 1) & (self.pred[4] == 1)] = 4

        #pred_comb[(pred[1] == 1) & (pred[2] == 1) & (pred[3] != 1) & (pred[4] != 1)] = 1
        pred_comb[(self.pred[1] == 1) & (self.pred[2] != 1) & (self.pred[3] == 1) & (self.pred[4] != 1)] = 1
        #pred_comb[(pred[1] == 1) & (pred[2] != 1) & (pred[3] != 1) & (pred[4] == 1)] = 1
        #pred_comb[(pred[1] != 1) & (pred[2] == 1) & (pred[3] == 1) & (pred[4] != 1)] = 1
        pred_comb[(self.pred[1] != 1) & (self.pred[2] == 1) & (self.pred[3] != 1) & (self.pred[4] == 1)] = 2
        #pred_comb[(pred[1] != 1) & (pred[2] != 1) & (pred[3] == 1) & (pred[4] == 1)] = 1

        return pred_comb

#Define final classifier
clf_list = [RandomForestClassifier(n_estimators=27, max_features=11),
        RandomForestClassifier(n_estimators=24, max_features=10),
        RandomForestClassifier(n_estimators=25, max_features=9),
        RandomForestClassifier(n_estimators=22, max_features=10)]
clf = voting(clf_list,.5)

#Split training and testing sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=.2)
clf.fit(X_train,y_train)
#Predict labels
pred_comb = clf.predict(X_test)

#Compute score of full classifier
score = cross_validation.cross_val_score(clf,X,y,cv = cv)
#Compute score of each expert
ind_scores = [cross_validation.cross_val_score(clf_list[i],X,y_all[i],cv = cv) for i in range(0,len(clf_list))]
		return res

xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['learning_rate'] = 0.04
xgb_params['n_estimators'] = 490
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.9  
xgb_params['min_child_weight'] = 10


# RandomForest params
rf_params = {}
rf_params['n_estimators'] = 200
rf_params['max_depth'] = 6
rf_params['min_samples_split'] = 70
rf_params['min_samples_leaf'] = 30


xgb_model = XGBClassifier(**xgb_params)
rf_model = RandomForestClassifier(**rf_params)

log_model = LogisticRegression()

stack = Ensemble(n_splits=3,
		stacker = log_model,
		base_models = (xgb_model, rf_model))

y_pred = stack.fit_predict(train, target_train, test)        
scaler.fit(X_train)

x_train_scalednew = scaler.transform(X_train)
y_train_scalednew = scaler.transform(y_train)

print("transsformed shape: %s" % (x_train_scalednew, ))
print("per feature min before scaling: %s" % X_train.min(axis=0))
print("per feature max before scaling: %s" % X_train.max(axis=0))
print("per feature min after scaling: %s" % x_train_scalednew.min(axis=0))
print("per feature max after scaling: %s" % x_train_scalednew.max(axis=0))

x_test_scalednew = scaler.transform(X_test)
y_test_scalednew = scaler.transform(y_test)

print("per-feature min after scaling: %s" % x_test_scalednew.min(axis=0))
print("per-feature max after scaling: %s" % x_test_scalednew.max(axis=0))

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50)

model.fit(X, y)

acc_train = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)

y_pred = model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix

cl = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
Beispiel #35
0
categories = [
    'rec.sport.hockey', 'sci.med', 'soc.religion.christian',
    'talk.religion.misc'
]

newsgroups_train = load_files(
    'C:\\Users\\gaura\\Desktop\\Course Material\\Artificial Intelligence - 537\\Assignments\\HW3\\Selected 20NewsGroup\\Training',
    encoding='latin-1')
newsgroups_test = load_files(
    'C:\\Users\\gaura\\Desktop\\Course Material\\Artificial Intelligence - 537\\Assignments\\HW3\\Selected 20NewsGroup\\Test',
    encoding='latin-1')

clf_nb = MultinomialNB(alpha=.01)
clf_lr = LogisticRegression()
clf_svc = LinearSVC()
clf_rf = RandomForestClassifier()

i, NB_results = split_test_classifier(clf_nb, newsgroups_train.data,
                                      newsgroups_test.data,
                                      newsgroups_train.target,
                                      newsgroups_test.target)

i, LR_results = split_test_classifier(clf_lr, newsgroups_train.data,
                                      newsgroups_test.data,
                                      newsgroups_train.target,
                                      newsgroups_test.target)

i, SVM_results = split_test_classifier(clf_svc, newsgroups_train.data,
                                       newsgroups_test.data,
                                       newsgroups_train.target,
                                       newsgroups_test.target)
Beispiel #36
0
def randomForest(trainVec, trainScore):
    model = RandomForestClassifier(max_depth=None)  # 取消最大深度,防止过拟合
    model.fit(trainVec, trainScore)
    return model
Beispiel #37
0
def validate_fold(X_train, X_test, Y_train, Y_test, Q_vec, weights,
                  evaluator, retrieval_method, **kwargs):
    """Perform validation on one fold of the data

    This function evaluates a retrieval method on one split of a
    dataset.

    Parameters
    ----------
    X_train : pd.DataFrame, shape = [n_train_samples, codebook_size]
        Training data.

    X_test : pd.DataFrame, shape = [n_test_samples, codebook_size]
        Test data.

    Y_train : pd.DataFrame, shape = [n_train_samples, n_classes]
        Training tags.

    Y_train : pd.DataFrame, shape = [n_test_samples, n_classes]
        Test tags.

    Q_vec : array-like, shape = [n_queries, n_classes]
        The queries to evaluate

    weights : array-like, shape = [n_queries]
        Ouery weights. Multi-word queries can be weighted to reflect importance
        to users.

    evaluator : object
        An instance of :class:`cbar.evaluation.Evaluator`.

    retrieval_method: str, 'loreta', 'pamir', or 'random-forest'
        The retrieval to be evaluated.

    kwargs:  key-value pairs
        Additionaly keyword arguments are passed to the retrieval methods.

    Returns
    -------
    params: dict
        The ``retrieval_method``'s parameters used for the evaluation
    """
    if retrieval_method in LETOR:
        method = dict(pamir=PAMIR, loreta=LoretaWARP).get(retrieval_method)
        letor = method(**kwargs)
        letor.fit(X_train, Y_train, Q_vec, X_test, Y_test)
        Y_score = letor.predict(Q_vec, X_test)
        params = letor.get_params()
    elif retrieval_method == 'random-forest':
        rf = RandomForestClassifier(class_weight='balanced', **kwargs)
        clf = OneVsRestClassifier(rf, n_jobs=-1)
        clf.fit(X_train, Y_train)
        model_score = standardize(clf.predict_proba(X_test))
        Y_score = Q_vec.dot(model_score.T)
        params = clf.estimator.get_params()
    else:
        raise ValueError('Unknown retrieval method.')

    n_relevant = make_relevance_matrix(Q_vec, Y_train).sum(axis=1)
    evaluator.eval(Q_vec, weights, Y_score, Y_test, n_relevant)

    return params
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
# Not needed; though, included due to the scaled plotting later
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting DTC to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Type in 'cm' in console to print confusion matrix
# ([63, 5]  63+29 = 92 (Correct Predictions)
#  [3, 29]) 3+5 = 8 (Incorrect Predictions)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
    'alpha':[0.0001,0.001,0.01]
}

best_acc_params_mlp=HyperParamsResultsPlot(BestParams_GridSearchCV(mlp,X_train,y_train,hyper_params_mlp,folds),"Multilayer Perceptron")
'''
"""#### **Best Models :**

##### **Best Random Forest Model :**
"""

print('Running Best Candidate Models after Hyperparameter Tunning')

# best rf model
best_rf = RandomForestClassifier(random_state=random_state,
                                 max_depth=12,
                                 max_features=15,
                                 min_samples_leaf=1,
                                 min_samples_split=5,
                                 n_estimators=200)
#rf_test_pred=main_results(best_rf,X_train,X_val,X_test,y_train,y_val)
#submission(rf_test_pred)
"""##### **Best XGBoost Model :**"""

# best xgboost model
best_xg = XGBClassifier(class_weight='balanced',
                        max_depth=8,
                        max_features=10,
                        min_child_weight=1,
                        min_samples_split=5,
                        n_estimators=300,
                        random_state=random_state)
#xg_test_pred=main_results(best_xg,X_train,X_val,X_test,y_train,y_val)
Beispiel #40
0
class RandomForest:
    def __init__(self, criterion, max_features,
                 max_depth, min_samples_split, min_samples_leaf,
                 min_weight_fraction_leaf, bootstrap, max_leaf_nodes,
                 min_impurity_decrease, random_state=None, n_jobs=1,
                 class_weight=None, **kwargs):
        self.n_estimators = self.get_max_iter()
        self.criterion = criterion
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.bootstrap = bootstrap
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.class_weight = class_weight
        self.estimator = None

    @staticmethod
    def get_max_iter():
        return 100

    def get_current_iter(self):
        return self.estimator.n_estimators

    def fit(self, X, y, sample_weight=None):
        from sklearn.ensemble import RandomForestClassifier

        if self.estimator is None:
            self.n_estimators = int(self.n_estimators)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)

            if self.max_features not in ("sqrt", "log2", "auto"):
                max_features = int(X.shape[1] ** float(self.max_features))
            else:
                max_features = self.max_features

            self.bootstrap = check_for_bool(self.bootstrap)

            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            self.min_impurity_decrease = float(self.min_impurity_decrease)

            # initial fit of only increment trees
            self.estimator = RandomForestClassifier(
                n_estimators=self.get_max_iter(),
                criterion=self.criterion,
                max_features=max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                bootstrap=self.bootstrap,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                class_weight=self.class_weight,
                warm_start=True)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    @staticmethod
    def get_cs():
        cs = ConfigurationSpace()
        criterion = CategoricalHyperparameter(
            "criterion", ["gini", "entropy"], default_value="gini")

        # The maximum number of features used in the forest is calculated as m^max_features, where
        # m is the total number of features, and max_features is the hyperparameter specified below.
        # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This
        # corresponds with Geurts' heuristic.
        max_features = UniformFloatHyperparameter(
            "max_features", 0., 1., default_value=0.5)

        max_depth = UnParametrizedHyperparameter("max_depth", "None")
        min_samples_split = UniformIntegerHyperparameter(
            "min_samples_split", 2, 20, default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter(
            "min_samples_leaf", 1, 20, default_value=1)
        min_weight_fraction_leaf = UnParametrizedHyperparameter("min_weight_fraction_leaf", 0.)
        max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None")
        min_impurity_decrease = UnParametrizedHyperparameter('min_impurity_decrease', 0.0)
        bootstrap = CategoricalHyperparameter(
            "bootstrap", ["True", "False"], default_value="True")
        cs.add_hyperparameters([criterion, max_features,
                                max_depth, min_samples_split, min_samples_leaf,
                                min_weight_fraction_leaf, max_leaf_nodes,
                                bootstrap, min_impurity_decrease])
        return cs
Beispiel #41
0
              'min_samples_leaf': [10, 15, 20, 30],
              'criterion': ['gini', 'entropy'],
              'max_depth': [10, 15, 20],
              'min_samples_split': [10, 20],
              'n_jobs': [-1],
              'verbose': [1]}

start_time = time.time()

scores = ['accuracy', 'f1_macro', 'roc_auc_ovo']

for score in scores:
    print("Tuning for %s" % score)
    print("----------------------------------")

    rf_new = ms.HalvingRandomSearchCV(RandomForestClassifier(), param_grid, scoring='%s' % score)
    rf_new.fit(X_train, Y_train)

    print("Best parameters set found is:")
    print(rf_new.best_params_)

    print("Grid scores on training set:")

    means = rf_new.cv_results_['mean_test_score']
    stds = rf_new.cv_results_['std_test_score']

    print("Average scores are ", means)
    print("SD for the scores are ", stds)

    print("Detailed classification report:")
    y_true, y_pred = Y_test, rf_new.predict(X_test)
Beispiel #42
0
 def __init__(self):
     self.clf = Pipeline([
         ('imputer', SimpleImputer(strategy='most_frequent')),
         ('rf', RandomForestClassifier(max_depth=5, n_estimators=10))
     ])
Beispiel #43
0
    with open(malicious_ips_path17, 'r') as fp:
        malicious_ips_ls17 = json.load(fp)['malicious_ips']
    with open(malicious_ips_path18, 'r') as fp:
        malicious_ips_ls18 = json.load(fp)['malicious_ips']

    featurename_csv_path = './data/ls17_rfe_features_sorted_by_importance.txt'
    host_split_path = './data/host_train_test_IP_splits_2905.pickle'

    with open(host_split_path, 'rb') as fp:
        host_splits = pickle.load(fp)

    ##################
    ### ESTIMATORS ###
    ##################

    rf = RandomForestClassifier(n_jobs=6, random_state=0)
    rf_tuned = RandomForestClassifier(n_jobs=6,
                                      random_state=0,
                                      n_estimators=128,
                                      max_depth=10)
    svm = svm.LinearSVC(loss='hinge', random_state=0)
    knn = neighbors.KNeighborsClassifier(n_neighbors=5,
                                         weights='uniform',
                                         n_jobs=4),

    all_classifiers = {
        'RandomForestClassifier(n_jobs=6, random_state=0)':
        RandomForestClassifier(n_jobs=6, random_state=0),
        'GaussianNB()':
        GaussianNB(),
        'LogisticRegression(max_iter=1000, penalty=\'l2\', random_state=0, solver=\'sag\')':
        predicted: List of predicted result from model
    """
    cm=confusion_matrix(target,predicted)
    print ("Confusion Matrix : \n",cm)
    accuracy=accuracy_score(target,predicted)
    print ('Accuracy: {:.2f}'.format(accuracy))
    sensitivity=cm[0,0]/float(cm[0,0]+cm[0,1])
    print ('Sensitivity: {:.2f}'.format(sensitivity))
    specificity=cm[1,1]/float(cm[1,0]+cm[1,1])
    print ('Specificity: {:.2f}'.format(specificity))
    fpr,tpr,thresholds=roc_curve(target,predicted)
    Auc_value=auc(fpr,tpr)
    print ('Area Under Curve: {:.2f}'.format(Auc_value))
    
#[Model1: Only RandomForest]
rfr=RandomForestClassifier(n_estimators=500,random_state=1)
rfr.fit(X_train,y_train) #fit the model with training data
threshold_rf=FindOptimalCutOff(y_train,X_train,rfr) #get the threshold 
print ("Threshold: ",threshold_rf)
y_predict=rfr.predict(X_test) #predict the testing data
rf_o=pd.DataFrame() #dataFrame to record the result 
rf_o['y_predict_prob']=rfr.predict_proba(X_test)[:,1] #get the predicted prob of testing data
rf_o['y_predict']=rf_o['y_predict_prob'].map(lambda x:1 if x>threshold_rf else 0) #get the classification
getConfusionMatrix(y_test,rf_o['y_predict']) #get the confusion matrix result 

#[Model2: PCA+RandomForest]
#Principal Components [PCA]
pca=PCA()
pca.fit(X_train) #find the principal components
ex_var_ratio=pca.explained_variance_ratio_ # the amount of variance that each PC explains
ex_var_ratio_cum=np.cumsum(np.round(ex_var_ratio,decimals=4)*100)#Cumulative Variance explains
Beispiel #45
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.externals import joblib

# create all the machine learning models
models = []
models.append(('LR', LogisticRegression(random_state=9)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state=9)))
models.append(
    ('RF', RandomForestClassifier(n_estimators=num_trees, random_state=9)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=9)))

# variables to hold the results and names
results = []
names = []
scoring = "accuracy"

# import the feature vector and trained labels
h5f_data = h5py.File('output/data.h5', 'r')
h5f_label = h5py.File('output/labels.h5', 'r')

global_features_string = h5f_data['dataset_1']
global_labels_string = h5f_label['dataset_1']
Beispiel #46
0
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)

print("------ Stacking...")

estimators = [('lgbm',
               lgb.LGBMClassifier(objective='regression_l1',
                                  n_jobs=-1,
                                  n_estimators=1000,
                                  num_leaves=80,
                                  scale_pos_weight=0.05,
                                  verbose=2)),
              ('rf',
               RandomForestClassifier(random_state=123456,
                                      n_jobs=-1,
                                      max_depth=30,
                                      n_estimators=400,
                                      verbose=2)),
              ('xgboost',
               xgb.XGBClassifier(predictor='cpu_predictor',
                                 n_gpus=0,
                                 n_jobs=-1,
                                 n_estimators=700,
                                 eta=0.1,
                                 max_depth=10,
                                 verbose=2))]

stacking = StackingClassifier(estimators=estimators,
                              final_estimator=LogisticRegression(),
                              cv=5,
                              verbose=2)
Beispiel #47
0
heart = pandas.read_csv("pc.csv")
print(heart.describe())
heart.loc[heart["heartpred"] == 2, "heartpred"] = 1
heart.loc[heart["heartpred"] == 3, "heartpred"] = 1
heart.loc[heart["heartpred"] == 4, "heartpred"] = 1
heart["slope"] = heart["slope"].fillna(heart["slope"].median())
heart["thal"] = heart["thal"].fillna(heart["thal"].median())
heart["ca"] = heart["ca"].fillna(heart["ca"].median())
print(heart.describe())
predictors = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
    "exang", "oldpeak", "slope", "ca", "thal"
]
alg = RandomForestClassifier(n_estimators=75,
                             min_samples_split=20,
                             min_samples_leaf=1)
kf = KFold(heart.shape[0], n_folds=10, random_state=1)
predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (heart[predictors].iloc[train, :])
    #print(train_predictors)
    # The target we're using to train the algorithm.
    train_target = heart["heartpred"].iloc[train]
    #print(train_target)
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(heart[predictors].iloc[test, :])
    predictions.append(test_predictions)
from sklearn.model_selection import cross_val_score

train_y=train_y.ravel()
train_Y=train_Y.ravel()
test_y=test_y.ravel()


def evaluate_model(model):
    model.fit(train_x, train_y)
    print(model.score(train_x, train_y))
    print(model.score(test_x, test_y))
    cvs=cross_val_score(model, train_X, train_Y, cv=5)
    print(cvs)
    print(np.mean(cvs), np.std(cvs))

rfc = RandomForestClassifier(n_estimators=50)
evaluate_model(rfc)

'''
1.0
0.8134328358208955
[0.77653631 0.81564246 0.84269663 0.79775281 0.84180791]
0.817115441698256      
'''


lr = LogisticRegression(C=2, penalty='l2', tol=1e-8)
evaluate_model(lr)

'''
0.8491171749598716
Beispiel #49
0
print("For Decision Trees With One Variable")
classification_model(model, traindf, predictor_var, outcome_var)
print("")
print("")

"""
The accuracy of the prediction is much much better now. 

Using a single predictor gives a 97% prediction accuracy for this model but 
the cross-validation score is not that great.
"""

# Random Forest

predictor_var = features_mean
model = RandomForestClassifier(n_estimators = 100,min_samples_split = 25, 
                               max_depth = 7, max_features = 2)
print("For Random Forest")
classification_model(model, traindf,predictor_var, outcome_var)
print("")
print("")

"""
Using all the features improves the prediction accuracy and the cross-validation 
score is great.

An advantage with Random Forest is that it returns a feature importance 
matrix which can be used to select features. 
"""

# Selecting Top features 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Fitting Naive Bayes to the Training set
""" from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train) """

# Fitting Decision Tree to the Training set
""" from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = "entropy", random_state = 0)
classifier.fit(X_train, y_train) """

# Fitting Random Forest classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = "entropy", random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


#### Accuracy = (TP + TN) / (TP + TN + FP + FN)
#### Precision = TP / (TP + FP)
#### Recall = TP / (TP + FN)
#### F1 Score = 2 * Precision * Recall / (Precision + Recall)
Beispiel #51
0
precision_train, recall_train, f1_score_train = precision_recall_fscore(
    train_truth, train_predicted)

val_predicted = clf.predict(val_data)
precision_val, recall_val, f1_score_val = precision_recall_fscore(
    val_truth, val_predicted)

# print('Training set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_train, recall_train, f1_score_train))
# print('Validation set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_val, recall_val, f1_score_val))

print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format(
    precision_train, recall_train, f1_score_train, precision_val, recall_val,
    f1_score_val))

print('############## RANDOM FOREST ##############')
clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=10)
clf.fit(train_data, train_truth)

train_predicted = clf.predict(train_data)
precision_train, recall_train, f1_score_train = precision_recall_fscore(
    train_truth, train_predicted)

val_predicted = clf.predict(val_data)
precision_val, recall_val, f1_score_val = precision_recall_fscore(
    val_truth, val_predicted)

# print('Training set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_train, recall_train, f1_score_train))
# print('Validation set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_val, recall_val, f1_score_val))

print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format(
    precision_train, recall_train, f1_score_train, precision_val, recall_val,
Beispiel #52
0

data=np.asarray(data_df)
label=np.asarray(label_df).flatten('F') #change to 1D vector

scaler = joblib.load('scaler.joblib')
scaler.fit(data)
data=scaler.transform(data)


x_train, x_test, y_train, y_test = train_test_split(data,label, test_size=0.2, random_state = 4)



mlp =MLPClassifier(random_state=4)
rfc=RandomForestClassifier(random_state=4)
svc = LinearSVC()
ovr = OneVsRestClassifier(svc)
models =[]
models.append(ovr)
models.append(mlp)
models.append(rfc)
kf = StratifiedKFold(n_splits=5, random_state = 4)


y_pred=[]
for model in models:
    model.fit(data,label)
    y=model.predict(x_test)
    y_pred.append(y)  
    print(accuracy_score(y_test, y))
from sklearn.model_selection import train_test_split
xtra, xtes, ytra, ytes = train_test_split(
    wajah['data'], 
    wajah['target'], 
    test_size = .1
)
# print(len(xtra))
# print(len(xtes))
# print(xtra[0])
# print(ytra[0])

# ===============================
# random forest

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=40)

# train
model.fit(xtra, ytra)

# akurasi
print(model.score(xtes, ytes))

# predict
print(xtes[0])
print(model.predict([xtes[0]]))
print(ytes[0])

# ===============================
# plot
Beispiel #54
0
def train_classifier(p, loop_num):
    """
    parameters:
 
    """
    logging.info('Preparing  metrics to classifier')
    
    # Load label images:
    classes_ims = []
    relevance_ims = []
    for t in p.training_files:
        classes_ims.append(load_and_resize_image(os.path.join(p.labels_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0])
        #classes_ims.append(load_and_resize_image(os.path.join(p.labels_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0]==True)
        if p.is_relevance_mask:  
            #relevance_ims.append(load_and_resize_image(os.path.join(p.relevance_masks_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0]==True)
            relevance_ims.append(load_and_resize_image(os.path.join(p.relevance_masks_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0])
            
    
    if not p.is_relevance_mask:
        relevance_ims = [np.ones(c.shape)*255 for c in classes_ims]

    # Original images filters array:
    X = np.array([])
    # Classes array:
    c = np.array([])
    # Relevance array:
    r = np.array([])

    # Stack filters file and label images:
    for i,t in enumerate(p.training_files):
        temp_i = (generate_all_filters(p.ims_folder, p.bigger_dim_output_size[l], p.z_size, t, p.training_files_timestamp[i], p.channel_num))[0]
        # If requested output size bigger than image:
        if not temp_i:
            if loop_num!=0:
                logging.warning('Requested output size is bigger than original image. '                 'saving/using classifier from previous iteration although min_f1_score is not met.')
                return 0, 0, 0
            else:
                raise Exception('Requested output size is bigger than original image. Please decrese bigger_dim_output_size in user_params.')
        
        temp_i = np.vstack([f.flatten() for f in temp_i]).T
        X = np.vstack((X,temp_i)) if X.size else temp_i
        c = np.hstack((c, classes_ims[i].flatten())) if c.size else classes_ims[i].flatten()
        r = np.hstack((r, relevance_ims[i].flatten())) if r.size else relevance_ims[i].flatten()
        
    if c.shape[0]!=X.shape[0] or r.shape[0]!=X.shape[0]:
        raise Exception('label images size must be equal to original images size')
    
    X = np.vstack([x.flatten() for x in X])
    y = np.array([c for c in c.flatten()])
    r = np.array([r for r in r.flatten()])
    
    X = X[r==255]
    y = y[r==255]

    skf = StratifiedKFold(n_splits=3)
    
    classification_reports = list()
    f1_scores = list()
    
    # In case the user asked for regression and not classification:
    if not p.is_regression:
        for train_ix, test_ix in skf.split(X, y): # for each of K folds
            # define training and test sets
            X_train, X_test = X[train_ix,:], X[test_ix,:]
            y_train, y_test = y[train_ix], y[test_ix]

            # Train classifier
            clf = RandomForestClassifier() #(n_jobs=2) not sure if works on cluster..
            clf.fit(X_train, y_train)

            # Predict test set labels
            y_hat = clf.predict(X_test)
            classification_reports.append(classification_report(y_test, y_hat))
            f1_scores.append(f1_score(y_test, y_hat, average=None))

        print(*classification_reports, sep='/n', flush=True)

        # Train classifier
        clf = RandomForestClassifier() #(n_jobs=2) not sure if works on cluster..
        
    else:
        clf = RandomForestRegressor()
        
    clf.fit(X, y)
    
    stop_loop_bool = True if (np.mean(f1_scores)>p.min_f1_score) else False

    return clf, stop_loop_bool
Beispiel #55
0
    # Collect training features and labels
    training_features, training_labels = f_extractor.extract_training(training_path)

    t1 = time.time()
    print('Done in %.3fs\n' % (t1 - t0))

    # Train a random forest classifier
    # ********************************
    #

    print('Training Random Forest')
    t0 = time.time()

    # Create and train a random forest with scikit-learn
    clf = RandomForestClassifier()
    clf.fit(training_features, training_labels)

    t1 = time.time()
    print('Done in %.3fs\n' % (t1 - t0))

    # Test
    # ****
    #

    print('Compute testing features')
    t0 = time.time()

    # Collect test features
    test_features = f_extractor.extract_test(test_path)
Beispiel #56
0
            clf=lambda: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                            decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
                            max_iter=-1, probability=False, random_state=None, shrinking=True,
                            tol=0.001, verbose=False),
            normalized=True
            ),
    ClfConf(id="knn",
            clf=lambda: KNeighborsClassifier(n_neighbors=3),
            normalized=False
            ),
    ClfConf(id="nm_g",
            clf=lambda: GaussianNB(),
            normalized=False
            ),
    ClfConf(id="rf",
            clf=lambda: RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
            normalized=False
            ),
    ClfConf(id="xgb",
            clf=lambda: xgb.XGBClassifier(),
            normalized=False
            ),
    ClfConf(id="gb",
            clf=lambda: GradientBoostingClassifier(
                loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse',
                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3,
                min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None,
                verbose=0, max_leaf_nodes=None),
            normalized=False
            ),
]
Beispiel #57
0
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from helpers import load_data, nn_layers, nn_reg, nn_iter, cluster_acc, myGMM, clusters, dims, dims_big, run_clustering, pairwiseDistCorr, reconstructionError, ImportanceSelect
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

out = './results/random_forest/'

perm_x, perm_y, housing_x, housing_y = load_data()  # perm, housing
raise Exception('Remove this line to run code')

#2

rfc = RandomForestClassifier(n_estimators=100,
                             class_weight='balanced',
                             random_state=5,
                             n_jobs=7)
fs_perm = rfc.fit(perm_x, perm_y).feature_importances_
fs_housing = rfc.fit(housing_x, housing_y).feature_importances_

tmp = pd.Series(np.sort(fs_perm)[::-1])
tmp.to_csv(out + 'perm scree.csv')

tmp = pd.Series(np.sort(fs_housing)[::-1])
tmp.to_csv(out + 'housing scree.csv')

#4
filtr = ImportanceSelect(rfc)
grid = {
    'filter__n': dims,
    'NN__alpha': nn_reg,
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y,
                       top_n_features):

    # random forest
    rf_est = RandomForestClassifier(random_state=0)
    rf_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [2, 3],
        'max_depth': [20]
    }

    rf_grid = model_selection.GridSearchCV(rf_est,
                                           rf_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
    print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
    print('Top N Features RF Train Score:' +
          str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_rf = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        rf_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print('Sample 10 Features from RF Classifier')
    print(str(features_top_n_rf[:10]))

    # AdaBoost
    ada_est = AdaBoostClassifier(random_state=0)
    ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est,
                                            ada_param_grid,
                                            n_jobs=25,
                                            cv=10,
                                            verbose=1)
    ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
    print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
    print('Top N Features Ada Train Score:' +
          str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_ada = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        ada_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print('Sample 10 Feature from Ada Classifier:')
    print(str(features_top_n_ada[:10]))

    # ExtraTree
    et_est = ExtraTreesClassifier(random_state=0)
    et_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [3, 4],
        'max_depth': [20]
    }
    et_grid = model_selection.GridSearchCV(et_est,
                                           et_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best ET Params:' + str(et_grid.best_params_))
    print('Top N Features Best ET Score:' + str(et_grid.best_score_))
    print('Top N Features ET Train Score:' +
          str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_et = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        et_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print('Sample 10 Features from ET Classifier:')
    print(str(features_top_n_et[:10]))

    # GradientBoosting
    gb_est = GradientBoostingClassifier(random_state=0)
    gb_param_grid = {
        'n_estimators': [500],
        'learning_rate': [0.01, 0.1],
        'max_depth': [20]
    }
    gb_grid = model_selection.GridSearchCV(gb_est,
                                           gb_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
    print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
    print('Top N Features GB Train Score:' +
          str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_gb = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        gb_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
    print('Sample 10 Feature from GB Classifier:')
    print(str(features_top_n_gb[:10]))

    # DecisionTree
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
    dt_grid = model_selection.GridSearchCV(dt_est,
                                           dt_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best DT Params:' + str(dt_grid.best_params_))
    print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
    print('Top N Features DT Train Score:' +
          str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_dt = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        dt_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
    print('Sample 10 Features from DT Classifier:')
    print(str(features_top_n_dt[:10]))

    # merge the three models
    features_top_n = pd.concat([
        features_top_n_rf, features_top_n_ada, features_top_n_et,
        features_top_n_gb, features_top_n_dt
    ],
                               ignore_index=True).drop_duplicates()

    features_importance = pd.concat([
        feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
        feature_imp_sorted_gb, feature_imp_sorted_dt
    ],
                                    ignore_index=True)

    return features_top_n, features_importance



classifier_ADA=AdaBoostClassifier()
classifier_ADA.fit(X_train1,Y_train1)
pred_ADA_train=classifier_ADA.predict(X_train1)
np.mean(pred_ADA_train==Y_train1)
pred_ADA_test=classifier_ADA.predict(X_test1)
np.mean(pred_ADA_test==Y_test1)
#Train Accuracy ADA=91.18
#Test Accuracy LR=90.27



classifier_RF=RandomForestClassifier()
classifier_RF.fit(X_train1,Y_train1)
pred_RF_train=classifier_RF.predict(X_train1)
np.mean(pred_RF_train==Y_train1)
pred_RF_test=classifier_RF.predict(X_test1)
np.mean(pred_RF_test==Y_test1)
#Train Accuracy RF=100
#Test Accuracy RF=86.60



positive=Reviews[Reviews["Sentiment"]=="positive"]
negative=Reviews[Reviews["Sentiment"]=="negative"]

print(positive.shape,negative.shape)
	def randomForest(self, predictors=predictors, target=target):
		alg = RandomForestClassifier(random_state=1, n_estimators=20, min_samples_split=2, min_samples_leaf=1)
		cleanData = self.dataClean()
		score = cV.kFold().analyze(cleanData, predictors, target, alg)
		return score