Exemple #1
0
def rf_fit():

	train_inp,valid_inp,train_target,valid_target = prepare_input()

	rf = RandomForestClassifier(random_state=31,n_jobs=-1,verbose=1,n_estimators=100,min_samples_split=5)
	start = time.time()

	rf.fit(train_inp,train_target)

	end = time.time()
	print "fitting took {:0.4} seconds".format(end-start)

	training_output = rf.predict_proba(train_inp)
	validation_output = rf.predict_proba(valid_inp)

	training_error = log_loss(train_target,training_output)
	validation_error = log_loss(valid_target,validation_output)

	print "Train error: {:02.4f}".format(training_error)
	print "Validation error: {:02.4f}".format(validation_error)


	joblib.dump(rf,rf_filename)


	return rf
def init_turns_module(values, trees, data, labels):
    # Fit regression model
    global turns_regr
    turns_regr = RandomForestClassifier(n_estimators=trees)
    turns_regr.fit(data[:, [0,1]], labels)
    print "init_turns, importances: ", turns_regr.feature_importances_
    return
Exemple #3
0
def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
           inverse=False, lc_filter=None):
    """

    path: Dirección del dataset a ocupar para entrenar
    index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar
    class_filter: Lista de clases que se quiere utilizar
    feature_filter: Lista de features que se quiere utilizar

    """
    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux
            
        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14,
                                     min_samples_split=5)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    return pd.concat(results)
def randomforest(df1,df2):
	
	
	newsT=df1.L
	L= ['L']
	for x in L:
	 	del df1[x]
	news=df1
	TRAINING=df1.as_matrix(columns=None)
	TEST=newsT.as_matrix(columns=None)
	
	newsT=df2['L']
	L= ['L']
	for x in L:
	 	del df2[x]
	X_test=df2.as_matrix(columns=None)
	y_test=newsT.as_matrix(columns=None)

	clf = RandomForestClassifier(n_estimators=200)
	clf.fit(TRAINING, TEST)
	y_pred1 = clf.predict_proba(X_test)[:, 1]
	y_pred = clf.predict(X_test)
	recall_score(y_test, y_pred)
	precision_score(y_test, y_pred)
	precision_score(y_test, y_pred,pos_label=0)
	recall_score(y_test, y_pred,pos_label=0)
	roc_auc_score(y_test, y_pred1)
	print 'roc: ',roc_auc_score(y_test, y_pred1)
	print 'precision: ',precision_score(y_test, y_pred)
	print 'recall:', recall_score(y_test, y_pred)
	print 'precision Negatives: ',precision_score(y_test, y_pred,pos_label=0)
	print 'recall Negatives: ', recall_score(y_test, y_pred,pos_label=0)
	
	return roc_auc_score(y_test, y_pred1),precision_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred,pos_label=0), recall_score(y_test, y_pred,pos_label=0)
 def __init__(self, data, classes, tree_features, n_trees=100):
     self.n_features = np.shape(data)[1]
     n_rows = np.shape(data)[0]
     n_nans = np.sum(np.isnan(data), 0)
     data = data[:, n_nans < n_rows]
     self.n_features = np.shape(data)[1]
     
     n_nans = np.sum(np.isnan(data), 1)
     data = data[n_nans < self.n_features, :]
     self.n_rows = np.shape(data)[0]
     
     if (tree_features > self.n_features):
         tree_features = self.n_features
     
     self.col_list = np.zeros((n_trees, tree_features), dtype='int')
     self.n_trees = n_trees
     self.bags = []
     for i in range(n_trees):
         cols = sample(range(self.n_features), tree_features)
         cols.sort()
         self.col_list[i, :] = cols
         data_temp = data[:, cols]
         n_nans = np.sum(np.isnan(data_temp), 1)
         data_temp = data_temp[n_nans == 0, :]
         classes_temp = classes[n_nans == 0]
         #bag = BaggingClassifier(n_estimators=1, max_features=tree_features)
         bag = RandomForestClassifier(n_estimators=1, max_features=tree_features)
         bag.fit(data_temp, classes_temp)
         self.bags.append(bag)
         print(np.shape(data_temp))
def train_and_predict():
	print('Converting data...')

	config.X = np.array(config.X)
	config.Y = np.array(config.Y)
	config.X_test = np.array(config.X_test)
	#print(config.X.shape)
	#print(config.Y.shape)
	#print(config.X_test.shape)
	print('Training...')
	print('Time Elapsed: ' + str((time.time() - config.start_time)/60))

	num_classes = len(config.Y[1, :])

	for i in range(num_classes):
		print('Creating Classifier: ', i)
		rf = RandomForestClassifier(n_estimators=500, max_depth=5, n_jobs=-1, oob_score=True, verbose=2, criterion="entropy")
		gbm = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic')
		
		print('Fitting Random Forest Classifier: ', i)
		rf.fit(config.X, config.Y[:, i])

		print('Fitting With XGBoost Classifier: ', i)
		gbm.fit(config.X, config.Y[:, i])

		print('Getting Random Forest Predictions for attribute: ', i)
		y_pred_rf = rf.predict(config.X_test)
		config.Y_pred_rf.append(y_pred_rf)
		print(y_pred_rf)

		print('Getting XGBoost Predictions for attribute: ', i)
		y_pred_xgb = gbm.predict(config.X_test)
		config.Y_pred_xgb.append(y_pred_xgb)
		print(y_pred_xgb)
Exemple #7
0
def TrainRandomForestVariance(p_subject, p_save):
	print "Welcome to TrainRandomForestVariance(" + p_subject + ", " + str(p_save) + ")"
	training_data_raw = pd.read_pickle(input_data_paths[p_subject])
	training_data = training_data_raw[["variance" in x or "classification" in x for x in training_data_raw.index]]

	# Ictal vs interictal
	forest_seizure = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1)
	y_seizure = [1 * (x > 0) for x in training_data.T["classification"]]
	forest_seizure.fit(training_data[:-1].T, y_seizure)

	# IctalA vs IctalB
	forest_early = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1)
	y_early = [1 * (x == 2) for x in training_data.T["classification"]]
	forest_early.fit(training_data[:-1].T, y_early)

	# Save models
	if p_save:
		saved_files = joblib.dump(forest_seizure, "RFV_" + p_subject + "_seizure.pkl")
		for saved_file in saved_files:
			os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models")
		saved_files = joblib.dump(forest_early, "RFV_" + p_subject + "_early.pkl")
		for saved_file in saved_files:
			os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models")

	return {"seizure":forest_seizure, "early":forest_early}
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
def random_forest_classify(train_data,train_label,test_data):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(train_data, ravel(train_label))
    test_label=rf.predict(test_data)
    
    save_result(test_label,'sklearn_random_forest_classify_Result.csv')  
    return test_label 
def predict_rf(train_features, test_features, train_labels, test_labels):
  model = RandomForestClassifier(n_estimators=1000)
  model.fit(train_features, train_labels)
  predictions = model.predict(train_features)
  print get_accuracy(predictions, train_labels)
  predictions = model.predict(test_features)
  print get_accuracy(predictions, test_labels)
Exemple #11
0
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'pointGroup', md = None):
    """
    Build a random forest-classifier model to predict some structure feature from compositional data.  Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    le = LabelEncoder()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = le.fit_transform(df[targetcolumn].values)

    rfc = RandomForestClassifier(max_depth = md)
    acc = mean(cross_val_score(rfc, X, y))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfc.fit(X_train,y_train)
    y_predict = rfc.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    
    cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)

    rfc.fit(X, y)

    return rfc, cm, round(acc,2), le
def training_and_test(token, train_data, test_data, num_classes, result):
    """Train and test

    Args:
        token (:obj:`str`): token representing this run
        train_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of training feature and label
        test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label
        num_classes (:obj:`int`): Number of classes
        result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result
    """
    model = RandomForestClassifier(n_estimators=20, criterion="entropy")
    model.fit(train_data[0], train_data[1].flatten())
    # Test
    predicted_y = model.predict(test_data[0])
    predicted_proba = model.predict_proba(test_data[0])
    # Evaluate the Test and Store Result
    confusion_matrix = get_confusion_matrix(num_classes=num_classes,
                                            label=test_data[1].flatten(), predicted=predicted_y)
    result.add_record(model.get_params(), key=token, confusion_matrix=confusion_matrix)
    # In case any label is missing, populate it
    if predicted_proba.shape[1] != num_classes:
        temp_array = np.zeros((predicted_proba.shape[0], num_classes), np.float32)
        for i in range(len(model.classes_)):
            temp_array[:, model.classes_[i]] = predicted_proba[:, i]
        predicted_proba = temp_array
    return predicted_y, predicted_proba
 def cls_create(xs, ys):
     
     if algo == "SVM":
         classifier = svm.SVC(C = self.parm, probability=True)
         
     elif algo == "RF":
         classifier = RandomForestClassifier(n_estimators = int(self.parm), criterion='entropy',  n_jobs = 1)
     #
     #classifier = LDA()
     
     new_xs = xs
     
     """
     positive_count = len([y for y in ys if y > 0])
     if positive_count >= 20:
     
         #self.selector = svm.LinearSVC(C = 1, dual = False, penalty="l1")
         self.selector = LDA()
         new_xs = self.selector.fit_transform(xs, ys)
     else:
         self.selector = None
     """
     
     classifier.fit(new_xs, ys)
     probs = classifier.predict_proba(new_xs)            
     
     #self.pclassifier = svm.SVC(parm_val = 1.0)
     #self.pclassifier.fit(probs, ys)
     
     self.threshold, self.positive, self.negative = best_threshold_for_f1(probs, 20, ys)
     return classifier
Exemple #14
0
    def model_and_predict(self, X_train, y_train, X_test):
        district_idx = self.columns.index('PdDistrict')
        districts = set(X_train[:,district_idx])
        district_ys = {}
        # Grow forest and predict separately for each district's records
        for d in districts:
            district_X_train = X_train[X_train[:, district_idx] == d]
            district_X_train = np.delete(district_X_train, district_idx, 1)
            district_y_train = y_train[X_train[:, district_idx] == d]
            district_X_test = X_test[X_test[:, district_idx] == d]
            district_X_test = np.delete(district_X_test, district_idx, 1)
            print "Growing forest for", d

            # Not saving output in Git so make this deterministic 
            # with random_state
            rf = RandomForestClassifier(n_estimators=self.n_trees, n_jobs=-1,
                                        random_state=782629)
            rf.fit(district_X_train, district_y_train)

            district_ys[d] = list(rf.predict(district_X_test))
            print "Finished", d

        print "All predictions made"

        y_hat = []
        for row in X_test:
            d_ys = district_ys[row[district_idx]]
            y_hat.append(d_ys.pop(0))

        return y_hat
def run():
    mean_acc = 0.0
    mean_logloss = 0.0
    skf, X_all, labels = gen_cv()
    for fold, (test_index, train_index) in enumerate(skf, start=1):
        logger.info('at fold: {0}'.format(fold))
        logger.info('train samples: {0}, test samples: {1}'.format(len(train_index), len(test_index)))
        X_train, X_test = X_all[train_index], X_all[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        rfc = RandomForestClassifier(n_jobs=10, random_state=919)
        rfc.fit(X_train, y_train)
        y_test_predicted = rfc.predict(X_test)
        y_test_proba = rfc.predict_proba(X_test)
        # equals = y_test == y_test_predicted
        # acc = np.sum(equals) / float(len(equals))
        acc = accuracy_score(y_test, y_test_predicted)
        logger.info('test data predicted accuracy: {0}'.format(acc))
        # log loss -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
        logloss = log_loss(y_test, y_test_proba)
        logger.info('log loss at test data: {0}'.format(logloss))
        # logger.info('log loss at test data using label: {0}'.format(log_loss(y_test, y_test_predicted)))
        mean_acc += acc
        mean_logloss += logloss

    n_folds = skf.n_folds
    logger.info('mean acc: {0}'.format(mean_acc / n_folds))
    logger.info('mean log loss: {0}'.format(mean_logloss / n_folds))
def main():
    X, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.1, random_state=None)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    for i in range(8):
        clf = RandomForestClassifier(n_estimators = 2**i)   
        clf.fit(X,y)
        plot_surface(clf, X, y)
def buildModel(df):
	train_y = df['arr_del15'][:train_len]
	train_x = df[cols][:train_len]

	# transform categorical features
	train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0]
	train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0]
	train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0]
	
	pd.set_option('display.max_rows', 500)
	print(train_x)

	# train_x['origin'] = pd.factorize(train_x['origin'])[0]
	#	train_x['dest'] = pd.factorize(train_x['dest'])[0]
	# print(train_x)
	train_x = enc.fit_transform(train_x)
	print(train_x.shape)

	# Create Random Forest classifier with 50 trees
	clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
	clf_rf.fit(train_x.toarray(), train_y)

	del train_x, train_y
	print("Model built")
	return clf_rf
def ranforest(n_estimators, min_samples_split):
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    clf = RandomForestClassifier(n_estimators = n_estimators, 
                                 min_samples_split = min_samples_split,
                                 bootstrap = True)
    clf.fit(features_train, labels_train)
    
    t_fit = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t_fit, 3), "s"
    
    t_pred = time()
    pred = clf.predict(features_test)
    print "predict time:", round(time()-t_pred, 3), "s"
    
    print accuracy_score(pred, labels_test)

    
    try:
        prettyPicture(clf, features_test, labels_test)
    except NameError:
        pass
 def test_save_prediction(self):
     model = RandomForestClassifier()
     model.id = get_model_id(model)
     model.fit(self.iris.data, self.iris.target)
     indexes = np.fromfunction(lambda x: x, (self.iris.data.shape[0], ), dtype=np.int32)
     saving_predict_proba(model, self.iris.data, indexes)
     os.remove('RandomForestClassifier_r0_N__m5_0p0__m4_2__m1_auto__m0_N__m3_1__m2_N__n0_10__b0_1__c1_gini__c0_N_0_149.csv')
def crossValIteration(dat,classes,cutoff,prop=0.9,reshuffle=False):
	if reshuffle:
		dat.samples = sampleReshuffle(dat)
	saved_samples = [i for i in dat.samples]
	dat.samples = ["{0}_$$_{1}".format(i,v) for i,v in enumerate(dat.samples)]
	train,test=dat.splitTraining(prop, classes)
	print test.samples
	selectedSampleIndicies = [int(i.split("_$$_")[0]) for i in test.samples]
	dat.samples = saved_samples
	print test.samples
	test.samples = [i.split("_$$_")[1] for i in test.samples]
	train.samples = [i.split("_$$_")[1] for i in train.samples]
	print "Training set has {0} samples from classes: {1}".format(len(train.samples),",".join(set(train.samples)))
	print "Test set has {0} samples from classes: {1}".format(len(test.samples),",".join(set(test.samples)))
	print "Selecting data..."
	# select features for each disease
	print "Number of selections made for each class:"

	print "Setting up SVM..."
	Xtrain = train.values.transpose()
	Ytrain = train.samples

	clf=RandomForestClassifier(n_estimators=1000)
	clf.fit(Xtrain,Ytrain)

	Xtest = test.values.transpose()
	Ytest = test.samples
	print "Predicting R-forest..."
	#classification results versus actual
	acc = zip(Ytest,clf.predict(Xtest)) # (actual,predicted)... for each sample
	print acc # this is the elemental form of the "result" lists processed below
	print sum([i[0] == i[1] for i in acc])*1.0/len(acc)
	return acc
Exemple #21
0
def rforests(trainx, trainy, test, n_estimators=100, k=5):
	trainy = np.ravel(trainy)

	forest = RandomForestClassifier(n_estimators)
	forest.fit(trainx, trainy)


	prob_train = forest.predict_proba(trainx)
	prob_test = forest.predict_proba(test)

	# Since the index is the number of the country that's been chosen
	# we can use these with argsort to get the maximum 5., we will have to do this
	# for the entire matrix though.
	sort_train = np.argsort(prob_train)[:,-k:]
	sort_test = np.argsort(prob_test)[:,-k:]

	# Now we need to transform these back to countries, but to map I need to
	# have a dataframe.
	col_names = []

	for i in range(k):
		name = "country_destination_" + str(i+1)
		col_names.append(name)

	pred_train = pd.DataFrame(sort_train, columns=col_names)
	pred_test = pd.DataFrame(sort_test, columns=col_names)

	for name in col_names:
		pred_train[name] = pred_train[name].map(dicts.country)
		pred_test[name] = pred_test[name].map(dicts.country)

	pred_train = np.fliplr(pred_train)
	pred_test = np.fliplr(pred_test)

	return forest, pred_train, pred_test
Exemple #22
0
def get_preds(features, trees=3000, depth=19):  # features is the number of latents features that I want the nmf to run on
    # Create dataframes
    df = get_nmf(k=features)
    df_full = add_yahoo_to_df(df)
    df_train = add_dummies(df_full)   # Why aren't you using df_full?

    df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is
    df_test_full = add_yahoo_to_df(df_test)
    df_test_full = add_dummies(df_test_full)

    # Create models
    X_model_class, y_model_class = get_classifier_data(df_full)
    rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth)
    rf_class.fit(X_model_class, y_model_class)
    #
    X_model_regress, y_model_regress = get_regressor_data(df_full)
    rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth)
    rf_regress.fit(X_model_regress, y_model_regress)

    # Get X and y values
    X_classify, y_classify  = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11']))
    X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11']))

    # Run models

    classifier_preds = rf_class.predict(X_classify)
    classifier_accuracy = accuracy_score(classifier_preds, y_classify)

    regressor_preds = rf_regress.predict(X_regress)
    regressor_mse = mean_squared_error(regressor_preds, y_regress)

    # I want to return the number of features, k, along with the accuracy of the classifier
    # and the MSE of the regressor.  This will give me an idea of how well things are doing
    # based on the number of features.
    return [features, classifier_accuracy, regressor_mse]
Exemple #23
0
def main():

    S, col_names_S = load_data(config.paths.training_data,
                               config.paths.cache_folder)
    Xs, Ys, col_names_S = extract_xy(S, col_names_S)

    a = RandomForestClassifier(n_estimators=1)
    a.fit(Xs.toarray(), Ys.toarray().ravel())
    best_features = a.feature_importances_
    max_ind, max_val = max(enumerate(best_features), key=operator.itemgetter(1))
    print best_features
    print max_ind, max_val

    print Xs.shape
    print Ys.shape
    param_range = [1, 3, 5, 7, 10, 15, 20, 30, 60, 80]
    train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='entropy'), Xs, Ys.toarray().ravel(),
                                                 'n_estimators', param_range)

    print train_scores
    print test_scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve for Random Forest")
    plt.xlabel("Number of Trees")
    plt.ylabel("Score")
    plt.plot(param_range, train_mean, label="Training Score", color='r')
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color='r')
    plt.plot(param_range, test_mean, label="Test Score", color='b')
    plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color='b')
    plt.legend(loc="best")
    plt.show()
Exemple #24
0
def rand_forest(train_bow,train_labels,test_bow,test_labels,bow_indexes):
    print("Training rndForest")
    rf_classifier=RandomForestClassifier()

    rf_classifier.fit(train_bow,train_labels)
    print("Testing rndForest")
    test(rf_classifier,"rf",test_bow,test_labels,bow_indexes)
 def randomForest_eval_func(self, chromosome):
     n_estimators, max_features, window_size = self.decode_chromosome(chromosome)
     if self.check_log(n_estimators, max_features, window_size):
         return self.get_means_from_log(n_estimators, max_features, window_size)[0]
     folded_dataset = self.create_folded_dataset(window_size)
     indim = 21 * (2 * window_size + 1)
     mean_AUC = 0
     mean_decision_value = 0
     mean_mcc = 0
     sample_size_over_thousand_flag = False
     for test_fold in xrange(self.fold):
         test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold)
         if len(test_labels) + len(train_labels) > 1000:
             sample_size_over_thousand_flag = True
         clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
         clf.fit(train_dataset, train_labels)
         probas = clf.predict_proba(test_dataset)
         decision_values = map(lambda x: x[1], probas) # Probability of being binding residue
         AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels)
         mean_AUC += AUC
         mean_decision_value += decision_value_and_max_mcc[0]
         mean_mcc += decision_value_and_max_mcc[1]
         if sample_size_over_thousand_flag:
             break
     if not sample_size_over_thousand_flag:
         mean_AUC /= self.fold
         mean_decision_value /= self.fold
         mean_mcc /= self.fold
     self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc)
     self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc)
     return mean_AUC
Exemple #26
0
    def fit(self, x, y):
        models = []
        preds = np.zeros((len(x), self.n_channels + self.n_features))

        # create channel based models
        for i in xrange(self.n_channels):
            print('training channel model {}'.format(i))
            model = LogisticRegression()
            feats = x[:, (i * self.n_features):((i + 1) * self.n_features)]
            model.fit(feats, y)
            models.append(model)
            preds[:, i] = model.predict(feats)

        # create band based models
        for i in xrange(self.n_features):
            print('training band model {}'.format(i))
            model = LogisticRegression()
            feats = x[:, i:(self.n_channels * self.n_features):self.n_features]
            model.fit(feats, y)
            models.append(model)
            preds[:, self.n_channels + i] = model.predict(feats)

        # create integrating forest
        top_classifier = RandomForestClassifier()
        top_classifier.fit(preds, y)

        self.models = models
        self.c = top_classifier
Exemple #27
0
class Model:
    """Abstraction for gibberish model. Two methods: fit and predict."""    
    def __init__(self, X, y, ntrees=500):
        """Get data and fit model."""
        self.clf = RandomForestClassifier(n_estimators=ntrees)
        self.ntrees = ntrees
        self.clf = self.clf.fit(X, y)
        self.version = 0

    def fit(self, X, y):
        """Updates model with data X, y."""
        self.clf = RandomForestClassifier(n_estimators=self.ntrees)
        self.clf = self.clf.fit(X, y)
        print("updating model from " + str(self.version) + " to " + str(self.version + 1) + ".")
        self.version += 1
        return(self)

    def predict(self, X):
        """Predict classification for X"""
        prediction = self.clf.predict(X)
        print("using version " + str(self.version))
        return(prediction)
        
    def __repr__(self):
        return("<Model(version='%s')>" % (self.version))
def Random_Forest_classifier(train_input_data,train_output_data,test_input_data,test_output_data):
    tree_list = []
    accuracy_percent = []
    for trees in range(10,200,10):
        clf = RandomForestClassifier(trees)
        clf.fit(train_input_data,train_output_data)
        predicted_output = clf.predict(test_input_data)
        error_list = []
        if isinstance(predicted_output,list) ==False:
            predicted_output = predicted_output.tolist()
        if isinstance(test_output_data,list) ==False:
            test_output_data = test_output_data.tolist()
        for i in range(len(test_output_data)):
            cur_univ_similarities =  similar_univs[similar_univs['univName'] == predicted_output[i]]
            cur_univ_similarity_list = cur_univ_similarities.values.tolist()
            cur_univ_similarity_list = [item for sublist in cur_univ_similarity_list for item in sublist]
            if test_output_data[i] in cur_univ_similarity_list[1:]:
                error_list.append(0)
            else:
                error_list.append(1)
        tree_list.append(trees)
        accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100))
    tree_list = np.array(tree_list)
    accuracy_percent = np.array(accuracy_percent)
    plt.plot(tree_list,accuracy_percent)
    plt.xlabel('Number of trees')
    plt.ylabel('Percent of accuracy')
    plt.title('Varation of accuracy with trees')
    plt.grid(True)
    plt.savefig("rf1.png")
    plt.show()
    return predicted_output
Exemple #29
0
def onescore(X, Y, Xtest):
    clf = RandomForestClassifier(oob_score=True, n_jobs=-1, n_estimators=1000, max_features=300, random_state=0)
    clf.fit(X, Y)
    print "oob_score = ", clf.oob_score_
    print clf.get_params()
    ytest = clf.predict(Xtest)
    output(ytest, "try_004.csv")
def cross_validate():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Cross-Validating")
    rf = RandomForestClassifier(n_estimators=10,
                                verbose=1,
                                compute_importances=True,
                                n_jobs=2)
    cv = cross_validation.KFold(len(data),
                                k=10,
                                indices=False)
    results = []
    for traincv, testcv in cv:
        print "\t-- cv [%d]"%len(results)
        print "\t","extracting features"
        #...
        feacv = features.extract_features(feature_names,
                                          traincv)
        print "\t","learning"
        rf.fit(feacv, data["OpenStatus"])
        print "\t","predicting"
        probs = rf.predict_proba(testcv)
        print "\t","evaluating"
        results.append( llfun(target[testcv],
                              [x["OpenStatus"] for x in probas]) )
    print "LogLoss: " + str( np.array(results).mean() )
def train_rf(feature,label,params_dummy):
        rf =  RandomForestClassifier(random_state=10,n_estimators=70)
        rf.fit(feature,label)

        return rf
print("Training Accuracy is: ")
print(log.score(X_train, Y_train) * 100)
print(" Testing accuracy is: ")
print(log.score(X_test, Y_test) * 100)
#print("Precision is: ")
#print(precision)
#print("Recall is: ")
#print(recall)

print "-----------Random Forest Classifier----------------"

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,random_state=0)
print X_train.shape
print Y_test.shape
rand_forest = RandomForestClassifier(random_state=0, n_estimators=250, min_samples_split=8, min_samples_leaf=4)
rand_forest.fit(X_train, Y_train)
Y_predict = rand_forest.predict(X_test)
confusion_mat_random = confusion_matrix(Y_test, Y_predict)
print ("Random forest accuracy: ")
print  accuracy_score(Y_test, Y_predict) * 100
print ('Confusion Matrix for Random forest: ')
print (pd.crosstab(Y_test, Y_predict, rownames=['Predicted Values'], colnames=['True Values']))


print "-------------SVM---------------------------------"


clf = svm.SVC()
y_pred = clf.fit(X_train, Y_train).predict(X_test)
accuracy = accuracy_score(Y_test,y_pred)
from sklearn.datasets import load_files
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
nltk.download('stopwords')
nltk.download('wordnet')temporal_data = load_files(r"txt_sentoken")
X, y = temporal_data.data, temporal_data.targetdocuments = []
stemmer = WordNetLemmatizer()# Pre-processing tasks
for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)    # Converting to Lowercase
    document = document.lower()    # Lemmatization
    document = document.split()    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)    documents.append(document)# Bag of Words model to convert text documents into numerical features
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()# Training and testing splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)# Evaluating the model
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
Exemple #34
0
#
# Now its time to train the model!
#
# **Creating an instance of the RandomForestClassifier class and fit it to the training data from the previous step.**

# In[29]:

from sklearn.ensemble import RandomForestClassifier

# In[30]:

rfc = RandomForestClassifier(n_estimators=600)

# In[31]:

rfc.fit(X_train, y_train)

# ## Predictions and Evaluation
#
# Let's predict off the y_test values and evaluate our model.
#
# ** Predict the class of not.fully.paid for the X_test data.**

# In[32]:

predictions = rfc.predict(X_test)

# **creating a classification report from the results.**

# In[33]:
#print(df.head())
#print(df.describe())

Y = df.iloc[:, 11]
X = df.iloc[:,:11]

X_train, X_test, Y_train, Y_test = np.asarray(train_test_split(X, Y, test_size = 0.1))

print("X_train contain = ", X_train.shape, "    and    Y_train contain = ", Y_train.shape)
print("X_test  contain = ", X_test.shape, "     and    Y_test   contain = ", Y_test.shape)


model1 = LogisticRegression()
model1.fit(X_train, Y_train)
print('Logistic Regression Test Score = ' , model1.score(X_test, Y_test))

model2 = DecisionTreeClassifier()
model2.fit(X_train, Y_train)
print('Decision Tree Classifier Test Score = ' , model2.score(X_test, Y_test))

model3 = AdaBoostClassifier()
model3.fit(X_train, Y_train)
print('Ada Boost Classifier Test Score = ' , model3.score(X_test, Y_test))

model4 = RandomForestClassifier()
model4.fit(X_train, Y_train)
print('Random Forest Classifier Test Score = ' , model4.score(X_test, Y_test))

model5 = MLPClassifier()
model5.fit(X_train, Y_train)
print('Random Forest Classifier Test Score = ' , model5.score(X_test, Y_test))
Exemple #36
0
    data.drop(columns_one_hot, axis=1, inplace=True)    # drop()函数:将指定的列按指定的方向删除,并返回

    print u'处理后数据4:\n', data.head(10)

    columns = list(data.columns)
    columns.remove('churn')
    x = data[columns]    # 得到数据
    y = data['churn']    # 得到类标记label
    print u'分组与one-hot编码后:\n', x.head(10)

    # 数组或矩阵分割成随机训练和测试子集
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=0)

    clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=12, min_samples_split=5,
                                 oob_score=True, class_weight={0: 1, 1: 1/y_train.mean()})
    clf.fit(x_train, y_train)   # Build a forest of trees from the training set (X, y).

    # 特征选择   clf.feature_importances_ : 返回重要的特征权值或者说重要性
    important_features = pd.DataFrame(data={'features': x.columns, 'importance': clf.feature_importances_})
    print 'important_features:\n', important_features
    # 按照importance进行排序
    important_features.sort_values(by='importance', axis=0, ascending=False, inplace=True)
    important_features['cum_importance'] = important_features['importance'].cumsum()    # 返回累加的和
    print u'特征重要度:\n', important_features
    # 返回important_features['cum_importance']小于0.95的数据,取出‘features’这一列
    selected_features = important_features.loc[important_features['cum_importance'] < 0.95, 'features']

    # 重新组织数据
    x_train = x_train[selected_features]
    x_test = x_test[selected_features]
                            max_features='auto',
                            bootstrap=True)
GB = GradientBoostingClassifier(loss='deviance',
                                learning_rate=0.1,
                                n_estimators=200,
                                max_features='auto')
ET = ExtraTreesClassifier(n_estimators=10,
                          criterion='gini',
                          max_features='auto',
                          bootstrap=False)

y_train = train_data_Y.loc[:].ravel()
x_train = train_data_X.values
x_test = test_data_X.values

RF.fit(train_data_X, train_data_Y)
RF_feature = RF.feature_importances_
RF_feature
rf_score = RF.score(test_data_X, test_data_Y)
print("RandomForestClassifier score is:", rf_score)

GB.fit(train_data_X, train_data_Y)
GB_feature = GB.feature_importances_
GB_feature
gb_score = GB.score(test_data_X, test_data_Y)
print("GradientBoostingClassifier score is:", gb_score)

ET.fit(train_data_X, train_data_Y)
ET_feature = ET.feature_importances_
ET_feature
et_score = ET.score(test_data_X, test_data_Y)
Exemple #38
0
from sklearn.ensemble import RandomForestClassifier

#model = RandomForestClassifier(criterion='gini', n_estimators=700,
#                             min_samples_split=10,min_samples_leaf=1,
#                             max_features='auto',oob_score=True,
#                             random_state=1,n_jobs=-1)

model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

model.fit(all_features, Targeted_feature)
#prediction_rm=model.predict(X_test)
#print('--------------The Accuracy of the model----------------------------')
#print('The accuracy of the Random Forest Classifier is', round(accuracy_score(prediction_rm,y_test)*100,2))
#kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
#result_rm=cross_val_score(model,all_features,Targeted_feature,cv=10,scoring='accuracy')
#print('The cross validated score for Random Forest Classifier is:',round(result_rm.mean()*100,2))
#y_pred = cross_val_predict(model,all_features,Targeted_feature,cv=10)
#sns.heatmap(confusion_matrix(Targeted_feature,y_pred),annot=True,fmt='3.0f',cmap="summer")
#plt.title('Confusion_matrix', y=1.05, size=15)




# Import data
test_df=pd.read_csv("data/test.csv")
Exemple #39
0
#4.1 types of models
# Random Forest
model = RandomForestClassifier(n_estimators=100)
#SVM Support Vector Machine
model = SVC()
#Gradient Boosting Classifie
model = GradientBoostingClassifier()
#k-nearest neighbors
model = KNeighborsClassifier(n_neighbors=3)
# Gaussian Naive Bayes
model = GaussianNB()
#Logistic Regression
#model = LogisticRegression()

#Train the model
model.fit(train_X, train_y)

#5. Evaluation
# Score the model
print(model.score(train_X, train_y), model.score(valid_X, valid_y))

# It does not work with the regression model
#plot_model_var_imp(model, train_X, train_y)

rfecv = RFECV(estimator=model,
              step=1,
              cv=StratifiedKFold(train_y, 2),
              scoring='accuracy')
rfecv.fit(train_X, train_y)

print(rfecv.score(train_X, train_y), rfecv.score(valid_X, valid_y))
X = data.iloc[:, 2:].values
y = data.iloc[:, 1].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train = pd.DataFrame(X_train)

X_test = pd.DataFrame(X_test)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

import pickle
pickle.dump(rf, open('model.pkl', 'wb'))
Exemple #41
0
# In[619]:

##-----Multiclass - Random Forest classification - TFIDF, ----------##
##__________________________________________________________________##


# In[620]:

#Random Forest
from sklearn.ensemble import RandomForestClassifier


# In[621]:

clf = RandomForestClassifier(max_features=2000)
clf.fit(X_train_fit, y_train)


# In[622]:

X_train.shape


# In[623]:

X_test.shape


# In[624]:

y_train.shape
Exemple #42
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
df = pd.read_csv(
    "/Users/prateekb/Downloads/PythonPractice/Machine Learning/DecisionTree/loan_borowwer_data.csv"
)
df.purpose = pd.Categorical(df.purpose)
df['purpose'] = df.purpose.cat.codes
x = df.iloc[:, 0:13]
y = df['not.fully.paid']
X_train, X_test, Y_train, Y_test = train_test_split(x,
                                                    y,
                                                    test_size=.20,
                                                    random_state=43)

dt = tree.DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)
Y_pred_train = dt.predict(X_train)

print("The train accuracy score is: ", accuracy_score(Y_pred_train, Y_train))
print("The test accuracy score is: ", accuracy_score(Y_pred, Y_test))

rm = RandomForestClassifier()
rm.fit(X_train, Y_train)
Y_pred = rm.predict(X_test)
Y_pred_train = rm.predict(X_train)

print("The train accuracy score is: ", accuracy_score(Y_pred_train, Y_train))
print("The test accuracy score is: ", accuracy_score(Y_pred, Y_test))

print("Random Forest gives better test accuracy")
# Labels encoding
le = preprocessing.LabelEncoder()
le.fit(np.concatenate([train['y'].unique(), val["y"].unique()]))

train['y'] = le.transform(train['y'])
val["y"] = le.transform(val["y"])

# Características ya seleccionadas
cols = [u'SD1', u'SD2', u'SD3', u'SD4', u'SD5', u'P4', u'P5', u'P10', u'P11',
       u'P13', u'P15', u'P18', u'P21', u'P30', u'P37', u'P45', u'P48', u'P49',
       u'P54', u'P57', u'P60', u'P64', u'P69', u'P71', u'P72', u'P73', u'P74',
       u'P76', u'P77', u'P78', u'P79', u'P80', u'P82', u'P83', u'P85', u'P90',
       u'P91', u'P92', u'P93']

# Entrenamiento
rf = RandomForestClassifier(n_estimators=35, criterion="entropy", bootstrap=True, max_depth=20 , n_jobs=-1, random_state=17)
rf.fit(train[cols], train["y"])

# Predicción
ypred = rf.predict(val[cols])

del rf

# Métricas
acc = accuracy_score(val.y.values, ypred)
ck = cohen_kappa_score(val.y.values, ypred)
print('Resultados para RandomForest:')
print('Precisión: {}'.format(acc))
print('Coeficiente kappa: {}'.format(ck))
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.5,
                                                    random_state=random_state)

# Run classifier
classifier = OneVsRestClassifier(
    svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

clf = RandomForestClassifier(n_jobs=4)
y_pred = clf.fit(X_train, y_train).predict(X_test)

# Compute Precision-Recall and plot curve
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(
        y_test[:, i], y_score[:, i])
    average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])

# Compute micro-average ROC curve and ROC area
precision["micro"], recall["micro"], _ = precision_recall_curve(
    y_test.ravel(), y_score.ravel())
average_precision["micro"] = average_precision_score(y_test,
                                                     y_score,
from sklearn.datasets import load_breast_cancer

features, target = load_breast_cancer(return_X_y=True)

print(features)

print(target)

from sklearn.ensemble import RandomForestClassifier

seed = 888
rf_model = RandomForestClassifier(random_state=seed)

rf_model.fit(features, target)

preds = rf_model.predict(features)

print(preds)

from sklearn.metrics import accuracy_score

acc = accuracy_score(target, preds)

print(acc)
Exemple #46
0
隨機森林參數:
n_estimators: 樹的數量(default=10)。
min_samples_leaf: 最終葉節點最少樣本數(default=1);
                  當樣本不大時,可不設定使用預設,若樣本數量非常大時,則推薦增加此參數值。
min_samples_split:節點再劃分時所需的最小樣本數(default=2);
                  當樣本不大時,可不設定使用預設,若樣本數量非常大時,則推薦增加此參數值。
oob_score: 是否採用袋外樣本(out-of-bag samples)來評估模型的準確度(default=False)。
'''
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000,
                             min_samples_split=20,
                             min_samples_leaf=1,
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rfc.fit(x_train, y_train)
'''
交叉驗證Cross-Validation
 K-fold是拆解成 k 個子樣本來做交叉測試
'''
from sklearn.model_selection import cross_val_score

# 邏輯斯回歸:LR acc is 0.8239700374531835 變數不同:7 columns
acc_LR = np.mean(rfecv.grid_scores_)

# 隨機森林:RFC acc is 0.8239700374531835
acc_RFC = rfc.oob_score_

# 支援向量機:SVM acc is 0.6329783950617284
from sklearn import svm
svc = svm.SVC()
Exemple #47
0
class RF:
    def __init__(self):

        self.X = None
        self.Y = None

        self.nobj = 0
        self.nvarx = 0

        self.quantitative = False
        self.autoscale = False
        self.estimators = 0
        self.features = ''
        self.random = False
        self.class_weight = False
        self.learning_curve = True
        self.cv = None
        self.n = 2
        self.p = 1
        self.scoring_function = None

        self.mux = None
        self.wgx = None

        self.TP = 0
        self.TN = 0
        self.FP = 0
        self.FN = 0

        self.TPpred = 0
        self.TNpred = 0
        self.FPpred = 0
        self.FNpred = 0

        self.SDEC = 0.00  # SD error of the calculations
        self.R2 = 0.00  # determination coefficient
        self.scoringR = 0.00
        self.SDEP = 0.00
        self.Q2 = 0.00
        self.scoringP = 0.00
        self.OOBe = 0.00

        self.clf = None

        self.vpath = None

    def saveModel(self, filename):
        """Saves the model to a binary file in numpy file and another in pkl format

        """

        f = file(filename, 'wb')

        np.save(f, self.nobj)
        np.save(f, self.nvarx)

        np.save(f, self.quantitative)
        np.save(f, self.autoscale)
        np.save(f, self.estimators)
        np.save(f, self.features)
        np.save(f, self.random)
        np.save(f, self.class_weight)
        np.save(f, self.learning_curve)
        np.save(f, self.cv)
        np.save(f, self.n)
        np.save(f, self.p)

        np.save(f, self.mux)
        np.save(f, self.wgx)

        np.save(f, self.TP)
        np.save(f, self.TN)
        np.save(f, self.FP)
        np.save(f, self.FN)

        np.save(f, self.TPpred)
        np.save(f, self.TNpred)
        np.save(f, self.FPpred)
        np.save(f, self.FNpred)

        np.save(f, self.SDEC)
        np.save(f, self.R2)
        np.save(f, self.scoringR)
        np.save(f, self.Q2)
        np.save(f, self.SDEP)
        np.save(f, self.scoringP)

        np.save(f, self.OOBe)

        np.save(f, self.vpath)

        f.close()

        # the classifier cannot be saved with numpy
        joblib.dump(self.clf, os.path.dirname(filename) + '/clasifier.pkl')

    def loadModel(self, filename):
        """Loads the model from two files, one in numpy and another in pkl format
        """

        f = file(filename, 'rb')

        self.nobj = np.load(f)
        self.nvarx = np.load(f)

        self.quantitative = np.load(f)
        self.autoscale = np.load(f)
        self.estimators = np.load(f)
        self.features = np.load(f)
        self.random = np.load(f)
        self.class_weight = np.load(f)
        self.learning_curve = np.load(f)
        self.cv = np.load(f)
        self.n = np.load(f)
        self.p = np.load(f)

        self.mux = np.load(f)
        self.wgx = np.load(f)

        self.TP = np.load(f)
        self.TN = np.load(f)
        self.FP = np.load(f)
        self.FN = np.load(f)

        self.TPpred = np.load(f)
        self.TNpred = np.load(f)
        self.FPpred = np.load(f)
        self.FNpred = np.load(f)

        self.SDEC = np.load(f)
        self.R2 = np.load(f)
        self.scoringR = np.load(f)
        self.Q2 = np.load(f)
        self.SDEP = np.load(f)
        self.scoringP = np.load(f)

        self.OOBe = np.load(f)

        self.vpath = np.load(f)

        f.close()

        # the classifier cannot be loaded with numpy
        self.clf = joblib.load(os.path.dirname(filename) + '/clasifier.pkl')

    def build(self,
              X,
              Y,
              quantitative=False,
              autoscale=False,
              nestimators=0,
              features='',
              random=False,
              tune=False,
              class_weight="balanced",
              cv='loo',
              n=2,
              p=1,
              lc=True,
              vpath=''):
        """Build a new RF model with the X and Y numpy matrices

        """

        nobj, nvarx = np.shape(X)

        self.nobj = nobj
        self.nvarx = nvarx

        self.quantitative = quantitative
        self.autoscale = autoscale
        self.estimators = nestimators
        self.features = features
        self.random = random
        self.class_weight = class_weight
        self.learning_curve = lc
        self.n = n
        self.p = p
        self.cv = cv

        self.X = X.copy()
        self.Y = Y.copy()

        self.vpath = vpath

        #print self.vpath
        if autoscale:
            self.X, self.mux = center(self.X)
            self.X, self.wgx = scale(self.X, autoscale)

        if random:
            RANDOM_STATE = None
        else:
            RANDOM_STATE = 1226  # no reason to pick this number

        if self.cv:
            self.cv = getCrossVal(self.cv, RANDOM_STATE, self.n, self.p)

        if tune:
            self.estimators, self.features = self.optimize(self.X, self.Y)

            if self.features == 'none':
                self.features = None

        #print self.estimators

        if self.quantitative:
            print("Building Quantitative RF model")
            self.clf = RandomForestRegressor(n_estimators=int(self.estimators),
                                             warm_start=False,
                                             max_features=self.features,
                                             oob_score=True,
                                             random_state=RANDOM_STATE)
        else:
            print("Building Qualitative RF_model")
            self.clf = RandomForestClassifier(n_estimators=int(
                self.estimators),
                                              warm_start=False,
                                              max_features=self.features,
                                              oob_score=True,
                                              random_state=RANDOM_STATE,
                                              class_weight=self.class_weight)

        self.clf.fit(self.X, self.Y)

        print('Building Learning Curves')
        if self.learning_curve:
            title = "Learning Curves (RF)"
            # SVC is more expensive so we do a lower number of CV iterations:
            cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
            estimator = self.clf
            plot = plot_learning_curve(estimator,
                                       title,
                                       self.X,
                                       self.Y, (0.0, 1.01),
                                       cv=cv)
            plot.savefig(self.vpath + "/RF-learning_curves.png", format='png')
            plot.savefig("./RF-learning_curves.png", format='png')

        # Regenerate the X and Y, since they might have been centered/scaled
        self.X = X.copy()
        self.Y = Y.copy()

    def validate(self):
        """ Validates the models and completes suitable scoring values

        """

        ##        valRF = open("valRF.txt", "w")
        ##        valRF.write("Experimental\tRecalculated\tPredicted\n")
        if self.X == None or self.clf == None:
            return

        X = self.X.copy()
        Y = self.Y.copy()
        if self.autoscale:
            X = X - self.mux
            X = X * self.wgx

        Yp = self.clf.predict(X)
        Ym = np.mean(Y)

        ######################################################################3
        ### quantitative

        if self.quantitative:
            # OOB_errors = []
            # Recalculated predictions
            SSY0 = np.sum(np.square(Ym - Y))
            SSY = np.sum(np.square(Yp - Y))

            NMSErec = np.mean(mean_squared_error(Y, Yp))  # Mean Squared Error
            self.scoringR = NMSErec
            self.SDEC = np.sqrt(SSY / self.nobj)
            self.R2 = 1.00 - (SSY / SSY0)
            self.OOBe = 1.00 - self.clf.oob_score_

            print("Recalculated results")
            print('rec R2:%5.3f SDEC:%5.3f OOB_error:%5.3f neg_mean_squared_error:%5.3f' % \
                  (self.R2,self.SDEC,self.OOBe, self.scoringR))

            scoring = 'neg_mean_squared_error'

            y_pred = cross_val_predict(self.clf, X, Y, cv=self.cv)
            NMSEcv = np.mean(
                cross_val_score(self.clf, X, Y, cv=self.cv,
                                scoring=scoring))  # Mean Squared Error

            SSY0_out = np.sum(np.square(Ym - Y))
            SSY_out = np.sum(np.square(Y - y_pred))
            self.scoringP = NMSEcv
            self.SDEP = np.sqrt(SSY_out / (self.nobj))
            self.Q2 = 1.00 - (SSY_out / SSY0_out)
            # OOBe_loo  = 1.00 - np.mean(OOB_errors)

            print(str(self.cv) + " cross-validation results")
            print('pred R2:%5.3f Q2:%5.3f SDEP:%5.3f neg_mean_squared_error:%5.3f' % \
                  (self.R2,self.Q2,self.SDEP, self.scoringP))

            # Automated cross-validation loo scikitlearn

            clf = RandomForestRegressor(n_estimators=int(self.estimators),
                                        warm_start=False,
                                        max_features=self.features,
                                        oob_score=True,
                                        random_state=1226)

            # GRAPHS

            pngfiles = glob.glob(self.vpath + '/*.png')
            for i in pngfiles:
                ##                print i
                os.remove(i)
            try:
                fig1 = plt.figure()
                plt.xlabel('experimental y')
                plt.ylabel('recalculated\n', fontsize=14)
                plt.title('R2: %4.2f  /  SDEC: %4.2f \n' %
                          (self.R2, self.SDEC),
                          fontsize=14)
                plt.plot(Y, Yp, "ro")
                fig1.savefig(self.vpath + "/RF-recalculated.png", format='png')
                fig1.savefig("./RF-recalculated.png", format='png')
            except:
                print(
                    "Error creating Recalculated vs Experimental RF model graph"
                )

            try:
                fig1 = plt.figure()
                plt.xlabel('experimental y')
                plt.ylabel('predicted\n', fontsize=14)
                plt.title('Q2: %4.2f  /  SDEP: %4.2f \n' %
                          (self.Q2, self.SDEP),
                          fontsize=14)
                plt.plot(Y, y_pred, "ro")
                fig1.savefig(self.vpath + "/RF-predicted.png", format='png')
                fig1.savefig("./RF-predicted.png", format='png')
            except:
                print(
                    "Error creating Predicted vs Experimental RF model graph")

        # File with experimental, recalculated and cv predictions values.
##            for i in range(len(Y)):
##               valRF.write(str(Y[i]) + "\t" + str(Yp[i]) + "\t" + str(y_pred[i]) + "\n")

######################################################################3
### qualitative
        else:

            # I think this is not needed.... by the characteristics of RF it allways shows perfect performance
            if len(Yp) != len(Y):
                return

            TP = TN = FP = FN = 0

            for i in range(len(Y)):

                if Y[i] == 1.0:
                    if Yp[i] == 1.0:
                        TP += 1
                    else:
                        FN += 1
                else:
                    if Yp[i] == 1.0:
                        FP += 1
                    else:
                        TN += 1

            if TP + TN + FP + FN == 0:
                #print 'no objects'
                return

            self.TP = TP
            self.TN = TN
            self.FP = FP
            self.FN = FN

            sens = sensitivity(TP, FN)
            spec = specificity(TN, FP)
            mcc = MCC(TP, TN, FP, FN)
            f1 = f1_score(Y, Yp, pos_label=1, average='binary')

            self.OOBe = 1.00 - self.clf.oob_score_

            print("Recalculated results")
            print("rec  TP:%d TN:%d FP:%d FN:%d spec:%5.3f sens:%5.3f MCC:%5.3f OOB_error:%5.3f f1_score:%5.3f" % \
                  (TP, TN, FP, FN, spec, sens, mcc, self.OOBe, f1 ))

            # Leave-one-out Cross validation
            print('Cross validating RF....')
            scoring = 'f1'

            y_pred = cross_val_predict(self.clf, X, Y, cv=self.cv)

            #Y_score = np.mean(cross_val_score(self.clf, X, Y, cv=self.cv, scoring=scoring))

            TPo = TNo = FPo = FNo = 0

            for i in range(len(Y)):

                if Y[i] == 1.0:
                    if y_pred[i] == 1.0:
                        TPo += 1
                    else:
                        FNo += 1
                else:
                    if y_pred[i] == 1.0:
                        FPo += 1
                    else:
                        TNo += 1

            if TPo + TNo + FPo + FNo == 0:
                return

            self.TPpred = TPo
            self.TNpred = TNo
            self.FPpred = FPo
            self.FNpred = FNo

            sens_cv = sensitivity(TPo, FNo)
            spec_cv = specificity(TNo, FPo)
            mcc_cv = MCC(TPo, TNo, FPo, FNo)
            f1_cv = f1_score(Y, y_pred, pos_label=1, average='binary')

            print(str(self.cv) + " cross-validation results")
            print("pred  TP:%d TN:%d FP:%d FN:%d spec:%5.3f sens:%5.3f MCC:%5.3f f1_score:%5.3f" % \
                  (TPo, TNo, FPo, FNo, spec_cv, sens_cv, mcc_cv, f1_cv ))

            # Create Graphs

            pngfiles = glob.glob(self.vpath + '/*.png')
            for i in pngfiles:
                os.remove(i)

            # Predicted confusion matrix graph
            try:
                FourfoldDisplay(TPo, TNo, FPo, FNo, 'RFC Predicted',
                                'RF_predicted_confusion_matrix.png',
                                self.vpath)
            except:
                print("Failed to generate RF predicted validation graph")

            # Recalculated confusion matrix graph
            try:
                FourfoldDisplay(TP, TN, FP, FN, 'RFC Recalculated',
                                'RF_recalculated_confusion_matrix.png',
                                self.vpath)
            except:
                print("Failed to generate RF recalculated validation graph")

        return (Yp)

    def project(self, Xb):
        """ Uses the X matrix provided as argument to predict Y
        """

        if self.clf == None:
            print('failed to load clasifier')
            return

        if self.autoscale:
            Xb = Xb - self.mux
            Xb = Xb * self.wgx

        Xb = Xb.reshape(1,
                        -1)  # required by sklean, to avoid deprecation warning
        Yp = self.clf.predict(Xb)

        return (Yp)

    def optimize(self, X, Y):
        """ Optimizes the number of trees (estimators) and max features used (features)
            and returns the best values, acording to the OOB criteria

            The results are shown in a diagnostic plot

            To avoid including many trees to produce tiny improvements, increments of OOB error
            below 0.01 are considered irrelevant
        """

        RANDOM_STATE = 1226
        errors = {}
        features = ['sqrt', 'log2', 'none']

        if self.quantitative:
            tclf = {
                'sqrt':
                RandomForestRegressor(warm_start=False,
                                      oob_score=True,
                                      max_features="sqrt",
                                      random_state=RANDOM_STATE),
                'log2':
                RandomForestRegressor(warm_start=False,
                                      oob_score=True,
                                      max_features="log2",
                                      random_state=RANDOM_STATE),
                'none':
                RandomForestRegressor(warm_start=False,
                                      oob_score=True,
                                      max_features=None,
                                      random_state=RANDOM_STATE)
            }
        else:
            tclf = {
                'sqrt':
                RandomForestClassifier(warm_start=False,
                                       oob_score=True,
                                       max_features="sqrt",
                                       random_state=RANDOM_STATE,
                                       class_weight=self.class_weight),
                'log2':
                RandomForestClassifier(warm_start=False,
                                       oob_score=True,
                                       max_features="log2",
                                       random_state=RANDOM_STATE,
                                       class_weight=self.class_weight),
                'none':
                RandomForestClassifier(warm_start=False,
                                       oob_score=True,
                                       max_features=None,
                                       random_state=RANDOM_STATE,
                                       class_weight=self.class_weight)
            }

        # Range of `n_estimators` values to explore.
        min_estimators = 15
        max_estimators = 700
        stp_estimators = 100

        num_steps = int((max_estimators - min_estimators) / stp_estimators)

        print('optimizing RF....')
        updateProgress(0.0)

        optValue = 1.0e10
        j = 0
        for fi in features:
            errors[fi] = []
            count = 0
            for i in range(min_estimators, max_estimators + 1, stp_estimators):
                clf = tclf[fi]
                clf.set_params(n_estimators=i)
                clf.fit(X, Y)
                oob_error = 1 - clf.oob_score_
                errors[fi].append((i, oob_error))
                if oob_error < optValue:
                    if np.abs(oob_error - optValue) > 0.01:
                        optValue = oob_error
                        optEstimators = i
                        optFeatures = fi

                updateProgress(
                    float(count + (j * num_steps)) /
                    float(len(features) * num_steps))
                count = count + 1
            j = j + 1

        for ie in errors:
            xs, ys = list(zip(*errors[ie]))
            plt.plot(xs, ys, label=ie)

        plt.xlim(min_estimators, max_estimators)
        plt.xlabel("n_estimators (Trees)")
        plt.ylabel("OOB error rate")
        plt.legend(loc="upper right")
        plt.show()

        plt.savefig(self.vpath + "/rf-OOB-parameter-tuning.png")
        plt.savefig("./rf-OOB-parameter-tuning.png")

        print('optimum features:', optFeatures, 'optimum estimators:',
              optEstimators, 'best OOB:', optValue)

        return (optEstimators, optFeatures)
Exemple #48
0
# Naive Bayes is a prediction model based on applying Bayes’ theorem with the “naive” assumption of conditional independence between every pair of features given the value of the class variable.
#
# Build a Naive Bayes model to predict whether a movie review is positive or negative. Test the model accuracy on the test data.

from sklearn.naive_bayes import MultinomialNB as MNB

bobbyBayes = MNB().fit(X, y)
b_predict = bobbyBayes.predict(test_tran)

# for review, classification in zip(docs_test, b_predict):
#     print(f"Prediction: {dataset.target_names[classification]}Review:\n{review}\n")
print(accuracy_score(y_test, b_predict))

# Task 4
# ---
# A random forest is a ensemble (collective) model that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
#
# Build a random forest model to predict whether a movie review is positive or negative. Test the model accuracy on the test data. Try different values (20, 100, 500) for the hyper parameter 'n_estimators', i.e., the number of decision trees in the ensemble, and print out the model accuracy for each of the parameter value.

from sklearn.ensemble import RandomForestClassifier

for num in (20, 100, 500, 1000):

    rfc = RandomForestClassifier(n_estimators=num)
    rfc = rfc.fit(X, y)
    rf_pred = rfc.predict(test_tran)
    print(accuracy_score(y_test, rf_pred))

# ### From the above tasks, you can observe that different models and different choice of hyper-parameter values can lead to quite different prediction performance. What is the model (and hyper-parameter) among the above that gives the best prediction? What is the worst?
# It would appear that the normal SVC using the linear kernel with any C>10 yeilds the most accurate model for the feature extraction method we used (TfidfVectorizer). Interestingly Naive Bayes got very close without the testing a tuning and was near instantaneous to compute on a dataset this small. I tried RandomForestClassifier with n_estimators=100000 and there was no change at all from 1000 trees.
).rolling(5).std()
wipro_data = wipro_data.dropna()

arr = []
val = []
for value in wipro_data['Close Price'].iteritems():
    arr.append(value[1])
for i in range(0, 483):
    if arr[i + 1] > arr[i]:
        val.append(1)
    else:
        val.append(-1)
wipro_data['Action'] = pd.DataFrame(val)
wipro_data = wipro_data.dropna()

train_X = wipro_data[[
    '%chg op_cl', '%chg lw_hg', '%chg 5dymean', '%chg 5dystd'
]]
train_Y = wipro_data[['Action']]

RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(train_X, train_Y)
RF.predict(train_X)
print("Random Forests")
round(RF.score(train_X, train_Y), 4)
wipro_data['Net Cummulative Returns'] = (
    ((wipro_data['Open Price'] - wipro_data['Close Price']) /
     (wipro_data['Open Price'])) * 100).cumsum()
plt.figure(figsize=(20, 10))
plt.plot(wipro_data['Net Cummulative Returns'])
clf_b = LogisticRegression(class_weight='balanced')
clf_b.fit(X_train_b, y_train_b)
y_pred_b = clf_b.predict_proba(X_test_b)

print('Accuracy on test set: {0:.4f}'.format(accuracy_score(y_test_b, clf_b.predict(X_test_b))))
print('Percision score on test set: {0:.4f}'.format(precision_score(y_test_b, clf_b.predict(X_test_b))))
print('Recall score on test set: {0:.4f}'.format(recall_score(y_test_b, clf_b.predict(X_test_b))))
print('F1 score on test set: {0:.4f}'.format(f1_score(y_test_b, clf_b.predict(X_test_b))))
print("AUC score: ", roc_auc_score(y_test_b, y_pred_b[:,1]))

#Initialising Random Forest model
rf_clf=RandomForestClassifier(n_estimators=100,n_jobs=100,random_state=0, min_samples_leaf=100)

#Fitting on data
rf_clf.fit(X_train_im, y_train_im)

#Scoring the model on train data
score_rf=rf_clf.score(X_train_im, y_train_im)
print("Training score: %.2f " % score_rf)

#Scoring the model on test_data
score_rf=rf_clf.score(X_test_im, y_test_im)
print("Testing score: %.2f " % score_rf)

y_pred_rf = rf_clf.predict(X_test_im)
print('Accuracy on test set: {0:.4f}'.format(accuracy_score(y_test_im, rf_clf.predict(X_test_im))))
print('Percision score on test set: {0:.4f}'.format(precision_score(y_test_im, rf_clf.predict(X_test_im))))
print('Recall score on test set: {0:.4f}'.format(recall_score(y_test_im, rf_clf.predict(X_test_im))))
print('F1 score on test set: {0:.4f}'.format(f1_score(y_test_im, rf_clf.predict(X_test_im))))
Exemple #51
0
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 17 13:32:13 2020

@author: Damara
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                               n_informative=2, n_redundant=0,
                              random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.feature_importances_)
print(clf.predict([[0, 0, 0, 0]]))
Exemple #52
0
def training():
    data = pd.read_csv("datasets/Training.csv")
    # Import train_test_split function
    from sklearn.model_selection import train_test_split

    X, y = data.iloc[:,:-1], data.iloc[:,-1]

    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
    #Import Random Forest Model
    from sklearn.ensemble import RandomForestClassifier

    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=100)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)

    y_pred=clf.predict(X_test)
    #Import scikit-learn metrics module for accuracy calculation
    from sklearn import metrics
    # Model Accuracy, how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))



    feature_imp = pd.Series(clf.feature_importances_,index=list(data.columns[:-1])).sort_values(ascending=False).head(50)

    X_reduced, y = data[[
        'receiving_blood_transfusion', 'red_sore_around_nose','abnormal_menstruation', 'continuous_sneezing', 
       'breathlessness','blackheads', 
      'shivering', 
      'dizziness', 
        'back_pain', 
        'unsteadiness',
       'yellow_crust_ooze', 
       'muscle_weakness', 
       'loss_of_balance', 
       'chills',
       'ulcers_on_tongue', 
       'stomach_bleeding', 
       'lack_of_concentration', 
       'coma',
       'neck_pain', 
       'weakness_of_one_body_side', 
       'diarrhoea',
       'receiving_unsterile_injections', 
       'headache', 
       'family_history',
       'fast_heart_rate', 
       'pain_behind_the_eyes', 
       'sweating', 
       'mucoid_sputum',
       'spotting_urination',
        'sunken_eyes', 
        'dischromic_patches',
        'nausea',
       'dehydration',
       'loss_of_appetite', 
       'abdominal_pain', 
       'stomach_pain',
       'yellowish_skin', 
       'altered_sensorium', 
       'chest_pain', 
       'muscle_wasting',
       'vomiting', 
       'mild_fever', 
       'high_fever', 
       'red_spots_over_body',
       'dark_urine',
        'itching', 
        'yellowing_of_eyes', 
        'fatigue', 
        'joint_pain',
       'muscle_pain']], data.iloc[:,-1]

    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3) # 70% training and 30% test

    #Create a Gaussian Classifier

    clf2=RandomForestClassifier(n_estimators=100)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf2.fit(X_train,y_train)

    y_pred=clf2.predict(X_test)

    # Model Accuracy, how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    

    # df=pd.read_csv("dataset.csv")
    # df=pre_processing(df)
    # y=df[["Disease"]]
    # df.drop("Disease", axis="columns", inplace=True)
    # x=df
    # # print("#"*50)
    # # print(x)

    dummyRow=pd.DataFrame(np.zeros(len(X_reduced.columns)).reshape(1,len(X_reduced.columns)), columns=X_reduced.columns)
    dummyRow.to_csv('datasets/dummyRowDisease.csv', index=False)
    # model=RandomForestClassifier(random_state=2)
    # # model=XGBClassifier(max_depth=2,min_child_weight=3, gamma=0,subsample=0.86, reg_alpha=0, n_estimators=125)
    # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=5)
    # model.fit(x,y)
    # print("#"*50)
    # print(model)
    # print("#"*50)
    # print(model.score(x_test,y_test))
    pkl_filename="datasets/pickle_model_disease.pkl"
    with open(pkl_filename,'wb') as file:
        pickle.dump(clf2,file)
x_train_undersampled = []
y_train_undersampled = []
for i in ess_sequences:
    x_train_undersampled.append(X_train[i])
    y_train_undersampled.append(y_train[i])
for i in x:
    x_train_undersampled.append(X_train[i])
    y_train_undersampled.append(y_train[i])
print(len(x_train_undersampled))
print(len(y_train_undersampled))

modxtr = np.array(x_train_undersampled)
modytr = np.array(y_train_undersampled)

clf = RFC(n_estimators=100)
svm_best_clf = clf.fit(modxtr, modytr)
test_predictions_svm = svm_best_clf.predict(X1)
test_predictions_svm_proba = svm_best_clf.predict_proba(X1)
accuracy = accuracy_score(y1,test_predictions_svm)
true_n, false_p, false_n, true_p = confusion_matrix(
     y1, test_predictions_svm).ravel()
print(true_n, false_p, false_n, true_p)

prec = precision_score(y1, test_predictions_svm)
f1 = f1_score(y1, test_predictions_svm)
sensitivity, specificity = compute_measures(true_p, false_p, false_n, true_n)
print(sensitivity, specificity)

average = (sensitivity + specificity) / 2
fpr, tpr, thresholds = roc_curve(y1, test_predictions_svm_proba[:, 1])
roc_auc1 = auc(fpr, tpr)
Exemple #54
0
# In[294]:

plt.plot([k for k in range(1, 15)], knn_scores, color='green')
for i in range(1, 15):
    plt.text(i, knn_scores[i - 1], (i, knn_scores[i - 1]))
plt.xticks([i for i in range(1, 15)])
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Scores')

# In[299]:

knn_classifier = KNeighborsClassifier(n_neighbors=12)
score = cross_val_score(knn_classifier, x, y, cv=10)

# In[300]:

score.mean()

# In[239]:

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
model = clf.fit(X, y)

# In[284]:

randomforest_classifier = RandomForestClassifier(n_estimators=50)
score = cross_val_score(randomforest_classifier, X, y, cv=10)
score.mean()
Exemple #55
0
#Test Set Performance
tree_predicted = tree.predict(x_test)
tree_acc = accuracy_score(y_test, tree_predicted)
print('The test accuracy is: ' + str(tree_acc) + '.')
print(confusion_matrix(y_test, tree_predicted))

#################
#BRANDOM FOREST
#################
print('\nRANDOM FOREST')
from sklearn.ensemble import RandomForestClassifier

#Train Classifer
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(x_train, y_train)

#Test Set Performance
rf_predicted = rf.predict(x_test)
rf_acc = accuracy_score(y_test, rf_predicted)
print('The test accuracy is: ' + str(rf_acc) + '.')
print(confusion_matrix(y_test, rf_predicted))

#################
#Building KNN
#################
print('\nK NEAREST NEIGHBORS')
from sklearn.neighbors import KNeighborsClassifier

#Train Classifier
knn = KNeighborsClassifier()
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
report(grid_search.cv_results_)

# look at best classifier 

clf = RandomForestClassifier(n_estimators=50, max_depth=None, max_features= 10, min_samples_leaf= 3, 
                             min_samples_split= 2,bootstrap = True, criterion= 'entropy', random_state=0)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# confusion matrix
print(metrics.classification_report(y_test, y_pred))

# confusion matrix heatmap
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

# feature importance horizontal bar plot
Exemple #57
0
})
tree.export_graphviz(model, out_file='decision_tree.dot')
print(f'Training Time: {t} seconds.')
print(f'Accuracy on test set: {accuracy}')
print(f'R squared on test set: {r2_accuracy}')
print(f'Confusion Matrix: {matrix[0]}, {matrix[1]}')
print('-' * 30)


# ---------- TRAIN RANDOM FOREST----------
print('Random Forest')

model = RandomForestClassifier(n_jobs=-1, n_estimators=500, max_features=0.2, min_samples_leaf=1)

t = time()
model.fit(x_train, y_train)
t = round(time() - t, 3)
accuracy = model.score(x_test, y_test)
r2_accuracy = r2_score(model.predict(x_test), y_test)
matrix = confusion_matrix(y_test, model.predict(x_test))
filename = 'random_forest.sav'
pickle.dump(model, open(filename, 'wb'))
data['models'].append({
    'name': 'random_forest',
    'train_time': t,
    'accuracy': accuracy,
    'r2_accuracy': r2_accuracy,
    'confusion_matrix': matrix.tolist(),
})
print(f'Training Time: {t} seconds.')
print(f'Accuracy on test set: {accuracy}')
Exemple #58
0
    y_test, y_pred_decision_tree)
print("Confusion Matrix (Decision Tree):\n", cm_decision_tree)

# Printing the Accuracy, Precision and Recall
print("Accuracy of Decision Tree:", decision_tree_accuracy)
print("Precision of Decision Tree:", decision_tree_precision)
print("Recall of Decision Tree:", decision_tree_recall)
print("")

########################RANDOM FOREST CLASSIFIER###############################
# Fitting Random Forest Classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(n_estimators=10,
                                                  criterion='entropy',
                                                  random_state=0)
random_forest_classifier.fit(x_train, y_train)

# Predicting the Validation set results
print("Validation Set Results:")
y_pred_random_forest = random_forest_classifier.predict(x_validation)

# Making the Confusion Matrix
cm_random_forest, random_forest_accuracy, random_forest_precision, random_forest_recall = confusion_matrix(
    y_validation, y_pred_random_forest)
print("Confusion Matrix (Random Forest Classifier):\n", cm_random_forest)

# Printing the Accuracy, Precision and Recall
print("Accuracy of Random Forest Classifier:", random_forest_accuracy)
print("Precision of Random Forest Classifier:", random_forest_precision)
print("Recall of Random Forest Classifier:", random_forest_recall)
print("")
Exemple #59
0
                                      criterion='gini',
                                      max_depth=10,
                                      max_features='auto',
                                      max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=4,
                                      min_samples_split=5,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=120,
                                      n_jobs=None,
                                      oob_score=False,
                                      random_state=0,
                                      verbose=0,
                                      warm_start=False)
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_test)
random_accy = round(accuracy_score(y_pred, y_test), 3)
print(random_accy)

# In[39]:
#ランダムフォレスト_グリッド
# n_estimators = [50,75,100,120]
# random_state = [0,15]
# min_samples_split = [5,6,10,15,20,25]
# max_depth = [5,10,15,20,25,30]
# min_samples_leaf=[2,3,4,5,6]
# parameters = {'n_estimators':n_estimators,
# 'random_state':random_state,
# 'min_samples_split':min_samples_split,
# 'max_depth':max_depth,
Exemple #60
0
X.info()
X['age'].fillna(X['age'].mean(), inplace=True)
# %%
X_tr, X_tst, y_tr, y_tst = train_test_split(X,
                                            y,
                                            test_size=0.25,
                                            random_state=33)
vec = DictVectorizer(sparse=False)
X_tr = vec.fit_transform(X_tr.to_dict(orient='record'))
print(vec.feature_names_)
X_tst = vec.transform(X_tst.to_dict(orient='record'))
# %%
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

dtc.fit(X_tr, y_tr)
rfc.fit(X_tr, y_tr)
gbc.fit(X_tr, y_tr)

dtc_prd = dtc.predict(X_tst)
rfc_prd = rfc.predict(X_tst)
gbc_prd = gbc.predict(X_tst)

print("Accuracy of dtc is: ", dtc.score(X_tst, y_tst))
print(classification_report(y_tst, dtc_prd, target_names=['died', 'survived']))
print("Accuracy of rfc is: ", rfc.score(X_tst, y_tst))
print(classification_report(y_tst, rfc_prd, target_names=['died', 'survived']))
print("Accuracy of gbc is: ", gbc.score(X_tst, y_tst))
print(classification_report(y_tst, gbc_prd, target_names=['died', 'survived']))