def eval_subset(train, test):
    n_clusters = len(np.unique(train[2]))

    clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1)
    clf.fit(train[0], train[2])
    DTacc = float(clf.score(test[0], test[2]))

    clf = KNeighborsClassifier(n_neighbors=1, algorithm='brute', n_jobs=1)
    clf.fit(train[0], train[2])
    acc = float(clf.score(test[0], test[2]))

    LR = LinearRegression(n_jobs=-1)
    LR.fit(train[0], train[1])
    MSELR = float(((LR.predict(test[0]) - test[1])**2).mean())

    MSE = float((((decoder((train[0], train[1]),
                           (test[0], test[1])) - test[1])**2).mean()))

    max_iters = 10
    cnmi, cacc = 0.0, 0.0
    for iter in range(max_iters):
        nmi, acc = unsupervised_evaluation.evaluation(train[0],
                                                      n_clusters=n_clusters,
                                                      y=train[2])
        cnmi += nmi / max_iters
        cacc += acc / max_iters
    print('nmi = {:.3f}, acc = {:.3f}'.format(cnmi, cacc))
    print('acc = {:.3f}, DTacc = {:.3f}, MSELR = {:.3f}, MSE = {:.3f}'.format(
        acc, DTacc, MSELR, MSE))
    return MSELR, MSE, acc, DTacc, float(cnmi), float(cacc)
Beispiel #2
0
def predict_et():

    X = pd.read_csv('data/X_train.csv', header=0)
    y = pd.read_csv('data/y_train.csv', header=0)
    #X= X.drop(['id'],axis=1)
    #X= X.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1)
    y = y['fault_severity']

    testX = pd.read_csv('data/X_test.csv', header=0)
    testY = pd.read_csv('data/y_test.csv', header=0)
    testX1 = testX
    #testX1= testX.drop(['id'],axis=1)
    #testX1=testX.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1)
    testY = testY['fault_severity']

    et = ExtraTreesClassifier(n_estimators=440, random_state=1)
    et.fit(X, y)
    print(et.score(X, y))
    print(et.score(testX1, testY))

    # prediction
    testy = et.predict_proba(testX1)

    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    submission = pd.DataFrame(et.predict_proba(testX1),
                              index=testX.id,
                              columns=pred_cols)
    print(multiclass_log_loss(testY.values, submission.values))

    submission.to_csv('et_output.csv', index_label='id')
def plot_confusion_matrix(model, relevant_features_new, y_new,
                          threshold_classification):

    extra_trees = ExtraTreesClassifier(n_estimators=1000, random_state=0)
    base_classification = Base_Classification(model, extra_trees)

    #sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    sss = StratifiedKFold(n_splits=3, shuffle=False, random_state=10)
    for train_index, test_index in sss.split(relevant_features_new, y_new):
        x_train, x_test = relevant_features_new.iloc[
            train_index, :], relevant_features_new.iloc[test_index, :]
        y_train, y_test = y_new.iloc[train_index, :], y_new.iloc[test_index, :]
        break

    #x_train, x_test, y_train, y_test = train_test_split(relevant_features_new, y_new, test_size=0.3, random_state=42)
    extra_trees.fit(x_train, y_train)
    pred = extra_trees.predict_proba(x_test)
    pred = pd.DataFrame(pred, columns=extra_trees.classes_)
    valid_indexes = base_classification.get_accuracy.get_indexes_with_valid_predictions(
        pred, threshold_classification)

    x_test_valid = x_test.iloc[valid_indexes, :]
    y_test_valid = y_test.iloc[valid_indexes, :]

    base_classification.get_accuracy.plot_confusion_matrix(
        x_test_valid, y_test_valid, extra_trees)
    print("Accuracy => {}".format(extra_trees.score(x_test_valid,
                                                    y_test_valid)))
    base_classification.get_accuracy.plot_confusion_matrix(
        x_test, y_test, extra_trees)
    print("Accuracy => {}".format(extra_trees.score(x_test, y_test)))
Beispiel #4
0
def model_training(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=5)

    accs = []
    depths = np.arange(60, 200, 20)
    for i in depths:
        print('training extra tree classifier, n estimators = {}'.format(i))
        etf = ExtraTreesClassifier(n_estimators=i,
                                   max_depth=None,
                                   min_samples_split=2,
                                   random_state=5).fit(X_train, y_train)
        print('accuracy {}'.format(round(etf.score(X_test, y_test), 3)))
        accs += [etf.score(X_test, y_test)]

    print('top accuracy {}'.format(round(max(accs), 3)))
    dpth = depths[accs.index(max(accs))]
    etf = ExtraTreesClassifier(n_estimators=dpth,
                               max_depth=None,
                               min_samples_split=2,
                               random_state=5).fit(X_train, y_train)

    joblib.dump(etf, 'model_extratrees.pkl')
    print('model saved')
Beispiel #5
0
def dimensionReduction(
    data,
    target,
    fea_alg='et'
):  #featureSelection(trainingSet,trainingLabels,testSet,testLabels,fea_alg = 'dt'):
    nFold = 5
    skf = StratifiedKFold(n_splits=nFold)
    for train_index, test_index in skf.split(data, target):
        pass
    trainingSet = data[train_index]
    trainingLabels = target[train_index]
    testSet = data[test_index]
    testLabels = target[test_index]

    # random forest, feature_importances_, feature importances
    if fea_alg == 'et':
        clf = ExtraTreesClassifier(n_estimators=300,
                                   random_state=0,
                                   max_features="sqrt")
        clf.fit(trainingSet, trainingLabels)
        select = clf.feature_importances_
        score0 = clf.score(testSet, testLabels)
        model = SelectFromModel(clf, prefit=True)
        train_new = model.transform(trainingSet)
        test_new = model.transform(testSet)
        score1 = clf.fit(train_new, trainingLabels).score(test_new, testLabels)
        print train_new.shape[1], score0, score1, select[:5]

    if fea_alg == 'lsvc':
        clf = LinearSVC(C=0.01, penalty="l1", dual=False).fit(data, target)
        clf.fit(trainingSet, trainingLabels)
        select = clf.coef_
        score0 = clf.score(testSet, testLabels)
        model = SelectFromModel(clf, prefit=True)
        train_new = model.transform(trainingSet)
        test_new = model.transform(testSet)
        score1 = clf.fit(train_new, trainingLabels).score(test_new, testLabels)
        print train_new.shape[1], score0, score1, select[:5]

    # naive bayesian, sigma_ : array, shape (n_classes, n_features), variance of each feature per class
    elif fea_alg == 'nb':
        clf = GaussianNB()
        clf.fit(trainingSet, trainingLabels)
        feature_rank = clf.sigma_
        ind = np.argsort(np.sum(feature_rank, axis=0))
        max_score = 0
        max_i = 0
        for i in range(0, len(ind) + 1, 10):
            score = clf.fit(trainingSet[:, ind[:i + 1]],
                            trainingLabels).score(testSet[:, ind[:i + 1]],
                                                  testLabels)
            if score > max_score:
                max_score = score
                max_i = i
        feature_ind = ind[:max_i + 1]
        data = data[feature_ind]

    return data
Beispiel #6
0
def train_l2_et(x_train, x_test, y_train, y_test):
    clf = ExtraTreesClassifier(n_estimators=256)
    clf.fit(x_train, y_train)

    if y_test is not None:
        print('ExtraTreesClassifier:', clf.score(x_test, y_test))
    else:
        print('ExtraTreesClassifier:', clf.score(x_train, y_train))
    return np.reshape(clf.predict(x_train), (-1, 1))
Beispiel #7
0
def get_ERT(Xtrain, Xtest, Ytrain, Ytest, gtree):
    # Extremely Randomized Trees
    ert = ExtraTreesClassifier(n_estimators=1000,max_features=gtree.best_estimator_.max_features,max_depth=gtree.best_estimator_.max_depth,min_samples_split=gtree.best_estimator_.min_samples_split,n_jobs=-1)
    ert.fit(Xtrain,Ytrain)
    scores = np.empty((2))
    scores[0] = ert.score(Xtrain,Ytrain)
    scores[1] = ert.score(Xtest,Ytest)
    print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0]*100))
    print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1]*100))

    return ert
Beispiel #8
0
def train_l1_et(x_train, x_test, y_train, y_test):
    clf = ExtraTreesClassifier(n_estimators=256, n_jobs=-1)
    clf.fit(x_train, y_train)

    if y_test is not None:
        print('ExtraTreesClassifier:', clf.score(x_test, y_test))
    else:
        print('ExtraTreesClassifier:', clf.score(x_train, y_train))
    test_res = np.reshape(clf.predict(x_train), (-1, 1))
    train_res = np.reshape(clf.predict(x_test), (-1, 1))
    return [test_res, train_res]
def EnsembleMethods(X, y):

	# divide our data set into a training set and a test set
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    								X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

	# get randomized PCA model
	num_components = 120
	print("Extracting the top %d eigenfaces from %d faces"
          % (num_components, X_train.shape[0]))
	pca = RandomizedPCA(n_components=num_components, whiten=True).fit(X_train)

    # use the PCA model on our training set and test set.
	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	# get decision tree classifier
	decision_tree_classifier = DecisionTreeClassifier(max_depth=None, 
							   min_samples_split=1, random_state=0)

	# use decision tree classifier to fit the data.
	decision_tree_classifier.fit(X_train_pca, y_train)

	# print the performance of decision tree classifier
	print("====== Decision Tree Classifier ========")
	print('TRAIN SCORE', decision_tree_classifier.score(X_train_pca, y_train))
	print('TEST SCORE', decision_tree_classifier.score(X_test_pca, y_test))

	# get random forest classifier
	random_forest_classifier = RandomForestClassifier(n_estimators=10,
					max_depth=None, min_samples_split=1, random_state=0)

	# use random forest classifier to fit the data.
	random_forest_classifier.fit(X_train_pca, y_train)

	# print the performance of decision tree classifier
	print("====== Random Forest Classifier ========")
	print('TRAIN SCORE', random_forest_classifier.score(X_train_pca, y_train))
	print('TEST SCORE', random_forest_classifier.score(X_test_pca, y_test))                       

	# get extra trees classifier
	extra_trees_classifier = ExtraTreesClassifier(n_estimators=10,
				max_depth=None, min_samples_split=1, random_state=0)

	# use extra trees classifier to fit the data.
	extra_trees_classifier.fit(X_train_pca, y_train)

	# print the performance of decision tree classifier
	print("====== Extra Trees Classifier ========")
	print('TRAIN SCORE', extra_trees_classifier.score(X_train_pca, y_train))
	print('TEST SCORE', extra_trees_classifier.score(X_test_pca, y_test))  
Beispiel #10
0
def test3():
	print("3. Testing softmax for full harmonization...")
	trainXc, trainyc = load_dataset("train", "data/chorales_rnn.hdf5")
	devXc, devyc = load_dataset("dev", "data/chorales_rnn.hdf5")
	testXc, testyc = load_dataset("test", "data/chorales_rnn.hdf5")
	stack = lambda x1, x2: numpy.vstack((x1, x2))
	hstack = lambda x1, x2: numpy.hstack((x1, x2))
	# Remove Oracle features
	trainXc = [X[:, range(0,10)] for X in trainXc]
	devXc = [X[:, range(0,10)] for X in devXc]
	testXc = [X[:, range(0,10)] for X in testXc]
	# Aggregate data
	Xtrain = stack(reduce(stack, trainXc), reduce(stack, devXc))
	ytrain = hstack(reduce(hstack, trainyc), reduce(hstack, devyc))
	Xtest, ytest = reduce(stack, testXc), reduce(hstack, testyc)

	# Remove padding
	ypadding = ytest.max()
	Xtrain_up, ytrain_up, Xtest_up, ytest_up = [], [], [], []
	for idx, p in enumerate(ytrain):
		if p != ypadding:
			Xtrain_up.append(Xtrain[idx])
			ytrain_up.append(ytrain[idx])
	for idx, p in enumerate(ytest):
		if p != ypadding:
			Xtest_up.append(Xtest[idx])
			ytest_up.append(ytest[idx])
	Xtrain, ytrain, Xtest, ytest = numpy.array(Xtrain_up), numpy.array(ytrain_up), \
								   numpy.array(Xtest_up), numpy.array(ytest_up)

	encoder, Xtrainsparse, Xtestsparse = encode(Xtrain, Xtest)
	RF = RandomForestClassifier(10, "entropy", None)
	RF.fit(Xtrain, ytrain)
	# Write full harmonization data
	with h5py.File('data/chorales_sm.hdf5', "w", libver="latest") as f:
		f.create_dataset("Xtrain", Xtrain.shape, dtype="i", data=Xtrain)
		f.create_dataset("ytrain", ytrain.shape, dtype="i", data=ytrain)
		f.create_dataset("Xtest", Xtest.shape, dtype="i", data=Xtest)
		f.create_dataset("ytest", ytest.shape, dtype="i", data=ytest)
	print "Full harmonization data written"
	score_RF_train = RF.score(Xtrain, ytrain)
	score_RF_test = RF.score(Xtest, ytest)
	print "R-FOREST: %.2f%% training, %.2f%% test" % (score_RF_train * 100, score_RF_test * 100)
	ERF = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0)
	ERF.fit(Xtrainsparse, ytrain)
	score_ERF_train = ERF.score(Xtrainsparse, ytrain)
	score_ERF_test = ERF.score(Xtestsparse, ytest)
	print "EXTRA TREES: %.2f%% training, %.2f%% test" % (score_ERF_train * 100, score_ERF_test * 100)
	logit = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1)
	logit.fit(Xtrainsparse, ytrain)
	score_logit_train = logit.score(Xtrainsparse, ytrain)
	score_logit_test = logit.score(Xtestsparse, ytest)
	print "LOGIT: %.2f%% training, %.2f%% test" % (score_logit_train * 100, score_logit_test * 100)
Beispiel #11
0
def extratrees_clf():
    # 3.2. Create classifier.
    max_features = int(np.sqrt(train_dataset_X.shape[1]))
    clf = ExtraTreesClassifier(random_state=42,
                               n_jobs=-1,
                               n_estimators=100,
                               max_features=max_features)
    # 3.3. Fit classifier.
    clf.fit(X_train, y_train)
    # 4. Calculate score.
    # FAILED TO CONVERGE
    print("Train set score: {0}".format(clf.score(X_train, y_train)))  # 0.9925
    print("Test set score: {0}".format(clf.score(X_test, y_test)))  # 0.782
Beispiel #12
0
def learn(f):
    global raw_data
    print 'testing classifier'
    data = raw_data[raw_data['label'] != 'unknown']
    data = data[data['file type'] == 'EXECUTE']
    X = data.as_matrix(f)
    y = np.array(data['label'].tolist())
    #clf = RandomForestClassifier(n_estimators=100)
    clf = ExtraTreesClassifier(n_estimators=100)
    #clf = AdaBoostClassifier()
    scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10)
    print("predicted accuracy: %0.2f (+/- %0.2f)" %
          (scores.mean(), scores.std() * 2))
    seed = 3301
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=seed)
    clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("actual accuracy: %0.2f" % scores)
    importances = zip(f, clf.feature_importances_)
    importances.sort(key=lambda k: k[1], reverse=True)
    for im in importances[0:20]:
        print im[0].ljust(30), im[1]
    #y_pred = clf.predict(X_test)
    #labels = ['good', 'bad']
    #cm = confusion_matrix(y_test, y_pred, labels)
    #plot_cm(cm, labels)
    #joblib.dump(clf, 'model.pkl')
    return clf
Beispiel #13
0
def train_model(train):

    le = LabelEncoder()
    cols = ['Term', 'Home Ownership']
    train['Loan Status'] = le.fit_transform(train['Loan Status'])

    train = pd.get_dummies(data=train, columns=cols, drop_first=True)

    X = train.drop(columns=[
        'Purpose', 'Monthly Debt', 'Years of Credit History',
        'Number of Open Accounts', 'Number of Credit Problems',
        'Current Credit Balance', 'Maximum Open Credit', 'Bankruptcies',
        'Tax Liens'
    ])
    y = train['Loan Status']

    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler()
    X_ros, y_ros = ros.fit_sample(X, y)
    #print(X_ros.shape[0] - X.shape[0], 'new random picked points')

    y = X_ros['Loan Status'].values
    X = X_ros.drop(columns=['Loan Status']).values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    model = ExtraTreesClassifier()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    return model, model.score(X_test, y_test)
def ERFC_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting Extreme Random Forest Classifier***************")
    t0 = time()
    clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("Extreme Random Forest Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending Extreme Random Forest Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
Beispiel #15
0
def test_vocabulary(vocabulary_sizes, desc_sel, desc_tr, desc_te):
    score_list = []

    for num_clusters in vocabulary_sizes:
        kmeans = load_or_compute_pickle(num_clusters)

        # Construct training data labels
        train_labels = [i // 15 for i in range(150)]
        test_labels = train_labels  #in this case, since both are 10x15 images

        # Calculate the bag of words for training and test data
        data_train = bag_of_words_histogram(desc_tr, kmeans,
                                            num_clusters).reshape(
                                                150, num_clusters)
        data_test = bag_of_words_histogram(desc_te, kmeans,
                                           num_clusters).reshape(
                                               150, num_clusters)

        print('Computing RF for a vocabulary of', num_clusters)
        # Use best performing parameters for RF
        RFC = ExtraTreesClassifier(n_estimators=100,
                                   max_depth=10,
                                   bootstrap=False,
                                   random_state=0).fit(data_train,
                                                       train_labels)
        score = RFC.score(data_test, test_labels)
        score_list.append(score)
        print('score:', score)
    pickle_out = open('vocabulary_scores.pickle', 'wb')
    pickle.dump(score_list, pickle_out)
    pickle_out.close()
    return score_list
Beispiel #16
0
def get_ERT(Xtrain, Ytrain, baseTree, Xtest = None , Ytest = None, verbose = 0):
    # Extremely Randomized Trees
    ert = ExtraTreesClassifier(n_estimators=1000,max_features=baseTree.best_estimator_.max_features,
                               max_depth=baseTree.best_estimator_.max_depth,
                               min_samples_split=baseTree.best_estimator_.min_samples_split,n_jobs=-1)
    ert.fit(Xtrain,Ytrain)
    
    if (verbose == 1):
        scores = np.empty((2))
        scores[0] = ert.score(Xtrain,Ytrain)
        print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0]*100))
        if (type(Xtest) != type(None)):
            scores[1] = ert.score(Xtest,Ytest)
            print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1]*100))

    return ert
Beispiel #17
0
def learn(f):
    global raw_data
    print 'testing classifier'
    data = raw_data[raw_data['label'] != 'unknown']
    data = data[data['file type'] == 'EXECUTE']
    X = data.as_matrix(f)
    y = np.array(data['label'].tolist())
    #clf = RandomForestClassifier(n_estimators=100)
    clf = ExtraTreesClassifier(n_estimators=100)
    #clf = AdaBoostClassifier()
    scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10)
    print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    seed = 3301
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("actual accuracy: %0.2f" % scores)
    importances = zip(f, clf.feature_importances_)
    importances.sort(key=lambda k:k[1], reverse=True)
    for im in importances[0:20]:
        print im[0].ljust(30), im[1]
    #y_pred = clf.predict(X_test)
    #labels = ['good', 'bad']
    #cm = confusion_matrix(y_test, y_pred, labels)
    #plot_cm(cm, labels)
    #joblib.dump(clf, 'model.pkl')
    return clf
Beispiel #18
0
def do_extra_trees(md = None):
    from sklearn.ensemble import ExtraTreesClassifier
    train_X, train_Y, test_X, test_Y = analysis_glass()
    ETC = ExtraTreesClassifier(n_estimators=100, max_depth = md)
    ETC.fit(train_X, train_Y)

    return ETC.score(test_X, test_Y)
Beispiel #19
0
def get_ERT(Xtrain, Ytrain,tree, Xtest = None , Ytest = None, verbose = 0):
    # Extremely Randomized Trees
    ert = ExtraTreesClassifier(n_estimators=1000,max_features=tree.best_estimator_.max_features,
                               max_depth=tree.best_estimator_.max_depth,
                               min_samples_split=tree.best_estimator_.min_samples_split,n_jobs=-1)
    ert.fit(Xtrain,Ytrain)
    
    if (verbose == 1):
        scores = np.empty((2))
        scores[0] = ert.score(Xtrain,Ytrain)
        print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0]*100))
        if (type(Xtest) != type(None)):
            scores[1] = ert.score(Xtest,Ytest)
            print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1]*100))

    return ert
Beispiel #20
0
def random_forest_cross_validate(targets, features, nprocesses=-1):
    cv = cross_validation.KFold(len(features), k=5, indices=False)
    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for i, (traincv, testcv) in enumerate(cv):
        cfr = ExtraTreesClassifier(
            n_estimators=100,
            max_features=None,
            verbose=2,
            compute_importances=True,
            n_jobs=nprocesses,
            random_state=0,
        )
        print "Fitting cross validation #{0}".format(i)
        cfr.fit(features[traincv], targets[traincv])
        print "Scoring cross validation #{0}".format(i)
        cfr.set_params(n_jobs=1) # read in the features to predict, remove bad columns
        score = cfr.score(features[testcv], targets[testcv])
        print "Score for cross validation #{0}, score: {1}".format(i, score)
        mean_diff = get_metric(cfr, features[testcv], targets[testcv])
        print "Mean difference: {0}".format(mean_diff)
        results.append(mean_diff)
        print "Features importance"
        features_list = []
        for j, importance in enumerate(cfr.feature_importances_):
            if importance > 0.0:
                column = features.columns[j]
                features_list.append((column, importance))
        features_list = sorted(features_list, key=lambda x: x[1], reverse=True)
        for j, tup in enumerate(features_list):
            print j, tup
        pickle.dump(features_list, open("important_features.p", 'wb'))
        print "Mean difference: {0}".format(mean_diff)
        results.append(mean_diff)
def ExtrExtraTrees_classification(train,
                                  test,
                                  train_labels,
                                  test_labels,
                                  res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with ExtraTrees...")

    extra = ExtraTreesClassifier()
    extra.fit(train, train_labels)
    prediction = extra.predict(test)

    utils.report_and_confmat(test_labels, prediction, "ExtraTrees")
    score = extra.score(test, test_labels)

    res["ExtraTrees"] = {
        "model": extra,
        "accuracy": score,
        "name": "ExtraTreesClassifier"
    }
    print("ExtraTrees ended...")
    return score, extra
def many_classify_dtree(X,Y):
    print("Building the model for decision trees...")
    x = []
    x.append(X.loc[0:15000])
    x.append(X.loc[15000:30000])
    x.append(X.loc[30000:45000])
    x.append(X.loc[45000:59999])
    y = []
    y.append(Y.loc[0:15000])
    y.append(Y.loc[15000:30000])
    y.append(Y.loc[30000:45000])
    y.append(Y.loc[45000:60000])
    scores = []
    for i in range(0,4):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(x[i], y[i], test_size=0.1)
        start_time = datetime.now()
        #print(start_time)
        clf = ExtraTreesClassifier(n_estimators=10)
        y_train = np.ravel(y_train)
        y_test = np.ravel(y_test)
        clf = clf.fit(X_train,y_train)
        end_time = datetime.now()
        #print(end_time)
        scores.append(clf.score(X_test,y_test))
    s = 0
    for i in range(0,4):
        s= s +scores[i]
        #print(scores[i])

    print("Classification Score using Decision Tree with Drift Detection:" + str(s/4))
Beispiel #23
0
def do_extra_trees(md=None):
    from sklearn.ensemble import ExtraTreesClassifier
    train_X, train_Y, test_X, test_Y = analysis_glass()
    ETC = ExtraTreesClassifier(n_estimators=100, max_depth=md)
    ETC.fit(train_X, train_Y)

    return ETC.score(test_X, test_Y)
Beispiel #24
0
def Extreme_rf_dis(n_trees, X, Y, train_indices, test_indices, seed):
    clf = ExtraTreesClassifier(n_estimators=500,
                               random_state=seed,
                               oob_score=True,
                               n_jobs=-1)
    clf = clf.fit(X[train_indices], Y[train_indices])
    pred = clf.predict(X[test_indices])
    weight = clf.score(X[test_indices], Y[test_indices])
    #print(1 - clf.oob_score_)
    n_samples = X.shape[0]
    dis = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        dis[i][i] = 1
    res = clf.apply(X)
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            a = np.ravel(res[i])
            b = np.ravel(res[j])
            score = a == b
            d = float(score.sum()) / n_trees
            dis[i][j] = dis[j][i] = d
    X_features1 = np.transpose(dis)
    X_features2 = X_features1[train_indices]
    X_features3 = np.transpose(X_features2)
    return X_features3[train_indices], X_features3[test_indices], weight, pred
Beispiel #25
0
def learnly():
    clf = ExtraTreesClassifier(n_estimators=30)
    clf.fit(features_train, labels_train)
    clf.predict(features_train)
    score = clf.score(features_test, labels_test)
    print(score)
    stop = "stop"
    return clf, score
def extraTreesClassifier(X, Y, X_test, Y_test):
    clf = ExtraTreesClassifier(n_estimators=10,
                               random_state=0)
    fitXY = clf.fit(X, Y)
    score = fitXY.score(X, Y)
    print('Training set score: ' + str(score))
    score = clf.score(X_test, Y_test)
    print('Test set score: ' + str(score))
def EnsembleMethod(X, y):

    # divide our data set into a training set and a test set
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

    # train with decision tree classifier
    decisionTreeClassifier = DecisionTreeClassifier(max_depth=None,
                                                    min_samples_split=1,
                                                    random_state=0)

    # use the classifier to fit the data.
    decisionTreeClassifier.fit(X_train, y_train)

    # print the performance of the classifier
    print("====== Decision Tree Classifier ========")
    print('TRAIN SCORE', decisionTreeClassifier.score(X_train, y_train))
    print('TEST SCORE', decisionTreeClassifier.score(X_test, y_test))

    # train with random forest classifier
    randomForestClassifier = RandomForestClassifier(n_estimators=10,
                                                    max_depth=None,
                                                    min_samples_split=1,
                                                    random_state=0)

    # use the classifier to fit the data.
    randomForestClassifier.fit(X_train, y_train)

    # print the performance of the classifier
    print("====== Random Forest Classifier ========")
    print('TRAIN SCORE', randomForestClassifier.score(X_train, y_train))
    print('TEST SCORE', randomForestClassifier.score(X_test, y_test))

    # train with  extra trees classifier
    extraTreesClassifier = ExtraTreesClassifier(n_estimators=10,
                                                max_depth=None,
                                                min_samples_split=1,
                                                random_state=0)

    # use the classifier to fit the data.
    extraTreesClassifier.fit(X_train, y_train)

    # print the performance of the classifier
    print("======= Extra Trees Classifier ========")
    print('TRAIN SCORE', extraTreesClassifier.score(X_train, y_train))
    print('TEST SCORE', extraTreesClassifier.score(X_test, y_test))
def classify(X,Y):
    print("Building the model for random forests...")
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.1)
    clf = ExtraTreesClassifier(n_estimators=10)
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    clf = clf.fit(X_train,y_train)
    print("Classification Score using Random Forests:" + str(clf.score(X_test,y_test)))
def do_predict_by_cv_and_norm(x_train, y_train, x_test, y_test, times_num, flag='ExtraTrees'):
    """
    使用规范化技术预测模型在测试集上的准确度
    :param x_train: 样本集中的特征向量列表
    :param y_train: 样本集中的标签列表
    :param x_test: 测试集中的特征向量列表
    :param y_test: 测试集中的标签列表
    :param times_num: 模型的n_estimators值
    :param flag: 模型指示词
    :return: 返回预测的准确度
    """
    clf = ExtraTreesClassifier(n_estimators=10)
    file_path = '/Users/ming.zhou/NLP/DiscourseStructures/result/' + flag + 'PredictResult20171117.text'

    if flag == 'RandomForest':
        clf = RandomForestClassifier(n_estimators=10)
    elif flag == 'DecisionTree':
        clf = DecisionTreeClassifier()
    elif flag == 'SVM':
        clf = svm.SVC()

    normalizer = preprocessing.Normalizer().fit(x_train)
    x_train_norm = normalizer.transform(x_train)
    x_test_norm = normalizer.transform(x_test)
    clf.fit(x_train_norm, y_train)
    y_pred = clf.predict(x_test_norm)
    score = clf.score(x_test_norm, y_test)

    print(str(score) + '\n')
    print(y_test + '\n')
    print(y_pred + '\n')
    print('test_Y len is %s,y_pred len is %s \n' % (len(y_test), len(y_pred)))

    result = open(file_path, 'a')
    result_content = list()
    result_content.append('**********************step=' + str(times_num) + '**********************' + '\n')
    result_content.append(str(score) + '\n')
    result_content.append('test_Y:' + '\n')
    result_content.append(str(y_test) + '\n')
    result_content.append('y_pred:' + '\n')
    result_content.append(str(y_pred) + '\n')
    result_content.append('test_Y len is ' + str(len(y_test)) + ',and y_pred len is ' + str(len(y_pred)) + '\n')

    for i in range(len(y_test)):
        y_and_ypred = str(y_test[i]) + '-' + str(y_pred[i])

        if y_test[i] != y_pred[i]:
            if y_and_ypred not in compare:
                compare[y_and_ypred] = 1
            else:
                compare[y_and_ypred] = compare[y_and_ypred] + 1

    # sortedResult = sorted(compare.items(), key=lambda d: -d[1])
    # print(sortedResult)
    # result_content.append(str(sortedResult) + '\n')
    result.writelines(result_content)
    result.close()
    return score
Beispiel #30
0
def et_classifier(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame,
                  y_val: np.ndarray) -> tuple:
    x_trn, x_val = x_trn.copy(), x_val.copy()
    y_trn, y_val = y_trn.copy(), y_val.copy()
    model = ExtraTreesClassifier(n_estimators=400,
                                 min_samples_leaf=16,
                                 class_weight='balanced',
                                 n_jobs=-1,
                                 random_state=7)
    _ = model.fit(x_trn, y_trn)

    training_score = model.score(x_trn, y_trn)
    validation_score = model.score(x_val, y_val)

    clf_report = classification_report(y_val, model.predict(x_val))
    ck_score = cohen_kappa_score(y_val, model.predict(x_val))

    return model, training_score, validation_score, clf_report, ck_score
Beispiel #31
0
def evaluate_et(trainX, trainy, testX, testy, params):
    sc = StandardScaler()
    trainX = sc.fit_transform(trainX)
    testX = sc.transform(testX)
    model = ExtraTreesClassifier(**params)
    model.fit(trainX, trainy)
    test_acc = model.score(testX, testy)
    pred = model.predict_proba(testX)
    return model, test_acc, pred
def classify(X,Y,test_data,test_labels):
	print("Building the model for random forests...")
	Y = np.ravel(Y)
	test_labels = np.ravel(test_labels)
	clf = ExtraTreesClassifier(n_estimators=10)
	clf = clf.fit(X,Y)
	print("Classification Score using Random Forests:" + str(clf.score(test_data,test_labels)))
	output = clf.predict(test_data)
	return output
def get_ERT(Xtrain, Xtest, Ytrain, Ytest, gtree):
    # Extremely Randomized Trees
    ert = ExtraTreesClassifier(
        n_estimators=1000,
        max_features=gtree.best_estimator_.max_features,
        max_depth=gtree.best_estimator_.max_depth,
        min_samples_split=gtree.best_estimator_.min_samples_split,
        n_jobs=-1)
    ert.fit(Xtrain, Ytrain)
    scores = np.empty((2))
    scores[0] = ert.score(Xtrain, Ytrain)
    scores[1] = ert.score(Xtest, Ytest)
    print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0] *
                                                                 100))
    print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1] *
                                                                100))

    return ert
def classify(X, Y):
    print("Building the model for random forests...")
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, Y, test_size=0.1)
    clf = ExtraTreesClassifier(n_estimators=10)
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    clf = clf.fit(X_train, y_train)
    print("Classification Score using Random Forests:" +
          str(clf.score(X_test, y_test)))
Beispiel #35
0
def classify(X, Y, test_data, test_labels):
    print("Building the model for random forests...")
    Y = np.ravel(Y)
    test_labels = np.ravel(test_labels)
    clf = ExtraTreesClassifier(n_estimators=10)
    clf = clf.fit(X, Y)
    print("Classification Score using Random Forests:" +
          str(clf.score(test_data, test_labels)))
    output = clf.predict(test_data)
    return output
def extract_tree(train_vecs, y_train, test_vecs, y_test):
    clf = ExtraTreesClassifier(n_estimators=10,
                               max_depth=10,
                               min_samples_split=2,
                               n_jobs=1,
                               random_state=0)
    clf.fit(train_vecs, y_train)
    joblib.dump(clf, storedpaths + 'model_extracttree.pkl')
    test_scores = clf.score(test_vecs, y_test)
    return test_scores
Beispiel #37
0
def et_classify(self):
    print "Extra Trees"
    clf = ExtraTreesClassifier()
    clf.fit(self.descr, self.target)
    mean = clf.score(self.test_descr, self.test_target)
    pred = clf.predict(self.test_descr)

    print "Pred ", pred
    print "Mean : %3f" % mean
    print "Feature Importances ", clf.feature_importances_
Beispiel #38
0
def ExRF(n_trees,  seed, train_x, train_y, test_x, test_y):
    clf = ExtraTreesClassifier(n_estimators=n_trees,
                                  random_state = seed, oob_score=True)
    clf = clf.fit(train_x,train_y)
    oob_error = 1 - clf.oob_score_
    test_error = clf.score(test_x,test_y)
    test_auc = clf.predict_proba(test_x)
    #filename = './tmp1/RF_%d_.pkl'%seed
    #_ = joblib.dump(clf, filename, compress=9)
    return test_error
Beispiel #39
0
def et_classify(self):
	print "Extra Trees"
	clf = ExtraTreesClassifier()
	clf.fit(self.descr, self.target)
	mean = clf.score(self.test_descr, self.test_target)
	pred = clf.predict(self.test_descr)

	print "Pred ", pred
	print "Mean : %3f" % mean
	print "Feature Importances ", clf.feature_importances_
Beispiel #40
0
def train_model(stats, X_train, Y_train, X_test=None, Y_test=None):
        
    print "Training ExtraTrees classifier"
    clf = Classifier(n_estimators=n_estimators,n_jobs=30,
                     min_samples_leaf=nodesize,
                     #class_weight='balanced_subsample',
                     )
    clf.fit(X_train,Y_train)
    stats["train_acc"] = clf.score(X_train, Y_train)

    print "Training complete"
    print 'Training Accuracy: %.3f'%stats["train_acc"]
    
    # Breakout early if no test set is given
    if X_test is None:
        return clf, stats

    stats["test_acc"] = clf.score(X_test, Y_test)
    print 'Testing Accuracy: %.3f'%stats["test_acc"]

    X_test_TP = X_test[Y_test==1]
    Y_test_TP = Y_test[Y_test==1]
    stats["test_acc_TP"] = clf.score(X_test_TP, Y_test_TP)
    print 'Testing Accuracy TP: %.3f'%stats["test_acc_TP"]

    X_test_FP = X_test[Y_test==0]
    Y_test_FP = Y_test[Y_test==0]
    stats["test_acc_FP"] = clf.score(X_test_FP, Y_test_FP)
    print 'Testing Accuracy FP: %.3f'%stats["test_acc_FP"]
        
    pred_probas = clf.predict_proba(X_test)[:,1]
    Y_predict = clf.predict(X_test)
    
    total_contacts = Y_test.sum()
    predicted_contacts = Y_predict[Y_test==1].sum()
    print 'Total contacts predicted %i/%i'%(predicted_contacts,total_contacts)

    fpr,tpr,_ = roc_curve(Y_test, pred_probas)
    stats["ROC_AUC"] = auc(fpr,tpr)
    print "ROC area under the curve", stats["ROC_AUC"]

    return clf, stats
def train_data_and_score_tree(features,labels, cv, depth):
    f_train, f_test, l_train, l_test = cross_validation.train_test_split(
        features, labels, test_size=cv,random_state=0
    ) 

    clf = ExtraTreesClassifier(max_depth=depth)
    # clf = DecisionTreeClassifier(max_depth=depth)
    clf = clf.fit(f_train,l_train)
    score = clf.score(f_test,l_test)
    
    return score,clf
def EnsembleMethod(X, y):

	# divide our data set into a training set and a test set
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    								X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

	# train with decision tree classifier
	decisionTreeClassifier = DecisionTreeClassifier(max_depth=None, 
							   min_samples_split=1, random_state=0)

	# use the classifier to fit the data.
	decisionTreeClassifier.fit(X_train, y_train)

	# print the performance of the classifier
	print("====== Decision Tree Classifier ========")
	print('TRAIN SCORE', decisionTreeClassifier.score(X_train, y_train))
	print('TEST SCORE', decisionTreeClassifier.score(X_test, y_test)) 

	# train with random forest classifier
	randomForestClassifier = RandomForestClassifier(n_estimators=10,
					max_depth=None, min_samples_split=1, random_state=0)   

	# use the classifier to fit the data.
	randomForestClassifier.fit(X_train, y_train)

	# print the performance of the classifier
	print("====== Random Forest Classifier ========")
	print('TRAIN SCORE', randomForestClassifier.score(X_train, y_train))
	print('TEST SCORE', randomForestClassifier.score(X_test, y_test)) 

	# train with  extra trees classifier
	extraTreesClassifier = ExtraTreesClassifier(n_estimators=10,
				max_depth=None, min_samples_split=1, random_state=0)

	# use the classifier to fit the data.
	extraTreesClassifier.fit(X_train, y_train)

	# print the performance of the classifier
	print("======= Extra Trees Classifier ========")
	print('TRAIN SCORE', extraTreesClassifier.score(X_train, y_train))
	print('TEST SCORE', extraTreesClassifier.score(X_test, y_test)) 
Beispiel #43
0
def main():
    results = {}
    for currency in currencies:
        logging.info('Currency: {0}'.format(currency))

        # get data
        data = pd.read_csv(
            r'../../data/' + currency + '1440.csv',
            names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'],
            parse_dates=[[0, 1]],
            index_col=0,
        ).astype(float)
        logging.info('Loaded {0} rows'.format(len(data)))
        # print data.tail()

        # extract features
        features = extractFeatures(data)
        # print features.tail()

        # set rewards
        rewards = calculateRewards(data)
        rewards = rewards[-len(features):]
        # print rewards.tail()

        # train split
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            features,
            rewards,
            test_size=0.40,
            # random_state=shuffle,
        )
        logging.info('Data splitted')

        # create classifier
        logging.info('Classifier: training...')
        # rfc = RandomForestClassifier(n_estimators=30)
        rfc = ExtraTreesClassifier(n_estimators=20, oob_score=True, bootstrap=True)
        rfc.fit(X_train, y_train)

        # saving
        logging.info('Classifier: saving...')
        externals.joblib.dump(rfc, 'models/' + currency + '.pkl', compress=9)

        # score
        logging.info('Classifier: scoring...')
        results[currency] = {
            'score': rfc.score(X=X_test, y=y_test),
            'oob': rfc.oob_score_,
        }
        # break

    for currency, scores in results.iteritems():
        logging.info('{0} score:{1:.2f} oob:{2:.2f}'.format(currency, scores['score'], scores['oob']))
    def train_classifier(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print ("Fitting data ...")
        clf = ExtraTreesClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit(X_tfidf, y_train)


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        ##################
        # run classifier on test data
        ##################

        X_test_CV = count_vect.transform(docs_test)

        print ("Shape of test data is "+str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf,y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf,count_vect
Beispiel #45
0
def do_classify(X,y,Xtest,ytest):
    importance=[]
    scores=[]
    for i in range(100):
        clf = ExtraTreesClassifier(n_estimators=10)
        clf = clf.fit(X, y)
        scores.append(clf.score(Xtest,ytest))
        importance.append(clf.feature_importances_)  
    
    mean_importance=np.mean(importance,axis=0)
    mean_scores=np.mean(scores)
    return mean_importance,mean_scores
def extreme_tree(X_train, X_test, Y_train, Y_test):
    estimators = [10, 100, 500]
    criterion = ["gini", "entropy"]
    max_features = ["auto", "sqrt", "log2"]
    for est in estimators:
        for cr in criterion:
            for mf in max_features:
                extre_model = ExtraTreesClassifier(n_jobs=8,
                                                   random_state=np.random.RandomState(),
                                                   n_estimators=est,
                                                   criterion=cr,
                                                   max_features=mf)
                extre_model.fit(X_train, Y_train)
                score = extre_model.score(X_test, Y_test)
                print "ExtraTreesClassifier(n_jobs=8, random_state=np.random.RandomState(), n_estimators=%d, criterion=%s, max_features=%s) -> %.4f" % (est, cr, mf, score)
def trainClassifiersAndSave(computeScore=False):
    for db in dbs:
        if (not os.path.exists("clfs/" + db)):
            clf = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=-1, verbose=100)
            saveTrainedClassifier(db, clf)
        elif (computeScore):
            clf = joblib.load("clfs/" + db)

        if (computeScore):
            print("Loading test data...")
            loaded = loadDB(db + ".csv")
            X_test = loaded[:, 0:-1]
            y_test = loaded[:, -1]

            print("Normalized score is {}".format(clf.score(X_test, y_test)))
            X_test = y_test = 0
Beispiel #48
0
def extreamly_random_forest(train_data, predictors):
    # Applying method
    max_score = 0
    best_n = 0
    for n in range(1, 100):
        rfc_scr = 0.
        rfc = ExtraTreesClassifier(n_estimators=n)
        for train, test in KFold(len(train_data), n_folds=10, shuffle=True):
            rfc.fit(train_data[predictors].T[train].T, train_data["Survived"].T[train].T)
            rfc_scr += rfc.score(train_data[predictors].T[test].T, train_data["Survived"].T[test].T)/10
        if rfc_scr > max_score:
            max_score = rfc_scr
            best_n = n

    print(best_n, max_score)
    rfc = ExtraTreesClassifier(best_n)

    # Creating submission
    create_submission(rfc, train_data, test_data, predictors, "rfcsurvivors.csv")
Beispiel #49
0
def runLogitAndNB(Xtrainsparse, Xtestsparse):
	for i in range(len(ytrainraw[0])):
		print "Output type %i" % i
		logit1 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1)
		logit2 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=100)
		logit3 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10000)
		nb1 = naive_bayes.MultinomialNB(alpha=0.01, fit_prior=True, class_prior=None)
		nb2 = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)
		nb3 = naive_bayes.MultinomialNB(alpha=1, fit_prior=True, class_prior=None)
		RF1 = RandomForestClassifier(1, "entropy", None)
		RF2 = RandomForestClassifier(10, "entropy", None)
		RF3 = RandomForestClassifier(20, "entropy", None)
		ytrain = numpy.hstack((ytrainraw[:, i], ydevraw[:, i]))
		ytest = ytestraw[:, i]
		RF1.fit(Xtrainsparse, ytrain)
		RF2.fit(Xtrainsparse, ytrain)
		RF3.fit(Xtrainsparse, ytrain)
		scores = [RF1.score(Xtestsparse, ytest), RF2.score(Xtestsparse, ytest), RF3.score(Xtestsparse, ytest)]
		print "R-FOREST: Best score %.2f%%, min of %.2f%%" % (max(scores) * 100, min(scores) * 100)
		ERF = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0)
		ERF.fit(Xtrainsparse, ytrain)
		print "EXTRA TREES: Best score %.2f%%" % (ERF.score(Xtestsparse, ytest) * 100)
		nb1.fit(Xtrainsparse, ytrain)
		nb2.fit(Xtrainsparse, ytrain)
		nb3.fit(Xtrainsparse, ytrain)
		scores = [nb1.score(Xtestsparse, ytest), nb2.score(Xtestsparse, ytest), nb3.score(Xtestsparse, ytest)]
		print "MULTI-NB: Best score %.2f%%" % (max(scores) * 100)
		logit1.fit(Xtrainsparse, ytrain)
		logit2.fit(Xtrainsparse, ytrain)
		logit3.fit(Xtrainsparse, ytrain)
		scores = [logit1.score(Xtestsparse, ytest), logit2.score(Xtestsparse, ytest), logit3.score(Xtestsparse, ytest)]
		print "LOGIT: Best score %.2f%%" % (max(scores) * 100)
		most_common = lambda lst : max(set(list(lst)), key=list(lst).count)
		print "Most common class frequency: %.1f%% (train) %.1f%% (test)" % \
					(Counter(ytrain)[most_common(ytrain)] / float(len(ytrain)) * 100., \
					Counter(ytest)[most_common(ytest)] / float(len(ytest)) * 100.)
		print

	# ------------- Random Forest (Extra Trees) ---------------
	# Note, 2 procs is faster than 8
	truthLabels = np.array([int(x['label']) for x in p])
	from sklearn.ensemble import RandomForestClassifier	
	from sklearn.ensemble import ExtraTreesClassifier
	fCount = 5#featuresNorm.shape[1]
	# forest = ExtraTreesClassifier(n_estimators=10, compute_importances=False, n_jobs=4, bootstrap=False, random_state=0, max_features=1)#26)
	forest = ExtraTreesClassifier(n_estimators=100, compute_importances=True, n_jobs=7, bootstrap=True, random_state=0, max_features=fCount)
	# forest = RandomForestClassifier(n_estimators=30, compute_importances=True, n_jobs=4, bootstrap=True, random_state=0, max_features=10)#26)
	t0 = time.time()
	forest.fit(X, truthLabels)
	print "Time:", time.time()-t0
	importances = forest.feature_importances_
	forestScore = forest.score(X, truthLabels) # 100%
	predF = forest.predict(X)
	print forestScore
	if 1:
		figure(3)
		bar(range(fCount), importances, color='k')
		xticks(arange(.5, featuresNorm.shape[1]+.5), featureNames, fontsize=14)
		yticks(fontsize=12)
		title('Importance Weighting of Random Forest Features', fontsize=28)
		xlabel("Features", fontsize=22)
		ylabel("Weighting", fontsize=22)
		axis([-.25, fCount, 0, .2])

	# forest = RandomForestClassifier(n_estimators=200, compute_importances=False, n_jobs=1, bootstrap=True, random_state=0, max_features=fCount) # BEST
	# forest = RandomForestClassifier(n_estimators=100, compute_importances=False, n_jobs=1, bootstrap=True, random_state=0, max_features=fCount)	
	# forest = ExtraTreesClassifier(n_estimators=20, compute_importances=False, n_jobs=1, bootstrap=True, random_state=0, max_features=fCount)	
from math import *
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import re,os
data=pd.read_csv('red.csv')
x=data[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]
y=data['quality']
clf=ExtraTreesClassifier(n_estimators=200, max_depth=None,min_samples_split=1, random_state=0)
clf.fit(x,y)
test=pd.read_csv('red_test.csv')
x=test[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]
y=test['quality']
p=clf.predict(x)
print clf.score(x,y)
print clf.feature_importances_
t=np.arange(0.0,100.0)
plt.plot(t,test['quality'],'--',t,p,'-')
plt.show()
Beispiel #52
0
    n_features = X_train.shape[1]   
    print "XX:", n_features

    g =   1.0/float((3*n_features))
    print g

    print "Training."

 
    clf = RandomForestClassifier(n_estimators=850, max_depth=None, max_features=int(math.sqrt(n_features)), min_samples_split=100, random_state=144, n_jobs=4);
    clf.fit(X_train, y_train)
    print "Validation set score: RF " , clf.score(X_val, y_val)
 
    clf_etree = ExtraTreesClassifier(n_estimators=1000, max_depth=None, max_features=int(math.sqrt(n_features)), min_samples_split=100, random_state=144, n_jobs=4);
    clf_etree.fit(X_train, y_train)
    print "Validation set score: ERF " , clf_etree.score(X_val, y_val)

    clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) 
    clf_boost.fit(X_train, y_train)
    print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val)


    #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) 
    #clf_gboost.fit(X_train, y_train)
    #print "Validation set score:LR " , clf_gboost.score(X_val, y_val)


    print "Classifier:"
    print clf, clf.get_params()
    print clf_etree, clf_etree.get_params()
    print clf_boost, clf_boost.get_params()
Beispiel #53
0
        "kernel_window":kernel_window,
        "n_estimators":n_estimators,
        "clf_name":clf_name,
        "test_pdb":fold["test"].tolist(),
        "train_pdb":fold["train"].tolist(),
        "diag_window":diag_window,
        "ratio_TP_to_FP":ratio_TP_to_FP,
    }
    print fold["test"]

    print "Training ExtraTrees classifier"
    clf = Classifier(n_estimators=n_estimators,n_jobs=28,)
                     #class_weight='subsample')
                     #class_weight="auto") # ExtraTrees
    clf.fit(X_train,Y_train)
    stats["train_acc"] = clf.score(X_train, Y_train)

    print "Training complete"
    print 'Training Accuracy: %.3f'%stats["train_acc"]
    
    del X_train, Y_train
    gc.collect()

    # For testing, now load the entire dataset!
    X_test,Y_test = load_fold_dataset(fold["test"],load_all=True)

    stats["test_acc"] = clf.score(X_test, Y_test)
    print 'Testing Accuracy: %.3f'%stats["test_acc"]

    X_test_TP = X_test[Y_test==1]
    Y_test_TP = Y_test[Y_test==1]
    def use_pipeline_temporal(self):

        docs_train, docs_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=42)  # docs_test and y_test will be overwritten

        dataset_test = pd.read_csv(path_to_labelled_test_data_file_temporal, header=0, names=['posts', 'class'])

        docs_test = dataset_test['posts']
        y_test = dataset_test['class']

        #####################
        # Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
        #####################

        pipeline = Pipeline([
            ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)),
            ('clf', ExtraTreesClassifier()),
        ])

        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
            'vect__use_idf': (True, False),
            'clf__n_estimators': (50, 100),
            # 'clf__criterion': ("gini", "entropy"),
            # 'clf__max_depth': (None, 2, 4),
            # 'clf__min_samples_split': (2, 4, 6),
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42)
        grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1)
        clf_gs = grid_search.fit(docs_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        # y_predicted = clf_gs.predict(docs_test)


        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        ngram_range = best_parameters['vect__ngram_range']
        use_idf = best_parameters['vect__use_idf']

        # vectorisation

        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation

        tfidf_transformer = TfidfTransformer(use_idf=use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print("Fitting data with best parameters ...")
        clf = ExtraTreesClassifier().fit(X_tfidf, y_train)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

        ##################
        # run classifier on test data
        ##################

        X_test_CV = count_vect.transform(docs_test)

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf, y_test))

        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        # import matplotlib.pyplot as plt
        # plt.matshow(cm)
        # plt.show()

        return clf, count_vect
    def use_pipeline_with_fs(self):

        #####################
        #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
        #####################

        pipeline = Pipeline([
                ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)),
                ("selector", SelectPercentile()),
                ('clf', ExtraTreesClassifier()),
        ])


        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'vect__ngram_range': [(1,1), (1,2), (1,3)],
            'vect__use_idf': (True, False),
            'clf__n_estimators': (50,100),
            'clf__criterion': ("gini", "entropy"),
            'clf__max_depth': (None,2,4),
            'clf__min_samples_split': (2,4,6),
            'selector__score_func': (chi2, f_classif),
            'selector__percentile': (85, 95, 100),
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42)
        grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1)
        clf_gs = grid_search.fit(docs_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        #y_predicted = clf_gs.predict(docs_test)

        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        ngram_range = best_parameters['vect__ngram_range']
        use_idf = best_parameters['vect__use_idf']
        score_func = best_parameters['selector__score_func']
        percentile = best_parameters['selector__percentile']

        # vectorisation

        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation

        tfidf_transformer = TfidfTransformer(use_idf=use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)


        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=score_func, percentile=percentile)

        combined_features = Pipeline([
            ("vect", count_vect),
            ("tfidf", tfidf_transformer),
            ("feat_select", selector)
        ])

        X_features = combined_features.fit_transform(docs_train,y_train)
        X_test_features = combined_features.transform(docs_test)

        print ("Shape of train data after feature selection is "+str(X_features.shape))
        print ("Shape of test data after feature selection is "+str(X_test_features.shape))


        # run classifier on selected features

        clf = ExtraTreesClassifier().fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file,'w')

        for fb in feature_boolean:
            f.write(str(fb)+'\n')

        f.close()


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        #################
        # run classifier on test data
        #################

        y_predicted = clf.predict(X_test_features)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(X_test_features,y_test))


        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        # import matplotlib.pyplot as plt
        # plt.matshow(cm)
        # plt.show()

        return clf,count_vect
    def train_classifier_use_feature_selection(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)


        # print number of unique words (n_features)
        print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=_score_func, percentile=_percentile)

        print ("Fitting data with feature selection ...")
        selector.fit(X_tfidf, y_train)


        # get how many features are left after feature selection
        X_features = selector.transform(X_tfidf)

        print ("Shape of array after feature selection is "+str(X_features.shape))

        clf = ExtraTreesClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file,'w')

        for fb in feature_boolean:
            f.write(str(fb)+'\n')

        f.close()


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        ####################
        #test clf on test data
        ####################

        X_test_CV = count_vect.transform(docs_test)

        print ("Shape of test data is "+str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        # apply feature selection on test data too
        X_test_selector = selector.transform(X_test_tfidf)
        print ("Shape of array for test data after feature selection is "+str(X_test_selector.shape))

        y_predicted = clf.predict(X_test_selector)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(X_test_selector,y_test))


        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf, count_vect
Beispiel #57
0
print(scores.mean())
submission(TestK, y_pred, name="ForestUpdatedGini.csv")


# ## Extremly Randomized Trees
#     this one led to .8134 in kaggle with training set splitted 70:30

# In[431]:

from sklearn.ensemble import ExtraTreesClassifier

ert = ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_depth=None, min_samples_split=1,random_state=0)
ert.fit(X_train, y_train)
y_pred = ert.predict(TestK)
#print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Training accuracy:' , ert.score(X_train, y_train))
print('Test accuracy:' , ert.score(X_test, y_test))

scores = cross_val_score(ert, X_train, y_train)
print(scores.mean())

submission(TestK, y_pred, name="ERDupdate.csv")


# ## Logistic regression

# In[207]:

from sklearn.linear_model import LogisticRegression
LogisticRegression(penalty='l1')
lr = LogisticRegression(penalty='l1', C=0.1)
Beispiel #58
0
def train_model():

    TIL_n     = feat.count_TIL_corpus()
    decoy_n   = TIL_n*_DECOY_PROPORTION
    FP_n      = feat.count_TIL_false_pos()

    wiki_n    = feat.count_WIKI_corpus()
    skip_wiki_n =  wiki_n // decoy_n

    # Keep the number of false positives in about the same Order-of-Mag
    skip_FP  = FP_n // TIL_n
    print "Skipping every {} value in FP".format(skip_FP)

    if FLAG_BUILD_DECOY_LIST:
        build_skip_query(skip_wiki_n)

    print "Loading features"
    features = Word2Vec.load(feat.f_features)
    dimension = 100 # default dimension
  

    ITR_decoy = query_skip_decoys()

    print "Building training set"
    ITR_train = list(feat.TIL_full_corpus_iter())

    print "Building the false positive set"
    ITR_FP    = list(feat.TIL_false_pos_iter(skip_FP))

    print "Building corpus iter"
    ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy)
    ITR = list(ITR)
    
    Y = np.zeros(len(ITR))
    Y[:TIL_n] = 1.0

    TTS = train_test_split
    x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2)

    print "Proportion of answers {}/{}".format(y_train.sum(),
                                               y_test.sum())

    print "Calculating the wordVecs for train"
    vec_train = np.concatenate([getWordVecs(text,weight,
                                            features,dimension)
                                for text,weight in x_train])
        
    print "Building the scalar"
    scaler = preprocessing.StandardScaler().fit(vec_train)

    print "Saving the scaler"
    joblib.dump(scaler, f_norm_scale)

    print "Scaling train vectors"
    vec_train = scaler.transform(vec_train)

    print "Calculating the wordVecs for test"
    vec_test = np.concatenate([getWordVecs(text,weight,features,dimension)
                               for text,weight in x_test])

    print "Scaling test vectors"
    vec_test = scaler.transform(vec_test)

    print "Train size/TP in sample", vec_train.shape, (y_train==1).sum()
    print "Test  size/TP in sample", vec_test.shape, (y_test==1).sum()
    print "Training classifer"

    #from sklearn.linear_model import SGDClassifier as Classifier
    #from sklearn.linear_model import LogisticRegression as Classifier
    #from sklearn.linear_model import BayesianRidge as Classifier
    #from sklearn.naive_bayes import BernoulliNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.ensemble import RandomForestClassifier as Classifier
    from sklearn.ensemble import ExtraTreesClassifier as Classifier
    
    # This seems to be the best... but high FP rate
    #from sklearn.naive_bayes import BernoulliNB as Classifier    
 
    #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD
    #clf =  Classifier(C=2500,verbose=2) # LogisiticRegression
    #clf =  Classifier() # Naive Bayes
    clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees
    
    clf.fit(vec_train, y_train)  

    print 'Test Accuracy: %.3f'%clf.score(vec_test, y_test)

    idx_TP = np.array(y_test) > 0
    vec_TP = np.array(vec_test)[idx_TP]
    y_TP   = np.array(y_test)[idx_TP]
    print 'Test Accuracy on TP: %.3f'%clf.score(vec_TP, y_TP)

    vec_FP = np.array(vec_test)[~idx_TP]
    y_FP   = np.array(y_test)[~idx_TP]
    print 'Test Accuracy on FP: %.3f'%clf.score(vec_FP, y_FP)

    print "Saving the classifer"
    joblib.dump(clf, f_clf)

    #Create ROC curve
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt

    pred_probas = clf.predict_proba(vec_test)[:,1]
    fpr,tpr,_ = roc_curve(y_test, pred_probas)
    roc_auc = auc(fpr,tpr)
    plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')
    plt.show()
Beispiel #59
0
TTS = train_test_split
x_train, x_test, y_train, y_test = TTS(X, Y, test_size=0.17)

print "Scaling train vectors"
x_train = scalar.transform(x_train)

print "Scaling text vectors"
x_test = scalar.transform(x_test)

print "Training classifer"
from sklearn.ensemble import ExtraTreesClassifier as Classifier

clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees
clf.fit(x_train, y_train)  

print 'Test Accuracy: %.3f'%clf.score(x_test, y_test)

y_test = np.array(y_test)
for n in _INV_STATUS_MAP.keys():
    idx = y_test==n
    try:
        score = clf.score(x_test[idx], y_test[idx])
    except:
        score = -1
    print 'Test Accuracy on {}: {:0.3f}'.format(_INV_STATUS_MAP[n],
                                                score)

    
print
print "Suggesting some new entries"