Beispiel #1
1
def predict_TestData(Food_df,People_df):
    cTrainF = rand(len(Food_df)) > .5
    cTestF = ~cTrainF
    cTrainP = rand(len(People_df)) > .5
    cTestP = ~cTrainP

    TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0)
    TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0)

    TrainX= TrainX_df.ix[:,2:].values
    TestX= TestX_df.ix[:,2:].values
    TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))])
    TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))])

    ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0)
    ET_classifier.fit(TrainX,TrainY)
    ET_prediction = ET_classifier.predict(TestX) 

    LinSVC_classifier = svm.LinearSVC()
    LinSVC_classifier.fit(TrainX,TrainY)
    LinSVC_predict = LinSVC_classifier.predict(TestX)

    a=DataFrame()
    a["url"]=TestX_df.urls.values
    a["answer"]=TestY
    a["ET_predict"]=ET_prediction
    a["LinSVC_predict"]=LinSVC_predict
    a.to_csv("prediction_for_TestData.csv")
def crossVal(positions, X, y, missedYFile):
    outF = open(missedYFile, 'w')
    posArray = np.array(positions)
    # Split into training and test
    sss = StratifiedShuffleSplit(y, 4, test_size=0.1, random_state=442)
    cvRound = 0
    for train_index, test_index in sss:
        clf = ExtraTreesClassifier(n_estimators=300,
                                   random_state=13,
                                   bootstrap=True,
                                   max_features=20,
                                   min_samples_split=1,
                                   max_depth=8,
                                   min_samples_leaf=13,
                                   n_jobs=4
                                   )
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pos_test = posArray[test_index]

        clf = clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        metrics.confusion_matrix( y_test, preds )
        print( metrics.classification_report(y_test, clf.predict(X_test)) )
        for loc,t,p in zip(pos_test, y_test, preds):
            if t=='0' and p=='1':
                print >> outF, loc + '\t' + str(cvRound)
        cvRound += 1
    outF.close()
def train_UsingExtraTreesClassifier(df,header,x_train, y_train,x_test,y_test) :

    # training
    clf = ExtraTreesClassifier(n_estimators=200,random_state=0,criterion='gini',bootstrap=True,oob_score=1,compute_importances=True)
    # Also tried entropy for the information gain but 'gini' seemed to give marginally better fit, bith in sample & out of sample
    clf.fit(x_train, y_train)
    #estimation of goodness of fit
    print "Estimation of goodness of fit using the ExtraTreesClassifier is : %f  \n" % clf.score(x_test,y_test)
    print "Estimation of out of bag score  using the ExtraTreesClassifier is : %f \n \n  " % clf.oob_score_
    # getting paramters back, if needed
    clf.get_params()
    # get the vector of predicted prob back
    y_test_predicted= clf.predict(x_test)
    X = df[df.columns - [header[-1]]]

    feature_importance = clf.feature_importances_
    # On a scale of 10 - make importances relative to max importance and plot them
    feature_importance = 10.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance) #Returns the indices that would sort an array.
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 1, 1)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()
    return y_test_predicted
class Identifier:
	def __init__(self,grabable = set([]),clf = None):
		self.grabable = grabable #TODO if we care to, not used at the mo
		self.orb = orb = cv2.ORB(nfeatures = 1000)#,nlevels = 20, scaleFactor = 1.05)
		self.items = [ "champion_copper_plus_spark_plug", "cheezit_big_original","crayola_64_ct", "dove_beauty_bar", "elmers_washable_no_run_school_glue","expo_dry_erase_board_eraser", "feline_greenies_dental_treats","first_years_take_and_toss_straw_cups", "genuine_joe_plastic_stir_sticks","highland_6539_self_stick_notes", "kong_air_dog_squeakair_tennis_ball","kong_duck_dog_toy", "kong_sitting_frog_dog_toy", "kygen_squeakin_eggs_plush_puppies","mark_twain_huckleberry_finn", "mead_index_cards","mommys_helper_outlet_plugs","munchkin_white_hot_duck_bath_toy", "one_with_nature_soap_dead_sea_mud","oreo_mega_stuf", "paper_mate_12_count_mirado_black_warrior","rollodex_mesh_collection_jumbo_pencil_cup", "safety_works_safety_glasses", "sharpie_accent_tank_style_highlighters", "stanley_66_052" ]
		if not clf:
			print "Training new classifier"
			self.clf =ExtraTreesClassifier(min_samples_split = 1,n_jobs = -1,n_estimators = 150, class_weight = 'subsample')
			X = np.ascontiguousarray(joblib.load('labels.pkl'))
			Y = np.ascontiguousarray(joblib.load('features.pkl'), dtype = np.float64)
			Y = preprocessing.scale(Y)
			self.clf.fit(Y,X)
		else:
			self.clf = clf
	def identify(self,im,possibilites):
		if im is not None:
			kpTest, desTest = self.orb.detectAndCompute(im,None)
			pred = self.clf.predict(preprocessing.scale(np.array(desTest,dtype = np.float64)))
			c = Counter(pred)
			r = [(k,c[k]) for k in sorted(set(c.keys())&possibilites, key  = lambda k: c[k],reverse = True)]
			if r:
				item = r[0][0]
				print self.items[item],
				return item
			else:
				return -1

		else:
			print "Image to recognize is None"
def automatic_bernulli():
    data = pd.read_csv('/home/vasiliy/Study/StadiumProject/Classifier/signs.csv', sep=';')
    Y = np.array(data['fight'].get_values())
    np.random.shuffle(Y)
    data.drop(['match', 'city', 'date', 'fight'], 1, inplace=True)
    # data = data[['anger_over_value_relation', 'avg_likes', 'sc_max_surprise', 'sc_median_fear',
    #              'fear_over_value_relation']]

    X = data.as_matrix()

    features_number = 0
    result = {}
    for features_number in range(3, 16):
        X_new = SelectKBest(f_classif, k=features_number).fit_transform(X, Y)
        # X_new = X
        classifier = ExtraTreesClassifier()
        super_means = []
        for i in range(1000):
            kf = KFold(len(X_new), n_folds=6, shuffle=True)
            means = []
            for training, testing in kf:
                classifier.fit(X_new[training], Y[training])
                prediction = classifier.predict(X_new[testing])
                curmean = np.mean(prediction == Y[testing])
                means.append(curmean)
            super_means.append(np.mean(means))
        print 'features_number=', features_number, 'Mean accuracy: {:.1%} '.format(
                np.mean(super_means))
            # result['fn'+str(features_number)+'n_n'+str(n_neib)] = np.mean(super_means)
        score, permutation_scores, pvalue = permutation_test_score(classifier, X_new, Y, scoring="accuracy", cv=kf,
                                                                n_permutations=len(Y), n_jobs=1)
        print ("Classification score %s (pvalue : %s)" % (score, pvalue))
def ERFC_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting Extreme Random Forest Classifier***************")
    t0 = time()
    clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("Extreme Random Forest Classifier - {0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)
    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)

    print("***************Ending Extreme Random Forest Classifier***************")
    return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def extremeRand(trainData,testData,trainOuts,testOuts):
	clf = ExtraTreesClassifier(n_estimators=5, max_depth=10,
      min_samples_split=1, random_state=2)
	print(clf.fit(trainData,trainOuts))
	predictions = clf.predict(testData)
	print(predictions)
	misses,error = sup.crunchTestResults(predictions,testOuts,.5)
	print(1-error)
def classify(X,Y,test_data,test_labels):
	print("Building the model for random forests...")
	Y = np.ravel(Y)
	test_labels = np.ravel(test_labels)
	clf = ExtraTreesClassifier(n_estimators=10)
	clf = clf.fit(X,Y)
	print("Classification Score using Random Forests:" + str(clf.score(test_data,test_labels)))
	output = clf.predict(test_data)
	return output
Beispiel #10
0
def EXRT(X_train,t_train,x,t,predict):
	for i in [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:	
		clf = ExtraTreesClassifier(n_estimators=500, max_depth=None, max_features=i)

		clf.fit(X_train, t_train)
		prediction = clf.predict(x)
		if predict:
			write_predictions(t,prediction)
		else:
			get_accuracy(prediction,t)
Beispiel #11
0
def et_classify(self):
	print "Extra Trees"
	clf = ExtraTreesClassifier()
	clf.fit(self.descr, self.target)
	mean = clf.score(self.test_descr, self.test_target)
	pred = clf.predict(self.test_descr)

	print "Pred ", pred
	print "Mean : %3f" % mean
	print "Feature Importances ", clf.feature_importances_
def extratree_cla(train_data, train_id, test_data, seed = None):
    clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=4, random_state= seed)#, max_features="log2")
    param_grid = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
    }
    clf.fit(train_data, train_id)
    pred_class = clf.predict(test_data)
    pred_prob = clf.predict_proba(test_data)
    return pred_class, pred_prob
Beispiel #13
0
def test_extra_trees_3():
    """Ensure that the TPOT ExtraTreesClassifier outputs the same as the sklearn version when min_weight > 0.5"""
    tpot_obj = TPOT()

    result = tpot_obj._extra_trees(training_testing_data, 0, 1., 0.6)
    result = result[result['group'] == 'testing']

    etc = ExtraTreesClassifier(n_estimators=500, random_state=42, max_features=1., min_weight_fraction_leaf=0.5, criterion='gini')
    etc.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, etc.predict(testing_features))
    def train_classifier(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print ("Fitting data ...")
        clf = ExtraTreesClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit(X_tfidf, y_train)


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        ##################
        # run classifier on test data
        ##################

        X_test_CV = count_vect.transform(docs_test)

        print ("Shape of test data is "+str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf,y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf,count_vect
def PCA_reduction(posture, trainblock, componenet):
    currentdirectory = os.getcwd()  # get the directory.
    parentdirectory = os.path.abspath(currentdirectory + "/../..")  # Get the parent directory(2 levels up)
    path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-'+str(posture)+'/TrainBlock-'+str(trainblock)+''
    if not os.path.exists(path):
        os.makedirs(path)
    i_user = 1
    block = 1
    AUC = []
    while i_user <= 31:
        while block <= 6:
            train_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(trainblock)+"-GI.csv", dtype=float, delimiter=",")
            test_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(block)+"-GI.csv", dtype=float, delimiter=",")

            target_train = np.ones(len(train_data))
            row = 0
            while row < len(train_data):
                if np.any(train_data[row, 0:3] != [1, i_user, posture]):
                    target_train[row] = 0
                row += 1

            row = 0
            target_test = np.ones(len(test_data))
            while row < len(test_data):
                if np.any(test_data[row, 0:3] != [1, i_user, posture]):
                    target_test[row] = 0
                row += 1

            sample_train = train_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]]
            sample_test = test_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]]
            scaler = preprocessing.MinMaxScaler().fit(sample_train)
            sample_train_scaled = scaler.transform(sample_train)
            sample_test_scaled = scaler.transform(sample_test)

            pca = PCA(n_components=componenet)
            sample_train_pca = pca.fit(sample_train_scaled).transform(sample_train_scaled)
            sample_test_pca = pca.transform(sample_test_scaled)

            clf = ExtraTreesClassifier(n_estimators=100)
            clf.fit(sample_train_pca, target_train)

            prediction = clf.predict(sample_test_pca)
            auc = metrics.roc_auc_score(target_test, prediction)
            AUC.append(auc)

            block += 1

        block = 1
        i_user += 1
    print(AUC)
    AUC = np.array(AUC)
    AUC = AUC.reshape(31, 6)
    np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-"+str(posture)+"/TrainBlock-"+str(trainblock)+"/PCA-"+str(componenet)+"-Component.csv", AUC, delimiter=",")
def extraTree(X, y, train, valid):
	clf = ExtraTreesClassifier(n_jobs = -1, n_estimators = 300, verbose = 2,
            random_state = 1, max_depth = 10, bootstrap = True)
	clf.fit(X[train], y[train])
	yhat = clf.predict(X[valid])
	yhat_prob = clf.predict_proba(X[valid])[:,1]
	print("extra tree randomForest" + str(accuracy_score(y[valid], yhat)))
	print(classification_report(y[valid], yhat))

	print("extra tree randomForest roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob)))
	np.savetxt("y_extratree.csv", yhat_prob)
	return yhat_prob
def main():
	print "Reading training data"
	trdata = csvtolist2D('train.csv')
	print "Length of training data : " ,len(trdata)
	print "Reading test data"
	testdata = csvtolist2D('test.csv')
	print "Length of test data : " ,len(testdata)
	
	#first row is for Headings

	trdata = trdata[1:]
	testdata = testdata[1:]

	labels,i=[],0
	for row in trdata:
		labels.append(int(row[0]))
		trdata[i]=row[1:]
		i=i+1
	"""
	print "Extracting features for trdata..."
	trfeatures = featureextractor(trdata)
	list2DtoCSV(trfeatures,"Ptrfeatures.csv")

	print "Extracting features for testdata..."
	testfeatures = featureextractor(testdata)
	list2DtoCSV(testfeatures,"Ptestfeatures.csv")
	"""
	
	print "reading features...."
	trfeatures = csvtolist2D("Ptrfeatures.csv")
	testfeatures = csvtolist2D("Ptestfeatures.csv")
	"""
	scaler = preprocessing.StandardScaler().fit(trfeatures)
	trfeatures = scaler.transform(trfeatures)
	testfeatures = scaler.transform(testfeatures)
	"""
	print "Starting training..."
	#clf = svm.SVC()
	#clf = RandomForestClassifier(n_estimators=150)
	clf = ExtraTreesClassifier(n_estimators=150)
	clf = clf.fit(trfeatures, labels)
	print "Predicting result..."
	RFCresult = clf.predict(testfeatures)
	output=[['ImageId','Label']]
	i=1
	for ele in RFCresult:
		row=[]
		row.append(i)
		row.append(ele)
		output.append(row)
		i=i+1
	list2DtoCSV(output,"Poutput.csv")
Beispiel #18
0
def extratreeclassifier(input_data,output_labels,filename, m_d=3, n_est=10, rs=0):
    # Learn an ExtraTreesClassifier for comparison
    from sklearn.ensemble import ExtraTreesClassifier
    etC = ExtraTreesClassifier(max_depth= m_d, n_estimators=n_est, random_state=rs)
    crossValidation(input_data, output_labels, etC)
    X_train, X_test, Y_train, Y_test = train_test_split(input_data, output_labels, test_size=0.25, random_state=42)

    etC.fit(X_train,Y_train)
    predictionsTrees = etC.predict(X_test)
    calc_conf_matrix(Y_test, predictionsTrees, 'Extra Tree Classifier confusion matrix', filename+'_cm')
    roc_plot(input_data,output_labels, etC,'roc_'+filename )
    coeff_of_deterimination(etC, input_data, output_labels, 3)
    print etC.feature_importances_
Beispiel #19
0
def test_extra_trees_3():
    """Ensure that the TPOT ExtraTreesClassifier outputs the same as the sklearn version when max_features > the number of features"""
    tpot_obj = TPOT()

    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(tpot_obj.non_feature_columns, axis=1)
    num_features = len(training_features.columns)

    result = tpot_obj._extra_trees(training_testing_data, 0, num_features + 1)
    result = result[result['group'] == 'testing']

    etc = ExtraTreesClassifier(n_estimators=500, random_state=42, max_features=num_features, criterion='gini')
    etc.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, etc.predict(testing_features))
def execute(fdata):

    data = list()
    target = list()
    storeDict = dict()

    for i, lines in enumerate(fdata):
        sline = lines.split(",")
        target.append(int(sline[0]))
        data.append([float(x) for j, x in enumerate(sline) if j != 0])
        storeDict[i] = [float(x) for j, x in enumerate(sline) if j != 0]

    data = np.array(data)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.25, random_state=0)
    clf = ExtraTreesClassifier()
    clf = clf.fit(X_train, y_train)
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X_train)

    clfNew = svm.SVC(kernel='linear', C=1).fit(X_new, y_train)

    value_feature = list()
    countDict = dict()
    for key, val in storeDict.items():
        countDict[key] = 0
        for i, inval in enumerate(val):
            if inval in X_new[0]:
                countDict[key] = countDict[key] + 1


    keyName = max(countDict, key=countDict.get)
    posStore = list()
    for val in X_new[0]:
        posStore.append(storeDict[keyName].index(val))

    X_test_new = list()

    for val in X_test:
        inlist = list()
        for i, inval in enumerate(val):
            if i in posStore:
                inlist.append(inval)

        X_test_new.append(inlist)

    X_test_new = np.array(X_test_new)

    return accuracy_score(y_test, clf.predict(X_test)), accuracy_score(y_test, clfNew.predict(X_test_new))
Beispiel #21
0
def feature_selection(train, y):

	sss = StratifiedShuffleSplit(y, n_iter=1, test_size=.3, random_state=42)
	train_idx, test_idx = next(iter(sss))

	xtrain = train.iloc[train_idx].values
	ytrain = y.iloc[train_idx].values

	xtest = train.iloc[test_idx].values
	ytest = y.iloc[test_idx].values

	clf_et = ExtraTreesClassifier().fit(xtrain, ytrain)

	et_preds = clf_et.predict(xtest)

	print 'initial f1 score based on extra trees classifier: ', f1_score(ytest, et_preds)

	feat_imp = clf_et.feature_importances_
	sorted_fi = feat_imp[np.argsort(feat_imp)[::-1]] #descending sort

	print 'feature importance: ', feat_imp 
	print 'sorted feature importances: ', sorted_fi

	clf_gb = GradientBoostingClassifier()
	feats_tot = xtrain.shape[1]

	f1_best = 0
	print "output format:"
	print "no of features, f1-score, roc-score of class-predictions, roc-score of probabilities"

	for feats in range(1,feats_tot+1):
		threshold_idx = min(len(sorted_fi),feats)
		threshold = sorted_fi[threshold_idx]
		select = (feat_imp>threshold)
		clf_gb.fit(xtrain[:,select],ytrain)
		tmp_preds = clf_gb.predict(xtest[:,select])
		tmp_probs = clf_gb.predict_proba(xtest[:,select])[:,1]
		f1 = f1_score(ytest,tmp_preds)
		roc_pred = roc_auc_score(ytest,tmp_preds)
		roc_prob = roc_auc_score(ytest,tmp_probs)
		if f1 > f1_best:
			f1_best = f1
			np.save('./features/clf_sel.npy',select)
		print feats,f1,roc_pred,roc_prob
		if feats >= 16:
			break

	print "f1_best:", f1_best
Beispiel #22
0
def cross_val(clf_name, X, y, n_folds=5, proba=False, score=accuracy_score, *params, **kwargs):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=41)
    if clf_name == "extra":
        c = ExtraTreesClassifier(12, max_depth=23, max_features=10, n_jobs=-1, *params, **kwargs)
    elif clf_name == "grad":
        c = GradientBoostingClassifier(n_estimators=40, learning_rate=0.1, *params, **kwargs)
    elif clf_name == "cgrad":
        c = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(n_estimators = 20,learning_rate= 0.1, *params, **kwargs), method='isotonic', cv=10) 
    elif clf_name == "cmulti":
        c = CalibratedClassifierCV(base_estimator=MultinomialNB(alpha = alpha_multi, *params, **kwargs), method='isotonic', cv=10) 
    elif clf_name == "multi":
        c = MultinomialNB(*params, **kwargs)
    elif clf_name == "bag":
        c = BaggingClassifier(base_estimator=MultinomialNB(alpha = 0.5, *params, **kwargs),n_estimators = 100,n_jobs = -1)
    elif clf_name == "bern":
        c = BernoulliNB(alpha=0.00000000001, *params, **kwargs)
    elif clf_name == "gauss":
        c = GaussianNB(*params, **kwargs)
    elif clf_name == "random":
        c = RandomForestClassifier(1200,max_depth= 23,max_features = 10,n_jobs = -1, *params, **kwargs)
    elif clf_name == "lda":
        c = LinearDiscriminantAnalysis(*params, **kwargs)
    elif clf_name == "logistic":
        c = LogisticRegression(C=1, *params, **kwargs)
    elif clf_name == "svm":
        c = LinearSVC(C=100, *params, **kwargs)
    elif clf_name == "knn":
        c = KNeighborsClassifier(n_neighbors=20, *params, **kwargs)
    elif clf_name == "near":
        c = NearestCentroid(*params, **kwargs)
    elif clf_name == "ridge":
        c = OneVsOneClassifier(RidgeClassifier(alpha=0.1, *params, **kwargs))
    elif clf_name == "sgd":
        c = SGDClassifier(loss="hinge", penalty="l2", n_iter=50, alpha=0.000001, fit_intercept=True, average=True)

    y_pred = np.zeros(y.shape)
    score_list = []
    for i, (train, test) in enumerate(cv):
        c.fit(X[train,:], y[train])
        if proba:
            y_pred[test] = c.predict_proba(X[test,:])
        else:
            y_pred[test] = c.predict(X[test,:])
        score_list.append(score(y[test], y_pred[test]))
        print(score_list[i])
    print("Final score",score(y,y_pred))
    return y_pred
def binary_cbf(oversampling=(0, 0)):
    """
    :param oversampling: Tuple(Int), double review samples with star classes in range
    :return: None
    """
    t = time()
    with sqlite3.connect(DB_PATH) as conn:
        y = FeatureReformer(conn, 'r_samples', ['rstar']).transform('y2').transpose()[0]
        X = FeatureReformer(conn, 'r_samples', [
                'brcnt',
                'bstar',
                'checkins',
                'compliments',
                'fans',
                'rdate',
                'urcnt',
                'ustar',
                'uvotes',
                'ysince',
                ]).transform()

        # oversampling
        ovsp = over_sampling(y, oversampling)
        y = y[ovsp]
        X = X[ovsp]

        n_samples, n_features = X.shape
        print(X.shape)
        print('Done with collecting & reforming data from database, using ', time()-t, 's')
        t = time()
        rec_scorer = RecScorer(n_class=2)
        div = ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=0)
        model = ExtraTreesClassifier(n_estimators=5)
        for train, test in div:
            X_train = X[np.array(train)]
            X_test = X[np.array(test)]
            y_train = y[np.array(train)]
            y_test = y[np.array(test)]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            # Metrics below
            rec_scorer.record(y_true=y_test, y_pred=y_pred)
            # print(confusion_matrix(y_true=y_test, y_pred=y_pred), '\n', time()-t, 's used >>\n')
            print(time()-t, 's used >>\n')

        print('Done with 5-fold training & cross validating, using ', time()-t, 's')
        rec_scorer.finalScores()
class ERFTrainer(object):
    def __init__(self, X, label_words):
        self.le = preprocessing.LabelEncoder()  
        self.clf = ExtraTreesClassifier(n_estimators=100, 
                max_depth=16, random_state=0)

        y = self.encode_labels(label_words)
        self.clf.fit(np.asarray(X), y)

    def encode_labels(self, label_words):
        self.le.fit(label_words) 
        return np.array(self.le.transform(label_words), dtype=np.float32)

    def classify(self, X):
        label_nums = self.clf.predict(np.asarray(X))
        label_words = self.le.inverse_transform([int(x) for x in label_nums]) 
        return label_words
Beispiel #25
0
def etclassifier(training_samples, eval_samples, do_grid_search=True):
    X_train, Y_train = training_samples
    X_eval, Y_eval = eval_samples

        
    clf = ExtraTreesClassifier(max_depth=None, n_estimators=1000,
                                 min_weight_fraction_leaf=0.0, max_features=None, min_samples_split=16, criterion='gini',
                                 min_samples_leaf=2, max_leaf_nodes=None, oob_score=False, bootstrap=True,
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)
    to_be_tuned_parameters = {
                              #'n_estimators':[500, 2000, 4000],
                              'max_features':['log2', 'auto', None],
                              'min_samples_split':[2, 8, 16],
                              'min_samples_leaf': [1, 2],

                            }
    if do_grid_search:
        clf = GridSearchCV(clf, to_be_tuned_parameters, cv=5, n_jobs=5, scoring='log_loss')
    #Best parameters set found on development set:
    #()
    #{'max_features': None, 'min_samples_split': 10, 'n_estimators': 1000, 'min_samples_leaf': 2}
    
    
                

    print(clf)
    clf.fit(X_train, Y_train)
    if do_grid_search:
        print("Best parameters set found on development set:")
        print()
        
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean_score, scores.std() * 2, params))
        
    else:
        scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss')
        print scores, np.mean(scores), np.median(scores)
    Y_eval = clf.predict(X_eval)
    Y_prob = clf.predict_proba(X_eval)
    return Y_eval, Y_prob
	def trainExtraRandomForest(self, trainingData, evaluationData, validCols):
		""" Train the extra random forest model, and return the predictions and 
		    the instantiated trained model
		"""
		# Get the results list
		trainingResults = trainingData['Result'].tolist()
		nEstimators = randint(50,1000)
		logging.info('Extra Random Forest - Model Iterations %i', nEstimators)
		extraModel = ExtraTreesClassifier(n_estimators= nEstimators, max_depth=None, min_samples_split=1, random_state=0)

		# Train the model
		start = time.clock()
		extraModel = extraModel.fit(trainingData[validCols],trainingResults)
		elapsed = (time.clock() - start)		
		logging.info('Extra Random Forest - Training Time %f secs', elapsed)

		# Return the model predictions for evaluation
		return extraModel.predict(evaluationData[validCols]), extraModel
Beispiel #27
0
def allfeatures_001():
    train = classes.get_train_data()
    copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple'])
    day = DayTransformer()
    state = StateTransformer()
    car_val = FillEncoderBinarizer('car_value', 'z')
    risk_factor = FillEncoderBinarizer('risk_factor', 0)
    c_prev = FillEncoderBinarizer('C_previous', 0)
    c_dur = FillEncoderBinarizer('duration_previous', -1)
    last_plan = LastObservedPlan()

    features = FeatureUnion([
        ('copy', copy),
        ('day', day),
        ('state', state),
        ('car_val', car_val),
        ('risk_factor', risk_factor),
        ('c_prev', c_prev),
        ('c_dur', c_dur),
        # ('last_plan', last_plan)
    ])

    pipeline = Pipeline([
        ('filter', LastShoppingPointSelector()),
        ('features', features)
    ])

    train, test = classes.train_test_split(train)
    train_x = pipeline.fit_transform(train)
    train_y = classes.split_plan(classes.get_actual_plan(train))
    y_encoder = classes.MultiColLabelBinarizer()
    y = y_encoder.fit_transform(train_y[list('ABCDEFG')])

    # Just on one col
    est = ExtraTreesClassifier(n_estimators=100, verbose=3)
    est.fit(train_x, y)

    actuals = classes.split_plan(classes.get_actual_plan(test))
    test_x = classes.truncate(test)
    test_x = pipeline.transform(test_x)
    pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x)))

    score = classes.score_df(pred, actuals)
    scores = classes.col_score_df(pred, actuals)
Beispiel #28
0
def train_model(stats, X_train, Y_train, X_test=None, Y_test=None):
        
    print "Training ExtraTrees classifier"
    clf = Classifier(n_estimators=n_estimators,n_jobs=30,
                     min_samples_leaf=nodesize,
                     #class_weight='balanced_subsample',
                     )
    clf.fit(X_train,Y_train)
    stats["train_acc"] = clf.score(X_train, Y_train)

    print "Training complete"
    print 'Training Accuracy: %.3f'%stats["train_acc"]
    
    # Breakout early if no test set is given
    if X_test is None:
        return clf, stats

    stats["test_acc"] = clf.score(X_test, Y_test)
    print 'Testing Accuracy: %.3f'%stats["test_acc"]

    X_test_TP = X_test[Y_test==1]
    Y_test_TP = Y_test[Y_test==1]
    stats["test_acc_TP"] = clf.score(X_test_TP, Y_test_TP)
    print 'Testing Accuracy TP: %.3f'%stats["test_acc_TP"]

    X_test_FP = X_test[Y_test==0]
    Y_test_FP = Y_test[Y_test==0]
    stats["test_acc_FP"] = clf.score(X_test_FP, Y_test_FP)
    print 'Testing Accuracy FP: %.3f'%stats["test_acc_FP"]
        
    pred_probas = clf.predict_proba(X_test)[:,1]
    Y_predict = clf.predict(X_test)
    
    total_contacts = Y_test.sum()
    predicted_contacts = Y_predict[Y_test==1].sum()
    print 'Total contacts predicted %i/%i'%(predicted_contacts,total_contacts)

    fpr,tpr,_ = roc_curve(Y_test, pred_probas)
    stats["ROC_AUC"] = auc(fpr,tpr)
    print "ROC area under the curve", stats["ROC_AUC"]

    return clf, stats
def makeAllEvals(dataset,dbtype='CATH',level=1,k_iters=10):

    dataDict = dbParser(dataset,level=level,dbtype=dbtype)
    print dataDict

    labels = dataDict['target_names']
    skf = StratifiedKFold(labels, k_iters)

    #Level 1
    #clf = ExtraTreesClassifier(n_estimators=100,min_samples_split=2,max_depth=None)
    #Level 2
    clf = ExtraTreesClassifier(n_estimators=300,min_samples_split=2,max_depth=None)

    accsList = []

    for train, test in skf:
        print '\n--------------------------------------------------\n'
        _train = [dataDict['vectors'][i] for i in train]
        _test = [dataDict['vectors'][i] for i in test]
        _targets = [dataDict['target_names'][i] for i in train]

        clf.fit(_train,_targets)

        y_true = [labels[i] for i in test]
        y_pred = clf.predict(_test)


        localAccuracy = accuracy_score(y_true, y_pred)
        accsList.append(localAccuracy)
        print '*** localAccuracy:', localAccuracy

        print(classification_report(y_true, y_pred))

        cm = confusion_matrix(y_true, y_pred,labels=clf.classes_)
        print cm

    _ACC = np.mean(accsList)
    _ACCstd = np.std(accsList)

    print '\n[ FINAL SUMMARY ]'
    print ' *** ACCURACY: ', _ACC
    print ' *** ACCURACY deviation: ', _ACCstd
Beispiel #30
0
class extraTrees:

   def __init__(self,predictorsLabel,targetLabel='target',n_estimators=5):
      self.predictorsLabel = predictorsLabel
      self.targetLabel = targetLabel
      self.ETC = ExtraTreesClassifier(n_estimators)

   def training(self,trainingData,n_estimators=5):
      target = trainingData[self.targetLabel]
      train = pd.DataFrame(trainingData, columns=self.predictorsLabel)
      train = train.replace(np.nan,-1.0) 
      self.ETC.fit(train,target)
 
   def prediction(self,predictionData):
      test = pd.DataFrame(predictionData, columns=self.predictorsLabel)
      test = test.replace(np.nan,-1.0) 
      predTarget = self.ETC.predict(test)
      pred = pd.DataFrame(np.array([test['ID'],predTarget]).T,\
                          columns=["ID","target"])
      return pred
Beispiel #31
0
etc.fit(X_train, y_train)

print("\n----------ET----------")
print('Accuracy of ET classifier on training set: {:.3f}'.format(etc.score(X_train, y_train)))
# test data set acc
print('Accuracy of ET classifier on test set: {:.3f}'.format(etc.score(X_new_bal, y_new_bal)))

# ' test data matrix'
# y_pred = etc.fit(X_train, y_train).predict(X_test)
# # Plot non-normalized confusion matrix
# plot_confusion_matrix(y_test, y_pred, classes=class_names,
#                       title='Confusion matrix, without normalization')
# plt.show()

'Confusion Matrix'
y_pred_et = etc.predict(X_new_bal)
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_new_bal, y_pred_et, classes=class_names,
                      title='Confusion matrix, without normalization')
print("Time consuming of DT is: ", time.time() - start_time_ET)
plt.show()

# ' roc for test'
# roc_curve_plot(X_test, y_test, etc)
# ' pre_re for test'
# precision_recall_curve(X_Balance_test, y_Balance_test, etc)

"2.7 Logistic Regression"
start_time_LR = time.time()
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(solver='lbfgs')
Beispiel #32
0
# Wow! We can see that we have a lot of  features for nothing! Basicly the Tickets Features.
# These features generate noise

# In[ ]:

#Selected features
g = sns.barplot(y=col[:X_test_best.shape[1]],
                x=importances[indices][:X_test_best.shape[1]],
                orient='h')
g.set_xlabel("Relative importance", fontsize=12)
g.set_ylabel("Features", fontsize=12)
g.tick_params(labelsize=9)
g.set_title("Feature importance")
plt.show()

# So, my new features "Groups", "Ticket_tog" and "CabinYN" are some of the important features !!
# They are not among the most important features such as: Embarked and Tickets.

# The best algorithm is Extra Tree Classifier.
# Create the results file.

# In[ ]:

ExtC.fit(X_train_best, y_train)
test_Survived = pd.Series(ExtC.predict(X_test_best), name="Survived")
r = pd.DataFrame(test_Survived, dtype="int64")
results = pd.concat([IDtest, r], axis=1)
results.to_csv("result.csv", index=False)

#
Beispiel #33
0
    #Get the CV score of the tree classifier.
    result = cross_val_score(cls,
                             x_train,
                             x_target,
                             cv=skf,
                             scoring=make_scorer(acc),
                             n_jobs=-1)

    #Print the accuracy of the classifier.
    print("ACC: %0.2f (+/- %0.2f)" % (result.mean(), result.std()))

    #Fit the data
    cls.fit(x_train, x_target)

    #Confusion matrix
    prediction = cls.predict(x_train)
    cnf_matrix = confusion_matrix(x_target, prediction)
    np.set_printoptions(precision=2)

    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=range(len(set(x_target))),
                          normalize=False,
                          title='Confusion matrix')

    plt.savefig("bankTreeConfusion.png", bbox_inches='tight')
    plt.savefig("bankTreeConfusion.pdf", bbox_inches='tight')

    #Get important features

    importances = cls.feature_importances_
Beispiel #34
0
                            max_depth=15,
                            max_features='auto',
                            max_leaf_nodes=None,
                            min_impurity_split=1e-07,
                            min_samples_leaf=1,
                            min_samples_split=2,
                            min_weight_fraction_leaf=0.0,
                            n_estimators=100,
                            n_jobs=1,
                            oob_score=False,
                            random_state=None,
                            verbose=0,
                            warm_start=False)
xtra.fit(train[features], train['Mood'])
print "Extra Trees Classifier"
print accuracy_score(test['Mood'], xtra.predict(test[features]))

ada = AdaBoostClassifier(algorithm='SAMME.R',
                         base_estimator=None,
                         learning_rate=0.1,
                         n_estimators=300,
                         random_state=None)
ada.fit(train[features], train['Mood'])
print "Ada Boost Classifier"
print accuracy_score(test['Mood'], ada.predict(test[features]))

knn = KNeighborsClassifier(algorithm='auto',
                           leaf_size=30,
                           metric='euclidean',
                           metric_params=None,
                           n_jobs=1,
Beispiel #35
0
best_params = {
    'class_weight': 'balanced',
    'criterion': 'entropy',
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 5,
    'min_samples_split': 10,
    'n_estimators': 500
}
et_clf = ExtraTreesClassifier(**best_params)

et_clf.fit(X_train, y_train)

print('test accuracy')
y_hat = et_clf.predict(X_test)
print(classification_report(y_test, y_hat))

print('all towns')
y_all_hat = et_clf.predict(X_data)
print(classification_report(y.targets, y_all_hat))

selected_data['predicted'] = y_all_hat
# %%
print('predicted positive')
pp = []
for pop_id, r in selected_data[selected_data.predicted == 1].iterrows():
    print(pop_id, r.predicted, r.city_type, r.city_name)
    if r.city_name is not None:
        pp.append(r.city_name)
print(', '.join(sorted(set(pp))))
Beispiel #36
0
rfe = rfe.fit(x, y)
print(rfe.support_)
print(rfe.ranking_)

#define the training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
print x_test.shape

# Decisiontree algorithm
#train the model on test set
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
print(model)
# make predictions for the test set
expected = y_test
predicted = model.predict(x_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

sales_test['return'] = model.predict(z)
print(sales_test['return'].value_counts())

Kscore = cross_val_score(model, x, y, cv=10, scoring='accuracy')
print(Kscore)
print(Kscore.mean())

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_estimators=10)
clf = KNeighborsClassifier(n_neighbors=1)
Beispiel #37
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.7593650793650794
exported_pipeline = ExtraTreesClassifier(bootstrap=False,
                                         criterion="entropy",
                                         max_features=0.6500000000000001,
                                         min_samples_leaf=5,
                                         min_samples_split=9,
                                         n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #38
0
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
if len(sys.argv) > 1:
    if sys.argv[1] == 'balance':
        params = {
            'n_estimators': 100,
            'max_depth': 4,
            'random_state': 0,
            'class_weight': 'balanced'
        }
    else:
        raise TypeError("Invalid input argument; should be 'balance'")
classifier = ExtraTreesClassifier(**params)
classifier.fit(x_train, y_train)
visualize_classifier(classifier, x_train, y_train, 'Training dataset')

y_test_pred = classifier.predict(x_test)
visualize_classifier(classifier, x_test, y_test, 'Test dataset')

# Evaluate classifier performance
class_names = ['Class-0', 'Class-1']
print("\n" + "#" * 40)
print("\nClassifier performance on training dataset\n")
print(
    classification_report(y_train,
                          classifier.predict(x_train),
                          target_names=class_names))
print("#" * 40 + "\n")
print("#" * 40)
print("\nClassifier performance on test dataset\n")
print(classification_report(y_test, y_test_pred, target_names=class_names))
print("#" * 40 + "\n")
        tmp = np.zeros(self.shape)
        height, width = self.shape
        for row in xrange(height):
            for col in xrange(width):
                tmp[row][col] = self.getElement(row, col)
        return tmp


if __name__ == "__main__":
    from sklearn.ensemble import ExtraTreesClassifier
    classif = ExtraTreesClassifier()

    nbObj = 10
    nbFeat = 7
    nbClass = 4
    Xl = np.random.rand(nbObj, nbFeat)
    yl = np.random.randint(0, nbClass - 1, nbObj)

    Xlv = FeatureBiaser(Xl, [(0, 3), (1, 3)])
    Xlv = Xlv.asContiguousArray()
    print Xl.shape, Xlv.shape

    classif.fit(Xlv, yl)

    Xt = np.random.rand(nbObj, nbFeat)
    Xtv = FeatureBiaser(Xt, [(0, 3), (1, 3)])
    Xtv = Xtv.asContiguousArray()
    print Xt.shape, Xtv.shape

    classif.predict(Xtv)
# In[ ]:

evaluate(y, y_pred2)

# In[ ]:

model3 = ExtraTreesClassifier()

# In[ ]:

model3.fit(X[:700], y[:700])

# In[ ]:

y_pred3 = model3.predict(X[700:])

# In[ ]:

evaluate(y[700:], y_pred3)

# ## TODO: Only use these as features.

# ## 1. Generate word bags using similar codes in naive_bayes_classifier

# ## 2. The reval characterisctic is captured by n day rate movement, we first set n == 1

# ## 3. Fit a naive bayes model to learn about which word/words could have the most impact of the rate.

# ## Future work
# 1. Use title information and treat title worlds differently as those in the body of the article
Beispiel #41
0
class TreeRegression(RegressionModel):
    """
    initialise class instance.
    """
    def __init__(self,
                 data,
                 normalize=False,
                 n_estimators=1000,
                 min_samples_leaf=1,
                 max_depth=None,
                 **kwargs):
        # call parent function.
        RegressionModel.__init__(self, data, normalize=normalize, **kwargs)

        # placeholders specific to this class.
        self.model = None

        # Reference to the library used: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        # Selecting the most important features using a tress classifer algorithm# initialise a statsmodels OLS instance.
        self.model = ExtraTreesClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf)

    """
    fit the model with the training data.
    """

    def train(self):
        # call parent function.
        RegressionModel.train(self)

        # building of a forest of tress based on the the untrained data set
        self.model.fit(self.train_x, self.train_y)

        # update the is_trained variable.
        self.is_trained = True

    """
    display coefficient information.
    """

    def describe(self):
        # call parent function.
        RegressionModel.describe(self)

        # uses an inbuilt class feature_importances of tree based classifiers -
        # which selects the most important features based on gini importance/ mean decrease impurity
        # in more laymen terms: along the lines of the less probability/ samples that read that particular node/ variable ->
        # the less important that variable is
        #print(self.model.feature_importances_)

        # plot a bar graph of feature importances - selecting all the features
        #feat_importances = pd.Series(self.model.feature_importances_, index=self.train_x.columns)
        #feat_importances.nlargest(len(self.train_x.columns)).plot(kind='barh')
        #plt.show()

    """
    generate test predictions based on the fitted model.
    """

    def test(self):
        # call parent function.
        RegressionModel.test(self)

        # predict TRAINING data. convert to pandas series.
        numpy_predictions_train = self.model.predict(
            self.train_x).flatten().astype(int)
        self.train_predictions = pd.Series(numpy_predictions_train,
                                           dtype="int32").clip(lower=0)

        # predict TESTING data. convert to pandas series.
        numpy_predictions_test = self.model.predict(
            self.test_x).flatten().astype(int)
        self.test_predictions = pd.Series(numpy_predictions_test,
                                          dtype="int32").clip(lower=0)

        # assess the performance of the predictions.
        self.assess_performance()
Beispiel #42
0
                annot=True,
                cmap="RdYlGn")

# ### Applying logistic regression and getting the coefficients

# In[970]:

model = LogisticRegression(C=10**2)
ytest = ytest.to_numpy()
xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                y,
                                                test_size=0.10,
                                                random_state=0,
                                                stratify=y)
model.fit(xtrain, ytrain)
predicted_classes = model.predict(xtest)
accuracy = accuracy_score(ytest.to_numpy().flatten(), predicted_classes)
parameters = model.coef_
print("Accuracy: ", accuracy)
print("Parameters: ", parameters)  # printing the coefficients
cm = confusion_matrix(ytest, predicted_classes)
print(cm)

# ### Getting important features using Random Forrest

# In[954]:

sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(xtrain, ytrain)

# In[955]:
                        bootstrap=True,
                        bootstrap_features=False,
                        n_jobs=1,
                        random_state=1)
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print("Bag: Area under ROC {0}".format(score))
model_scores.append(score)
s = precision_recall_fscore_support(y_test, y_pred)
scores_f1_pre_re.append(s)

from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=150, random_state=20, n_jobs=-1)
etc.fit(X_train, y_train)
y_pred = etc.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print("ET: Area under ROC {0}".format(score))
model_scores.append(score)
s = precision_recall_fscore_support(y_test, y_pred)
scores_f1_pre_re.append(s)

import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=500,
                        n_estimators=150,
                        learning_rate=0.15,
                        colsample_bytree=0.35)
gbm.fit(X_train, y_train)

y_pred = gbm.predict(X_test)
score = roc_auc_score(y_test, y_pred)
Beispiel #44
0
best_columns = [
    'f_138',
    'f_11',
    'f_96',
    'f_200',
    'f_76',
    'f_41',
    'f_83',
    'f_156',
    'f_131',
    'f_84',
    'f_182',
]

# -0.8605
exported_pipeline = ExtraTreesClassifier(max_features=0.367266672504996,
                                         criterion='entropy',
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         n_estimators=4464)

exported_pipeline.fit(X[best_columns], Y['target'])

# --- answer module ---
score_dataset = pd.read_csv('../original_data/x_test.csv',
                            delimiter=';',
                            names=names)
y_pred = exported_pipeline.predict(score_dataset[best_columns])
pd.Series(y_pred).to_csv('../data/answer.csv', index=False)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

n_errors_Ran = print((y_pred2 != y_test).sum())
cohen_kappa_score(y_test, y_pred2)
print(accuracy_score(y_train, model2.predict(X_train)))

###################################Extratreeclassifier####################################################

from sklearn.ensemble import ExtraTreesClassifier

model3 = ExtraTreesClassifier()
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))
n_errors_ext = print((y_pred3 != y_test).sum())

cohen_kappa_score(y_test, y_pred3)
print(accuracy_score(y_train, model3.predict(X_train)))

####################################Support Vector Machine ####################################################################################

from sklearn.svm import SVC

model4 = SVC()
# In[17]:

from sklearn.model_selection import train_test_split

# In[18]:

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3)

# In[20]:

from sklearn.ensemble import ExtraTreesClassifier

modelo = ExtraTreesClassifier()
modelo.fit(x_treino, y_treino)
resultado = modelo.score(x_teste, y_teste)
print("Acurácia:", resultado)

# In[21]:

previsoes = modelo.predict(x_teste[400:403])

# In[22]:

previsoes

# In[23]:

y_teste[400:403]

# In[ ]:
Beispiel #47
0
        feature_set_test.append(feature_extraction(Xtest[i][j]))

feature_sets_train = np.array(feature_set_train)
feature_sets_test = np.array(feature_set_test)

print("Loading Feature Set Matrix...")
print("FeatureSet Train: ", feature_sets_train.shape)
print("FeatureSet Test: ", feature_sets_test.shape)

# In[9]:

ytrain = ytrain.reshape(-1, )
ytest = ytest.reshape(-1, )
# print ("ytrain Reshaped!")

# In[10]:

Emodel = ExtraTreesClassifier(n_estimators=150)
Emodel.fit(feature_sets_train, ytrain)

# In[11]:

t1 = time()
pred = Emodel.predict(feature_sets_test[0].reshape(1, -1))
print("Running the Classifier, Sony Mixed mode... ")
print("Predicted Label: ", pred[0])
t2 = time()
print("Time taken per prediction (in sec): ", t2 - t1)

# In[ ]:
Beispiel #48
0
                              n_estimators=30,
                              bootstrap=True,
                              max_features=None,
                              max_depth=7,
                              max_leaf_nodes=7)
et_clf

# In[39]:

et_clf.fit(x_train, y_train)
print(et_clf.score(x_train, y_train))
print(et_clf.score(x_test, y_test))

# In[40]:

print(confusion_matrix(et_clf.predict(x_test), y_test))
print(f1_score(et_clf.predict(x_test), y_test, average='macro'))

# In[41]:

test_pred = et_clf.predict(test_data[train_columns])
submission['target'] = np.where(test_pred == 0, 'low',
                                np.where(test_pred == 1, 'medium', 'high'))
print(np.unique(submission['target'], return_counts=True))
submission.to_csv(cwd + "/submission_v2.csv", index=False)

# ### Analysing with Logistic Regression Model

# In[42]:

from sklearn.linear_model import LogisticRegression
                              n_estimators=50, bootstrap=True,
                              n_jobs=-1, oob_score=True,
                              bootstrap_features=True, max_features=0.5)
print('Training model..')
bag_clf_2.fit(X_train, y_train)
print('Done')
print('oob score:', bag_clf_2.oob_score_)
print('Making predictions..')
y_pred = bag_clf_2.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
RandomForest Classifier
rf_clf = RandomForestClassifier(n_estimators=50, bootstrap=True,
                                max_leaf_nodes=16, n_jobs=-1, oob_score=True)
print('Training model..')
rf_clf.fit(X_train, y_train)
print('Done.')
print('oob score:', rf_clf.oob_score_)
print('Making predictions..')
y_pred = rf_clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
# ExtraTree Classifier
ext_clf = ExtraTreesClassifier(n_estimators=50, bootstrap=True,
                               max_leaf_nodes=16, n_jobs=-1, oob_score=True)
print('Training model..')
ext_clf.fit(X_train, y_train)
print('Done.')
print('oob score:', ext_clf.oob_score_)
print('Making predictions..')
y_pred = ext_clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
Beispiel #50
0
    stacking_model = StackingModel(topLayer_model, base_model_list)
    stacking_model.fit(X_train, y_train, X_test)
    print('stacking_model:', getAuc(y_test, stacking_model.predict()))


    print("other_model>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    rf_model = RandomForestClassifier()
    adb_model = AdaBoostClassifier()
    gdbc_model = GradientBoostingClassifier()
    et_model = ExtraTreesClassifier()

    rf_model.fit(X_train, y_train)
    adb_model.fit(X_train, y_train)
    gdbc_model.fit(X_train, y_train)
    et_model.fit(X_train, y_train)

    print('rf_model:', getAuc(y_test, rf_model.predict(X_test)))
    print('adb_model:', getAuc(y_test, adb_model.predict(X_test)))
    print('gdbc_model:', getAuc(y_test, gdbc_model.predict(X_test)))
    print('et_model:', getAuc(y_test, et_model.predict(X_test)))





'''
终于搞定stacking,太开心了!!!
之前一直以为,这个东西贼他妈神秘;搞懂了,发现贼他妈简单!!!!
'''

Beispiel #51
0
######################################################################################################
# Train
X_train, X_test, y_train, y_test = train_test_split(data2,
                                                    training_labels,
                                                    stratify=training_labels,
                                                    test_size=0.25)

percent = list()
for bagging_run in range(0, 30):

    # train 3 classifiers
    final_preds = []
    for x in range(0, 3):
        clf = ExtraTreesClassifier(n_estimators=100)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        final_preds.append(y_pred)
        # print(classification_report(y_test, y_pred, labels=[1, 2, 3, 4, 5, 6, 7]))

    summary = np.zeros(shape=(len(final_preds[0]), len(final_preds) + 1))

    # Create an array with all predictions in a row
    # with the correct prediction at the end
    # summary = pred pred pred ... correct_y
    # loop through all predictions
    for j in range(len(final_preds[0])):
        # loop through number of predictor models
        for i in range(len(final_preds)):
            summary[j][i] = final_preds[i][j]
        # Append correct pred
        summary[j][summary[0].shape[0] - 1] = y_test.iloc[j]
Beispiel #52
0
# naive bayes implementation
from matplotlib import pyplot
# create model
from pyearth import Earth

model = Earth()

# fit the earth model
model.fit(X, y)
print(" Model:")
print(model)

# make predictions
expected = y
predicted = model.predict(X)

# since the quality can only be a number, round all the outputs off
for i in range(len(predicted)):
    predicted[i] = int(round(predicted[i]))

# check how far the predictions are from actual values
difference = list()
total_diff = 0.
for i in range(len(y)):
    diff = predicted[i] - expected[i]
    difference.append(diff)
    total_diff += abs(diff)
diversion = total_diff / len(y)

# check how many of predictions match actual values
Beispiel #53
0
class ExtraTreesClassifier:
    def __init__(self,
                 criterion,
                 min_samples_leaf,
                 min_samples_split,
                 max_features,
                 bootstrap,
                 max_leaf_nodes,
                 max_depth,
                 min_weight_fraction_leaf,
                 min_impurity_decrease,
                 oob_score=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 class_weight=None):

        self.n_estimators = self.get_max_iter()
        self.estimator_increment = 10
        if criterion not in ("gini", "entropy"):
            raise ValueError("'criterion' is not in ('gini', 'entropy'): "
                             "%s" % criterion)
        self.criterion = criterion

        if check_none(max_depth):
            self.max_depth = None
        else:
            self.max_depth = int(max_depth)
        if check_none(max_leaf_nodes):
            self.max_leaf_nodes = None
        else:
            self.max_leaf_nodes = int(max_leaf_nodes)

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)
        self.max_features = float(max_features)
        self.bootstrap = check_for_bool(bootstrap)
        self.min_weight_fraction_leaf = float(min_weight_fraction_leaf)
        self.min_impurity_decrease = float(min_impurity_decrease)
        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.class_weight = class_weight
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        self.iterative_fit(X,
                           y,
                           n_iter=2,
                           refit=True,
                           sample_weight=sample_weight)
        iteration = 2
        while not self.configuration_fully_fitted():
            n_iter = int(2**iteration / 2)
            self.iterative_fit(X,
                               y,
                               n_iter=n_iter,
                               sample_weight=sample_weight)
            iteration += 1
        return self

    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesClassifier as ETC

        if refit:
            self.estimator = None

        if self.estimator is None:
            max_features = int(X.shape[1]**float(self.max_features))
            self.estimator = ETC(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight,
                warm_start=True)

        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        probas = self.estimator.predict_proba(X)
        probas = convert_multioutput_multiclass_to_multilabel(probas)
        return probas

    @staticmethod
    def get_max_iter():
        return 512
Beispiel #54
0
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean())
print("BaggingClassifier")
bagging_classifier = BaggingClassifier()
bagging_classifier.fit(X_train, y_train)
y_pred = bagging_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
scores = cross_val_score(bagging_classifier, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean())
print("ExtraTreesClassifier")
extra_trees_classifier = ExtraTreesClassifier(n_estimators=100)
extra_trees_classifier.fit(X_train, y_train)
y_pred = extra_trees_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
scores = cross_val_score(extra_trees_classifier, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean())
print("GradientBoostingClassifier")
gradient_boosting_classifier = GradientBoostingClassifier()
gradient_boosting_classifier.fit(X_train, y_train)
y_pred = gradient_boosting_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
scores = cross_val_score(gradient_boosting_classifier, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean())
print("RandomForestClassifier")
random_forest_classifier = RandomForestClassifier(n_estimators=100)
random_forest_classifier.fit(X_train, y_train)
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# In[ ]:

# random_forest = RandomForestClassifier(n_estimators=100)
# random_forest.fit(x_tr, y_tr)
# random_forest.score(x_tr, y_tr)
etc = ExtraTreesClassifier(n_estimators=400)
etc.fit(x, y)
ypred = etc.predict(test_x)

# In[ ]:

# ypred=random_forest.predict(test_x)

# In[ ]:

passenger = test2['PassengerId']

# In[ ]:

submission = pd.DataFrame({"PassengerId": passenger, "Survived": ypred})

# In[ ]:
print(f1)
print(f2)
print(f1 * f2 / (f1 + f2) * 2)
rf3 = RandomForestClassifier(n_estimators=100,
                             max_depth=6,
                             min_samples_split=2,
                             class_weight="balanced")

######extratrees
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=100,
                           max_depth=10,
                           min_samples_split=2,
                           random_state=0)
etc.fit(x1, y1)
r4 = etc.predict(xt1)
r5 = etc.predict(ax1)
r6 = etc.predict(nx1)
r7 = etc.predict(x1)
print(pd.Series(r4).value_counts())
print(pd.Series(r5).value_counts(1)[0])
print(pd.Series(r6).value_counts(1)[1])
print(pd.Series(r7).value_counts())
f1 = pd.Series(r6).value_counts(1)[1]
f2 = pd.Series(r6).value_counts()[1] / (pd.Series(r6).value_counts()[1] +
                                        pd.Series(r5).value_counts()[1])
print(f1)
print(f2)
print(f1 * f2 / (f1 + f2) * 2)

#####adaboost
Beispiel #57
0
def cross_val(clf_name,
              X,
              y,
              n_folds=5,
              proba=False,
              score=accuracy_score,
              *params,
              **kwargs):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=41)
    if clf_name == "extra":
        c = ExtraTreesClassifier(12,
                                 max_depth=23,
                                 max_features=10,
                                 n_jobs=-1,
                                 *params,
                                 **kwargs)
    elif clf_name == "grad":
        c = GradientBoostingClassifier(n_estimators=40,
                                       learning_rate=0.1,
                                       *params,
                                       **kwargs)
    elif clf_name == "cgrad":
        c = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(
            n_estimators=20, learning_rate=0.1, *params, **kwargs),
                                   method='isotonic',
                                   cv=10)
    elif clf_name == "cmulti":
        c = CalibratedClassifierCV(base_estimator=MultinomialNB(
            alpha=alpha_multi, *params, **kwargs),
                                   method='isotonic',
                                   cv=10)
    elif clf_name == "multi":
        c = MultinomialNB(*params, **kwargs)
    elif clf_name == "bag":
        c = BaggingClassifier(base_estimator=MultinomialNB(alpha=0.5,
                                                           *params,
                                                           **kwargs),
                              n_estimators=100,
                              n_jobs=-1)
    elif clf_name == "bern":
        c = BernoulliNB(alpha=0.00000000001, *params, **kwargs)
    elif clf_name == "gauss":
        c = GaussianNB(*params, **kwargs)
    elif clf_name == "random":
        c = RandomForestClassifier(1200,
                                   max_depth=23,
                                   max_features=10,
                                   n_jobs=-1,
                                   *params,
                                   **kwargs)
    elif clf_name == "lda":
        c = LinearDiscriminantAnalysis(*params, **kwargs)
    elif clf_name == "logistic":
        c = LogisticRegression(C=1, *params, **kwargs)
    elif clf_name == "svm":
        c = LinearSVC(C=100, *params, **kwargs)
    elif clf_name == "knn":
        c = KNeighborsClassifier(n_neighbors=20, *params, **kwargs)
    elif clf_name == "near":
        c = NearestCentroid(*params, **kwargs)
    elif clf_name == "ridge":
        c = OneVsOneClassifier(RidgeClassifier(alpha=0.1, *params, **kwargs))
    elif clf_name == "sgd":
        c = SGDClassifier(loss="hinge",
                          penalty="l2",
                          n_iter=50,
                          alpha=0.000001,
                          fit_intercept=True,
                          average=True)

    y_pred = np.zeros(y.shape)
    score_list = []
    for i, (train, test) in enumerate(cv):
        c.fit(X[train, :], y[train])
        if proba:
            y_pred[test] = c.predict_proba(X[test, :])
        else:
            y_pred[test] = c.predict(X[test, :])
        score_list.append(score(y[test], y_pred[test]))
        print(score_list[i])
    print("Final score", score(y, y_pred))
    return y_pred
training_features = scale(training_features)
testing_features = scale(testing_features)

SVM_lin.fit(training_features, labels_array_train)
predictions_SVM_lin = SVM_lin.predict(testing_features)
print("f1 Score SVM with linear kernel : ",
      f1_score(y_true=labels_array_test, y_pred=predictions_SVM_lin))

SVM_rbf.fit(training_features, labels_array_train)
predictions_SVM_rbf = SVM_rbf.predict(testing_features)
print("f1 Score SVM with Gaussian kernel : ",
      f1_score(y_true=labels_array_test, y_pred=predictions_SVM_rbf))

xtr.fit(training_features, labels_array_train)
predictions_XTR = xtr.predict(testing_features)
print("f1 Score XTR : ",
      f1_score(y_true=labels_array_test, y_pred=predictions_XTR))

gbm.fit(training_features, labels_array_train)
predictions_XGB = gbm.predict(testing_features)
print("f1 Score XGB : ",
      f1_score(y_true=labels_array_test, y_pred=predictions_XGB))

#model.fit(training_features, labels_array_train, nb_epoch = 4, batch_size = 258)
#predictions_DL = model.predict_classes(testing_features)
#print("f1 Score DL : ", f1_score(y_true=labels_array_test, y_pred = np.round(predictions_DL)))

#==============================================================================

# Issue prediction
Beispiel #59
0
                       np.array(gbdt.predict_proba(x_test)[:, 1]) > 0.4))

    #--Weighted averaging--#
    wa_prob = 0.2 * gbdt.predict_proba(x_test)[:, 1] + 0.5 * rf.predict_proba(
        x_test)[:, 1] + 0.2 * et.predict_proba(
            x_test)[:, 1] + 0.1 * lr1.predict_proba(x_test)[:, 1]
    wa_brier.append(brier_score_loss(y_test, wa_prob))
    print('accuracy rate of weighted averaging:',
          accuracy_score(y_test,
                         np.array(wa_prob) >= 0.5))
    wa_score1.append(accuracy_score(y_test, np.array(wa_prob) >= 0.5))
    del wa_prob

    #--Majority voting--#
    mv_prob = 0.24 * gbdt.predict(x_test) + 0.27 * rf.predict(
        x_test) + 0.24 * lr1.predict(x_test) + 0.25 * et.predict(x_test)
    mv_brier.append(brier_score_loss(y_test, mv_prob))
    print('accuracy rate of majority voting:',
          accuracy_score(y_test,
                         np.array(mv_prob) >= 0.5))
    mv_score1.append(accuracy_score(y_test, np.array(mv_prob) >= 0.5))
    del mv_prob

#----------Output accuracy rate----------#
#print ('accuracy rate of support vector machine:',         mean(svc_accuracy) )
print('accuracy rate of k nearest neighbors:', mean(knn_accuracy))
print('accuracy rate of logistic regression with lasso:', mean(lr1_accuracy))
print('accuracy rate of logistic regression with ridge:', mean(lr2_accuracy))
print('accuracy rate of decision tree:', mean(dt_accuracy))
print('accuracy rate of extremely randomized trees:', mean(et_accuracy))
print('accuracy rate of random forest:', mean(rf_accuracy))
Beispiel #60
0
class Extractor(BaseEstimator, ClassifierMixin):
    """
    An sklearn-style classifier that extracts the main content (and/or comments)
    from an HTML document.

    Args:
        blockifier (``Blockifier``)
        features (str or List[str], ``Features`` or List[``Features``], or List[Tuple[str, ``Features``]]):
            One or more features to be used to transform blocks into a matrix of
            numeric values. If more than one, a :class:`FeatureUnion` is
            automatically constructed. See :func:`get_and_union_features`.
        model (:class:`ClassifierMixin`): A scikit-learn classifier that takes
             a numeric matrix of features and outputs a binary prediction of
            1 for content or 0 for not-content. If None, a :class:`ExtraTreesClassifier`
            with default parameters is used.
        to_extract (str or Sequence[str]): Type of information to extract from
            an HTML document: 'content', 'comments', or both via ['content', 'comments'].
        prob_threshold (float): Minimum prediction probability of a block being
            classified as "content" for it actually be taken as such.
        max_block_weight (int): Maximum weight that a single block may be given
            when training the extractor model, where weights are set equal to
            the number of tokens in each block.

    Note:
        If ``prob_threshold`` is not None, then ``model`` must implement the
            ``predict_proba()`` method.
    """
    def __init__(self,
                 blockifier=TagCountNoCSSReadabilityBlockifier,
                 features=('kohlschuetter', 'weninger', 'readability'),
                 model=None,
                 to_extract='content',
                 prob_threshold=0.5,
                 max_block_weight=200):
        self.blockifier = blockifier
        self.features = features
        # initialize model
        if model is None:
            self.model = ExtraTreesClassifier()
        elif isinstance(model, ClassifierMixin):
            self.model = model
        else:
            raise TypeError('invalid `model` type: "{}"'.format(type(model)))
        if isinstance(to_extract, string_):
            self.to_extract = (to_extract, )
        else:
            self.to_extract = tuple(to_extract)
        self.prob_threshold = prob_threshold
        self.max_block_weight = max_block_weight
        self._positive_idx = None

    @property
    def features(self):
        return self._features

    @features.setter
    def features(self, feats):
        self._features = get_and_union_features(feats)

    def fit(self, documents, labels, weights=None):
        """
        Fit :class`Extractor` features and model to a training dataset.

        Args:
            blocks (List[Block])
            labels (``np.ndarray``)
            weights (``np.ndarray``)

        Returns:
            :class`Extractor`
        """
        block_groups = np.array(
            [self.blockifier.blockify(doc) for doc in documents])
        mask = [self._has_enough_blocks(blocks) for blocks in block_groups]
        block_groups = block_groups[mask]
        labels = np.concatenate(np.array(labels)[mask])

        # TODO: This only 'fit's one doc at a time. No feature fitting actually
        # happens for now, but this might be important if the features change
        features_mat = np.concatenate(
            [self.features.fit_transform(blocks) for blocks in block_groups])
        if weights is None:
            self.model.fit(features_mat, labels)
        else:
            weights = np.concatenate(np.array(weights)[mask])
            self.model.fit(features_mat, labels, sample_weight=weights)
        return self

    def get_html_labels_weights(self, data):
        """
        Gather the html, labels, and weights of many files' data.
        Primarily useful for training/testing an :class`Extractor`.

        Args:
            data: Output of :func:`dragnet.data_processing.prepare_all_data`.

        Returns:
            Tuple[List[Block], np.array(int), np.array(int)]: All blocks, all
                labels, and all weights, respectively.
        """
        all_html = []
        all_labels = []
        all_weights = []
        for html, content, comments in data:
            all_html.append(html)
            labels, weights = self._get_labels_and_weights(content, comments)
            all_labels.append(labels)
            all_weights.append(weights)
        return np.array(all_html), np.array(all_labels), np.array(all_weights)

    def _has_enough_blocks(self, blocks):
        if len(blocks) < 3:
            logging.warning('extraction failed: too few blocks (%s)',
                            len(blocks))
            return False
        return True

    def _get_labels_and_weights(self, content, comments):
        """
        Args:
            content (Tuple[np.array[int], np.array[int], List[str]])
            comments (Tuple[np.array[int], np.array[int], List[str]])

        Returns:
            Tuple[np.array[int], np.array[int], List[str]]
        """
        # extract content and comments
        if 'content' in self.to_extract and 'comments' in self.to_extract:
            labels = np.logical_or(content[0], comments[0]).astype(int)
            weights = content[1],
        # extract content only
        elif 'content' in self.to_extract:
            labels = content[0]
            weights = content[1]
        # extract comments only
        else:
            labels = comments[0]
            weights = comments[1]
        if self.max_block_weight is None:
            weights = np.minimum(weights, self.max_block_weight)

        return labels, weights

    def extract(self, html, encoding=None, as_blocks=False):
        """
        Extract the main content and/or comments from an HTML document and
        return it as a string or as a sequence of block objects.

        Args:
            html (str): HTML document as a string.
            encoding (str): Encoding of ``html``. If None (encoding unknown), the
                original encoding will be guessed from the HTML itself.
            as_blocks (bool): If False, return the main content as a combined
                string; if True, return the content-holding blocks as a list of
                block objects.

        Returns:
            str or List[Block]
        """
        preds, blocks = self.predict(html,
                                     encoding=encoding,
                                     return_blocks=True)
        if as_blocks is False:
            return str_cast(b'\n'.join(blocks[ind].text
                                       for ind in np.flatnonzero(preds)))
        else:
            return [blocks[ind] for ind in np.flatnonzero(preds)]

    def predict(self, documents, **kwargs):
        """
        Predict class (content=1 or not-content=0) of the blocks in one or many
        HTML document(s).

        Args:
            documents (str or List[str]): HTML document(s)

        Returns:
            ``np.ndarray`` or List[``np.ndarray``]: array of binary predictions
                for content (1) or not-content (0).
        """
        if isinstance(documents,
                      (str, bytes, unicode_, np.unicode_, etree._Element)):
            return self._predict_one(documents, **kwargs)
        else:
            return np.concatenate(
                [self._predict_one(doc, **kwargs) for doc in documents])

    def _predict_one(self, document, encoding=None, return_blocks=False):
        """
        Predict class (content=1 or not-content=0) of each block in an HTML
        document.

        Args:
            documents (str): HTML document

        Returns:
            ``np.ndarray``: array of binary predictions for content (1) or
            not-content (0).
        """
        # blockify
        blocks = self.blockifier.blockify(document, encoding=encoding)
        # get features
        try:
            features = self.features.transform(blocks)
        except ValueError:  # Can't make features, predict no content
            preds = np.zeros((len(blocks)))
        # make predictions
        else:
            if self.prob_threshold is None:
                preds = self.model.predict(features)
            else:
                self._positive_idx = (self._positive_idx
                                      or list(self.model.classes_).index(1))
                preds = self.model.predict_proba(
                    features) > self.prob_threshold
                preds = preds[:, self._positive_idx].astype(int)

        if return_blocks:
            return preds, blocks
        else:
            return preds