Beispiel #1
1
def predict_TestData(Food_df,People_df):
    cTrainF = rand(len(Food_df)) > .5
    cTestF = ~cTrainF
    cTrainP = rand(len(People_df)) > .5
    cTestP = ~cTrainP

    TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0)
    TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0)

    TrainX= TrainX_df.ix[:,2:].values
    TestX= TestX_df.ix[:,2:].values
    TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))])
    TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))])

    ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0)
    ET_classifier.fit(TrainX,TrainY)
    ET_prediction = ET_classifier.predict(TestX) 

    LinSVC_classifier = svm.LinearSVC()
    LinSVC_classifier.fit(TrainX,TrainY)
    LinSVC_predict = LinSVC_classifier.predict(TestX)

    a=DataFrame()
    a["url"]=TestX_df.urls.values
    a["answer"]=TestY
    a["ET_predict"]=ET_prediction
    a["LinSVC_predict"]=LinSVC_predict
    a.to_csv("prediction_for_TestData.csv")
class Identifier:
	def __init__(self,grabable = set([]),clf = None):
		self.grabable = grabable #TODO if we care to, not used at the mo
		self.orb = orb = cv2.ORB(nfeatures = 1000)#,nlevels = 20, scaleFactor = 1.05)
		self.items = [ "champion_copper_plus_spark_plug", "cheezit_big_original","crayola_64_ct", "dove_beauty_bar", "elmers_washable_no_run_school_glue","expo_dry_erase_board_eraser", "feline_greenies_dental_treats","first_years_take_and_toss_straw_cups", "genuine_joe_plastic_stir_sticks","highland_6539_self_stick_notes", "kong_air_dog_squeakair_tennis_ball","kong_duck_dog_toy", "kong_sitting_frog_dog_toy", "kygen_squeakin_eggs_plush_puppies","mark_twain_huckleberry_finn", "mead_index_cards","mommys_helper_outlet_plugs","munchkin_white_hot_duck_bath_toy", "one_with_nature_soap_dead_sea_mud","oreo_mega_stuf", "paper_mate_12_count_mirado_black_warrior","rollodex_mesh_collection_jumbo_pencil_cup", "safety_works_safety_glasses", "sharpie_accent_tank_style_highlighters", "stanley_66_052" ]
		if not clf:
			print "Training new classifier"
			self.clf =ExtraTreesClassifier(min_samples_split = 1,n_jobs = -1,n_estimators = 150, class_weight = 'subsample')
			X = np.ascontiguousarray(joblib.load('labels.pkl'))
			Y = np.ascontiguousarray(joblib.load('features.pkl'), dtype = np.float64)
			Y = preprocessing.scale(Y)
			self.clf.fit(Y,X)
		else:
			self.clf = clf
	def identify(self,im,possibilites):
		if im is not None:
			kpTest, desTest = self.orb.detectAndCompute(im,None)
			pred = self.clf.predict(preprocessing.scale(np.array(desTest,dtype = np.float64)))
			c = Counter(pred)
			r = [(k,c[k]) for k in sorted(set(c.keys())&possibilites, key  = lambda k: c[k],reverse = True)]
			if r:
				item = r[0][0]
				print self.items[item],
				return item
			else:
				return -1

		else:
			print "Image to recognize is None"
Beispiel #3
0
def stack(X, y, X_test, y_test):
    X, X1, y, y1 = train_test_split(X, y, test_size=0.5)
    #clf1 = GradientBoostingClassifier(n_estimators=10)
    #clf1 = RandomForestClassifier(n_estimators=20)
    clf1 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)
    clf2 = linear_model.SGDClassifier(loss='log')
    enc = OneHotEncoder()
    #clf2 = RandomForestClassifier(n_estimators=10)
    #clf2 = GradientBoostingClassifier(n_estimators=20)
    clf1.fit(X, y)
    enc.fit(clf1.apply(X))
    clf2.fit(enc.transform(clf1.apply(X1)), y1)

    #prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)[:, :, 0]))[:, 1]

    prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)).toarray())[:, 1]
    res = clf2.predict(enc.transform(clf1.apply(X_test)))        
    check = zip(y_test, res)
    tp, tn, fp, fn = 0, 0, 0, 0
    for value, prediction in check:
        if (prediction and value):
            tp += 1
        if (prediction and not value):
            fp += 1
        if (not prediction and value):
            fn += 1
        if (not prediction and not value):
            tn += 1
    print ('TP: {0}, TN: {1}, FP: {2}, FN: {3}'.format(tp, tn, fp, fn))
    print ("Precision Score : %f" % metrics.precision_score(y_test, res))
    print ("Recall Score : %f" % metrics.recall_score(y_test, res))
    return roc_curve(y_test, prob)
Beispiel #4
0
def learn(f):
    global raw_data
    print 'testing classifier'
    data = raw_data[raw_data['label'] != 'unknown']
    data = data[data['file type'] == 'EXECUTE']
    X = data.as_matrix(f)
    y = np.array(data['label'].tolist())
    #clf = RandomForestClassifier(n_estimators=100)
    clf = ExtraTreesClassifier(n_estimators=100)
    #clf = AdaBoostClassifier()
    scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10)
    print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    seed = 3301
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("actual accuracy: %0.2f" % scores)
    importances = zip(f, clf.feature_importances_)
    importances.sort(key=lambda k:k[1], reverse=True)
    for im in importances[0:20]:
        print im[0].ljust(30), im[1]
    #y_pred = clf.predict(X_test)
    #labels = ['good', 'bad']
    #cm = confusion_matrix(y_test, y_pred, labels)
    #plot_cm(cm, labels)
    #joblib.dump(clf, 'model.pkl')
    return clf
Beispiel #5
0
def plotFeatureImportances(x, y, fieldNames, numTrees = 100):
    print fieldNames
    # fit
    forest = ExtraTreesClassifier(n_estimators=numTrees, compute_importances=True, random_state=0)
    forest.fit(x, y)

    # get importances
    importances = forest.feature_importances_
    print sum(importances)
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # present
    numFeatures = len(importances)
    print 'feature ranking:'
    for i in xrange(numFeatures):
        print '%d. feature %d (%s) has importance %f' % (i+1, indices[i], fieldNames[indices[i]], importances[indices[i]])

    xtickLabels = [fieldNames[i] for i in indices]
    pylab.figure()
    pylab.title('Feature Importances From A Random Forest with %s trees' % numTrees)
    pylab.bar(xrange(numFeatures), importances[indices], color='r', yerr=std[indices], align='center')
    pylab.xticks(xrange(numFeatures), xtickLabels)
    pylab.xlim([-1, numFeatures])
    pylab.show()
Beispiel #6
0
def main():

    # Define the known data points or "training" data
    explanatory_fields = "d100 dd0 dd5 fday ffp gsdd5 gsp map mat_tenths mmax_tenths mmindd0 mmin_tenths mtcm_tenths mtwm_tenths sday".split()
    explanatory_rasters = [os.path.join(TRAINING_DIR, "current_" + r + ".img") for r in explanatory_fields]
    response_shapes = os.path.join(TRAINING_DIR, "DF.shp")

    # Load the training rasters using the sampled subset
    try:
        cached = json.load(open("_cached_training.json"))
        train_xs = np.array(cached['train_xs'])
        train_y = np.array(cached['train_y'])
    except IOError:
        train_xs, train_y = load_training_vector(response_shapes, 
            explanatory_rasters, response_field='GRIDCODE')
        cache = {'train_xs': train_xs.tolist(), 'train_y': train_y.tolist()}
        with open("_cached_training.json", 'w') as fh:
            fh.write(json.dumps(cache))

    print(train_xs.shape, train_y.shape)

    # Train the classifier
    clf = ExtraTreesClassifier(n_estimators=120, n_jobs=3)
    clf.fit(train_xs, train_y)
    print(clf)

    evaluate_clf(clf, train_xs, train_y, feature_names=explanatory_fields)
def ET_classif(features_df=None, labels_df=None):
    '''Scoring function to be used in SelectKBest feature selection class 
        object.
        
    This scoring function assigns varaible importances to the features
        passed in to it using the ExtraTreesClassifier. It then returns
        the features as two identical arrays mimicking the scores and 
        p-values arrays required by SelectKBest to pick the top K 
        features.
        
    Args:
        features_df: Pandas dataframe of features to be used to predict 
            using the ExtraTreesClassifier.
        labels_df: Pandas dataframe of the labels being predicted.
    Returns:
        Two identical arrays containing the feature importance scores
            returned for each feature by the ExtraTreesClassifier.
    '''
    reducer = ExtraTreesClassifier(n_estimators=500, bootstrap=False,
                                   oob_score=False, max_features=.10,
                                   min_samples_split=10, min_samples_leaf=2,
                                   criterion='gini', random_state=42)

    reducer.fit(features_df, labels_df)
    return reducer.feature_importances_, reducer.feature_importances_
Beispiel #8
0
def train_tree():
	word_vector_hash = knn.word_vectors(training, vector_length, False)

	sku_vectors, class_labels, _, sku_hash = knn.data(adapt1, vector_length, 'all', word_vector_hash)
	xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0)
	model2 = xtrees.fit(sku_vectors, class_labels)

	sku_vectors, class_labels, _, sku_hash = knn.data(adapt2, vector_length, 'all', word_vector_hash)
	xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0)
	model3 = xtrees.fit(sku_vectors, class_labels)

	sku_vectors, class_labels, _, sku_hash = knn.data(adapt3, vector_length, 'all', word_vector_hash)
	xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0)
	model4 = xtrees.fit(sku_vectors, class_labels)

	# Non-adaptive data
	sku_vectors, class_labels, _, sku_hash = knn.data(training, vector_length, False, word_vector_hash)
	model2 = ConfidenceDecorator(model2, sku_vectors, class_labels)
	model3 = ConfidenceDecorator(model3, sku_vectors, class_labels)
	model4 = ConfidenceDecorator(model4, sku_vectors, class_labels)

	xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0)
	model1 = xtrees.fit(sku_vectors, class_labels)
	model1 = ConfidenceDecorator(model1, sku_vectors, class_labels)

	forest = RandomForestClassifier(n_estimators=3, max_depth=None, min_samples_split=1, random_state=0)
	model5 = forest.fit(sku_vectors, class_labels)
	model5 = ConfidenceDecorator(model5, sku_vectors, class_labels)

	#neigh = neighbors.KNeighborsClassifier(n_neighbors=10, warn_on_equidistant=False, weights="distance")
	#model6 = neigh.fit(sku_vectors, class_labels)
	#model6 = ConfidenceDecorator(model6, sku_vectors, class_labels)

	models = [model1, model2, model3, model4, model5]# model6]
	return models, word_vector_hash
Beispiel #9
0
def feature_important(filename):
    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier

    content = read_csv(filename)
    X = [c.decisions for c in content]
    y = [c.objective for c in content]

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    #
    for f in range(len(X[0])):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(len(X[0])), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(len(X[0])), indices)
    plt.xlim([-1, len(X[0])])
    plt.show()
def train_random_forest(X_train,y_train,**kwargs):
    from sklearn.ensemble import ExtraTreesClassifier

    n_estimators = kwargs.pop('n_estimators',300)
    max_features = kwargs.pop('max_features','auto')
    n_jobs       = kwargs.pop('n_jobs',-1)
    verbose      = kwargs.pop('verbose',0)
    tuned_params = kwargs.pop('tuned_params',None)

    # initialize baseline classifier
    clf = ExtraTreesClassifier(n_estimators=n_estimators,random_state=42,
                               n_jobs=n_jobs,verbose=verbose,criterion='gini',
                               max_features=max_features,oob_score=True,
                               bootstrap=True)
    
    if tuned_params is not None: # optimize if desired
        from sklearn.grid_search import GridSearchCV
        cv = GridSearchCV(clf,tuned_params,cv=5,scoring='roc_auc',
                          n_jobs=n_jobs,verbose=verbose,refit=True)
        cv.fit(X_train, y_train)
        clf = cv.best_estimator_
    else: # otherwise train with the specified parameters (no tuning)
        clf.fit(X_train,y_train)

    return clf
Beispiel #11
0
    def tree_based_feature_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        n = len(self.features)
        forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
        forest.fit(x, y)
        importances = forest.feature_importances_
        print(importances)
        std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]
        print("Feature ranking:")

        for f in range(n):
            print("%d. feature %d: %s (%f)" % (f + 1, indices[f], self.features[indices[f]],importances[indices[f]]))

        # Plot the feature importances of the forest
        # plt.figure()
        # plt.title("Feature importances")
        # plt.bar(range(n), importances[indices],
        #         color="r", yerr=std[indices], align="center")
        # plt.xticks(range(n), indices)
        # plt.xlim([-1, n])
        # plt.show()
        n = 12
        print(indices[0:n+1])
        print(self.features[indices[0:n+1]])
        new_x = x[:, indices[0:n+1]]
        return new_x
def top_importances(features_df=None, labels_df=None, top_N=10):
    ''' Finds the top N importances using the ExtraTreesClassifier.
        
    Finds the top N importances of a dataframe of features and a dataframe
        of labels using the ExtraTreesClassifier.
    
    Args:
        features_df: Pandas dataframe of features used to predict.
        labels_df: Pandas dataframe of labels to be predicted.
        top_N: interger value of the top N most importance features to return.
    Returns:
        Pandas dataframe containing the top N importances and their 
        importance scores.
    
    '''
    reducer = ExtraTreesClassifier(n_estimators=2000, bootstrap=False,
                                   oob_score=False, max_features=.10,
                                   min_samples_split=10, min_samples_leaf=2,
                                   criterion='gini')

    reducer.fit(features_df, labels_df)
    scores = pd.DataFrame(reducer.feature_importances_,
                          index=features_df.columns)
    scores.columns = ['Importances']
    scores = scores.sort(['Importances'], ascending=False)
    return scores[0:top_N]
Beispiel #13
0
def plotImportance(X,y):
	forest = ExtraTreesClassifier(n_estimators=250,
	                              random_state=0)

	forest.fit(X, y)
	importances = forest.feature_importances_
	std = np.std([tree.feature_importances_ for tree in forest.estimators_],
	             axis=0)
	indices = np.argsort(importances)[::-1]
	n=X.shape[1]

	#Print the feature ranking
	#print("Feature ranking:")

	#for f in range(n):
	#    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

	# Plot the feature importances of the forest
	plt.figure(figsize=(20,15))
	plt.title("Feature importances")
	plt.bar(range(n), importances[indices],
	       color="r", yerr=std[indices], align="center")
	plt.xticks(range(n), X.columns[indices],rotation=90)
	plt.xlim([-1, n])
	plt.savefig('featuresel.pdf')
class FeaturesSelectionRandomForests(object):
    
    
    def __init__(self, n_estimators = 100, feature_importance_th = 0.005):
        
        self.n_estimators = n_estimators
        self.feature_importance_th = feature_importance_th
        
            
    def fit(self, X, y, n_estimators = None, feature_importance_th = None):
        
        if n_estimators is not None:
            assert isinstance(n_estimators,(int,long,float))
            self.n_estimators = n_estimators
        if feature_importance_th is not None:
            assert isinstance(feature_importance_th,(int,long,float))
            self.feature_importance_th = feature_importance_th
        
        #filter features by forest model
        self.trees = ExtraTreesClassifier(n_estimators=100, compute_importances=True)
        self.trees.fit(X, y)
        self.features_mask = np.where(self.trees.feature_importances_ > 0.005)[0]

    
    def plot_features_importance(self):
        
        pd.DataFrame(self.trees.feature_importances_).plot(kind='bar')
        plt.show()
        
    
    def transform(self, X):

        assert hasattr(self,"features_mask")

        return X[:, self.features_mask]
def get_important_features(Xtrain, Ytrain, n=250, threshold=0.01, verbose=False):
    """ Use entirety of provided X, Y to train random forest

    Arguments
    Xtrain -- Training data
    Ytrain -- Training prediction

    Optional Arguments
    n -- number of ensemble members
    threshold -- threshold of importance above which a feature is relevant
    verbose -- if true, prints results of ranking

    Returns
    ranking -- a ranked list of indices of important features
    """
    # Train and fit tree classifier ensemble
    classifier = ExtraTreesClassifier(n_estimators=n, random_state=0)
    classifier.fit(Xtrain, Ytrain)

    # Compute important features
    importances = classifier.feature_importances_
    std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    ranking = [[indices[f], importances[indices[f]]] for f in range(Xtrain.shape[1])]
    ranking = filter(lambda r: r[1] >= threshold, ranking)

    if verbose:
        for r in range(len(ranking)):
            print str(r+1) + ". ", ranking[r][0], ranking[r][1]

    return ranking
Beispiel #16
0
def select_with_forest(X, y, n_trees=10, treshold=0.01):
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import ExtraTreesClassifier
    import pandas as pd
    import numpy as np
    # encode labels (str -> int):
    le = LabelEncoder()
    X = X.copy()
    for col in X.columns:
        le.fit(X[col].unique())
        X[col] = le.transform(X[col])
    # train the classifier:
    forest = ExtraTreesClassifier(criterion="entropy", n_estimators=n_trees)
    forest.fit(X, y)
    print('number of selected features: ', np.sum(forest.feature_importances_ >= treshold))
    # select important features:
    importances = pd.DataFrame()
    importances['predictor name'] = X.columns.tolist()
    importances['importance'] = forest.feature_importances_
    importances = importances.sort_values(by='importance', ascending=False)
    #X2 = forest.transform(X, treshold)
    #labels2 = X.columns[list(forest.feature_importances_>=treshold)]
    #X2 = pd.DataFrame(X2)
    #X2.columns = labels2
    return importances #X2
Beispiel #17
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
        #y_pred/=m;
        clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3,
                            max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
def get_most_important_features(train):
  train = train.drop('ID', 1)
  train_y = train['TARGET']
  train_X = train.drop('TARGET', 1)

  random_forest = RandomForestClassifier(n_estimators=100)
  random_forest.fit(train_X, train_y)

  feater_importance = pd.Series(random_forest.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(15  ,7), title='Feature importance by random forest')

  # plt.savefig("feature_importance.png")

  grad_boosting = GradientBoostingClassifier()
  grad_boosting.fit(train_X, train_y)

  feater_importance = pd.Series(grad_boosting.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(10,7), title='Feature importance by gradient boosting')

  # plt.savefig("feature_importance2.png")

  extra_trees = ExtraTreesClassifier()
  extra_trees.fit(train_X, train_y)

  feater_importance = pd.Series(extra_trees.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(20,7), title='Feature importance by extra trees classifier')
def train_UsingExtraTreesClassifier(df,header,x_train, y_train,x_test,y_test) :

    # training
    clf = ExtraTreesClassifier(n_estimators=200,random_state=0,criterion='gini',bootstrap=True,oob_score=1,compute_importances=True)
    # Also tried entropy for the information gain but 'gini' seemed to give marginally better fit, bith in sample & out of sample
    clf.fit(x_train, y_train)
    #estimation of goodness of fit
    print "Estimation of goodness of fit using the ExtraTreesClassifier is : %f  \n" % clf.score(x_test,y_test)
    print "Estimation of out of bag score  using the ExtraTreesClassifier is : %f \n \n  " % clf.oob_score_
    # getting paramters back, if needed
    clf.get_params()
    # get the vector of predicted prob back
    y_test_predicted= clf.predict(x_test)
    X = df[df.columns - [header[-1]]]

    feature_importance = clf.feature_importances_
    # On a scale of 10 - make importances relative to max importance and plot them
    feature_importance = 10.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance) #Returns the indices that would sort an array.
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 1, 1)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()
    return y_test_predicted
def FeaturesImportance(trainData, trainLabels):
    forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
    forest.fit(trainData, trainLabels)
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(16):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(16), importances[range(16)], color="r", align="center")
    plt.xticks(range(16), [r'$x_1$', r'$x_2$', r'$x_3$', r'$x_4$', r'$x_5$',
                          r'$x_6$', r'$x_7$', r'$x_8$', r'$x_9$', r'$x_{10}$', 
                          r'$x_{11}$', r'$x_{12}$', r'$x_{13}$', r'$x_{14}$', r'$x_{15}$', 
                          r'$x_{16}$'])
    plt.yticks([0.0, 0.05, 0.10, 0.15, 0.20, 0.25], [r'$0.00$', r'$0.05$', r'$0.10$', r'$0.15$', r'$0.20$', r'$0.25$'])  
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xlim([-1, 16])
    plt.show()
    
    return importances
def remove_feature_tree_based(train_X,train_Y):
    '''
    Removes features based on trees - see sklearn:
    http://scikit-learn.org/dev/auto_examples/ensemble/plot_forest_importances.html#example-ensemble-plot-forest-importances-py

    Actually removes based on "importance"
    '''
    forest = ExtraTreesClassifier(n_estimators=1000,
                                  compute_importances = True,
                                  random_state = 0)

    forest.fit(train_X, train_Y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                  axis=0)
    indices = np.argsort(importances)[::-1]

    x_labels = ['rc1', 'rc2', 'dca1', 'dca2','dcm1', 'dcm2','ace1','ace2','acsc1', 'acsc2', 'acsv1', 'acsv2', 'acss1','acss2', 'acsk1', 'acsk2', 'taca1', 'taca2', 'tdc1', 'tdc2', 'gmin', 'gmean', 'trd','ep111','ep112','ep211', 'ep212', 'ep311','ep312', 'ep411','ep412','ep511','ep512','ep611','ep612','ep121','ep122','ep221', 'ep222', 'ep321','ep322', 'ep421','ep422','ep521','ep522','ep621','ep622']

    # Print the feature ranking
    print "Feature ranking:"

    for f in xrange(46):
        print "%d. feature %s (%f)" % (f + 1, x_labels[indices[f]], importances[indices[f]])

    # Transform the data to have only the features that are important
    x_new = forest.transform(train_X)

    return (forest, x_new)
Beispiel #22
0
def algo_fit_cross_validated(training_matrix, target):
    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(training_matrix, target)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    l = list(training_matrix.columns.values)
    for f in range(training_matrix.shape[1]):
        print("%d. feature %d(%s) (%f)" % (f + 1, indices[f], l[indices[f]], importances[indices[f]]))

    ##### Works well ######
    # SVM
    # svm = SVC(kernel="linear", C=0.06)
    # svm.fit(training_matrix, target)
    #
    # scores_svm = cross_validation.cross_val_score(svm, training_matrix, target, cv=5)
    # print("(svm) Accuracy: %0.5f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))
    #
    # return svm
    ##### Works well ######

    # Random Forest
    rf = RandomForestClassifier(n_estimators=1500, max_depth=2, max_features=4)
    scores_rf = cross_validation.cross_val_score(rf, training_matrix, target, cv=5)
    print("(Random Forest) Accuracy: %0.5f (+/- %0.2f)" % (scores_rf.mean(), scores_rf.std() * 2))
    rf.fit(training_matrix, target)
    return rf
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
    def fit(self, X, Y, sample_weight=None):
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.feature_selection import SelectFromModel

        num_features = X.shape[1]
        max_features = int(float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        estimator = ExtraTreesClassifier(
            n_estimators=self.n_estimators,
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            bootstrap=self.bootstrap,
            max_features=max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            random_state=self.random_state,
            class_weight=self.class_weight,
        )
        estimator.fit(X, Y, sample_weight=sample_weight)
        self.preprocessor = SelectFromModel(estimator=estimator, threshold="mean", prefit=True)
        return self
def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
Beispiel #26
0
    def _cascade_layer(self, X, y=None, layer=0):
        n_tree = getattr(self, 'n_cascadeRFtree')
        n_cascadeRF = getattr(self, 'n_cascadeRF')
        min_samples = getattr(self, 'min_samples_cascade')

        prf = RandomForestClassifier(
            n_estimators=100, max_features=8,
            bootstrap=True, criterion="entropy", min_samples_split=20,
            max_depth=None, class_weight='balanced', oob_score=True)
        crf = ExtraTreesClassifier(
            n_estimators=100, max_depth=None,
            bootstrap=True, oob_score=True)

        prf_pred = []
        if y is not None:
            # print('Adding/Training Layer, n_layer={}'.format(self.n_layer))
            for irf in range(n_cascadeRF):
                prf.fit(X, y)
                crf.fit(X, y)
                setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf)
                setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf)
                probas = prf.oob_decision_function_
                probas += crf.oob_decision_function_
                prf_pred.append(probas)
        elif y is None:
            for irf in range(n_cascadeRF):
                prf = getattr(self, '_casprf{}_{}'.format(layer, irf))
                crf = getattr(self, '_cascrf{}_{}'.format(layer, irf))
                probas = prf.predict_proba(X)
                probas += crf.predict_proba(X)
                prf_pred.append(probas)

        return prf_pred
Beispiel #27
0
class MyExtraTree(MyClassifier):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesClassifier(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesClassifier(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    # def predict(self, Xtest, option = None):
    #   return self._extree.predict(Xtest)

    def predict_proba(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)[:, 1]

    def predict_proba_multi(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._extree.feature_importances_

        std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Extra Tree Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()


    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._extree.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Extra tree feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
Beispiel #28
0
def tree(train_data, train_labels, all_bigrams, task):
	forest = ExtraTreesClassifier(n_estimators=100, random_state=0)
	forest.fit(train_data, train_labels)
	importances = forest.feature_importances_
	indices = np.argsort(importances)[::-1]

	# Print the feature ranking
	print "-"*45
	print task

	for f in range(20):
	  print("%d. feature, name: %s, importance: %f" % (f + 1, all_bigrams[indices[f]], importances[indices[f]]))

	# Plot the feature importances of the forest
	pl.figure()
	n = train_data.shape[1]
	n = 2000
	pl.title("Sorted feature importance for %s" %(task))
	pl.bar(range(n), importances[indices][:n], color="black", align="center")
	pl.xlim([0, (n)])
	pl.xticks([num for num  in range(0, n+1, 250)])
	pl.savefig(task+'.pdf', bbox_inches='tight')
	print "plot saved"

	return indices
def calc_prob(df_features_driver, df_features_other):

    df_train = df_features_driver.append(df_features_other)
    df_train.reset_index(inplace = True)
    df_train.Driver = df_train.Driver.astype(int)

    # So far, the best result was achieved by using a RandomForestClassifier with Bagging
    # model = BaggingClassifier(base_estimator = ExtraTreesClassifier())
    # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1))
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = AdaBoostClassifier())
    #model = RandomForestClassifier(200)
    # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()])
    # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()),
    #                             GradientBoostingClassifier])
    #model = GradientBoostingClassifier(n_estimators = 10000)
    model = ExtraTreesClassifier(n_estimators=100,max_features='auto',random_state=0, n_jobs=2, criterion='entropy', bootstrap=True)
    # model = ExtraTreesClassifier(500, criterion='entropy')

    feature_columns = df_train.iloc[:, 4:]

    # Train the classifier
    model.fit(feature_columns, df_train.Driver)
    df_submission = pd.DataFrame()

    df_submission['driver_trip'] = create_first_column(df_features_driver)

    probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver
    probs_df = pd.DataFrame(probs_array)

    df_submission['prob'] = np.array(probs_df.iloc[:, 1])

    return df_submission
Beispiel #30
0
def reduceRF(label):
  global x_data_rf_reduced, importantFeatureLocs
  model = ExtraTreesClassifier()
  model.fit(x_data, y_data[:, label])

  # the relative importance of each attribute
  importance = model.feature_importances_
  weight = float(0)
  del importantFeatureLocs[:] # reset
  #print(importance)  

  for ele in np.sort(importance)[::-1]:
    weight += float(ele)
    featureIndex = np.where(importance==ele)
    for loc in featureIndex[0]:
      importantFeatureLocs.append(loc)
  
    if weight > RFThreshold :
      break
  
  # remove duplications
  importantFeatureLocs = list(set(importantFeatureLocs))

  # extracting relevant columns from input data. Note that importantFeatureLocs
  # may be unsorted (since python 'set' is unsorted), so features are extracted
  # in unorderd fashion. This info is stored in the softmax model class
  x_data_rf_reduced = x_data[:, importantFeatureLocs]
Beispiel #31
0
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features
x, y = make_classification(n_samples=1000,
                           n_features=5,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)
forest = ExtraTreesClassifier(n_estimators=2000, random_state=0)

forest.fit(x, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(x.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(Covid_2_sklearn.shape[1]),
Beispiel #32
0
              a_train = X.values[train_index]
              a_test = X.values[test_index]
              b_train = y.values[train_index]
              b_test = y.values[test_index]

              clf = ExtraTreesClassifier(n_estimators=n_estimators,
                        min_samples_split=min_samples_split,
                        max_features=max_features,
                        max_depth=max_depth,
                        min_samples_leaf=min_samples_leaf,
                        n_jobs=2,
                        random_state=random_state,
                        criterion='entropy')

              clf.fit(a_train, b_train)

              preds = clf.predict_proba(a_test)[:, 1]

              # print clf.predict( xgb.DMatrix(check_agreement[features].values) )[:10]
              agreement_probs = clf.predict_proba(check_agreement[features])[:, 1]

              ks = compute_ks(
                      agreement_probs[check_agreement['signal'].values == 0],
                      agreement_probs[check_agreement['signal'].values == 1],
                      check_agreement[check_agreement['signal'] == 0]['weight'].values,
                      check_agreement[check_agreement['signal'] == 1]['weight'].values)
              print ('KS metric', ks, ks < 0.09)
              if ks >= 0.09:
                sys.exit()
Beispiel #33
0
#
## Get an array of the features ranked
#rank = fit.ranking_
#
## Creae a dataframe of the column names by ranking.
#col_names = list(df.columns.values)
#col_names.pop()
#cols_ranked = pd.DataFrame({'features': col_names, 'rank': list(rank)})

# ------------------------------------------------------
# Extremely Randomized Trees
# ------------------------------------------------------
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X_samp, y_samp)

# Get an array of the features ranked
rank = model.feature_importances_

# Creae a dataframe of the column names by ranking.
col_names = list(df_samp.columns.values)
col_names.pop()
cols_ranked = pd.DataFrame({'features': col_names, 'rank': list(rank)})
cols_ranked['rank'] -= cols_ranked['rank'].min()
cols_ranked['rank'] /= cols_ranked['rank'].max()
important_cols = cols_ranked.loc[
    cols_ranked['rank'] >= extra_trees_keep_thresh]

important_cols = list(important_cols['features'])
important_cols.append(dep_var)
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8164430115022656
exported_pipeline = ExtraTreesClassifier(bootstrap=False,
                                         criterion="entropy",
                                         max_features=0.7500000000000001,
                                         min_samples_leaf=7,
                                         min_samples_split=20,
                                         n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #35
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
import pandas as pd
import numpy as np

df = pd.read_csv('/dataset/lab/data.csv', sep=' ', header=None)
X = df[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]].as_matrix()
y = df[[0]][0].tolist()

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=1000, n_jobs=2, random_state=0)

forest.fit(X, y)

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

print cross_val_score(forest, X, y, cv=4, n_jobs=4)
Beispiel #36
0
def ModelAnalyzer(X, y, regressor=True):
    # INPUTS:
    #   - X: (DataFrame) Explanatory variables to be used as features for ML models
    #   - y: (Vector) Response variables to be used as target for ML models
    #   - regressor: (bool) Determines whether a regressor or classifier will be used
    # OUTPUTS:
    #   - out: (str) Multiline report of the accuracy and fit time of each model

    import time
    from sklearn.metrics import mean_absolute_error, accuracy_score
    from sklearn.model_selection import train_test_split
    import warnings
    warnings.filterwarnings('ignore')

    # Split dataset into train and test dataset (train_size is the proportion of train to test lengths)
    train_X, test_X, train_Y, test_Y = train_test_split(X,
                                                        y,
                                                        train_size=0.5,
                                                        shuffle=False,
                                                        random_state=1)

    if regressor:
        # Run several models and determine prediction accuracy using accuracy score.

        # Model Selection
        # Decision Tree
        from sklearn.tree import DecisionTreeRegressor
        start_dt = time.time()
        dt = DecisionTreeRegressor(random_state=1)
        dt.fit(train_X, train_Y)
        dt_test_predictions = dt.predict(test_X)
        dt_mae = mean_absolute_error(dt_test_predictions, test_Y)
        finish_dt = str(round(time.time() - start_dt, 5))
        out_dt = "Decision Tree MAE: " + str(dt_mae) + ', Time: ' + str(
            finish_dt) + ' seconds.'

        # Random Forest
        from sklearn.ensemble import RandomForestRegressor
        start_rf = time.time()
        rf = RandomForestRegressor(random_state=1,
                                   max_features='auto',
                                   min_samples_split=2,
                                   min_samples_leaf=1,
                                   n_estimators=650)
        rf.fit(train_X, train_Y)
        rf_test_predictions = rf.predict(test_X)
        rf_mae = mean_absolute_error(rf_test_predictions, test_Y)
        finish_rf = str(round(time.time() - start_rf, 5))
        out_rf = "Random Forest MAE: " + str(rf_mae) + ', Time: ' + str(
            finish_rf) + ' seconds.'

        # Support Vector Regressor
        from sklearn.svm import SVR
        start_svr = time.time()
        svr = SVR(gamma='scale', C=1.0)
        svr.fit(train_X, train_Y)
        svr_test_predictions = svr.predict(test_X)
        svr_mae = mean_absolute_error(svr_test_predictions, test_Y)
        finish_svr = str(round(time.time() - start_svr, 5))
        out_svr = "Support Vector MAE: " + str(svr_mae) + ', Time: ' + str(
            finish_svr) + ' seconds.'

        # EXTRA TREES MODEL
        from sklearn.ensemble import ExtraTreesRegressor
        start_etr = time.time()
        etr = ExtraTreesRegressor(max_features='auto',
                                  n_estimators=125,
                                  min_samples_split=3,
                                  random_state=1)
        etr.fit(train_X, train_Y)
        etr_test_predictions = etr.predict(test_X)
        etr_mae = mean_absolute_error(etr_test_predictions, test_Y)
        finish_etr = str(round(time.time() - start_etr, 5))
        out_etr = "Extra Trees MAE: " + str(etr_mae) + ', Time: ' + str(
            finish_etr) + ' seconds.'

        from sklearn.linear_model import LassoCV
        start_lasso = time.time()
        lasso = LassoCV()
        lasso.fit(train_X, train_Y)
        lasso_test_predictions = lasso.predict(test_X)
        lasso_mae = mean_absolute_error(lasso_test_predictions, test_Y)
        finish_lasso = str(round(time.time() - start_lasso, 5))
        out_lasso = "Lasso MAE: " + str(lasso_mae) + ', Time: ' + str(
            finish_lasso) + ' seconds.'

        from sklearn.linear_model import RidgeCV
        start_ridge = time.time()
        ridge = RidgeCV()
        ridge.fit(train_X, train_Y)
        ridge_test_predictions = ridge.predict(test_X)
        ridge_mae = mean_absolute_error(ridge_test_predictions, test_Y)
        finish_ridge = str(round(time.time() - start_ridge, 5))
        out_ridge = "Ridge MAE: " + str(ridge_mae) + ', Time: ' + str(
            finish_ridge) + ' seconds.'

        from sklearn.linear_model import ElasticNetCV
        start_en = time.time()
        en = ElasticNetCV()
        en.fit(train_X, train_Y)
        en_test_predictions = en.predict(test_X)
        en_mae = mean_absolute_error(en_test_predictions, test_Y)
        finish_en = str(round(time.time() - start_en, 5))
        out_en = "Elastic Net MAE: " + str(en_mae) + ', Time: ' + str(
            finish_en) + ' seconds.'

        out = out_dt + '\n' + out_rf + '\n' + out_svr + '\n' + out_etr + '\n' + out_lasso + '\n' + out_ridge + '\n' + out_en

    else:
        # Run several models and determine prediction accuracy using accuracy score.

        # Logistic Regression
        from sklearn.linear_model import LogisticRegression
        start = time.time()
        lr = LogisticRegression(solver='lbfgs',
                                multi_class='auto',
                                max_iter=2000)
        lr.fit(train_X, train_Y)
        lr_predictions = lr.predict(test_X)
        finish_lr = str(round(time.time() - start, 5))
        lr_accuracy = accuracy_score(test_Y, lr_predictions)
        out_lr = "Logistic Regression Accuracy: " + str(
            lr_accuracy) + ', Time: ' + str(finish_lr) + ' seconds.'

        # Naïve Bayes
        from sklearn.naive_bayes import GaussianNB
        start = time.time()
        nb = GaussianNB()
        nb.fit(train_X, train_Y)
        nb_predictions = nb.predict(test_X)
        finish_nb = str(round(time.time() - start, 5))
        nb_accuracy = accuracy_score(test_Y, nb_predictions)
        out_nb = "Naive Bayes Accuracy: " + str(
            nb_accuracy) + ', Time: ' + str(finish_nb) + ' seconds.'

        # Stochastic Gradient Descent
        from sklearn.linear_model import SGDClassifier
        start = time.time()
        sgd = SGDClassifier(loss='modified_huber',
                            shuffle=True,
                            random_state=101,
                            tol=1e-3,
                            max_iter=1000)
        sgd.fit(train_X, train_Y)
        sgd_predictions = sgd.predict(test_X)
        finish_sgd = str(round(time.time() - start, 5))
        sgd_accuracy = accuracy_score(test_Y, sgd_predictions)
        out_sgd = "SGD Accuracy: " + str(sgd_accuracy) + ', Time: ' + str(
            finish_sgd) + ' seconds.'

        # K-Nearest Neighbors
        from sklearn.neighbors import KNeighborsClassifier
        start = time.time()
        knn = KNeighborsClassifier(n_neighbors=10)
        knn.fit(train_X, train_Y)
        knn_predictions = knn.predict(test_X)
        finish_knn = str(round(time.time() - start, 5))
        knn_accuracy = accuracy_score(test_Y, knn_predictions)
        out_knn = "KNN Accuracy: " + str(knn_accuracy) + ', Time: ' + str(
            finish_knn) + ' seconds.'

        # Decision Tree
        from sklearn.tree import DecisionTreeClassifier
        start = time.time()
        dt = DecisionTreeClassifier(max_depth=10,
                                    random_state=101,
                                    max_features=None,
                                    min_samples_leaf=5)
        dt.fit(train_X, train_Y)
        dt_predictions = dt.predict(test_X)
        finish_dt = str(round(time.time() - start, 5))
        dt_accuracy = accuracy_score(test_Y, dt_predictions)
        out_dt = "Decision Tree Accuracy: " + str(
            dt_accuracy) + ', Time: ' + str(finish_dt) + ' seconds.'

        # Random Forest
        from sklearn.ensemble import RandomForestClassifier
        start = time.time()
        rfm = RandomForestClassifier(n_estimators=125,
                                     oob_score=True,
                                     n_jobs=1,
                                     random_state=101,
                                     max_features=None,
                                     min_samples_leaf=3)
        rfm.fit(train_X, train_Y)
        rfm_predictions = rfm.predict(test_X)
        finish_rfm = str(round(time.time() - start, 5))
        rfm_accuracy = accuracy_score(test_Y, rfm_predictions)
        out_rfm = "Random Forest Accuracy: " + str(
            rfm_accuracy) + ', Time: ' + str(finish_rfm) + ' seconds.'

        # Support Vector Classifier
        from sklearn.svm import SVC
        start = time.time()
        svm = SVC(gamma='scale', C=1.0, random_state=101)
        svm.fit(train_X, train_Y)
        svm_predictions = svm.predict(test_X)
        finish_svm = str(round(time.time() - start, 5))
        svm_accuracy = accuracy_score(test_Y, svm_predictions)
        out_svm = "SVC Accuracy: " + str(svm_accuracy) + ', Time: ' + str(
            finish_svm) + ' seconds.'

        # Extra Trees
        from sklearn.ensemble import ExtraTreesClassifier
        start = time.time()
        etc = ExtraTreesClassifier(n_estimators=125)
        etc.fit(train_X, train_Y)
        etc_predictions = etc.predict(test_X)
        finish_etc = str(round(time.time() - start, 5))
        etc_accuracy = accuracy_score(test_Y, etc_predictions)
        out_etc = "Extra Trees Accuracy: " + str(
            etc_accuracy) + ', Time: ' + str(finish_etc) + ' seconds.'

        out = out_lr + '\n' + out_nb + '\n' + out_sgd + '\n' + out_knn + '\n' + out_dt + '\n' + out_rfm + '\n' + out_svm + '\n' + out_etc

    return print(out)
Beispiel #37
0
__author__ = 'shi'
# Feature Importance
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
data = np.loadtxt("output_res(1).txt")
#f1 = open("phenotype.txt")
#f1.readline()
result = np.loadtxt("phenotype.txt")
print data.shape
print result.shape
# fit an Extra Trees model to the data
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data,
                                                    result,
                                                    test_size=0.3)
model = ExtraTreesClassifier(n_estimators=200)
model.fit(x_train, y_train)
answer = model.predict(x_test)
print "predict_result:", np.mean(answer == y_test)
# display the relative importance of each attribute
for m in range(len(model.feature_importances_)):
    if model.feature_importances_[m] > 0.0005:
        print "feature_importance", m, model.feature_importances_[m]
Beispiel #38
0
#Paths for file saving
module_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
models_path = os.path.join(module_path, 'dummy_models')
baselline_path = os.path.join(module_path, 'baseline_images')

# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=10, random_state=0)
forest.fit(X_train, y_train)

y_true = y_test
y_pred = forest.predict(X_test)
y_score = forest.predict_proba(X_test)

#Pickle model
joblib.dump(forest, os.path.join(models_path, 'classifier_with_feature_importances_model.pkl'))
#Pickle y_true
joblib.dump(y_true, os.path.join(models_path, 'classifier_with_feature_importances_y_true.pkl'))
#Pickle y_pred
joblib.dump(y_pred, os.path.join(models_path, 'classifier_with_feature_importances_y_pred.pkl'))
#Pickle y_score
joblib.dump(y_score, os.path.join(models_path, 'classifier_with_feature_importances_y_score.pkl'))
#Pickle X
joblib.dump(X, os.path.join(models_path, 'classifier_with_feature_importances_x.pkl'))
Beispiel #39
0
            data['phonecharge_day_num'][i], data['phonelock_sum'][i],
            data['phonelock_var'][i], data['phonelock_mean'][i],
            data['phonelock_day_num'][i], data['in_time_second'][i],
            data['near_time_second'][i], data['in_all_percentage'][i],
            data['pre_score'][i]
        ])
    return feature


if __name__ == '__main__':
    filename = '..\\preprocess\\data\\features_and_flourishing.csv'
    all_data = get_file(filename)
    data_label = get_label(all_data)
    data_feature = get_feature(all_data)
    model = ExtraTreesClassifier()
    model.fit(data_feature, data_label)
    print(model.feature_importances_)
    j = 2
    label = []
    importance = []
    for i in model.feature_importances_:
        importance.append(i)
    for j in all_data.columns.values:
        label.append(j)
    print(label[2:len(model.feature_importances_)])
    print(importance)
    plt.bar(label[2:len(model.feature_importances_) + 2], importance)
    plt.xticks(rotation=270)
    plt.gca().margins(x=0)
    plt.gcf().canvas.draw()
    tl = plt.gca().get_xticklabels()
Beispiel #40
0
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
import numpy

fileName = "pima-indians-diabetes.data.csv"
rawData = open(fileName, "rt")
colNames = [
    "preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"
]
data = read_csv(rawData, names=colNames)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
model = ExtraTreesClassifier()
model.fit(X, Y)
# numpy.set_printoptions(precision=3)
print("Feature Importance Values : %s" % model.feature_importances_)
Beispiel #41
0
feature_name = "selected_%s" % suffix
fname = os.path.join(config.FEAT_DIR + "/Combine",
                     feature_name + config.FEAT_FILE_SUFFIX)
data_dict = pkl_utils._load(fname)
X_train = data_dict["X_train_basic"]
X_test = data_dict["X_test"]
y_train = data_dict["y_train"]
splitter = data_dict["splitter"]
n_iter = data_dict["n_iter"]
i = n_iter - 1  # use the last splitter to split the cv
X_train_cv = data_dict["X_train_basic"][splitter[i][0], :]
X_valid_cv = data_dict["X_train_basic"][splitter[i][1], :]
y_train_cv = data_dict["y_train"][splitter[i][0]]
y_valid_cv = data_dict["y_train"][splitter[i][1]]

learner = ExtraTreesClassifier(n_estimators=500,
                               criterion='gini',
                               max_depth=5,
                               min_weight_fraction_leaf=0.0,
                               max_features='auto',
                               n_jobs=-1,
                               random_state=config.RANDOM_SEED,
                               verbose=10)
learner.fit(X_train_cv, y_train_cv)
p_test = learner.predict_proba(X_valid_cv)
print("The log loss of valid set is {}".format(log_loss(y_valid_cv, p_test)))
index = learner.feature_importances_.argsort()
for i in range(-1, -len(index), -1):
    print("{:30}  {:30}".format(data_dict['feature_names'][index[i]],
                                learner.feature_importances_[index[i]]))
Beispiel #42
0
def getAmount(weekday, date, hour, month, degree, rain, sun):

    data = pd.read_csv("dataset_city_people_hour.csv")
    data.sort_values('Date', ascending=True, inplace=True)

    data.drop_duplicates(keep=False, inplace=True)

    # Drop unessecary fields
    data = data.drop(['Facility'], axis=1)
    data = data.drop(['Activity'], axis=1)

    # Replace strings with integers
    data.replace(to_replace=[
        "Jan", "Feb", "Mar", "Apr", "Maj", "Jun", "Jul", "Aug", "Sep", "Oct",
        "Nov", "Dec"
    ],
                 value=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                 inplace=True)
    data.replace(to_replace=[
        "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "Sunday"
    ],
                 value=[1, 2, 3, 4, 5, 6, 7],
                 inplace=True)

    # Fill NaN
    data.fillna(method='ffill', inplace=True)

    # Seperate weather features into different classes
    data.loc[(data["Sun"] > 0) & (data["Sun"] < 1200), "Sun"] = 1  # Slight sun
    data.loc[(data["Sun"] >= 1200) & (data["Sun"] < 2400),
             "Sun"] = 2  # Moderate sun
    data.loc[(data["Sun"] >= 2400) & (data["Sun"] < 3600),
             "Sun"] = 3  # Heavy sun
    data.loc[(data["Sun"] == 3600), "Sun"] = 4  # Very heavy sun

    data.loc[(data["Rain"] > 0.0) & (data["Rain"] < 0.5),
             "Rain"] = 1  # Slight rain
    data.loc[(data["Rain"] >= 0.5) & (data["Rain"] < 4.0),
             "Rain"] = 2  # Moderate rain
    data.loc[(data["Rain"] >= 4.0) & (data["Rain"] < 8.0),
             "Rain"] = 3  # Heavy rain
    data.loc[(data["Rain"] > 8), "Rain"] = 4  # Very heavy rain

    data.loc[(data["Temp"] < -10.0), "Temp"] = 0
    data.loc[(data["Temp"] >= -10.0) & (data["Temp"] < -5.0), "Temp"] = 1
    data.loc[(data["Temp"] >= -5.0) & (data["Temp"] < 0.0), "Temp"] = 2
    data.loc[(data["Temp"] >= 0.0) & (data["Temp"] < 5.0), "Temp"] = 3
    data.loc[(data["Temp"] >= 5.0) & (data["Temp"] < 10.0), "Temp"] = 4
    data.loc[(data["Temp"] >= 10.0) & (data["Temp"] < 15.0), "Temp"] = 5
    data.loc[(data["Temp"] >= 15.0) & (data["Temp"] < 20.0), "Temp"] = 6
    data.loc[(data["Temp"] >= 20.0) & (data["Temp"] < 25.0), "Temp"] = 7
    data.loc[(data["Temp"] >= 25.0) & (data["Temp"] < 30.0), "Temp"] = 8
    data.loc[(data["Temp"] >= 30.0), "Temp"] = 9

    numOfPeople = 10
    counter = 1

    # Seperate the number of people each hour to different classes
    for i in range(1, 80, numOfPeople):
        data.loc[(data["People"] >= i) & (data["People"] < i + numOfPeople),
                 "People"] = counter - 1
        counter += 1
    data.loc[(data["People"] > 80), "People"] = counter - 1

    # Convert to int
    data["Temp"] = data["Temp"].astype(int)
    data["Rain"] = data["Rain"].astype(int)

    # Get Data and target
    Y = data.iloc[:, 7]
    X = data.drop(["People"], axis=1)

    # Drop features to compare result
    #X = X.drop(["Date"], axis=1) # Drop Date
    #X = X.drop(["Rain"], axis=1) # Drop Rain
    #X = X.drop(["Sun"], axis=1) # Drop Sun
    #X = X.drop(["Month"], axis=1) # Drop Month
    #X = X.drop(["Day"], axis=1) # Drop Day
    #X = X.drop(["Temp"], axis=1) # Drop Temp

    cv2 = KFold(shuffle=True, n_splits=5)
    # Split
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.33,
                                                        random_state=42)

    #KNN
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, Y_train)
    #print(knn.predict([[3, 28, hour, 9, 8, 0, 0]]))
    print("Knn: " + str(knn.score(X_test, Y_test)))

    # SVM
    svm_model_linear = SVC(kernel='rbf', C=2,
                           gamma='auto').fit(X_train, Y_train)
    svm_predictions = svm_model_linear.predict(X_test)
    print("SVM: " + str(svm_model_linear.score(X_test, Y_test)))

    # Random forest
    rndF = RandomForestClassifier(100, random_state=0)
    rndF.fit(X_train, Y_train)
    rndfPred = rndF.predict(X_test)
    cm = confusion_matrix(Y_test, rndfPred)
    print("Random forest: " + str(rndF.score(X_test, Y_test)))

    # Desision tree
    DTree = DecisionTreeClassifier(random_state=0)
    DTree.fit(X_train, Y_train)
    print("Decision trees: " + str(DTree.score(X_test, Y_test)))

    # Extra trees
    exT = ExtraTreesClassifier(n_estimators=100, random_state=0)
    exT.fit(X_train, Y_train)
    print("Extra trees: " + str(exT.score(X_test, Y_test)))

    # Naive bayes
    NB = MultinomialNB()
    NB.fit(X_train, Y_train)
    print("Naive-bayes: " + str(NB.score(X_test, Y_test)))

    return (
        exT.predict([[weekday, date, hour, month, degree, rain, sun]])
    )  # 0 = weekday, 1 = date, 2 = hour, 3 = month, 4 = temp, 5 = rain, 6 = sun
Beispiel #43
0
iris_data = load_iris()

X = iris_data.data
y = iris_data.target

print(X.shape, y.shape)

# %% [markdown]
# ### Train classifier

# %%
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=15, random_state=0)
clf.fit(X, y)

# %% [markdown]
# ### Transpile classifier

# %%
from sklearn_porter import Porter

porter = Porter(clf, language='java')
output = porter.export(export_data=True)

print(output)

# %% [markdown]
# ### Run classification in Java
Beispiel #44
0
def train():

    #
    # load tweet featurese
    #

    tweet_features = np.loadtxt('output/devset_tweet_features.dat',
                                delimiter=',')
    tweet_labels = np.array(tweet_features[:, -1], dtype=int)
    tweet_features = tweet_features[:, :-1]

    # make the training set balanced
    training_posts = read_list('dataset_for_training/real_tweet_id.data')
    training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data'))
    all_posts = read_list('output/devset_eff_posts.dat')
    used_ind = np.ones((len(all_posts), ), dtype=bool)

    for ind, p in enumerate(all_posts):
        if not p in training_posts:
            used_ind[ind] = False

    tweet_features = tweet_features[used_ind, :]
    tweet_labels = tweet_labels[used_ind]

    #
    # training classifier 1
    #

    detector = None
    if classifier1 == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier1 == 'svm':
        detector = svm.SVC()
    elif classifier1 == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200,
                                        max_depth=None,
                                        min_samples_split=1,
                                        random_state=0)

    scaler_1 = preprocessing.StandardScaler().fit(tweet_features)
    tweet_features = scaler_1.transform(tweet_features)
    detector.fit(tweet_features, tweet_labels)
    with open('output/RUN_2_classifier_1.pickle', 'wb') as handle:
        pickle.dump(detector, handle)
    with open('output/RUN_2_scaler_1.pickle', 'wb') as handle:
        pickle.dump(scaler_1, handle)

    #
    # load textual and forensic features
    #

    forensic_features = np.loadtxt('output/devset_forensic_features.dat',
                                   delimiter=',',
                                   dtype=float)
    eff_forensic_topics = read_list('output/devset_eff_forensic_topics.dat')
    textual_features = np.loadtxt('output/devset_textual_features.dat',
                                  delimiter=',',
                                  dtype=float)
    eff_textual_topics = read_list('output/devset_eff_textual_topics.dat')

    real_mul_list = read_list('dataset_for_training/real_image_id.data')
    fake_mul_list = read_list('dataset_for_training/fake_image_id.data')
    mul_list = list(real_mul_list)
    mul_list.extend(fake_mul_list)

    topic_features = np.zeros(
        (len(mul_list),
         forensic_features.shape[1] + textual_features.shape[1]),
        dtype=float)
    topic_labels = np.zeros((len(mul_list), ), dtype=int)
    used_ind = np.ones((len(mul_list), ), dtype=bool)
    for ind, m in enumerate(mul_list):
        if m in eff_forensic_topics:
            ind1 = eff_forensic_topics.index(m)
            topic_features[
                ind, :forensic_features.shape[1]] = forensic_features[ind1]
        if m in eff_textual_topics:
            ind2 = eff_textual_topics.index(m)
            topic_features[
                ind, forensic_features.shape[1]:] = textual_features[ind2]
        if not (m in eff_forensic_topics or m in eff_textual_topics):
            used_ind[ind] = False

        label = 1
        if m in fake_mul_list:
            label = -1

        topic_labels[ind] = label

    # remove unused topic features
    topic_features = topic_features[used_ind, :]
    topic_labels = topic_labels[used_ind]

    detector_2 = None
    if classifier2 == 'logis':
        detector_2 = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier2 == 'svm':
        detector_2 = svm.SVC()
    elif classifier2 == 'randomforest':
        detector_2 = ExtraTreesClassifier(n_estimators=200,
                                          max_depth=None,
                                          min_samples_split=1,
                                          random_state=0)

    scaler_2 = preprocessing.StandardScaler().fit(topic_features)
    topic_features = scaler_2.transform(topic_features)
    detector_2.fit(topic_features, topic_labels)
    with open('output/RUN_2_classifier_2.pickle', 'wb') as handle:
        pickle.dump(detector_2, handle)
    with open('output/RUN_2_scaler_2.pickle', 'wb') as handle:
        pickle.dump(scaler_2, handle)

    print('Training statistics\n')
    print('Number of real tweets: ', sum(tweet_labels == 1))
    print('Number of fake tweets: ', sum(tweet_labels == -1))
    print('Number of real topics: ', sum(topic_labels == 1))
    print('Number of fake topics: ', sum(topic_labels == -1))
Beispiel #45
0
    def model_builder(self):

        self.df = self.df.drop(
            ["duration", "job", "contact", "month", "poutcome"], axis=1)
        self.df.head()
        self.df.columns
        self.df["marital"] = [
            0 if each == "single" else 1 for each in self.df.marital
        ]
        self.df["default"] = [
            0 if each == "no" else 1 for each in self.df.default
        ]
        self.df["housing"] = [
            0 if each == "no" else 1 for each in self.df.housing
        ]
        self.df["loan"] = [0 if each == "no" else 1 for each in self.df.loan]
        self.df["y"] = [0 if each == "no" else 1 for each in self.df.y]
        for each in self.df.education:
            if each == "unknown" or each == "primary":
                self.df["education"] = 0
            elif each == "seondary":
                self.df["education"] = 1
            else:
                self.df["education"] = 2

        # Splitting the dataset into the Training set and Test set
        X = self.df.iloc[:, 0:11].values
        y = self.df.iloc[:, 11].values

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=0)

        # Feature Scaling
        sc = StandardScaler()
        X = sc.fit_transform(X)
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        newdata = []

        #using 9 ML model to create a secondary dataset

        knn = KNeighborsClassifier(n_neighbors=10)  # n_neighbors means k
        knn.fit(X_train, y_train)
        y_pred_knn = knn.predict(X_test)
        file_knn1 = 'file_knn1.sav'
        if self.test != True:
            pickle.dump(knn, open(file_knn1, 'wb'))

        RF = RandomForestClassifier(n_estimators=100,
                                    criterion='entropy',
                                    random_state=0)
        RF.fit(X_train, y_train)
        y_pred_RF = RF.predict(X_test)
        file_rf1 = 'file_rf1.sav'
        if self.test != True:
            pickle.dump(RF, open(file_rf1, 'wb'))

        dtclassifier = DecisionTreeClassifier(criterion='entropy')
        dtclassifier.fit(X_train, y_train)
        y_pred_DT = dtclassifier.predict(X_test)
        file_dt1 = 'file_dt1.sav'
        if self.test != True:
            pickle.dump(dtclassifier, open(file_dt1, 'wb'))

        from sklearn.naive_bayes import GaussianNB
        nbclassifier = GaussianNB()
        nbclassifier.fit(X_train, y_train)
        nb_y_pred = nbclassifier.predict(X_test)
        file_nb1 = 'file_nb1.sav'
        if self.test != True:
            pickle.dump(nbclassifier, open(file_nb1, 'wb'))

        svmkclassifier = SVC(kernel='rbf', random_state=0, gamma='auto')
        svmkclassifier.fit(X_train, y_train)
        y_pred_SVMK = svmkclassifier.predict(X_test)
        file_svm1 = 'file_svm1.sav'
        if self.test != True:
            pickle.dump(svmkclassifier, open(file_svm1, 'wb'))

        bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                               n_estimators=100,
                               random_state=15)
        bg.fit(X_train, y_train)
        y_pred_bg = bg.predict(X_test)
        file_bg1 = 'file_bg1.sav'
        if self.test != True:
            pickle.dump(bg, open(file_bg1, 'wb'))

        et = ExtraTreesClassifier(n_estimators=100, max_features=4)
        et.fit(X_train, y_train)
        y_pred_et = et.predict(X_test)
        file_et1 = 'file_et1.sav'
        if self.test != True:
            pickle.dump(et, open(file_et1, 'wb'))

        adb = AdaBoostClassifier(n_estimators=50, random_state=4)
        adb.fit(X_train, y_train)
        y_pred_adb = adb.predict(X_test)
        file_adb1 = 'file_adb1.sav'
        if self.test != True:
            pickle.dump(adb, open(file_adb1, 'wb'))

        gb = GradientBoostingClassifier(n_estimators=1000, random_state=4)
        gb.fit(X_train, y_train)
        y_pred_gb = gb.predict(X_test)
        file_gb1 = 'file_gb1.sav'
        if self.test != True:
            pickle.dump(gb, open(file_gb1, 'wb'))

        #creation of secondary dataset using the primary dataset
        newdata = pd.DataFrame({
            "knn": y_pred_knn,
            "rf": y_pred_RF,
            "DT": y_pred_DT,
            "nb": nb_y_pred,
            "SVM": y_pred_SVMK,
            "BG": y_pred_bg,
            "ET": y_pred_et,
            "ADB": y_pred_adb,
            "GB": y_pred_gb
        })
        if self.test != True:
            newdata.to_csv("secondary_dataset.csv")

        # In[ ]:

        X_train, X_test, y_train, y_test = train_test_split(newdata,
                                                            y_test,
                                                            test_size=0.1,
                                                            random_state=0)

        from sklearn.naive_bayes import GaussianNB
        nbclassifier2 = GaussianNB()
        nbclassifier2.fit(X_train, y_train)
        nb_y_pred = nbclassifier2.predict(X_test)
        self.accuracy = accuracy_score(y_test, nb_y_pred) * 100

        file_final = 'file_final.sav'
        if self.test != True:
            pickle.dump(nbclassifier2, open(file_final, 'wb'))

        return self.accuracy
Beispiel #46
0
import pandas as pd
import numpy as np
#read files
arquivo = pd.read_csv('C:/Users/jvict/OneDrive/Documents/wine_dataset.csv')

#red=0 and white=1
arquivo['style'] = arquivo['style'].replace('red', 0)
arquivo['style'] = arquivo['style'].replace('white', 1)

#set the array
y = arquivo['style']
X = arquivo.drop('style', axis=1)

#split the arrays between train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#set the tree to module and fit it
from sklearn.ensemble import ExtraTreesClassifier
clt = ExtraTreesClassifier()
clt.fit(X_train, y_train)

#measure the accuracy of the IA
resultado = clt.score(X_test, y_test)
print(resultado)
Beispiel #47
0
    num_round = 100
    lgb_model = lgb.train(param,
                          train_data,
                          num_round,
                          valid_sets=[lgb.Dataset(X_test, y_test)],
                          early_stopping_rounds=1)
    print("Test")
    eval_metric(confusion_matrix(y_test, lgb_model.predict(X_test).round()))
    print("Training")
    eval_metric(confusion_matrix(y_train, lgb_model.predict(X_train).round()))

    print("Extra decision tree classifier")
    model = ExtraTreesClassifier(n_estimators=200,
                                 max_depth=None,
                                 min_samples_split=2)
    model.fit(X_train, y_train)
    print("Test")
    eval_metric(confusion_matrix(y_test, model.predict(X_test).round()))
    print("Training")
    eval_metric(confusion_matrix(y_train, model.predict(X_train).round()))

    print("Decision tree classifier")
    model = DecisionTreeClassifier(max_depth=None, min_samples_split=2)
    model.fit(X_train, y_train)
    print("Test")
    eval_metric(confusion_matrix(y_test, model.predict(X_test).round()))
    print("Training")
    eval_metric(confusion_matrix(y_train, model.predict(X_train).round()))

    print("Decision tree classifier with scaler and PCA")
    model = make_pipeline(
Beispiel #48
0
bestfeatures = SelectKBest(score_func=chi2, k=7)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(7,'Score'))  #print 10 best features

# Feature Importance
feature_importance = []

for i in range(250):
	model = ExtraTreesClassifier()
	model.fit(X, y)
	model_feature_importance = model.feature_importances_
	print(model_feature_importance) #use inbuilt class feature_importances of tree based classifiers
	feature_importance.append(model_feature_importance)

feature_importance = np.array(feature_importance)

#plot graph of feature importances for better visualization
avg = np.mean(feature_importance, axis=0)
feat_importances = pd.Series(avg, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')

plt.show()
def models(dataset):
    print("Models")

    x_train_res, x_val_res, y_train_res, y_val_res = train_test(dataset)

    rf = RandomForestClassifier(n_estimators=40, max_depth=10)
    rf.fit(x_train_res, y_train_res)
    filename = 'rf_model.pckl'
    pickle.dump(rf, open(filename, 'wb'))
    # some time later...
    # load the model from disk
    RandomForest_model = pickle.load(open(filename, 'rb'))
    print("RandomForestClassifier")

    knn = KNeighborsClassifier(n_neighbors=4)
    # fitting the model
    knn.fit(x_train_res, y_train_res)
    filename = 'knn_model.pckl'
    pickle.dump(knn, open(filename, 'wb'))
    # some time later...
    # load the model from disk
    K_nearest_model = pickle.load(open(filename, 'rb'))
    print("KNeighborsClassifier")

    lr = LogisticRegression()
    # fitting the model
    lr.fit(x_train_res, y_train_res)
    filename = 'lr_model.pckl'
    pickle.dump(lr, open(filename, 'wb'))
    # some time later...
    # load the model from disk
    Log_Reg_model = pickle.load(open(filename, 'rb'))
    print("LogisticRegression")


    bnb = GaussianNB()
    # fitting the model
    bnb.fit(x_train_res, y_train_res)
    filename = 'bnb_model.pckl'
    pickle.dump(bnb, open(filename, 'wb'))
     # some time later...
     # load the model from disk
    Bernoulli_Nb_model = pickle.load(open(filename, 'rb'))
    print("BernoulliNB")

    extr = ExtraTreesClassifier(n_estimators = 50, random_state = 123)
    # fitting the model
    extr.fit(x_train_res, y_train_res)
    filename = 'extra_tree_model.pckl'
    pickle.dump(extr, open(filename, 'wb'))
     # some time later...
     # load the model from disk
    Extra_Tree_model = pickle.load(open(filename, 'rb'))
    print("ExtraTreesClassifier")
    
    #randomForest_model = random_forest(dataset)
    #K_nearest_model = k_n(dataset)
    #Log_Reg_model = logReg(dataset)
    #Bernoulli_Nb_model = BernouNb(dataset)
    #Extra_Tree_model = ex_tr(dataset)
    #ExtraTreez_model = xtraTree(dataset)
    model = [RandomForest_model,
             K_nearest_model,
             Log_Reg_model,
             Bernoulli_Nb_model,
             Extra_Tree_model
             ]
    return(model)
from sklearn import datasets
mnist = datasets.fetch_mldata('MNIST original')
x, y = mnist.data, mnist.target

#基于树模型
from sklearn.datasets import load_iris

iris = load_iris()
ix, iy = iris.data, iris.target
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

model1 = ExtraTreesClassifier()
model2 = GradientBoostingClassifier()
model1.fit(ix, iy)
model2.fit(ix, iy)
model1.feature_importances_
model2.feature_importances_
clf1 = SelectFromModel(model1, prefit=True)
clf2 = SelectFromModel(model2, prefit=True)
clf1.get_support()
clf2.get_support()

#---
# sklearn 交叉验证
from sklearn.cross_validation import cross_val_score
#cross_val_score(model, X, y, cv=10)
from sklearn.cross_validation import cross_val_predict
#cross_val_predict(model, X, y, cv=10)
from sklearn.cross_validation import LeaveOneOut
Beispiel #51
0
class perform_ml():
    def __init__(self, df):

        self.df = df

        self.conn = sqlite3.connect('earnings.db', timeout=120)
        self.features = list(self.df.columns)
        #print(self.features)

        for remove_me in [
                '5 Day Change', '10 Day Change', '5 Day Change Abnormal',
                '10 Day Change Abnormal', 'Date Reported', 'Time Reported',
                'Symbol', 'Market Cap Text'
        ]:

            self.features.remove(remove_me)

        self.first_run = True
        self.max_means = -90
        self.iterations = 5
        self.start_feature_imp = [0]
        while True:
            self.buy_cutoff = .03
            self.cutoff_found = False
            self.test_df = df
            self.prepare_data()

            self.means = []
            self.num_trades = []
            self.accuracys = []

            self.current_means = []
            self.current_num_trades = []
            self.current_accuracys = []
            # TODO: if we keep a feature, start over again
            print('======================')
            print('using features', self.features)

            for i in range(self.iterations):
                num_trades = 500

                mean = self.find_cutoff()
                if mean < 0 and self.first_run == False:
                    break

                self.prepare_data()
                self.train_model()
                self.predict()

                mean_return, num_trades, accuracy = self.get_results(self.test)

                self.means.append(mean_return)
                self.num_trades.append(num_trades)
                self.accuracys.append(accuracy)

                mean_return, num_trades, accuracy = self.get_results(
                    self.test_2019)

                self.current_means.append(mean_return)
                self.current_num_trades.append(num_trades)
                self.current_accuracys.append(accuracy)

            if self.first_run:
                #print('starting result', this_runs_avg, this_runs_num_trades,self.buy_cutoff, self.means)
                self.store_results()
                self.start_feature_imp = list(self.feature_imp.keys())
                self.start_feature_imp.insert(0, 'Before and After')
                self.initial_feature_imp = self.start_feature_imp.copy()
                self.features = []
                self.first_run = False

                self.add_feature()
                continue

            self.store_results()

            #self.add_feature()
            try:
                if self.this_runs_avg > self.max_means:
                    self.max_means = self.this_runs_avg
                    self.start_feature_imp = self.initial_feature_imp.copy()

                    self.add_feature()
                else:
                    self.remove_added_feature()
                    self.add_feature()
            except:
                break

    def find_cutoff(self):
        while True:

            if self.cutoff_found == True:
                mean_return = 1
                break
            self.prepare_data()
            self.train_model()
            self.predict()
            mean_return, num_trades, accuracy = self.get_results(self.test)
            scaler = int(num_trades / 250) + 1

            #print('finding cutoff', mean, num_trades, self.buy_cutoff, scaler)

            if num_trades < 300:
                print('found cutoff')
                self.cutoff_found = True
                break

            self.buy_cutoff = round(self.buy_cutoff + (.005 * scaler), 4)
        return mean_return

    def store_results(self):
        try:
            self.this_runs_avg = sum(self.means) / self.iterations
            this_runs_num_trades = sum(self.num_trades) / self.iterations
            accuracy = sum(self.accuracys) / self.iterations
            stddev = np.std(self.means)

            current_avg = sum(self.current_means) / self.iterations
            current_num_trades = sum(self.current_num_trades) / self.iterations
            current_accuracy = sum(self.current_accuracys) / self.iterations
            current_stddev = np.std(self.current_means)
            #print(self.this_runs_avg, this_runs_num_trades, self.buy_cutoff, self.means, stddev, self.this_years_avg, accuracy)

            out_df = pd.DataFrame([[
                self.this_runs_avg, stddev, this_runs_num_trades, accuracy,
                current_avg, current_stddev, current_num_trades,
                current_accuracy, self.buy_cutoff,
                str(self.means),
                str(self.num_trades),
                str(self.features)
            ]])

            out_df.columns = [
                'Avg Return', 'Std Dev', 'Avg Num Trades', 'Accuracy',
                'Current Avg Return', 'Current Std Dev',
                'Current Avg Num Trades', 'Current Accuracy', 'Buy Cutoff',
                'Returns', 'Num Trades', 'Features'
            ]
            print(out_df)
            print(self.max_means)
            #self.test.to_csv('test.csv')
            #input()

            out_df.to_sql('current_predictions', self.conn, if_exists='append')
        except Exception as e:
            print(e)
            pass

    def remove_added_feature(self):

        print('removing added feature ', self.feature_added)
        self.features.remove(self.feature_added)

    # TODO: add two features at a time
    def add_feature(self):

        self.feature_added = self.start_feature_imp.pop(0)
        while self.feature_added in self.features:
            print('not adding feature', self.feature_added,
                  'as it already exists')
            self.feature_added = self.start_feature_imp.pop(0)
        print('adding feature', self.feature_added)
        self.features.append(self.feature_added)

    def prepare_data(self):
        self.test_df['is_train'] = 'Train'
        self.test_df['is_train'].values[
            (self.test_df['Date Reported'] >= datetime.strptime(
                '2018-01-01', '%Y-%m-%d'))
            & (self.test_df['Date Reported'] <= datetime.strptime(
                '2018-12-31', '%Y-%m-%d'))] = 'Test 2018'
        self.test_df['is_train'].values[
            self.test_df['Date Reported'] >= datetime.strptime(
                '2019-01-01', '%Y-%m-%d')] = 'Test 2019'

        self.test_df['Action'] = 'None'
        self.test_df['Action'].values[self.test_df['10 Day Change Abnormal'].
                                      values > self.buy_cutoff] = "Buy"
        self.test_df['Action'] = self.test_df['Action'].astype('category')
        self.test_df["Action Code"] = self.test_df["Action"].cat.codes

        self.test_df = self.test_df[self.features + [
            'Action', 'Action Code', 'is_train', '10 Day Change Abnormal',
            '10 Day Change', 'Date Reported', 'Symbol'
        ]]

        self.test_df = self.test_df.replace('-', np.nan)
        self.test_df = self.test_df.replace([np.inf, -np.inf], np.nan)
        self.test_df = self.test_df.dropna()

        self.train, self.test = self.test_df[
            self.test_df['is_train'] == 'Train'], self.test_df[
                self.test_df['is_train'] == 'Test 2018']

        self.train_2019 = pd.concat([
            self.test_df[self.test_df['is_train'] == 'Train'],
            self.test_df[self.test_df['is_train'] == 'Test 2018']
        ])
        self.test_2019 = self.test_df[self.test_df['is_train'] == 'Test 2019']

    def train_model(self, fast=False):

        self.clf = ExtraTreesClassifier(n_jobs=-1, n_estimators=500)

        #self.clf = RandomForestClassifier(n_jobs=-1)
        y = self.train['Action Code']

        train = self.train[self.features]
        self.clf.fit(train, y)

        self.clf_2019 = ExtraTreesClassifier(n_jobs=-1, n_estimators=500)

        #self.clf = RandomForestClassifier(n_jobs=-1)
        y = self.train_2019['Action Code']

        train = self.train_2019[self.features]
        self.clf_2019.fit(train, y)

    def predict(self):
        preds = self.clf.predict(self.test[self.features])
        preds = pd.DataFrame(preds).astype(str)
        preds.columns = ['Predicted']
        preds = preds.replace('0', 'Buy').replace('1', 'None')
        self.test['Predicted'] = list(preds['Predicted'])

        preds = self.clf_2019.predict(self.test_2019[self.features])
        preds = pd.DataFrame(preds).astype(str)
        preds.columns = ['Predicted']
        preds = preds.replace('0', 'Buy').replace('1', 'None')
        self.test_2019['Predicted'] = list(preds['Predicted'])

    def get_results(self, test_data):
        self.feature_imp = pd.Series(
            self.clf.feature_importances_,
            index=self.features).sort_values(ascending=False)

        if self.first_run:
            print(self.feature_imp)

        chosen = test_data[test_data['Predicted'] == 'Buy']
        mean_return = round(chosen['10 Day Change'].mean() * 100, 4)

        accuracy = len(chosen[chosen['10 Day Change'] > 0]) / float(
            len(chosen))

        return mean_return, len(chosen), accuracy
Beispiel #52
0
    plt.matshow(corrmat, fignum=figure.number)
    plt.colorbar().ax.tick_params(labelsize=20, length=10)
    # plt.title(f"Correlations at {window_ms}ms windows and {stride_ms}ms overlap", fontsize=30)
    plt.xticks(range(data.shape[1]), list(range(22)), fontsize=20)
    plt.tick_params(length=10, bottom=False)
    plt.clim(-1, 1)

    # Add feature names as y-axis labels
    plt.yticks([-0.5] + list(range(data.shape[1])) + [data.shape[1] - 0.5],
               [""] + column_labels + [""],
               fontsize=25)
    plt.savefig('heatmap.png', bbox_inches='tight')
    plt.show()

    # Feature importance
    indices = [
        datapoint_attribute_descriptions[label] for label in datapoint_features
    ]
    classifier = ExtraTreesClassifier(n_estimators=250)
    classifier.fit(X, y)
    importance = pd.Series(classifier.feature_importances_, index=indices)
    importance.nlargest(15).plot(kind='barh')
    plt.show()

    # Feature durations
    durations_path = f"data\\feature\\{conf.imp_type}\\{conf.dos_type}\\mixed_validation_time_100ms_100ms.csv"
    feature_times = datareader_csv.load_feature_durations(durations_path)
    del feature_times['time_ms']
    del feature_times['class_label']
    feature_plotting.plot_feature_barcharts(feature_times)
Beispiel #53
0
    pred_gbt = pred_gbt + list(model_gbt.fit(X[indxs_to_fit[:]], y[indxs_to_fit[:]]).predict_proba(X[indxs,:])[:,1])
    new_Y = new_Y + list(y[indxs[:]])
	
                                                                   
new_X = np.hstack((np.array(pred_ridge).reshape(len(pred_ridge), 1), np.array(pred_randomforest).reshape(len(pred_randomforest), 1), np.array(pred_lasso).reshape(len(pred_lasso), 1), np.array(pred_gbt).reshape(len(pred_gbt), 1)))
print new_X
new_Y = np.array(new_Y).reshape(len(new_Y), 1)

# <codecell>

#model_stacker = lm.LogisticRegression()
model_stacker = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)
print np.mean(cross_val_score(model_stacker, new_X, new_Y.reshape(new_Y.shape[0]), cv=5))

model_stacker.fit(new_X, new_Y.reshape(new_Y.shape[0]))
#save model to disk
filename = 'blendedmodel.sav'
pickle.dump(model_stacker, open(filename, 'wb'))
print "all done Teerth"

importances = model_stacker.feature_importances_
std = np.std([tree.feature_importances_ for tree in model_stacker.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
Beispiel #54
0
SVM = LinearSVC(random_state=42, loss="hinge")
SVC = SVC(random_state=42, kernel = "poly", degree = 3, C=67)

named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("SVM", SVM)
]


poly= Pipeline([
    ("polyfeat", PolynomialFeatures(degree=3)),
    ("svm_clf", LinearSVC(C=67, loss="hinge"))
])

extra_trees_clf.fit(X_train, y_train)
y_pred = extra_trees_clf.predict(X_test)
accuracy_score(y_test, y_pred)


# In[ ]:


estimators = [random_forest_clf, extra_trees_clf, SVM]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)


# In[ ]:
Beispiel #55
0
tar_test.shape

tar_train.describe()  # 1 (positive) more often -> always predict positive
tar_test.describe()  # 0.56

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=25)
classifier = classifier.fit(pred_train, tar_train)
predictions = classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test, predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)  #0.57
model = ExtraTreesClassifier()
model.fit(pred_train, tar_train)

var_name = (pred_train.columns.tolist())
var_sig = (list(model.feature_importances_))
var_imp = DataFrame(columns=var_name)
var_imp.loc['Imp'] = [list(model.feature_importances_)[n] for n in range(7)]
var_imp[var_imp.ix[var_imp.last_valid_index()].argsort()[::-1]]

trees = range(25)
accuracy = np.zeros(25)
for idx in range(len(trees)):

    classifier = RandomForestClassifier(n_estimators=idx + 1)

    classifier = classifier.fit(pred_train, tar_train)
Beispiel #56
0
class ExtraTreesClassifier(IterativeComponentWithSampleWeight,
                           BaseClassificationModel):
    def __init__(self,
                 criterion,
                 min_samples_leaf,
                 min_samples_split,
                 max_features,
                 bootstrap,
                 max_leaf_nodes,
                 max_depth,
                 min_weight_fraction_leaf,
                 min_impurity_decrease,
                 oob_score=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 class_weight=None):

        self.n_estimators = self.get_max_iter()
        if criterion not in ("gini", "entropy"):
            raise ValueError("'criterion' is not in ('gini', 'entropy'): "
                             "%s" % criterion)
        self.criterion = criterion

        if check_none(max_depth):
            self.max_depth = None
        else:
            self.max_depth = int(max_depth)
        if check_none(max_leaf_nodes):
            self.max_leaf_nodes = None
        else:
            self.max_leaf_nodes = int(max_leaf_nodes)

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)
        self.max_features = float(max_features)
        self.bootstrap = check_for_bool(bootstrap)
        self.min_weight_fraction_leaf = float(min_weight_fraction_leaf)
        self.min_impurity_decrease = float(min_impurity_decrease)
        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.class_weight = class_weight
        self.estimator = None

    @staticmethod
    def get_max_iter():
        return 512

    def get_current_iter(self):
        return self.estimator.n_estimators

    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesClassifier as ETC

        if refit:
            self.estimator = None

        if self.estimator is None:
            max_features = int(X.shape[1]**float(self.max_features))
            self.estimator = ETC(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight,
                warm_start=True)

        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        probas = self.estimator.predict_proba(X)
        probas = convert_multioutput_multiclass_to_multilabel(probas)
        return probas

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'ET',
            'name': 'Extra Trees Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'handles_multioutput': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        cs = ConfigurationSpace()

        criterion = CategoricalHyperparameter("criterion", ["gini", "entropy"],
                                              default_value="gini")

        # The maximum number of features used in the forest is calculated as m^max_features, where
        # m is the total number of features, and max_features is the hyperparameter specified below.
        # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This
        # corresponds with Geurts' heuristic.
        max_features = UniformFloatHyperparameter("max_features",
                                                  0.,
                                                  1.,
                                                  default_value=0.5)

        max_depth = UnParametrizedHyperparameter(name="max_depth",
                                                 value="None")

        min_samples_split = UniformIntegerHyperparameter("min_samples_split",
                                                         2,
                                                         20,
                                                         default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf",
                                                        1,
                                                        20,
                                                        default_value=1)
        min_weight_fraction_leaf = UnParametrizedHyperparameter(
            'min_weight_fraction_leaf', 0.)
        max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None")
        min_impurity_decrease = UnParametrizedHyperparameter(
            'min_impurity_decrease', 0.0)

        bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"],
                                              default_value="False")
        cs.add_hyperparameters([
            criterion, max_features, max_depth, min_samples_split,
            min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes,
            min_impurity_decrease, bootstrap
        ])

        return cs
import numpy as np
import pandas as pd

pif = np.loadtxt('processed_imputed_features.txt')

y = pd.read_csv('train.csv')['Complaint-Status']
train_length = len(y)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc.fit(pif[:train_length, :], y)
print(etc.feature_importances_)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pcad = pca.fit_transform(pif[:, :3])

import matplotlib
import matplotlib.pyplot as plt

colors = ['red', 'green', 'blue', 'purple', 'yellow']

plt.scatter(pcad[:train_length, 0],
            pcad[:train_length, 1],
            c=y,
            cmap=matplotlib.colors.ListedColormap(colors))
plt.show()
Beispiel #58
0
print(XGBClassifier_accy)  # 0.816

# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()
adaboost.fit(x_train, y_train)
y_pred = adaboost.predict(x_test)
adaboost_accy = round(accuracy_score(y_pred, y_test), 3)
print(adaboost_accy)  # 0.786

# Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier

ExtraTreesClassifier = ExtraTreesClassifier()
ExtraTreesClassifier.fit(x_train, y_train)
y_pred = ExtraTreesClassifier.predict(x_test)
extraTree_accy = round(accuracy_score(y_pred, y_test), 3)
print(extraTree_accy)  # 0.786

# Gaussian Process Classifier
from sklearn.gaussian_process import GaussianProcessClassifier

GaussianProcessClassifier = GaussianProcessClassifier()
GaussianProcessClassifier.fit(x_train, y_train)
y_pred = GaussianProcessClassifier.predict(x_test)
gau_pro_accy = round(accuracy_score(y_pred, y_test), 3)
print(gau_pro_accy)  # 0.786

# 投票法
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
filename = '../../datasets/pima-indians_classification_train.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
inputx = array[:,0:8]
outputy = array[:,8]
num_folds = 10
kfold = KFold(n_splits=10, random_state=None)
model = ExtraTreesClassifier(n_estimators=100)
results = cross_val_score(model, inputx, outputy, cv=kfold)
print(results.mean())
model.fit(inputx,outputy)
filename = '../../datasets/pima-indians_classification_test.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
newdataframe = read_csv(filename, names=names)
array = newdataframe.values
inputx = array[:,0:8]
print(inputx)
results = model.predict(inputx)
print(model.predict(inputx))
for val in results:
    if val == 0:
        print("diabetes not probable",end="   ")
    else:
        print("probability of getting diabetes",end="   ")
print()
#####KNN

#X=np.vstack((ca3,cb3))
X=c5.drop('Class',axis=1)

#x_n4=X[0:int(len(X)/4)]
#X=x_n4
#label=np.zeros(len(ca3)+len(cb3))
#label[0:200]=1
#label[200:len(label)]=2
label=c5['Class']
y=label
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(X_res, y_res)
# display the relative importance of each attribute
print(model.feature_importances_)
a=model.feature_importances_


yy=pd.DataFrame(label)
yn=(yy == 0).astype(int).sum()
yp=(yy == 1).astype(int).sum()



## SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE