Ejemplos de RandomForestClassifier.fit en Python, ejemplos de sklearn.ensemble.forest.RandomForestClassifier.fit en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: lexicase_ensemble_filter.py Proyecto: lacava/lexicase-random-forests

class LexicaseForestClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs):
        self._initial_forrest_size = n_estimators * initial_forrest_factor
        self._final_forrest_size = n_estimators

        rf_fit_args = copy(kwargs)
        rf_fit_args.update({'n_estimators': self._initial_forrest_size})
        self._rf = RandomForestClassifier(**rf_fit_args)

    def fit(self, X, y):
        self._rf.fit(X, y)

        for t in self._rf.estimators_:
            tree_y_pred = t.predict(X)
            t._error_vector = squared_error_vector(y, tree_y_pred)

        final_estimators = []
        for i in range(self._final_forrest_size):
            final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_))

        self._rf.estimators_ = final_estimators
        self._rf.n_estimators = self._final_forrest_size
        # TODO: Set other self._rf parameters to match correct size so that predict works.

    def predict(self, X, y=None):
        return self._rf.predict(X)

Ejemplo n.º 2

0

Mostrar archivo

    def data_feature_importance(self,
                                features_list,
                                title="Feature Importance"):

        from sklearn.preprocessing import LabelEncoder, Imputer
        from sklearn.cross_validation import train_test_split

        # extraemos las columnas con los features
        clf_data = self.dataframe.loc[:, features_list]

        # Preprocesaos los datos y los ajustamos
        cat_feats_to_use = list(clf_data.select_dtypes(include=object).columns)
        for feat in cat_feats_to_use:
            encoder = LabelEncoder()
            clf_data[feat] = encoder.fit_transform(clf_data[feat])

        # Llenamos los valores vacios
        num_feats_to_use = list(clf_data.select_dtypes(exclude=object).columns)
        for feat in num_feats_to_use:
            imputer = Imputer(strategy='median')
            clf_data[feat] = imputer.fit_transform(
                clf_data[feat].values.reshape(-1, 1))

        # Separamos el index de loas Fetures
        X = clf_data.iloc[:, 1:]
        y = clf_data.iloc[:, 0]  # the target were the first column I included

        # Entrenamos con los datos recivido
        x_train, _, y_train, y_test = train_test_split(X,
                                                       y,
                                                       test_size=.3,
                                                       random_state=35)

        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)

        from sklearn.ensemble.forest import RandomForestClassifier

        # inicializamos el clasificador
        clf = RandomForestClassifier(n_estimators=8, random_state=34)
        clf.fit(x_train, y_train)

        # Pasamos los datos a un Dataframe para poder graficarlos
        feats_imp = pd.DataFrame(clf.feature_importances_,
                                 index=X.columns,
                                 columns=['FeatureImportance'])
        feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False)

        feats_imp.plot(kind='barh', figsize=(12, 6), legend=False)
        plt.title(title)
        sns.despine(left=True, bottom=True)
        plt.gca().invert_yaxis()

        plt.savefig(self.DefeaultPath + " feature importance.png", dpi=200)
        plt.cla()
        plt.clf()

        return

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_explanation_model.py Proyecto: zxlzr/cxplain

    def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: randomforest_plot_tree.py Proyecto: ms201420201029/good_scripts

def plot_tree(profile, group, avg_acc, n_tree, picutre):
    '''
    选择最优的随机种子并建模
    :param profile: 丰度表
    :param group: 分组表
    :param avg_acc: 随机种子准确率的输出文件
    :param n_tree: 模型中树的颗数
    :param picutre: 输出图形的名称
    :return: None
    '''
    acc = pd.read_csv(avg_acc, sep='\t', header=0, index_col=0)
    best_state = int(acc.sort_values('avgAcc').index[-1])

    # 训练和保存预测模型
    rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                random_state=best_state)
    rf.fit(profile, group['label'])
    joblib.dump(rf, 'rf.pkl')

    # 绘制分类树结果图形
    dot = picutre.split('.')[0] + '.dot'
    tree_in_forest = rf.estimators_[rf.n_estimators-1]
    export_graphviz(tree_in_forest,
                    out_file=dot,
                    feature_names=profile.columns,
                    filled=True,
                    rounded=False,
                    precision=100)

    os.system('dot -Tpng {0} -o {1}'.format(dot, picutre))

Ejemplo n.º 5

0

Mostrar archivo

Archivo: dataframes.py Proyecto: dmbrdev/doutorado

def importance():
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv')

    features = list(df.columns.values)
    
    target = 'm_column_result'
    features.remove('m_match_id')
    features.remove('m_column_result')
    features.remove('m_match_date')
    features.remove('m_goals_home')
    features.remove('m_goals_away')
    features.remove('a_next_match_id')
    features.remove('h_next_match_id')
    features.remove('m_favorite')
    features.remove('m_medium')
    features.remove('m_underdog')
    features.remove('h_last_match_local')
    features.remove('a_last_match_local')
    features.remove('rf1000')
    features.remove('rf1000_fs1')
    
    
    X = df[features]
    y = df[target]
    # fit an Extra Trees model to the data
    clf = RandomForestClassifier(n_estimators=1000)
    clf.fit(X, y)
    # display the relative importance of each attribute
    for x,y in zip(features,clf.feature_importances_):
        
        print (x,y)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: randomForest.py Proyecto: craigsakuma/Data_Science

def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                  min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, 
                  random_state=0, min_density=None):
    clf = RandomForestClassifier()
    clf.fit(x_train,Y_train)
    
    return clf

Ejemplo n.º 7

0

Mostrar archivo

Archivo: Ba.py Proyecto: Emsibil/Bachelor

def enemy_detection_clf():

    chars = np.array(['warrior', 'warlock', 'mage', 'druid', 'rogue', 'shaman', 'paladin', 'priest', 'hunter'])
    data = []
    target = []
    for c in chars:
        p = path('images/character/new/black')
        for f in os.listdir(p+'/'+c):
            img = Image.open(p+'/'+c+'/'+f)
            w, h = img.size
            pixel = img.load()
            tmp = []
            for y in range(h):
                for x in range(w):
                    tmp.append(np.float(pixel[x,y] / 255))
            target.append(np.str(c))
            data.append(np.array(tmp))
    data = np.array(data)
    #image = data.view()
    #image.shape = (-1, 22, 30)
    #clf = svm.SVC(gamma = 0.001)
    clf = RandomForestClassifier()
    clf.fit(data, target)
    
    return clf

Ejemplo n.º 8

0

Mostrar archivo

def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")

Ejemplo n.º 9

0

Mostrar archivo

Archivo: randforest.py Proyecto: setman85/GA_homework

def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                  min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, 
                  random_state=None, verbose=0, min_density=None, compute_importances=None, *args):
    clf = RandomForestClassifier()
    clf.fit(x_train,Y_train)
    
    return clf

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_explanation_model.py Proyecto: xaipient/cxplain

    def test_nlp_erroneous_rnn_args_invalid(self):
        num_words = 1024
        (x_train,
         y_train), (x_test,
                    y_test) = TestUtil.get_random_variable_length_dataset(
                        max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64,
                                                 max_depth=5,
                                                 random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        with self.assertRaises(ValueError):
            _ = RNNModelBuilder(
                with_embedding=True,
                verbose=0)  # Must also specify the embedding_size argument.

        model_builder = RNNModelBuilder(embedding_size=num_words,
                                        with_embedding=True,
                                        verbose=0)

        input_layer = Input(shape=(10, 2))
        with self.assertRaises(ValueError):
            model_builder.build(input_layer)

        input_layer = Input(shape=(10, 3))
        with self.assertRaises(ValueError):
            model_builder.build(input_layer)

Ejemplo n.º 11

0

Mostrar archivo

def stkFoldCrossValidation():

    X = pickle.load(open('X.p', 'rb'))

    X = np.array(X)

    Y = pickle.load(open('Y.p', 'rb'))

    Y = np.array(Y)

    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, Y)

    k = 1
    for train_index, test_index in skf.split(X, Y):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        print(k)
        k += 1

        rf = RandomForestClassifier()

        rf.fit(X_train, Y_train)

        yp = rf.predict(X_test)
        print(classification_report(Y_test, yp, digits=6))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: random_forest_classifier.py Proyecto: yutarochan/lale

class RandomForestClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)

Ejemplo n.º 13

0

Mostrar archivo

def RF_Features_Importance(X, Y, outputfile="RF.csv"):
    forest = RandomForestClassifier(n_estimators=300)
    forest.fit(X, Y)
    importances = np.matrix(forest.feature_importances_).tolist()[0]
    df = pd.DataFrame(list(zip(header, importances)),
                      columns=["Features", "Importance"])

    df.to_csv(outputfile, index=False)

Ejemplo n.º 14

0

Mostrar archivo

def forest(X, y, model_path):
    model = RandomForestClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: models.py Proyecto: kalpanki/pp

def rforest_classify(X,Y):
	#clf = RandomForestClassifier(criterion='gini',max_features=7,n_estimators=100,n_jobs=3,min_samples_leaf=5)

	clf = RandomForestClassifier(n_estimators=500, \
			criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1 \
                ,max_features='auto', bootstrap=False, oob_score=False, n_jobs=-1, min_density=None)
	clf.fit(X,Y)
	return clf

Ejemplo n.º 16

0

Mostrar archivo

Archivo: cross_iris_01.py Proyecto: woojungjang/Tensorflow

def calc_score(test, train):
    test_f, test_l = split_data_label(test)
    train_f, train_l = split_data_label(train)
    # 학습시키고 정답률 구하기

    clf = RandomForestClassifier()
    clf.fit(train_f, train_l)
    pre = clf.predict(test_f)
    return metrics.accuracy_score(test_l, pre)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: amazon_process.py Proyecto: JayveeHe/OpinionRankProject

def train_rf(train_vec, train_label):
    from sklearn.ensemble.forest import RandomForestClassifier as RFC
    # rfrclf = RFR(n_estimators=1001)
    # rfrclf.fit(train_vec, train_label)
    # print rfrclf.feature_importances_
    trfclf = RFC(n_estimators=1001)
    trfclf.fit(train_vec, train_label)
    # print rfclf.feature_importances_
    return trfclf

Ejemplo n.º 18

0

Mostrar archivo

Archivo: main.py Proyecto: rjgsousa/sentiment_analysis

def main(args):

    if args.analyse != None:
        train_data_x, test_data_x,train_data_y, test_data_y  = process_data(args.analyse)

        RT = RandomForestClassifier(n_estimators=100)
        RT.fit(train_data_x, train_data_y)
        print RT.score(test_data_x, test_data_y)

    return

Ejemplo n.º 19

0

Mostrar archivo

Archivo: RF.py Proyecto: eenx15/Volvo-DataX

def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return: trained random forest classifier
    """
    clf = RandomForestClassifier(n_estimators=600, max_depth=50)
    clf.fit(features, target)
    return clf

Ejemplo n.º 20

0

Mostrar archivo

Archivo: Ba.py Proyecto: Emsibil/Bachelor

def my_digits():
    digits = _data()
    
    n_samples = len(digits.images)
    datas = digits.images.reshape((n_samples, -1))

    classifier = RandomForestClassifier()
    classifier.fit(datas, digits.target)
    
    return classifier

Ejemplo n.º 21

0

Mostrar archivo

Archivo: TitanicPrediction_LogisticRegression.py Proyecto: malaikannan/Kaggle_TitanicPredictionChallenge

 def RandomForestClassifer(self):
     
     '''
     Function to do RandomForest Classifer.
     '''
     train_Array = self.titanic_train_frame.values
     self.test_Array = self.titanic_test_frame.values
     randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
     randomForest.fit(train_Array[0::,1::],train_Array[0::,0])
     self.predicted_probability = randomForest.predict(self.test_Array[0::,0::])
     self.predicted_probability_list = self.predicted_probability.tolist()

Ejemplo n.º 22

0

Mostrar archivo

Archivo: randomForest.py Proyecto: matanaor1/AIProject1

class RFClassifier(super.abstract_classifier):

    def __init__(self, train_features, train_labels, num_of_trees):
        self.train_features = train_features
        self.train_labels = train_labels
        self.rf_member = RandomForestClassifier(num_of_trees)

    def train(self):
        self.rf_member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.rf_member.predict(newVector)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: training_prediction.py Proyecto: gssgch/gssgML

def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序，这里shuffle一下，效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')

Ejemplo n.º 24

0

Mostrar archivo

Archivo: ex26_7.py Proyecto: gabormakrai/landuseregression

def evalOne(enabledColumns):
    features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]]
    Y = []
    P = []
    for group in range(0,5):
    #     print("Test group " + str(group + 1))
        trainStationList = []
        testStationList = []
        for i in range(0,5):
            if i == group:
                testStationList.extend(groups[i])
            else:
                trainStationList.extend(groups[i])
        trainStations = set(float(station) for station in trainStationList)
        # reorder train stations
    #     print("\ttrainStationList:" + str(trainStationList))
        trainStationList = [s for s in all_stations if float(s) in trainStations]
    #     print("\ttrainStationList:" + str(trainStationList))
        testStations = set(float(station) for station in testStationList)
    #     print("\ttestStationList:" + str(testStationList))
        trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target")
     
        train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)]
#         train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)]
         
        test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)]
#         test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)]
         
        trainY = []
        for l in trainLocation:
            if l in train_lower:
                trainY.append(0)
            else:
                trainY.append(1)
         
        testY = []
        for l in testLocation:
            if l in test_lower:
                testY.append(0)
            else:
                testY.append(1)
         
        model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1)
        model.fit(trainX, trainY)
        predY = model.predict(testX)
         
        Y.extend(testY)
        P.extend(predY)
     
    f1 = f1_score(Y, P)
    accuracy = accuracy_score(Y, P)
    return f1, accuracy

Ejemplo n.º 25

0

Mostrar archivo

Archivo: Algro.py Proyecto: fengxinhe/DeviceManager

def RF(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelRF=RandomForestClassifier(n_estimators=10,
                                    max_depth=5,max_features=1,random_state=0)
     modelRF.fit(train_desc,np.array(train_labels))
     joblib.dump((modelRF, img_classes, stdSlr), pth+"/rf-bof.pkl", compress=3) 
     test(pth, "rf-")

Ejemplo n.º 26

0

Mostrar archivo

Archivo: train_main.py Proyecto: georgid/vocal-detection

def train_classifier(vocal_frames, non_vocal_frames):

    frames = np.append(vocal_frames, non_vocal_frames, axis=0)

    labels_vocal = np.ones(vocal_frames.shape[0])
    labels_non_vocal = np.zeros(non_vocal_frames.shape[0])

    labels = np.append(labels_vocal, labels_non_vocal, axis=0)

    rfc = RandomForestClassifier(n_estimators=100, max_depth=None)
    rfc.fit(frames, labels)

    return rfc

Ejemplo n.º 27

0

Mostrar archivo

class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)

Ejemplo n.º 28

0

Mostrar archivo

Archivo: s210_predict_good.py Proyecto: enima2684/zillow

def train_model(X_train,y_train):
    print("training the model ...")
    
    # create sets for probability calibration
    X_train_train, X_prob_cal, y_train_train, y_prob_cal = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.2)

    
    rf = RandomForestClassifier(
               max_features="auto",
               n_estimators=2000,
               max_depth=8,
               n_jobs=-1,
               class_weight = 'balanced',
               verbose=1)
    rf.fit(X_train_train,y_train_train)
    
    # feature importances
   
#    feature_importance = False
#    if(feature_importance):
#        
#        importances = rf.feature_importances_
#        std = np.std([tree.feature_importances_ for tree in rf.estimators_],
#                 axis=0)
#        indices = np.argsort(importances)[::-1]
#        col_names = df.drop('bin',axis=1).columns.values
#        print("Feature ranking:")
#        
#        for f in range(X_train_train.shape[1]):
#            print("%d. %s (%f)" % (f + 1, col_names[indices[f]], importances[indices[f]]))
#        
#        # Plot the feature importances of the forest
#        plt.figure()
#        plt.title("Feature importances")
#        plt.bar(range(X_train_train.shape[1]), importances[indices],
#               color="r", yerr=std[indices], align="center")
#        plt.xticks(range(X_train_train.shape[1]), col_names[indices],rotation = 50)
#        plt.xlim([-1, X_train_train.shape[1]])
#        plt.show()
        
    
    # Probability calibration
    sig_clf = CalibratedClassifierCV(rf, method="sigmoid", cv="prefit")
    sig_clf.fit(X_prob_cal, y_prob_cal)
    y_pred_train = sig_clf.predict_proba(X_train)
    
    
    print(".. training log_loss  : {:0.2f} %".format(log_loss(y_train,y_pred_train)*100))
    return sig_clf

Ejemplo n.º 29

0

Mostrar archivo

def classic_model(image_dir, image_lists, method):

    X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method)
    classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    classifier.fit(X, y)

    X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method)
    predictions = classifier.predict(X_test)
    confusion = pandas.crosstab(y_test,
                                predictions,
                                rownames=['Actual Class'],
                                colnames=['Predicted Class'])
    print confusion
    return accuracy_score(y_test, predictions)

Ejemplo n.º 30

0

Mostrar archivo

def try_model(train):
    print(train.shape)
    features = ["phone_brand", "device_model",  "event_count", "action_radius_max", "medianTime", "minTime", "maxTime", "weekday", "appcounts1"]
    encoder = LabelEncoder()
    train["group"] = encoder.fit_transform(train["group"].values)
    
    rf = RandomForestClassifier(n_estimators=50, max_depth=15, max_features=6, bootstrap=True, n_jobs=4, random_state=2016, class_weight=None)
    
    rf.fit(train[features].values, train["group"].values)
    feature_importance(rf, features)
    
    skf = StratifiedKFold(train["group"].values, n_folds=5, shuffle=True, random_state=2016)
    scores = cross_val_score(rf, train[features].values, train["group"].values, scoring="log_loss", cv=skf, n_jobs=1)
    print(scores)
    print("RF Score: %0.5f" %(-scores.mean())) # RF Score: 2.39884

Ejemplo n.º 31

0

Mostrar archivo

def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen, :]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:, :]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred

Ejemplo n.º 32

0

Mostrar archivo

Archivo: pscvread.py Proyecto: coreyabshire/color-names

def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen,:]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:,:]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred

Ejemplo n.º 33

0

Mostrar archivo

def train_model(X_train, y_train):
    print("training the model ...")
    rf = RandomForestClassifier(n_estimators=1000,
                                max_depth=8,
                                n_jobs=-1,
                                verbose=1)
    #    rf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0,probability=True)

    rf.fit(X_train, y_train)
    y_pred_train = rf.predict_proba(X_train)

    fpr, tpr, thresholds = roc_curve(y_train, y_pred_train[:, 0], pos_label=1)
    print("AUC on train : {:.02f} %".format(auc(fpr, tpr) * 100))

    return rf

Ejemplo n.º 34

0

Mostrar archivo

Archivo: randomforest_plot_tree.py Proyecto: ms201420201029/good_scripts

def random_forest(profile, group, n_tree, search_number, avg_acc):
    '''
    对丰度表进行建模
    :param profile:丰度表
    :param group: 分组表
    :param n_tree: 模型中树的颗数
    :param search_number: 搜索随机种子的次数
    :param avg_acc: 随机种子准确率的输出文件
    :return: 加label后的group
    '''
    real_label = set(group.iloc[:, 0])
    label_dict = {}
    for i, j in enumerate(real_label):
        label_dict[j] = i
    label = []
    for sample in group.index:
        label.append(label_dict[group.loc[sample].values[0]])

    group['label'] = label

    n = 0
    with open(avg_acc, 'w') as f:
        f.write('random_state\tavgAcc\n')
        while n < search_number:
            print('现在循环次数为{0}'.format(n+1))
            # random random_state
            random_state = round(random() * 10000)

            rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                        random_state=random_state)

            acc = []
            for i in range(10):
                sample_train = list(profile.sample(n=30).index)
                sample_val = list(set(profile.index).difference(sample_train))
                train = profile.loc[sample_train]
                val = profile.loc[sample_val]
                label_train = group['label'].loc[sample_train]

                rf.fit(train, group['label'][sample_train])
                pre = rf.predict(val)

                acc.append(metrics.accuracy_score(y_true=group['label'][sample_val], y_pred=pre))

            # print('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            f.write('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            n += 1
    return group

Ejemplo n.º 35

0

Mostrar archivo

Archivo: base.py Proyecto: CcIsHandsome/kaggle-IEEE-isFraud

def RandomForestSelector(A, y, n_estimators, n_features):
    columns = A.columns.values
    rf = RandomForestClassifier(n_estimators=n_estimators, verbose=0, n_jobs=-1, max_depth=9, random_state=2019)
    rf.fit(A, y)
    importance = rf.feature_importances_
    importance_index = np.argsort(importance)[::-1][:n_features]
    importance_columns = columns[importance_index]
    importance_values = importance[importance_index]
    # print(importance_columns, "\n选取后:",  len(importance_columns))
    # importance_dataFrame = pd.DataFrame({
    #         "feature": importance_columns,
    #         "value": importance_values
    # })
    # print(importance_dataFrame)
    A = A[importance_columns]
    return A, importance_columns

Ejemplo n.º 36

0

Mostrar archivo

class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(
            **{
                'verbose': 1,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'n_jobs': 40
            })
        self.name = "rf_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts

Ejemplo n.º 37

0

Mostrar archivo

Archivo: pscvread.py Proyecto: coreyabshire/color-names

def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen,:]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:,:]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred

Ejemplo n.º 38

0

Mostrar archivo

def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen, :]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:, :]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred

Ejemplo n.º 39

0

Mostrar archivo

Archivo: test_driver_functions.py Proyecto: lgadawski/spdb-driver-telematics

    def test_RandomForest(self):
        X = [[0, 1], [1, 1]]
        Y = [0, 1]

        regression = RandomForestClassifier(n_estimators=10)
        regression = regression.fit(X, Y)
        regression.predict_proba(X)

Ejemplo n.º 40

0

Mostrar archivo

def decision_frist():

    data = datasets.load_iris()
    x = data["data"]
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    des = DecisionTreeClassifier(max_leaf_nodes=3)
    des.fit(X_train, y_train)
    print(des.predict(X_test))
    print(des.score(X_test, y_test))

    rom = RandomForestClassifier()
    rom.fit(X_train, y_train)
    print(rom.predict(X_test))
    print(rom.score(X_test, y_test))

Ejemplo n.º 41

0

Mostrar archivo

 def mymap(data, N):
     data = cPickle.loads(str(data))
     x = data[:, :-1]
     y = data[:, -1]
     model = RandomForestClassifier(n_estimators=N, max_depth=6)
     model = model.fit(x, y)
     return cPickle.dumps(model)

Ejemplo n.º 42

0

Mostrar archivo

Archivo: dataframes.py Proyecto: dmbrdev/doutorado

def forestPredict(columName, features, trees):
    
    pd.options.mode.chained_assignment = None
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv')
    df['pred'] = ""
    #df = df.set_index([df.m_championship_id,df.m_match_group_num])
    df = df.set_index([df.m_match_id])
    
    for champId in range(1,91):
        
        champ = df[(df.m_championship_id == champId)]       
        print(champId)
        
        if (champId < 11 or champId > 20):
            
            if (len(champ) == 380):
                rd = 38
            elif (len(champ) == 306):
                rd = 34
            else:
                rd = 30
                 
            for mid in range(2,rd+1):
                
                train = champ[champ.m_match_group_num < mid]
                test = champ[champ.m_match_group_num == mid]
                  
                target = 'm_column_result'
                  
                X = train[features]
                y = train[target]                  
                Z = test[features]
                  
                clf = RandomForestClassifier(n_estimators=trees,max_features=None )
                clf.fit(X,y)
                          
                pred = clf.predict(Z)
                  
                for i,p in zip(Z.index,pred) :

                    df.set_value(i,'pred',p)
             
    nameFile = 'pred_' + columName + ".csv"
    
    df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);

Ejemplo n.º 43

0

Mostrar archivo

Archivo: model2.py Proyecto: tearf001/ucloud

def model_pred(trainX,trainY,testX,model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100,200,300,400,500]
        for param in params:
            clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:",float(sum(pred))/len(pred)
    return pred

Ejemplo n.º 44

0

Mostrar archivo

Archivo: portfolioClassificator.py Proyecto: ignasiet/LabIA

 def initDecTrees(self, path):
     for filename in os.listdir(path):
         if filename=='train.csv':
             with open(os.path.join(path,filename)) as infile:
                 f = csv.reader(infile)
                 aux = f.next()  # skip the header
                 x = []
                 y = []
                 for line in f:
                     if size(line) > 1:
                         if self.option == 1:
                             data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9])]
                             y.append(converter(line[6]))
                             x.append(data)
                         elif self.option == 2:
                             auxDeputy = fetchDeputyParty(line[2])
                             data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9]), encodeParty(auxDeputy['party']), encodeState(auxDeputy['state'])]
                             y.append(converter(line[6]))
                             x.append(data)
             clf = RandomForestClassifier(n_estimators=5)
             clf.fit(x, y)
             return clf

Ejemplo n.º 45

0

Mostrar archivo

Archivo: tree.py Proyecto: hongbin0908/pytrade

class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators,
                                                    'max_depth':max_depth,'min_samples_leaf':min_samples_leaf,
                                                    'n_jobs':40})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )
    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts

Ejemplo n.º 46

0

Mostrar archivo

class MyRandomForestClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000,
                 n_jobs=25):
        self.classifier = RandomForestClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf,
                                                      'n_jobs': n_jobs})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        return self.classifier.feature_importances_

Ejemplo n.º 47

0

Mostrar archivo

Archivo: mainfuction.py Proyecto: quan-js/SentimentAnalysis

def drawfeature(train_data_path='./train', train_filename='train_cleaned',test_data_path='./test', test_filename='test_cleaned'):
    train_file = os.path.join(train_data_path, train_filename)
    train_data = pd.read_csv(train_file)
    n_train_data = train_data['text'].size

    test_file = os.path.join(test_data_path,test_filename)
    test_data = pd.read_csv(test_file)
    n_test_data = test_data['text'].size

    vectorizer = CountVectorizer(analyzer="word",tokenizer=None, preprocessor=None, stop_words=None, max_features=2000)
    transformer = TfidfTransformer()

    train_data_words = []
    for i in xrange(n_train_data):
        train_data_words.append(words_to_features(train_data['text'][i]))
    train_data_features = vectorizer.fit_transform(train_data_words)
    train_data_features = train_data_features.toarray()
    train_data_features = transformer.fit_transform(train_data_features)
    train_data_features = train_data_features.toarray()
    train_data_pd=pd.Series(train_data_features,name=None)
    train_data_pd.to_csv("trainfeature.csv", index=None, header=True)


    test_data_words = []
    for i in xrange(n_test_data):
        test_data_words.append(words_to_features(test_data['text'][i]))
    test_data_features = vectorizer.fit_transform(test_data_words)
    test_data_features = test_data_features.toarray()
    test_data_features = transformer.fit_transform(test_data_features)
    test_data_features = test_data_features.toarray()
    test_data_pd=pd.Series(test_data_features,name=None)
    test_data_pd.to_csv("testfeature.csv", index=None, header=True)

    forest = RandomForestClassifier(n_estimators=60)
    forest = forest.fit(train_data_features, train_data['lable'])
    pred = forest.pedict(test_data_features)
    pred = pd.Series(pred,name='Target')
    pred.to_csv("bow_tfidf_RF.csv", index=None, header=True)

Ejemplo n.º 48

0

Mostrar archivo

Archivo: crossvalidate_features_to_select.py Proyecto: Pold87/pikki-virus

    total_test = np.array([])
    
    for year in [2007, 2009, 2011, 2013]:

        X_train, X_test, y_train, y_test = year_train_test_split(
            train_for_loo,
            'WnvPresent_DateTrapSpecies',
            year)      

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False)
        X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
        y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False)
        y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False)

        
        clf.fit(X_train, y_train)

        # y_pred = clf.predict_proba(X_test) [:, 1] # Random Forest
        y_pred = clf.predict_proba(X_test) # For XGB
        
        score = metrics.roc_auc_score(y_test, y_pred)
        scores.append(score)
        
        #import operator
        #feat_importances = dict(zip(X_train.columns, clf.feature_importances_))
        #sorted_feat_importances = sorted(feat_importances.items(), key=operator.itemgetter(1))
        #print(sorted_feat_importances)
        
        total_pred = np.concatenate((total_pred, y_pred))
        total_test = np.concatenate((total_test, y_test))

Ejemplo n.º 49

0

Mostrar archivo

Archivo: DigitRecognizerAnalysis.py Proyecto: anooshac/machine-learning-projects

#Check if there is linear correlation between pixel<x> columns and label
#If yes, we should dive into the columns with correlation. Linear / logistic regression may work well with the data.
#In this case, makes sense that there is no correlation - higher pixel values does not mean that label value will be higher
#print "Correlation:", train.corr()["label"]

#Check that the algorithm used gives good accuracy by using part of the training set to validate
train_train, train_test=train_test_split(train, test_size=0.3)

#Train model
model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)
#model=KNeighborsClassifier(n_neighbors=6)


#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
#model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int)))
#print "model.score:", model.score(train_test.ix[:,'pixel0':'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int)))
#print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3)
model.fit(train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'].values.ravel())
print "model.score", model.score(train_test.ix[:,'pixel0':'pixel783'], train_test.ix[:,'label'].values.ravel())


#Predict output
#predicted=model.predict(train_test.ix[:,'pixel0':'pixel783'])
#print predicted
#print "Accuracy: ", accuracy_score(train_test.ix[:,'label'].astype(int), predicted)

Ejemplo n.º 50

0

Mostrar archivo

Archivo: your_algorithm.py Proyecto: Inigo96/Udacity_Machine_Learning

### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]


#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")

### your code here!  name your classifier object clf if you want the 
clf=RandomForestClassifier()
clf.fit(features_train,labels_train)
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)

plt.show()
################################################################################

Ejemplo n.º 51

0

Mostrar archivo

Archivo: classification 10 29 2013.py Proyecto: AkiraKane/GA_Data_Science

metrics.confusion_matrix(y_train, model2.predict(x_train))
print metrics.classification_report(y_train, model2.predict(x_train))
metrics.confusion_matrix(y_test, model2.predict(x_test))
print metrics.classification_report(y_test, model2.predict(x_test))
clf.set_params(min_samples_leaf=5)
clf.set_params(max_depth=5)
model3 = clf.fit(x_train, y_train)
metrics.confusion_matrix(y_train, model3.predict(x_train))
print metrics.classification_report(y_train, model3.predict(x_train))
metrics.confusion_matrix(y_test, model3.predict(x_test))
print metrics.classification_report(y_train, model3.predict(x_train))


#### Models
from sklearn.ensemble.forest import (RandomForestClassifier,
ExtraTreesClassifier)
from sklearn.ensemble import *
model = RandomForestClassifier()

# Train
clf = model.fit(x_train, y_train)

# Get accuracy scores
scores = clf.score(data, target)
metrics.confusion_matrix(y_train, clf.predict(x_train))
print metrics.classification_report(y_train, clf.predict(x_train))

metrics.confusion_matrix(y_test, clf.predict(x_test))
print metrics.classification_report(y_test, clf.predict(x_test))

Ejemplo n.º 52

0

Mostrar archivo

Archivo: gradient_boosting_old.py Proyecto: mityinzer/lhcb_trigger_ml

 def fit(self, X, y, sample_weight=None):
     sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight, pow_sig=self.pow_sig,
                                      pow_bg=self.pow_bg)
     return RandomForestClassifier.fit(self, X, y, sample_weight=sample_weight)

Ejemplo n.º 53

0

Mostrar archivo

Archivo: model_fit.py Proyecto: JPLindsen/GA_Data_Science_Project

clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini',
                                                max_depth=6, 
                                                min_samples_leaf=3),
                         n_estimators = 200,
                         learning_rate = 0.1)
####################
clf = neighbors.KNeighborsClassifier(100, weights='uniform')
clf = neighbors.KNeighborsClassifier(100, weights='distance')
####################
clf = GaussianNB()
##############################
t0 = time()
param_grid = {'C': [150, 500, 750, 1000],
              'gamma': [ 0.0005, 0.001, 0.05, .01], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

clf = SVC( C = 750, kernel='rbf', class_weight='auto', gamma = 0.001, probability=True)

########################
scores = cross_val_score(clf, data_PCA, dat_clean.genre, cv=10)
print(scores)
print(scores.mean())

predicted = cross_val_predict(clf, data_PCA, dat_clean.genre, cv=10)
print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(dat_clean.genre, predicted))
print
print "Classification report:"

Ejemplo n.º 54

0

Mostrar archivo

        'score': []
    }

    if len(sys.argv) > 1 and sys.argv[1] == '--skip-train':
        results = pd.read_csv("./data/results.csv")
    else:
        for classification_dataset in classification_dataset_names:
            print("Starting", classification_dataset)

            X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/')
            train_X, test_X, train_y, test_y = train_test_split(X, y)

            rf = RandomForestClassifier()
            lexRF = LexicaseForestClassifier()

            rf.fit(train_X, train_y)
            lexRF.fit(train_X, train_y)

            rf_score = rf.score(test_X, test_y)
            lexRF_score = lexRF.score(test_X, test_y)

            results['problem'] = results['problem'] + ([classification_dataset] * 2)
            results['method'] = results['method'] + ['RF', 'LexRF']
            results['score'].append(rf_score)
            results['score'].append(lexRF_score)

        results = pd.DataFrame(results)
        results.to_csv("./data/results.csv", index=False)

    problems = (
        results

Ejemplo n.º 55

0

Mostrar archivo

Archivo: main.py Proyecto: dtpryce/jpmml-sklearn

print(audit_X.dtype, audit_y.dtype)

def predict_audit(classifier):
    adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"])
    adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"])
    return pandas.concat((adjusted, adjusted_proba), axis = 1)

audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5)
audit_tree.fit(audit_X, audit_y)

store_pkl(audit_tree, "DecisionTreeAudit.pkl")
store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv")

audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5)
audit_forest.fit(audit_X, audit_y)

store_pkl(audit_forest, "RandomForestAudit.pkl")
store_csv(predict_audit(audit_forest), "RandomForestAudit.csv")

audit_regression = LogisticRegression()
audit_regression.fit(audit_X, audit_y)

store_pkl(audit_regression, "RegressionAudit.pkl")
store_csv(predict_audit(audit_regression), "RegressionAudit.csv")

#
# Multi-class classification
#

iris_df = load_csv("Iris.csv")

Ejemplo n.º 56

0

Mostrar archivo

Archivo: gs_random_forest.py Proyecto: matthagy/sc2_label_models

import matplotlib.pyplot as plt
from sklearn.ensemble.forest import RandomForestClassifier

import autopath
from datasets import training_set, test_set
from util import convert_gray_scale, flatten


Xr,Yr = training_set
Xe,Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4,
         label='misclassified')
plt.legend(loc='upper left')
plt.draw()
plt.show()

Ejemplo n.º 57

0

Mostrar archivo

Archivo: svm.py Proyecto: ghollah/ServingMLAPIs

def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls):
	dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
	#----DATA PREPROCESSING
	#-------dealing with NULL values in the data
	#----------remove the rows in which the response is null
	dataset=dataset.dropna(subset=[resp_var])
	#----------dealing with nulls
	dataset=deal_with_nulls(dealing_with_nulls,dataset)
	#----FEATURE SELECTION
	#-------get predictors important in predicting the response
	#-----------transform categorical predictors to dummy variables
	predictors=dataset.drop(resp_var,axis=1,inplace=False)
	predictors=pd.get_dummies(predictors)
	#-----------balance the classes in the response var
	ros = RandomOverSampler(random_state=0)
	resp=dataset[resp_var]
	prds, resp = ros.fit_sample(predictors, resp)
	#-----------fit the random forest classifier to give us the important predictors
	rf_clf = RandomForestClassifier(n_estimators=n_estimators)
	rf_clf.fit(prds,resp)
	#-------get the important predictors
	feature_imp = pd.Series(rf_clf.feature_importances_,
                    index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
	#-------names of the important predictors
	important_predictor_names = feature_imp.index[0:important_features]
	#-------subset the data to get only the important predictors and the response
	resp=pd.DataFrame(data=resp,columns=[resp_var])
	predictors=pd.DataFrame(prds,columns=list(predictors))
	dataset=pd.concat([resp,predictors],axis=1)
	#---------------------------------------------------------
	#----MODEL TRAINING
	#--------Remove the response variables from the features variables - axis 1 refers to the columns
	m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
	# Response variables are the values we want to predict
	resp_var = np.array(dataset[resp_var])

	dataset = pd.get_dummies(m_data)
    
	# Saving feature names for later use
	feature_list = list(m_data.columns)
	# Convert to numpy array
	dataset = np.array(dataset)

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402)

	# Instantiate model with n_estimators decision trees
	clf = SVC(kernel='rbf',probability=True)

	# Train the model on training data
	clf.fit(train_features, train_labels)
    # evaluation
	predicted = clf.predict(test_features)
	pred_prob = clf.predict_proba(test_features)
    
	accuracy = accuracy_score(test_labels, predicted)
	#confusion matrix
	cnf = (confusion_matrix(test_labels,predicted))
	#precision score
	precision = precision_score(test_labels,predicted,pos_label=positive_class)
	#avg pres
	avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
	#recall score
	rec = recall_score(test_labels,predicted,pos_label=positive_class)
	#f1 scorea
	fscore = f1_score(test_labels,predicted,pos_label=positive_class)
	#fbeta score
	fbeta = fbeta_score(test_labels,predicted,beta=0.5)
	#hamming_loss
	hamming = hamming_loss(test_labels,predicted)
	#jaccard similarity score
	jaccard = jaccard_similarity_score(test_labels,predicted)
	#logloss
	logloss = log_loss(test_labels,predicted)
	#zero-oneloss
	zero_one = zero_one_loss(test_labels,predicted)
	#auc roc 
	area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
	#cohen_score
	cohen = cohen_kappa_score(test_labels,predicted)
	#mathews corr
	mathews = matthews_corrcoef(test_labels,predicted)
	# Variable importances from the important features selection stage
	variable_importance_list = list(zip(prds, feature_imp))
	output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
	output=json.dumps(output)
	return jsonify({"Predictions": output})

Ejemplo n.º 58

0

Mostrar archivo

Archivo: baseline_model.py Proyecto: amyxzhang/NB_project

                    corpus_test.append(text)
                    if int(vals[0]) == 0:
                        y_test.append('0')
                    else:
                        y_test.append('1')
    
    X_train = vectorizer.fit_transform(corpus_train)

    X_test = vectorizer.transform(corpus_test)
    
    clf = RandomForestClassifier(n_estimators=10)
    #clf = KNeighborsClassifier(n_neighbors=10)
    #clf = LinearSVC()
    
    clf.fit(X_train, y_train)
    
    print len(y_train)
    print len(y_test)
    
    pred = clf.predict(X_test)
    
    #pred = ['0']* len(y_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    total.append(score)
    
    n = 20
    
#     feature_names = vectorizer.get_feature_names()
#     coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))