コード例 #1
0
class LexicaseForestClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs):
        self._initial_forrest_size = n_estimators * initial_forrest_factor
        self._final_forrest_size = n_estimators

        rf_fit_args = copy(kwargs)
        rf_fit_args.update({'n_estimators': self._initial_forrest_size})
        self._rf = RandomForestClassifier(**rf_fit_args)

    def fit(self, X, y):
        self._rf.fit(X, y)

        for t in self._rf.estimators_:
            tree_y_pred = t.predict(X)
            t._error_vector = squared_error_vector(y, tree_y_pred)

        final_estimators = []
        for i in range(self._final_forrest_size):
            final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_))

        self._rf.estimators_ = final_estimators
        self._rf.n_estimators = self._final_forrest_size
        # TODO: Set other self._rf parameters to match correct size so that predict works.

    def predict(self, X, y=None):
        return self._rf.predict(X)
コード例 #2
0
    def data_feature_importance(self,
                                features_list,
                                title="Feature Importance"):

        from sklearn.preprocessing import LabelEncoder, Imputer
        from sklearn.cross_validation import train_test_split

        # extraemos las columnas con los features
        clf_data = self.dataframe.loc[:, features_list]

        # Preprocesaos los datos y los ajustamos
        cat_feats_to_use = list(clf_data.select_dtypes(include=object).columns)
        for feat in cat_feats_to_use:
            encoder = LabelEncoder()
            clf_data[feat] = encoder.fit_transform(clf_data[feat])

        # Llenamos los valores vacios
        num_feats_to_use = list(clf_data.select_dtypes(exclude=object).columns)
        for feat in num_feats_to_use:
            imputer = Imputer(strategy='median')
            clf_data[feat] = imputer.fit_transform(
                clf_data[feat].values.reshape(-1, 1))

        # Separamos el index de loas Fetures
        X = clf_data.iloc[:, 1:]
        y = clf_data.iloc[:, 0]  # the target were the first column I included

        # Entrenamos con los datos recivido
        x_train, _, y_train, y_test = train_test_split(X,
                                                       y,
                                                       test_size=.3,
                                                       random_state=35)

        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)

        from sklearn.ensemble.forest import RandomForestClassifier

        # inicializamos el clasificador
        clf = RandomForestClassifier(n_estimators=8, random_state=34)
        clf.fit(x_train, y_train)

        # Pasamos los datos a un Dataframe para poder graficarlos
        feats_imp = pd.DataFrame(clf.feature_importances_,
                                 index=X.columns,
                                 columns=['FeatureImportance'])
        feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False)

        feats_imp.plot(kind='barh', figsize=(12, 6), legend=False)
        plt.title(title)
        sns.despine(left=True, bottom=True)
        plt.gca().invert_yaxis()

        plt.savefig(self.DefeaultPath + " feature importance.png", dpi=200)
        plt.cla()
        plt.clf()

        return
コード例 #3
0
    def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)
コード例 #4
0
def plot_tree(profile, group, avg_acc, n_tree, picutre):
    '''
    选择最优的随机种子并建模
    :param profile: 丰度表
    :param group: 分组表
    :param avg_acc: 随机种子准确率的输出文件
    :param n_tree: 模型中树的颗数
    :param picutre: 输出图形的名称
    :return: None
    '''
    acc = pd.read_csv(avg_acc, sep='\t', header=0, index_col=0)
    best_state = int(acc.sort_values('avgAcc').index[-1])

    # 训练和保存预测模型
    rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                random_state=best_state)
    rf.fit(profile, group['label'])
    joblib.dump(rf, 'rf.pkl')

    # 绘制分类树结果图形
    dot = picutre.split('.')[0] + '.dot'
    tree_in_forest = rf.estimators_[rf.n_estimators-1]
    export_graphviz(tree_in_forest,
                    out_file=dot,
                    feature_names=profile.columns,
                    filled=True,
                    rounded=False,
                    precision=100)

    os.system('dot -Tpng {0} -o {1}'.format(dot, picutre))
コード例 #5
0
ファイル: dataframes.py プロジェクト: dmbrdev/doutorado
def importance():
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv')

    features = list(df.columns.values)
    
    target = 'm_column_result'
    features.remove('m_match_id')
    features.remove('m_column_result')
    features.remove('m_match_date')
    features.remove('m_goals_home')
    features.remove('m_goals_away')
    features.remove('a_next_match_id')
    features.remove('h_next_match_id')
    features.remove('m_favorite')
    features.remove('m_medium')
    features.remove('m_underdog')
    features.remove('h_last_match_local')
    features.remove('a_last_match_local')
    features.remove('rf1000')
    features.remove('rf1000_fs1')
    
    
    X = df[features]
    y = df[target]
    # fit an Extra Trees model to the data
    clf = RandomForestClassifier(n_estimators=1000)
    clf.fit(X, y)
    # display the relative importance of each attribute
    for x,y in zip(features,clf.feature_importances_):
        
        print (x,y)
コード例 #6
0
def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                  min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, 
                  random_state=0, min_density=None):
    clf = RandomForestClassifier()
    clf.fit(x_train,Y_train)
    
    return clf
コード例 #7
0
ファイル: Ba.py プロジェクト: Emsibil/Bachelor
def enemy_detection_clf():

    chars = np.array(['warrior', 'warlock', 'mage', 'druid', 'rogue', 'shaman', 'paladin', 'priest', 'hunter'])
    data = []
    target = []
    for c in chars:
        p = path('images/character/new/black')
        for f in os.listdir(p+'/'+c):
            img = Image.open(p+'/'+c+'/'+f)
            w, h = img.size
            pixel = img.load()
            tmp = []
            for y in range(h):
                for x in range(w):
                    tmp.append(np.float(pixel[x,y] / 255))
            target.append(np.str(c))
            data.append(np.array(tmp))
    data = np.array(data)
    #image = data.view()
    #image.shape = (-1, 22, 30)
    #clf = svm.SVC(gamma = 0.001)
    clf = RandomForestClassifier()
    clf.fit(data, target)
    
    return clf
コード例 #8
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
コード例 #9
0
ファイル: randforest.py プロジェクト: setman85/GA_homework
def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                  min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, 
                  random_state=None, verbose=0, min_density=None, compute_importances=None, *args):
    clf = RandomForestClassifier()
    clf.fit(x_train,Y_train)
    
    return clf
コード例 #10
0
    def test_nlp_erroneous_rnn_args_invalid(self):
        num_words = 1024
        (x_train,
         y_train), (x_test,
                    y_test) = TestUtil.get_random_variable_length_dataset(
                        max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64,
                                                 max_depth=5,
                                                 random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        with self.assertRaises(ValueError):
            _ = RNNModelBuilder(
                with_embedding=True,
                verbose=0)  # Must also specify the embedding_size argument.

        model_builder = RNNModelBuilder(embedding_size=num_words,
                                        with_embedding=True,
                                        verbose=0)

        input_layer = Input(shape=(10, 2))
        with self.assertRaises(ValueError):
            model_builder.build(input_layer)

        input_layer = Input(shape=(10, 3))
        with self.assertRaises(ValueError):
            model_builder.build(input_layer)
コード例 #11
0
def stkFoldCrossValidation():

    X = pickle.load(open('X.p', 'rb'))

    X = np.array(X)

    Y = pickle.load(open('Y.p', 'rb'))

    Y = np.array(Y)

    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, Y)

    k = 1
    for train_index, test_index in skf.split(X, Y):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        print(k)
        k += 1

        rf = RandomForestClassifier()

        rf.fit(X_train, Y_train)

        yp = rf.predict(X_test)
        print(classification_report(Y_test, yp, digits=6))
コード例 #12
0
class RandomForestClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
コード例 #13
0
def RF_Features_Importance(X, Y, outputfile="RF.csv"):
    forest = RandomForestClassifier(n_estimators=300)
    forest.fit(X, Y)
    importances = np.matrix(forest.feature_importances_).tolist()[0]
    df = pd.DataFrame(list(zip(header, importances)),
                      columns=["Features", "Importance"])

    df.to_csv(outputfile, index=False)
コード例 #14
0
def forest(X, y, model_path):
    model = RandomForestClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
コード例 #15
0
ファイル: models.py プロジェクト: kalpanki/pp
def rforest_classify(X,Y):
	#clf = RandomForestClassifier(criterion='gini',max_features=7,n_estimators=100,n_jobs=3,min_samples_leaf=5)

	clf = RandomForestClassifier(n_estimators=500, \
			criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1 \
                ,max_features='auto', bootstrap=False, oob_score=False, n_jobs=-1, min_density=None)
	clf.fit(X,Y)
	return clf
コード例 #16
0
def calc_score(test, train):
    test_f, test_l = split_data_label(test)
    train_f, train_l = split_data_label(train)
    # 학습시키고 정답률 구하기

    clf = RandomForestClassifier()
    clf.fit(train_f, train_l)
    pre = clf.predict(test_f)
    return metrics.accuracy_score(test_l, pre)
コード例 #17
0
def train_rf(train_vec, train_label):
    from sklearn.ensemble.forest import RandomForestClassifier as RFC
    # rfrclf = RFR(n_estimators=1001)
    # rfrclf.fit(train_vec, train_label)
    # print rfrclf.feature_importances_
    trfclf = RFC(n_estimators=1001)
    trfclf.fit(train_vec, train_label)
    # print rfclf.feature_importances_
    return trfclf
コード例 #18
0
ファイル: main.py プロジェクト: rjgsousa/sentiment_analysis
def main(args):

    if args.analyse != None:
        train_data_x, test_data_x,train_data_y, test_data_y  = process_data(args.analyse)

        RT = RandomForestClassifier(n_estimators=100)
        RT.fit(train_data_x, train_data_y)
        print RT.score(test_data_x, test_data_y)

    return
コード例 #19
0
ファイル: RF.py プロジェクト: eenx15/Volvo-DataX
def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return: trained random forest classifier
    """
    clf = RandomForestClassifier(n_estimators=600, max_depth=50)
    clf.fit(features, target)
    return clf
コード例 #20
0
ファイル: Ba.py プロジェクト: Emsibil/Bachelor
def my_digits():
    digits = _data()
    
    n_samples = len(digits.images)
    datas = digits.images.reshape((n_samples, -1))

    classifier = RandomForestClassifier()
    classifier.fit(datas, digits.target)
    
    return classifier
 def RandomForestClassifer(self):
     
     '''
     Function to do RandomForest Classifer.
     '''
     train_Array = self.titanic_train_frame.values
     self.test_Array = self.titanic_test_frame.values
     randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
     randomForest.fit(train_Array[0::,1::],train_Array[0::,0])
     self.predicted_probability = randomForest.predict(self.test_Array[0::,0::])
     self.predicted_probability_list = self.predicted_probability.tolist()
コード例 #22
0
ファイル: randomForest.py プロジェクト: matanaor1/AIProject1
class RFClassifier(super.abstract_classifier):

    def __init__(self, train_features, train_labels, num_of_trees):
        self.train_features = train_features
        self.train_labels = train_labels
        self.rf_member = RandomForestClassifier(num_of_trees)

    def train(self):
        self.rf_member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.rf_member.predict(newVector)
コード例 #23
0
ファイル: training_prediction.py プロジェクト: gssgch/gssgML
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
コード例 #24
0
def evalOne(enabledColumns):
    features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]]
    Y = []
    P = []
    for group in range(0,5):
    #     print("Test group " + str(group + 1))
        trainStationList = []
        testStationList = []
        for i in range(0,5):
            if i == group:
                testStationList.extend(groups[i])
            else:
                trainStationList.extend(groups[i])
        trainStations = set(float(station) for station in trainStationList)
        # reorder train stations
    #     print("\ttrainStationList:" + str(trainStationList))
        trainStationList = [s for s in all_stations if float(s) in trainStations]
    #     print("\ttrainStationList:" + str(trainStationList))
        testStations = set(float(station) for station in testStationList)
    #     print("\ttestStationList:" + str(testStationList))
        trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target")
     
        train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)]
#         train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)]
         
        test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)]
#         test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)]
         
        trainY = []
        for l in trainLocation:
            if l in train_lower:
                trainY.append(0)
            else:
                trainY.append(1)
         
        testY = []
        for l in testLocation:
            if l in test_lower:
                testY.append(0)
            else:
                testY.append(1)
         
        model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1)
        model.fit(trainX, trainY)
        predY = model.predict(testX)
         
        Y.extend(testY)
        P.extend(predY)
     
    f1 = f1_score(Y, P)
    accuracy = accuracy_score(Y, P)
    return f1, accuracy
コード例 #25
0
ファイル: Algro.py プロジェクト: fengxinhe/DeviceManager
def RF(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelRF=RandomForestClassifier(n_estimators=10,
                                    max_depth=5,max_features=1,random_state=0)
     modelRF.fit(train_desc,np.array(train_labels))
     joblib.dump((modelRF, img_classes, stdSlr), pth+"/rf-bof.pkl", compress=3) 
     test(pth, "rf-")
コード例 #26
0
ファイル: train_main.py プロジェクト: georgid/vocal-detection
def train_classifier(vocal_frames, non_vocal_frames):

    frames = np.append(vocal_frames, non_vocal_frames, axis=0)

    labels_vocal = np.ones(vocal_frames.shape[0])
    labels_non_vocal = np.zeros(non_vocal_frames.shape[0])

    labels = np.append(labels_vocal, labels_non_vocal, axis=0)

    rfc = RandomForestClassifier(n_estimators=100, max_depth=None)
    rfc.fit(frames, labels)

    return rfc
コード例 #27
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
コード例 #28
0
ファイル: s210_predict_good.py プロジェクト: enima2684/zillow
def train_model(X_train,y_train):
    print("training the model ...")
    
    # create sets for probability calibration
    X_train_train, X_prob_cal, y_train_train, y_prob_cal = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.2)

    
    rf = RandomForestClassifier(
               max_features="auto",
               n_estimators=2000,
               max_depth=8,
               n_jobs=-1,
               class_weight = 'balanced',
               verbose=1)
    rf.fit(X_train_train,y_train_train)
    
    # feature importances
   
#    feature_importance = False
#    if(feature_importance):
#        
#        importances = rf.feature_importances_
#        std = np.std([tree.feature_importances_ for tree in rf.estimators_],
#                 axis=0)
#        indices = np.argsort(importances)[::-1]
#        col_names = df.drop('bin',axis=1).columns.values
#        print("Feature ranking:")
#        
#        for f in range(X_train_train.shape[1]):
#            print("%d. %s (%f)" % (f + 1, col_names[indices[f]], importances[indices[f]]))
#        
#        # Plot the feature importances of the forest
#        plt.figure()
#        plt.title("Feature importances")
#        plt.bar(range(X_train_train.shape[1]), importances[indices],
#               color="r", yerr=std[indices], align="center")
#        plt.xticks(range(X_train_train.shape[1]), col_names[indices],rotation = 50)
#        plt.xlim([-1, X_train_train.shape[1]])
#        plt.show()
        
    
    # Probability calibration
    sig_clf = CalibratedClassifierCV(rf, method="sigmoid", cv="prefit")
    sig_clf.fit(X_prob_cal, y_prob_cal)
    y_pred_train = sig_clf.predict_proba(X_train)
    
    
    print(".. training log_loss  : {:0.2f} %".format(log_loss(y_train,y_pred_train)*100))
    return sig_clf
コード例 #29
0
def classic_model(image_dir, image_lists, method):

    X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method)
    classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    classifier.fit(X, y)

    X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method)
    predictions = classifier.predict(X_test)
    confusion = pandas.crosstab(y_test,
                                predictions,
                                rownames=['Actual Class'],
                                colnames=['Predicted Class'])
    print confusion
    return accuracy_score(y_test, predictions)
コード例 #30
0
def try_model(train):
    print(train.shape)
    features = ["phone_brand", "device_model",  "event_count", "action_radius_max", "medianTime", "minTime", "maxTime", "weekday", "appcounts1"]
    encoder = LabelEncoder()
    train["group"] = encoder.fit_transform(train["group"].values)
    
    rf = RandomForestClassifier(n_estimators=50, max_depth=15, max_features=6, bootstrap=True, n_jobs=4, random_state=2016, class_weight=None)
    
    rf.fit(train[features].values, train["group"].values)
    feature_importance(rf, features)
    
    skf = StratifiedKFold(train["group"].values, n_folds=5, shuffle=True, random_state=2016)
    scores = cross_val_score(rf, train[features].values, train["group"].values, scoring="log_loss", cv=skf, n_jobs=1)
    print(scores)
    print("RF Score: %0.5f" %(-scores.mean())) # RF Score: 2.39884
コード例 #31
0
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen, :]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:, :]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
コード例 #32
0
ファイル: pscvread.py プロジェクト: coreyabshire/color-names
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen,:]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:,:]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
コード例 #33
0
def train_model(X_train, y_train):
    print("training the model ...")
    rf = RandomForestClassifier(n_estimators=1000,
                                max_depth=8,
                                n_jobs=-1,
                                verbose=1)
    #    rf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0,probability=True)

    rf.fit(X_train, y_train)
    y_pred_train = rf.predict_proba(X_train)

    fpr, tpr, thresholds = roc_curve(y_train, y_pred_train[:, 0], pos_label=1)
    print("AUC on train : {:.02f} %".format(auc(fpr, tpr) * 100))

    return rf
コード例 #34
0
def random_forest(profile, group, n_tree, search_number, avg_acc):
    '''
    对丰度表进行建模
    :param profile:丰度表
    :param group: 分组表
    :param n_tree: 模型中树的颗数
    :param search_number: 搜索随机种子的次数
    :param avg_acc: 随机种子准确率的输出文件
    :return: 加label后的group
    '''
    real_label = set(group.iloc[:, 0])
    label_dict = {}
    for i, j in enumerate(real_label):
        label_dict[j] = i
    label = []
    for sample in group.index:
        label.append(label_dict[group.loc[sample].values[0]])

    group['label'] = label

    n = 0
    with open(avg_acc, 'w') as f:
        f.write('random_state\tavgAcc\n')
        while n < search_number:
            print('现在循环次数为{0}'.format(n+1))
            # random random_state
            random_state = round(random() * 10000)

            rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                        random_state=random_state)

            acc = []
            for i in range(10):
                sample_train = list(profile.sample(n=30).index)
                sample_val = list(set(profile.index).difference(sample_train))
                train = profile.loc[sample_train]
                val = profile.loc[sample_val]
                label_train = group['label'].loc[sample_train]

                rf.fit(train, group['label'][sample_train])
                pre = rf.predict(val)

                acc.append(metrics.accuracy_score(y_true=group['label'][sample_val], y_pred=pre))

            # print('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            f.write('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            n += 1
    return group
コード例 #35
0
def RandomForestSelector(A, y, n_estimators, n_features):
    columns = A.columns.values
    rf = RandomForestClassifier(n_estimators=n_estimators, verbose=0, n_jobs=-1, max_depth=9, random_state=2019)
    rf.fit(A, y)
    importance = rf.feature_importances_
    importance_index = np.argsort(importance)[::-1][:n_features]
    importance_columns = columns[importance_index]
    importance_values = importance[importance_index]
    # print(importance_columns, "\n选取后:",  len(importance_columns))
    # importance_dataFrame = pd.DataFrame({
    #         "feature": importance_columns,
    #         "value": importance_values
    # })
    # print(importance_dataFrame)
    A = A[importance_columns]
    return A, importance_columns
コード例 #36
0
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(
            **{
                'verbose': 1,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'n_jobs': 40
            })
        self.name = "rf_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
コード例 #37
0
ファイル: pscvread.py プロジェクト: coreyabshire/color-names
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen,:]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:,:]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
コード例 #38
0
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen, :]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:, :]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
コード例 #39
0
    def test_RandomForest(self):
        X = [[0, 1], [1, 1]]
        Y = [0, 1]

        regression = RandomForestClassifier(n_estimators=10)
        regression = regression.fit(X, Y)
        regression.predict_proba(X)
コード例 #40
0
def decision_frist():

    data = datasets.load_iris()
    x = data["data"]
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    des = DecisionTreeClassifier(max_leaf_nodes=3)
    des.fit(X_train, y_train)
    print(des.predict(X_test))
    print(des.score(X_test, y_test))

    rom = RandomForestClassifier()
    rom.fit(X_train, y_train)
    print(rom.predict(X_test))
    print(rom.score(X_test, y_test))
コード例 #41
0
 def mymap(data, N):
     data = cPickle.loads(str(data))
     x = data[:, :-1]
     y = data[:, -1]
     model = RandomForestClassifier(n_estimators=N, max_depth=6)
     model = model.fit(x, y)
     return cPickle.dumps(model)
コード例 #42
0
ファイル: dataframes.py プロジェクト: dmbrdev/doutorado
def forestPredict(columName, features, trees):
    
    pd.options.mode.chained_assignment = None
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv')
    df['pred'] = ""
    #df = df.set_index([df.m_championship_id,df.m_match_group_num])
    df = df.set_index([df.m_match_id])
    
    for champId in range(1,91):
        
        champ = df[(df.m_championship_id == champId)]       
        print(champId)
        
        if (champId < 11 or champId > 20):
            
            if (len(champ) == 380):
                rd = 38
            elif (len(champ) == 306):
                rd = 34
            else:
                rd = 30
                 
            for mid in range(2,rd+1):
                
                train = champ[champ.m_match_group_num < mid]
                test = champ[champ.m_match_group_num == mid]
                  
                target = 'm_column_result'
                  
                X = train[features]
                y = train[target]                  
                Z = test[features]
                  
                clf = RandomForestClassifier(n_estimators=trees,max_features=None )
                clf.fit(X,y)
                          
                pred = clf.predict(Z)
                  
                for i,p in zip(Z.index,pred) :

                    df.set_value(i,'pred',p)
             
    nameFile = 'pred_' + columName + ".csv"
    
    df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
コード例 #43
0
ファイル: model2.py プロジェクト: tearf001/ucloud
def model_pred(trainX,trainY,testX,model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100,200,300,400,500]
        for param in params:
            clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:",float(sum(pred))/len(pred)
    return pred
コード例 #44
0
 def initDecTrees(self, path):
     for filename in os.listdir(path):
         if filename=='train.csv':
             with open(os.path.join(path,filename)) as infile:
                 f = csv.reader(infile)
                 aux = f.next()  # skip the header
                 x = []
                 y = []
                 for line in f:
                     if size(line) > 1:
                         if self.option == 1:
                             data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9])]
                             y.append(converter(line[6]))
                             x.append(data)
                         elif self.option == 2:
                             auxDeputy = fetchDeputyParty(line[2])
                             data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9]), encodeParty(auxDeputy['party']), encodeState(auxDeputy['state'])]
                             y.append(converter(line[6]))
                             x.append(data)
             clf = RandomForestClassifier(n_estimators=5)
             clf.fit(x, y)
             return clf
コード例 #45
0
ファイル: tree.py プロジェクト: hongbin0908/pytrade
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators,
                                                    'max_depth':max_depth,'min_samples_leaf':min_samples_leaf,
                                                    'n_jobs':40})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )
    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
コード例 #46
0
class MyRandomForestClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000,
                 n_jobs=25):
        self.classifier = RandomForestClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf,
                                                      'n_jobs': n_jobs})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        return self.classifier.feature_importances_
コード例 #47
0
def drawfeature(train_data_path='./train', train_filename='train_cleaned',test_data_path='./test', test_filename='test_cleaned'):
    train_file = os.path.join(train_data_path, train_filename)
    train_data = pd.read_csv(train_file)
    n_train_data = train_data['text'].size

    test_file = os.path.join(test_data_path,test_filename)
    test_data = pd.read_csv(test_file)
    n_test_data = test_data['text'].size

    vectorizer = CountVectorizer(analyzer="word",tokenizer=None, preprocessor=None, stop_words=None, max_features=2000)
    transformer = TfidfTransformer()

    train_data_words = []
    for i in xrange(n_train_data):
        train_data_words.append(words_to_features(train_data['text'][i]))
    train_data_features = vectorizer.fit_transform(train_data_words)
    train_data_features = train_data_features.toarray()
    train_data_features = transformer.fit_transform(train_data_features)
    train_data_features = train_data_features.toarray()
    train_data_pd=pd.Series(train_data_features,name=None)
    train_data_pd.to_csv("trainfeature.csv", index=None, header=True)


    test_data_words = []
    for i in xrange(n_test_data):
        test_data_words.append(words_to_features(test_data['text'][i]))
    test_data_features = vectorizer.fit_transform(test_data_words)
    test_data_features = test_data_features.toarray()
    test_data_features = transformer.fit_transform(test_data_features)
    test_data_features = test_data_features.toarray()
    test_data_pd=pd.Series(test_data_features,name=None)
    test_data_pd.to_csv("testfeature.csv", index=None, header=True)

    forest = RandomForestClassifier(n_estimators=60)
    forest = forest.fit(train_data_features, train_data['lable'])
    pred = forest.pedict(test_data_features)
    pred = pd.Series(pred,name='Target')
    pred.to_csv("bow_tfidf_RF.csv", index=None, header=True)
コード例 #48
0
    total_test = np.array([])
    
    for year in [2007, 2009, 2011, 2013]:

        X_train, X_test, y_train, y_test = year_train_test_split(
            train_for_loo,
            'WnvPresent_DateTrapSpecies',
            year)      

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False)
        X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
        y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False)
        y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False)

        
        clf.fit(X_train, y_train)

        # y_pred = clf.predict_proba(X_test) [:, 1] # Random Forest
        y_pred = clf.predict_proba(X_test) # For XGB
        
        score = metrics.roc_auc_score(y_test, y_pred)
        scores.append(score)
        
        #import operator
        #feat_importances = dict(zip(X_train.columns, clf.feature_importances_))
        #sorted_feat_importances = sorted(feat_importances.items(), key=operator.itemgetter(1))
        #print(sorted_feat_importances)
        
        total_pred = np.concatenate((total_pred, y_pred))
        total_test = np.concatenate((total_test, y_test))
        
コード例 #49
0
#Check if there is linear correlation between pixel<x> columns and label
#If yes, we should dive into the columns with correlation. Linear / logistic regression may work well with the data.
#In this case, makes sense that there is no correlation - higher pixel values does not mean that label value will be higher
#print "Correlation:", train.corr()["label"]

#Check that the algorithm used gives good accuracy by using part of the training set to validate
train_train, train_test=train_test_split(train, test_size=0.3)

#Train model
model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)
#model=KNeighborsClassifier(n_neighbors=6)


#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
#model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int)))
#print "model.score:", model.score(train_test.ix[:,'pixel0':'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int)))
#print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3)
model.fit(train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'].values.ravel())
print "model.score", model.score(train_test.ix[:,'pixel0':'pixel783'], train_test.ix[:,'label'].values.ravel())


#Predict output
#predicted=model.predict(train_test.ix[:,'pixel0':'pixel783'])
#print predicted
#print "Accuracy: ", accuracy_score(train_test.ix[:,'label'].astype(int), predicted)
コード例 #50
0
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]


#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")

### your code here!  name your classifier object clf if you want the 
clf=RandomForestClassifier()
clf.fit(features_train,labels_train)
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)

plt.show()
################################################################################




コード例 #51
0
metrics.confusion_matrix(y_train, model2.predict(x_train))
print metrics.classification_report(y_train, model2.predict(x_train))
metrics.confusion_matrix(y_test, model2.predict(x_test))
print metrics.classification_report(y_test, model2.predict(x_test))
clf.set_params(min_samples_leaf=5)
clf.set_params(max_depth=5)
model3 = clf.fit(x_train, y_train)
metrics.confusion_matrix(y_train, model3.predict(x_train))
print metrics.classification_report(y_train, model3.predict(x_train))
metrics.confusion_matrix(y_test, model3.predict(x_test))
print metrics.classification_report(y_train, model3.predict(x_train))


#### Models
from sklearn.ensemble.forest import (RandomForestClassifier,
ExtraTreesClassifier)
from sklearn.ensemble import *
model = RandomForestClassifier()

# Train
clf = model.fit(x_train, y_train)

# Get accuracy scores
scores = clf.score(data, target)
metrics.confusion_matrix(y_train, clf.predict(x_train))
print metrics.classification_report(y_train, clf.predict(x_train))

metrics.confusion_matrix(y_test, clf.predict(x_test))
print metrics.classification_report(y_test, clf.predict(x_test))

コード例 #52
0
 def fit(self, X, y, sample_weight=None):
     sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight, pow_sig=self.pow_sig,
                                      pow_bg=self.pow_bg)
     return RandomForestClassifier.fit(self, X, y, sample_weight=sample_weight)
コード例 #53
0
clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini',
                                                max_depth=6, 
                                                min_samples_leaf=3),
                         n_estimators = 200,
                         learning_rate = 0.1)
####################
clf = neighbors.KNeighborsClassifier(100, weights='uniform')
clf = neighbors.KNeighborsClassifier(100, weights='distance')
####################
clf = GaussianNB()
##############################
t0 = time()
param_grid = {'C': [150, 500, 750, 1000],
              'gamma': [ 0.0005, 0.001, 0.05, .01], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

clf = SVC( C = 750, kernel='rbf', class_weight='auto', gamma = 0.001, probability=True)

########################
scores = cross_val_score(clf, data_PCA, dat_clean.genre, cv=10)
print(scores)
print(scores.mean())

predicted = cross_val_predict(clf, data_PCA, dat_clean.genre, cv=10)
print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(dat_clean.genre, predicted))
print
print "Classification report:"
コード例 #54
0
        'score': []
    }

    if len(sys.argv) > 1 and sys.argv[1] == '--skip-train':
        results = pd.read_csv("./data/results.csv")
    else:
        for classification_dataset in classification_dataset_names:
            print("Starting", classification_dataset)

            X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/')
            train_X, test_X, train_y, test_y = train_test_split(X, y)

            rf = RandomForestClassifier()
            lexRF = LexicaseForestClassifier()

            rf.fit(train_X, train_y)
            lexRF.fit(train_X, train_y)

            rf_score = rf.score(test_X, test_y)
            lexRF_score = lexRF.score(test_X, test_y)

            results['problem'] = results['problem'] + ([classification_dataset] * 2)
            results['method'] = results['method'] + ['RF', 'LexRF']
            results['score'].append(rf_score)
            results['score'].append(lexRF_score)

        results = pd.DataFrame(results)
        results.to_csv("./data/results.csv", index=False)

    problems = (
        results
コード例 #55
0
ファイル: main.py プロジェクト: dtpryce/jpmml-sklearn
print(audit_X.dtype, audit_y.dtype)

def predict_audit(classifier):
    adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"])
    adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"])
    return pandas.concat((adjusted, adjusted_proba), axis = 1)

audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5)
audit_tree.fit(audit_X, audit_y)

store_pkl(audit_tree, "DecisionTreeAudit.pkl")
store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv")

audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5)
audit_forest.fit(audit_X, audit_y)

store_pkl(audit_forest, "RandomForestAudit.pkl")
store_csv(predict_audit(audit_forest), "RandomForestAudit.csv")

audit_regression = LogisticRegression()
audit_regression.fit(audit_X, audit_y)

store_pkl(audit_regression, "RegressionAudit.pkl")
store_csv(predict_audit(audit_regression), "RegressionAudit.csv")

#
# Multi-class classification
#

iris_df = load_csv("Iris.csv")
コード例 #56
0
import matplotlib.pyplot as plt
from sklearn.ensemble.forest import RandomForestClassifier

import autopath
from datasets import training_set, test_set
from util import convert_gray_scale, flatten


Xr,Yr = training_set
Xe,Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4,
         label='misclassified')
plt.legend(loc='upper left')
plt.draw()
plt.show()
コード例 #57
0
ファイル: svm.py プロジェクト: ghollah/ServingMLAPIs
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls):
	dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
	#----DATA PREPROCESSING
	#-------dealing with NULL values in the data
	#----------remove the rows in which the response is null
	dataset=dataset.dropna(subset=[resp_var])
	#----------dealing with nulls
	dataset=deal_with_nulls(dealing_with_nulls,dataset)
	#----FEATURE SELECTION
	#-------get predictors important in predicting the response
	#-----------transform categorical predictors to dummy variables
	predictors=dataset.drop(resp_var,axis=1,inplace=False)
	predictors=pd.get_dummies(predictors)
	#-----------balance the classes in the response var
	ros = RandomOverSampler(random_state=0)
	resp=dataset[resp_var]
	prds, resp = ros.fit_sample(predictors, resp)
	#-----------fit the random forest classifier to give us the important predictors
	rf_clf = RandomForestClassifier(n_estimators=n_estimators)
	rf_clf.fit(prds,resp)
	#-------get the important predictors
	feature_imp = pd.Series(rf_clf.feature_importances_,
                    index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
	#-------names of the important predictors
	important_predictor_names = feature_imp.index[0:important_features]
	#-------subset the data to get only the important predictors and the response
	resp=pd.DataFrame(data=resp,columns=[resp_var])
	predictors=pd.DataFrame(prds,columns=list(predictors))
	dataset=pd.concat([resp,predictors],axis=1)
	#---------------------------------------------------------
	#----MODEL TRAINING
	#--------Remove the response variables from the features variables - axis 1 refers to the columns
	m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
	# Response variables are the values we want to predict
	resp_var = np.array(dataset[resp_var])

	dataset = pd.get_dummies(m_data)
    
	# Saving feature names for later use
	feature_list = list(m_data.columns)
	# Convert to numpy array
	dataset = np.array(dataset)

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402)

	# Instantiate model with n_estimators decision trees
	clf = SVC(kernel='rbf',probability=True)

	# Train the model on training data
	clf.fit(train_features, train_labels)
    # evaluation
	predicted = clf.predict(test_features)
	pred_prob = clf.predict_proba(test_features)
    
	accuracy = accuracy_score(test_labels, predicted)
	#confusion matrix
	cnf = (confusion_matrix(test_labels,predicted))
	#precision score
	precision = precision_score(test_labels,predicted,pos_label=positive_class)
	#avg pres
	avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
	#recall score
	rec = recall_score(test_labels,predicted,pos_label=positive_class)
	#f1 scorea
	fscore = f1_score(test_labels,predicted,pos_label=positive_class)
	#fbeta score
	fbeta = fbeta_score(test_labels,predicted,beta=0.5)
	#hamming_loss
	hamming = hamming_loss(test_labels,predicted)
	#jaccard similarity score
	jaccard = jaccard_similarity_score(test_labels,predicted)
	#logloss
	logloss = log_loss(test_labels,predicted)
	#zero-oneloss
	zero_one = zero_one_loss(test_labels,predicted)
	#auc roc 
	area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
	#cohen_score
	cohen = cohen_kappa_score(test_labels,predicted)
	#mathews corr
	mathews = matthews_corrcoef(test_labels,predicted)
	# Variable importances from the important features selection stage
	variable_importance_list = list(zip(prds, feature_imp))
	output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
	output=json.dumps(output)
	return jsonify({"Predictions": output})
コード例 #58
0
                    corpus_test.append(text)
                    if int(vals[0]) == 0:
                        y_test.append('0')
                    else:
                        y_test.append('1')
    
    X_train = vectorizer.fit_transform(corpus_train)

    X_test = vectorizer.transform(corpus_test)
    
    clf = RandomForestClassifier(n_estimators=10)
    #clf = KNeighborsClassifier(n_neighbors=10)
    #clf = LinearSVC()
    
    clf.fit(X_train, y_train)
    
    print len(y_train)
    print len(y_test)
    
    pred = clf.predict(X_test)
    
    #pred = ['0']* len(y_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    total.append(score)
    
    n = 20
    
#     feature_names = vectorizer.get_feature_names()
#     coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))