def Classify(features, labels, namesClasses):

    print "Training"
    # n_estimators is the number of decision trees
    # max_features also known as m_try is set to the default value of the square root of the number of features
    clf = RF(n_estimators=100, n_jobs=3)
    scores = cross_validation.cross_val_score(clf,
                                              features,
                                              labels,
                                              cv=5,
                                              n_jobs=1)
    print "Accuracy of all classes"
    print np.mean(scores)

    kf = KFold(labels, n_folds=5)
    y_pred = np.zeros((len(labels), len(set(labels))))
    for train, test in kf:
        features_train, features_test, labels_train, labels_test = features[
            train, :], features[test, :], labels[train], labels[test]
        clf = RF(n_estimators=100, n_jobs=3)
        clf.fit(features_train, labels_train)
        y_pred[test] = clf.predict_proba(features_test)

    print classification_report(labels, y_pred, target_names=namesClasses)

    return y_pred
Esempio n. 2
0
def check_nlp_improvement(fast=False):
    if fast:
        clf = RF(n_estimators=100, n_jobs=-1,criterion="entropy",max_features='auto',min_samples_split=5)
        folds = 5
    else:
        clf = RF(n_estimators=1000, n_jobs=-1, criterion="entropy", max_features=100, min_samples_split=5)
        folds = 10

    paramlist = [str(i) for i in clf.get_params().values()]
    parlist = str(np.sort(paramlist))+str(folds)
    h = hashlib.sha1()
    h.update(parlist.encode('utf-8'))
    sig = h.hexdigest()
    try:
        baseline = np.load("nlp_baseline_"+str(sig)+".npy")
    except Exception:
        print("Establishing baseline, this will run once")
        X_train, y_train, X_test, test_ids = read_json(do_descriptions=False)
        baseline = cv(X_train, y_train, None, MinMaxScaler(), clf, folds=folds, metric=metrics.log_loss, verbose=True)
        np.save("nlp_baseline_"+str(sig),baseline)

    print("Baseline:",baseline)

    X_train, y_train, X_test, test_ids = read_json(do_descriptions=True)
    print ("Checking performance, this may take several minutes")
    res = cv(X_train, y_train, None, MinMaxScaler(), clf, folds=folds, metric=metrics.log_loss, verbose=True)
    print("Result:",res)

    if res < baseline:
        print ("Improvement over baseline",str(baseline-res))
    else:
        print ("Performance worse than baseline by", str(res-baseline))
Esempio n. 3
0
def rrf(series, n_folds, clfparams, featureparams, aggregateparams,
        refineparams, include, exclude, save_test_predictions,
        save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include=include, exclude=exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]
    best_pruning = refineparams['n_prunings']
    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index=y.index, columns=pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(
                itrain, itest, **aggregateparams)
            clf = RF(**clfparams)
            clf.fit(Xtr, ytr)
            rrf = RRF(clf, **refineparams)
            rrf.fit(Xtr, ytr)
            loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
            loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
            _run.info['loss'].append(loss2te)
            _run.info['trainloss'].append(loss2tr)
            print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format(
                i, loss2tr, loss2te))
            pred.iloc[itest, :] = rrf.predict_proba(Xte)
            i += 1
        loss = multiclass_log_loss(y.values, pred.values)
        _run.info['features'] = list(Xtr.columns)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        #
        # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0])))
        # Xtr = pd.concat((Xtr, Xtest), axis=0)
        # ytr = pd.concat((ytr, semilabels))
        clf = RF(**clfparams)
        clf.fit(Xtr, ytr)  #,weights)
        rrf = RRF(clf, **refineparams)
        rrf.fit(Xtr, ytr)
        predtest = pd.DataFrame(rrf.predict_proba(Xte),
                                index=yte.index,
                                columns=pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss
def rf(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude,
        save_test_predictions, save_oob_predictions, skip_cross_validation, _run):
    data = TelstraData(include = include, exclude = exclude, **featureparams)
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pred_cols = ['predict_{}'.format(i) for i in range(3)]

    if skip_cross_validation:
        loss = 999.
    else:
        y = data.get_y()
        kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True)
        pred = pd.DataFrame(0., index = y.index, columns = pred_cols)
        i = 1
        _run.info['loss'] = []
        _run.info['trainloss'] = []
        feature_importances_ = 0
        for itrain, itest in kf:
            Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams)
            clf = RF(**clfparams)
            clf.fit(Xtr, ytr)#, weights)
            pred.iloc[itest, :] = clf.predict_proba(Xte)
            trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr))
            _run.info['trainloss'].append(trainloss)
            loss = multiclass_log_loss(yte, pred.iloc[itest].values)
            _run.info['loss'].append(loss)
            if i == 1:
                feature_importances_ = clf.feature_importances_/n_folds
            else:
                feature_importances_ += clf.feature_importances_/n_folds
            i += 1
        loss = multiclass_log_loss(y, pred.values)

        _run.info['features'] = list(Xtr.columns)
        _run.info['feature_importances'] = list(feature_importances_)
        # Optionally save oob predictions
        if save_oob_predictions:
            filename = '{}_{}.csv'.format(series, time)
            pred.to_csv(filename, index_label='id')
    # Optionally generate test predictions
    if save_test_predictions:
        filename = '{}_test_{}.csv'.format(series, time)
        Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams)
        clf = RF(**clfparams)
        clf.fit(Xtr, ytr)#,weights)
        predtest = pd.DataFrame(clf.predict_proba(Xte),
                                index = yte.index, columns = pred_cols)
        predtest.to_csv(filename, index_label='id')
    return loss
	def _calculate_cv_error(base_clf, best_rate, X, y, is_y_noise, clean_type, 
							max_nb_feats, major_oob_label):

		errors = []

		skf = StratifiedKFold(n_splits=NoiseDetectionEnsemble.k_folds, 
								shuffle=True)

		for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y):

			train_X = DataHelper.select_rows(X, train_idxs, copy=False)
			train_y = DataHelper.select_rows(y, train_idxs, copy=False)
			train_is_y_noise = DataHelper.select_rows(is_y_noise, train_idxs,
												copy=False)
	
			clean_train = NoiseDetectionEnsemble._clean_noisy_data(train_X,
													train_y, train_is_y_noise,
													clean_type, major_oob_label)

			train_X, train_y, adapted_rate = DataHelper.adapt_rate(clean_train[0], 
																clean_train[1], 
																best_rate)

			ensemble = RF(501, n_jobs=-1, max_features="sqrt")
			ensemble.fit(train_X, train_y)

			val_X = DataHelper.select_rows(X, val_idxs, copy=False)
			val_y = DataHelper.select_rows(y, val_idxs, copy=False)

			predictions = ensemble.predict(val_X)
			error = MetricsHelper.calculate_error_score(val_y, predictions)
			errors.append(error)

		return mean(errors)
Esempio n. 6
0
def main(x, y, task):

    #ys = [yr, ym, y25]
    #y_names = ['readm', 'mort_h', 'pheno25']
    #xs = [x48, onehot, w2v, w48, sentences]
    #x_names = ['48h', 'sparse_dx', 'w2v', 'w2v_48h', 'sentences']

    lr = LR(C=1e-4, penalty='l2', verbose=1)  #sag if multiclass/multilabel
    svm = SVM(C=1e5, verbose=True)
    rf = RF(n_estimators=60, verbose=1)
    gbc = GBC(n_estimators=200, learning_rate=1e-3, verbose=1)

    models = [lr, svm, rf, gbc]
    names = ['LR', 'SVM', 'RF', 'GBC']
    data = {}
    for idx in range(len(models)):
        if task != 'binary':
            data[names[idx]] = {}
            for ix in range(25):
                dat = run_experiment(x, y[:, ix], models[idx], task)
                data[names[idx]][ix] = dat
        else:
            dat = run_experiment(x, y, models[idx], task)
            data[names[idx]] = dat

    return (data)
Esempio n. 7
0
    def __init__(self, train_X, test_X, train_Y, test_Y, agent, classifier,
                 save_conf_mat):
        self.train_X = train_X
        self.test_X = test_X
        self.train_Y = train_Y
        self.test_Y = test_Y
        self.classifier = classifier

        if (self.classifier.lower() == 'knn'):
            self.clf = KNN()
        elif (self.classifier.lower() == 'rf'):
            self.clf = RF()
        elif (self.classifier.lower() == 'svm'):
            self.clf = SVM()
        else:
            self.clf = None
            print('\n[Error!] We don\'t currently support {} classifier...\n'.
                  format(classifier))
            exit(1)

        if (agent == None):
            self.agent = np.ones(train_X.shape[1])
        self.predictions = self.classify()
        self.accuracy = self.compute_accuracy()
        self.precision = self.compute_precision()
        self.recall = self.compute_recall()
        self.f1_score = self.compute_f1()
        self.confusion_matrix = self.compute_confusion_matrix()
        self.plot_confusion_matrix(save_conf_mat)
Esempio n. 8
0
def try_params(n_iterations, params):

    n_estimators = int(round(n_iterations * trees_per_iteration))
    print "n_estimators:", n_estimators
    pprint(params)

    clf = RF(n_estimators=n_estimators, verbose=0, n_jobs=-1, **params)
    clf.fit(x_train, y_train)

    p = clf.predict_proba(x_train)[:, 1]

    ll = log_loss(y_train, p)
    auc = AUC(y_train, p)
    acc = accuracy(y_train, np.round(p))

    print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    #

    p = clf.predict_proba(x_test)[:, 1]

    ll = log_loss(y_test, p)
    auc = AUC(y_test, p)
    acc = accuracy(y_test, np.round(p))

    print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    return {'loss': ll, 'log_loss': ll, 'auc': auc}
Esempio n. 9
0
def analyzing_models(images):
    """
    Program: Main program to analyze models and featuresets
    Input: Images
    Output: Accuracy dataframe of ceah molde and featureset
    """
    
    #  model list
    models = [RF(n_estimators=100, n_jobs=3),
              MLPClassifier(hidden_layer_sizes=(20, )),
              BernoulliNB()
              ExtraTreesClassifier(n_estimators=100, n_jobs=3)
              ]
    
    #model_names =['RandomForest', 'Neural network', 'ExtraTrees']
    model_names =['RandomForest', 'Neural network','Bernoulli Naive Bayes', 'ExtraTrees' ]
    
    #  feature list
    available_features = ['haralick', 
                          'zernike', 
                          'binary_pattern',
                          #'binary_pattern_small',
                          'ratio', 
                          'image_size',
                          #'normalized_sift',
                          'sift'
                          ]
    combi_features = combi_lists(available_features) 
    
    # accruacy datafream of accuracys    
    accuracy_df = compare_accuracy(images, combi_features, models, model_names)
    return accuracy_df
def main():
    # Get the clean datasets
    x,y,xt,feats,sample = readData()

    #Try out different models
    xg_class_params = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree",
                       "eta": 0.01,"max_depth": 14,"min_child_weight": 10,                    
                       "subsample": 0.66,
                       #"colsample_bytree": 0.7,
                       "colsample_bylevel":0.3,                       
                       "thread": 1,"silent": 1,"seed": 221}
    xg_class_params2 = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree",
                       "eta": 0.02,"max_depth": 5,"min_child_weight": 10,                    
                       "subsample": 0.66,
                       #"colsample_bytree": 0.7,
                       "colsample_bylevel":0.3,                       
                       "thread": 1,"silent": 1,"seed": 221}
    rf1 = RF(n_estimators=1000,max_features= 50,criterion='entropy',min_samples_split= 40,max_depth= 30, min_samples_leaf= 2, n_jobs = 10,verbose=0,random_state=42)
    etc1 = ETC(n_estimators=500,max_features= 90,criterion='entropy',min_samples_split= 20,max_depth= 25, min_samples_leaf= 10, n_jobs =10,verbose=0,random_state=42)
    xgb1 = XGC(xg_class_params,num_rounds=550)
    xgb2 = XGC(xg_class_params2,num_rounds=600)
    xgb_bag=bagger(xgb2,num_bags=3,bag_fraction=0.75)

    # EVALUATE a model
    score = crossValidate(etc1,x,y,folds=5,runs=1)
Esempio n. 11
0
def learn_total():

    clf = RF(n_estimators=200,
             max_features="auto",
             max_depth=8,
             min_samples_split=10,
             min_samples_leaf=2,
             n_jobs=3,
             oob_score=True,
             random_state=728)  #max_depth = 8最好
    #clf = GBDT(n_estimators=100,max_features="auto",max_depth=8,min_samples_split=10,min_samples_leaf=2,verbose=3)

    rd = 500 * 1000

    train = load_data_total("train", rd)

    train_label = load_label("train")
    train_label = train_label[:len(train)]

    train_label = np.array(train_label)

    print "train_label", len(train_label), "train", len(train)
    print "train特征数", len(train[0])

    print "learn"

    clf.fit(train, train_label)

    return clf
Esempio n. 12
0
def main():
    data_dir = '../data/taobao/'
    #data_dir = '../data/amazon/'
    item_path = data_dir + 'dim_items.txt'
    sub_item_path = data_dir + 'match_item.txt'
    train_pair_path = data_dir + 'train_set_1to1.txt'
    pg = PairGenerator(item_path, sub_item_path, train_pair_path)

    print('Preparing data...', time.ctime())
    label = '1'
    if data_dir == '../data/amazon/':
        label = '0'
    (train_data, test_data) = pg.fetch_all_topic(label, 0.2)
    train_x = np.array(train_data['pair_in'])
    train_y = np.array(train_data['pair_out'])
    test_num = len(test_data['pair_in'])
    test_upper = int(test_num * 0.5)
    test_x = np.array(test_data['pair_in'][test_upper:])
    test_y = np.array(test_data['pair_out'][test_upper:])
    print('Start training...', time.ctime())
    #C = NB()
    C = RF(verbose=1, n_jobs=2)
    C.fit(train_x, train_y)
    test_y_pred = C.predict_proba(test_x)
    test_y_pred = [result[1] for result in test_y_pred]
    with open(data_dir.split('/')[-2] + '_lda_pred.dat', 'w') as f:
        pickle.dump((test_y, test_y_pred), f)
    line = 'AUC: %s' % (metrics.roc_auc_score(test_y, test_y_pred) * 100)
    print(line)
    line = 'Ground Pos: %s, Predict Pos: %s' % (int(
        np.sum(test_y)), int(np.sum(test_y_pred)))
    print(line)
Esempio n. 13
0
def features_imp(df, target):
    from sklearn.ensemble import RandomForestRegressor as RF
    df['RAND_bin'] = np.random.randint(2, size=len(df[target]))
    df['RAND_uniform'] = np.random.uniform(0, 1, len(df[target]))
    df['RAND_int'] = np.random.randint(100, size=len(df[target]))
    columns = df.drop(target, axis=1).columns.tolist()
    estimator = RF(n_estimators=50)
    estimator.fit(df[columns], df[target])
    y_pred = estimator.predict(df[columns])
    baseline = MAE(y_pred, df[target])
    imp = []
    for col in columns:
        col_imp = []
        for n in range(3):
            save = df[col].copy()
            df[col] = np.random.permutation(df[col])
            y_pred = estimator.predict(df[columns])
            m = MAE(y_pred, df[target])
            df[col] = save
            col_imp.append(baseline - m)
        imp.append(np.mean(col_imp))
    FI = DataFrame([])
    FI['feature'] = columns
    FI['value'] = -np.array(imp)
    FI = FI.sort_values('value', ascending=False).reset_index(drop=True)
    M = FI[FI['feature'].isin(['RAND_bin', 'RAND_int', 'RAND_uniform'])]['value'].max()
    S = FI[FI['feature'].isin(['RAND_bin', 'RAND_int', 'RAND_uniform'])]['value'].std()
    threshold = M + S
    FI['important'] = np.where(FI['value'] > threshold, True, False)
    return FI
Esempio n. 14
0
    def RF_First(self, data, n_estimators=400, max_features='auto'):
        # 对训练数据进行训练,返回模验证数据,预测数据的预测结果
        model = RF(n_estimators=n_estimators, max_features=max_features)
        model.fit(data['train'][:, :-1], data['train'][:, -1])
        # 注意存储验证数据集结果和预测数据集结果的不同
        # 训练数据集的预测结果
        xul = model.predict(data['train'][:, :-1])
        # 验证的预测结果
        yanre = model.predict(data['test'][:, :-1])
        #预测的预测结果
        prer = model.predict(data['predict'][:, :-1])
        # 储存
        self.yanzhneg_pr.append(yanre)
        self.predi.append(prer)
        # 分别计算训练、验证、预测的误差
        # 每计算一折后,要计算训练、验证、预测数据的误差
        xx = self.RMSE(xul, data['train'][:, -1])
        yy = self.RMSE(yanre, data['test'][:, -1])
        pp = self.RMSE(prer, data['predict'][:, -1])
        # 储存误差
        self.error_dict['随机森林'] = [xx, yy, pp]

        # 验证数据集的真实输出结果
        self.yanzhneg_real = data['test'][:, -1]

        # 预测数据集的真实输出结果
        self.preal = data['predict'][:, -1]

        return print('1层中的随机森林运行完毕')
Esempio n. 15
0
def opt_model_RF(X, y):
    parameters = opt_RF(X, y)
    parameters = map(lambda i: int(i) if i > 2 else 2, parameters)
    rf = RF(max_depth=parameters[0],min_samples_split=parameters[1],min_samples_leaf=parameters[2],\
            n_estimators=100,class_weight='balanced',n_jobs=3,max_features="auto",oob_score=True)
    rf.fit(X, y)
    return rf
Esempio n. 16
0
    def RF_First(self, data, n_estimators=800, max_features='sqrt'):
        # 对训练数据进行训练,返回模验证数据,预测数据的预测结果
        model = RF(n_estimators=n_estimators, max_features=max_features)
        model.fit(data['train'][:, :-1], data['train'][:, -1])
        # 存储验证数据集结果和预测数据集结果
        # 训练数据集的预测结果
        xul = model.predict(data['train'][:, :-1])
        # 验证的预测结果
        yanre = model.predict(data['test'][:, :-1])
        # 预测的预测结果
        prer = model.predict(data['predict'][:, :-1])

        # 每计算一折后,要计算训练、验证、预测数据的误差
        xx = self.F1(xul, data['train'][:, -1])

        yy = self.F1(yanre, data['test'][:, -1])

        pp = self.F1(prer, data['predict'][:, -1])

        # 开始结合
        self.yanzhneg_pr.append(yanre)
        self.yanzhneg_real = data['test'][:, -1]
        self.predi.append(prer)
        self.preal = data['predict'][:, -1]

        # 存储误差
        self.error_dict['随机森林'] = [xx, yy, pp]
        return print('1层中的随机森林运行完毕')
Esempio n. 17
0
def main():
    train = pd.read_csv('criminal_train.csv')
    test = pd.read_csv('criminal_test.csv')
    print(train.dtypes)
    train, test = ObjectVariableRectification(train, test)
    y = np.array(train['Criminal'], dtype = float)
    X = np.array(train.drop(['Criminal', 'PERID'], axis = 1), dtype = float)
    assert(X.shape[0] == y.shape[0])
    print('-----------------Training------------------\n')
    clf = RF(n_estimators = 80, max_depth = 80)
    clf.fit(X, y)
    print(clf.score(X,y))
    print('\n')
    X_train = np.array(test.drop(['PERID'], axis = 1), dtype = float)
    assert[X.shape[1] == X_train.shape[1]]
    print('----------------Predicting-----------------\n')
    predictions = np.array(clf.predict(X_train), dtype = int)
    print('---------------WRITING THE FILE------------\n')
    filePtr = open('MySubmissions.csv', 'a+')
    filePtr.write('PERID,Criminal\n')
    for i in range(X_train.shape[0]):
        filePtr.write(str(test['PERID'][i]))
        filePtr.write(',')
        filePtr.write(str(predictions[i]))
        filePtr.write('\n')
    print('----------FILE SUCCESSFULY WRITTEN---------\n')
Esempio n. 18
0
def sup_predict(train, test, delay=delay, known_nodes=known_nodes):

    days, series = train.shape
    valdays = test.shape[0]

    series_sup_accs = []
    results = np.zeros((series - known_nodes, valdays - delay))

    for s in range(known_nodes, series):

        sys.stdout.write("\r Supervised prediction for series %s of %s" %
                         (str(s), str(series)))
        timeseries_train = train[:, s]
        timeseries_test = test[:, s]
        delay_trainset = delay_maker(timeseries_train, delay)
        delay_testset = delay_maker(timeseries_test, delay)

        model = RF(n_estimators=100, random_state=rs)
        model = model.fit(delay_trainset[:, :-1], delay_trainset[:, -1])
        y_pred = model.predict(delay_testset[:, :-1])
        results[s - known_nodes, :] = y_pred
        series_sup_accs.append(accuracy_score(delay_testset[:, -1], y_pred))

    print()

    return np.array(series_sup_accs), results.T
Esempio n. 19
0
def profit_curve_main(filepath, cost_benefit):
    """Main function to test profit curve code.

    Parameters
    ----------
    filepath     : str - path to find churn.csv
    cost_benefit  : ndarray - 2D, with profit values corresponding to:
                                          -----------
                                          | TP | FP |
                                          -----------
                                          | FN | TN |
                                          -----------
    """
    X_train, X_test, y_train, y_test = get_train_test(filepath)
    models = [RF(), LR(), GBC(), SVC(probability=True)]
    model_profits = []
    for model in models:
        profits, thresholds = get_model_profits(model, cost_benefit, X_train,
                                                X_test, y_train, y_test)
        model_profits.append((model, profits, thresholds))
    plot_model_profits(model_profits)
    max_model, max_thresh, max_profit = find_best_threshold(model_profits)
    max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh
    proportion_positives = max_labeled_positives.mean()
    reporting_string = ('Best model:\t\t{}\n'
                        'Best threshold:\t\t{:.2f}\n'
                        'Resulting profit:\t{}\n'
                        'Proportion positives:\t{:.2f}')
    print reporting_string.format(max_model.__class__.__name__, max_thresh,
                                  max_profit, proportion_positives)
Esempio n. 20
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est, depth, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='rf_{}_{}_{}.log'.format(n_est,
                                                          depth,
                                                          feature_name))


    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

    logging.info('AUC = {:.4f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Esempio n. 21
0
def train(train_x, train_y, test_x, test_y, algo, hyperparams, cv=3):
    if algo == 'SVM':
        model = GridSearchCV(SVC(), hyperparams, cv=cv)
        #model = SVC(C=C, kernel=kernel, degree=degree, tol=tol)

    elif algo == 'RF':
        model = GridSearchCV(RF(), hyperparams, cv=cv)
        #model = RF(n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

    print('Fitting Model and Tuning Hyperparameters with GridSearch using {}-fold cross-validation...'.format(cv))
    model.fit(train_x, train_y)

    best_params = model.best_params_
    print('Best Parameters Found: {}'.format(best_params))

    best_score = model.best_score_
    print('Mean cross-validated score of the best_estimator: {}'.format(best_score))
    
    print('Getting Predictions...')
    train_predictions = model.predict(train_x)
    test_predictions = model.predict(test_x)

    train_accuracy = accuracy_score(train_y, train_predictions)
    print('Train Set Accuracy: {}'.format(train_accuracy))

    test_accuracy = accuracy_score(test_y, test_predictions)
    print('Test Set Accuracy: {}'.format(test_accuracy))
    
    print('Test Set Classification Report')
    test_report = classification_report(test_y, test_predictions)
    print(test_report)
    return test_accuracy, test_predictions
Esempio n. 22
0
File: bovw.py Progetto: apacha/mscr
 def __init__(self, feat=SURF(), cls=RF(n_estimators=40), verbose=True):
     self._ft = Features(feat)
     self._da = Data(self._ft)
     self._fm = FitBoVW(cls)
     self._verbose = verbose
     self._cl = cls
     self._vq = None
 def classify_using_random_sampling(self, X_train, X_test, y_train, y_test, portion_of_sampled_dataset_vector, classifiers_for_experiments):
     psa = PSA()
     # ---- settings:
     number_of_runs_for_random_sampling = 20
     # ---- Experimenting:
     recognition_rate_LIST = np.zeros((len(classifiers_for_experiments), len(portion_of_sampled_dataset_vector)))
     classifier_index = 0
     for classifier in classifiers_for_experiments:
         print('############### Classifier: ' + classifier)
         portion_index = 0
         for portion_of_sampled_dataset in portion_of_sampled_dataset_vector:
             print('###### Portion of sampled dataset: ' + str(portion_of_sampled_dataset * 100) + '%')
             # ---- data reduction with random sampling:
             recognition_rate_with_random_sampling = [None] * number_of_runs_for_random_sampling
             for run_index in range(number_of_runs_for_random_sampling):
                 shuffled_samples = self.shuffle_samples_randomly(X=X_train, y=y_train)  # shuffle samples of classes randomly
                 # ---- data reduction:
                 number_of_classes = len(shuffled_samples)
                 n_samples = []
                 for class_index in range(number_of_classes):
                     number_of_samples_of_class = shuffled_samples[class_index].shape[0]
                     n_samples.append(int(number_of_samples_of_class * portion_of_sampled_dataset))
                 X, y = psa.reduce_data(sorted_samples=shuffled_samples, n_samples=n_samples)
                 # ---- report number of sampled data after PSA:
                 if run_index == 0:  # only report once in the multiple runs
                     print('number of sampled data in classes, after random sampling: ' + str(n_samples))
                 # ---- classify with random sampling:
                 if classifier == 'SVM':
                     # --------- train:
                     clf = SVC(kernel='linear')
                     clf.fit(X=X, y=y)
                 elif classifier == 'LDA':
                     # --------- train:
                     clf = LDA()
                     clf.fit(X=X, y=y)
                 elif classifier == 'QDA':
                     # --------- train:
                     clf = QDA()
                     clf.fit(X=X, y=y)
                 elif classifier == 'Random Forest':
                     # --------- train:
                     clf = RF(max_depth=2, random_state=0)
                     clf.fit(X=X, y=y)
                 elif classifier == 'Logistic Regression':
                     # --------- train:
                     clf = LR()
                     clf.fit(X=X, y=y)
                 elif classifier == 'Gaussian Naive Bayes':
                     # --------- train:
                     clf = GaussianNB()
                     clf.fit(X=X, y=y)
                 # --------- test:
                 labels_predicted = clf.predict(X_test)
                 recognition_rate_with_random_sampling[run_index] = (sum(labels_predicted == y_test) / len(labels_predicted)) * 100
             recognition_rate_with_random_sampling_average = np.mean(recognition_rate_with_random_sampling)
             print('The recognition rate using ' + classifier + ' with data number reduction (random sampling): ' + str(recognition_rate_with_random_sampling_average))
             recognition_rate_LIST[classifier_index, portion_index] = recognition_rate_with_random_sampling_average
             portion_index += 1
         classifier_index += 1
     return recognition_rate_LIST
Esempio n. 24
0
    def get_classifier(self, params):

        if self.learner_name == 'L1':
            self.ind_params = {
                'class_weight': 'balanced',
                'solver': 'liblinear',
                'penalty': 'l1'
            }
            joint_params = self.ind_params.copy()
            joint_params.update(params)
            print(joint_params)
            clf = LogisticRegression(**joint_params)
            self.space4classifier = {'C': hp.loguniform('C', -10, 10)}

        if self.learner_name == 'L2':
            self.ind_params = {
                'class_weight': 'balanced',
                'solver': 'liblinear',
                'penalty': 'l2'
            }
            joint_params = self.ind_params.copy()
            joint_params.update(params)
            print(joint_params)
            clf = LogisticRegression(**joint_params)
            self.space4classifier = {'C': hp.loguniform('C', -5, 5)}

        if self.learner_name == 'SVM':

            n_obs = len(y)
            n_pos, n_neg = np.sum(y), n_obs - np.sum(y)
            pos_weight = n_obs / 2.0 / n_pos
            neg_weight = n_obs / 2.0 / n_neg

            self.ind_params = {'class_weight': {0: neg_weight, 1: pos_weight}}
            joint_params = self.ind_params.copy()
            joint_params.update(params)
            joint_params.update({'probability': True})
            clf = svm.SVC(**joint_params)

        if self.learner_name == 'RF':
            self.ind_params = {'class_weight': 'balanced', 'n_jobs': 20}
            joint_params = self.ind_params.copy()
            joint_params.update(params)
            clf = RF(**joint_params)

        if self.learner_name == 'XGB':

            n_obs = len(self.y)
            n_pos, n_neg = np.sum(self.y), n_obs - np.sum(
                self.y)  # calculate weights for the pos/neg classes
            self.ind_params = {
                'objective': 'reg:logistic',
                'scale_pos_weight': n_pos / n_neg * 1.0
            }
            joint_params = self.ind_params.copy()
            joint_params.update(params)
            clf = xgb.XGBClassifier(**joint_params)

        return clf
Esempio n. 25
0
def new_rf():
    args = {"max_depth":200,
            "random_state": 0,
            "n_estimators":49,
            "class_weight":"balanced_subsample",
           # "max_features": None,
            }
    return RF(**args)
Esempio n. 26
0
 def objective_rf(self, trial):
     model = RF(n_estimators=int(
         trial.suggest_int('rf_n_estimators', 1, 100 + 1)),
                max_depth=int(trial.suggest_int('rf_max_depth', 2, 32 + 1)),
                max_leaf_nodes=trial.suggest_int('rf_max_leaf', 2, 40 + 1),
                min_samples_split=trial.suggest_int('rf_min_samples_split',
                                                    2, 10 + 1))
     return model
Esempio n. 27
0
 def calculateCrossVad(self, labels, subtrain):
     X_train, X_test, y_train, y_test = cross_validation.train_test_split(
         subtrain, labels, test_size=0.1)
     # print X_test, y_test
     srf = RF(n_estimators=500, n_jobs=-1)
     srf.fit(X_train, y_train)
     score = srf.score(X_test, y_test)
     return score
Esempio n. 28
0
 def _model(self):
     '''
     First iteration will be a random forrest.
     '''
     # Init model
     self.model = RF(n_estimators=10)
     X, y = self._splitter()
     self.model.fit(X, y)
Esempio n. 29
0
File: rf.py Progetto: j6e/hyperband
def try_params(n_iterations, params, data):
    n_estimators = int(round(n_iterations * trees_per_iteration))
    print("n_estimators:", n_estimators)
    pprint(params)

    clf = RF(n_estimators=n_estimators, verbose=0, n_jobs=-1, **params)

    return train_and_eval_sklearn_regressor(clf, data)
Esempio n. 30
0
def RF_Classifier(data_train, labels_train, num_estimators, max_features,
                  oob_score, n_jobs):
    random_forest = RF(n_estimators=num_estimators,
                       max_features=max_features,
                       oob_score=oob_score,
                       n_jobs=n_jobs)
    random_forest.fit(data_train, labels_train)
    return random_forest
Esempio n. 31
0
    try:
        for ne in range(nb_exp):
            print 'exp num:', ne
            X, y = sh(X, y)

            X_train = X[:n_samples_train, :]
            X_test = X[n_samples_train:(n_samples_train + n_samples_test), :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:(n_samples_train + n_samples_test)]

            # training only on normal data:
            X_train = X_train[y_train == 0]
            y_train = y_train[y_train == 0]

            print('RF processing...')
            model = RF()
            tstart = time()

            # the lower, the more normal:
            scoring = model.fit_predict(X_train, y_train, X_test, y_test)

            fit_predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]

            # cluster: old version of scipy -> interpol1d needs sorted x_input