Python balance_dataの例、utils.balance_data Pythonの例

コード例 #1

0

ファイルを表示

ファイル: flowid_classifier_legacy.py プロジェクト: Jumabek/nids-with-sampling

def classify_fold(classifier_args, X_train, y_train, X_test, fold_index, args,
                  runs_dir):
    classifier_name = args[0]
    input_dim = X_train.shape[1]
    num_class = np.unique(y_train)
    clf = get_classifier(
        classifier_args, args,
        runs_dir)  # runs_dir is a dir to put training log of the classifier
    if not use_class_weight_to_balance:
        X_train, y_train = balance_data(X_train, y_train)

    tick = time.time()
    clf.fit(X_train, y_train)
    tock = time.time()
    print("training time = {0:.0f}".format(tock - tick))
    tick = time.time()
    if classifier_name in ['cnn', 'softmax']:
        pred = clf.predict(X_test, eval_mode=True)
    else:
        pred = clf.predict(X_test)
    tock = time.time()
    duration = tock - tick
    print("Predicted data of size ", X_test.shape,
          " in {0:.2f} sec.".format(duration))
    print('Done with fold-{}'.format(fold_index))

    return pred, duration

コード例 #2

0

ファイルを表示

ファイル: feature_selection.py プロジェクト: Jumabek/nids-with-sampling

def process_fold(classifier_args, X_train, y_train, fold_index):
    classifier_name = args[0]
    input_dim = X_train.shape[1]
    num_class = np.unique(y_train)
    clf = get_classifier(
        classifier_args, args,
        runs_dir)  # runs_dir is a dir to put training log of the classifier
    #unique,counts = np.unique(y_train,return_counts=True)
    if not use_class_weight_to_balance:
        X_train, y_train = balance_data(X_train, y_train)

    #unique,counts = np.unique(y_train,return_counts=True)
    tick = time.time()

    clf.fit(X_train, y_train)
    pickle.dump(clf, open(runs_dir, 'wb'))
    tock = time.time()
    duration = tock - tick
    print("Trained and predicted data of sizes ", X_train.shape, X_test.shape,
          " in {0:.2f} min.".format(duration / 60))
    print('Done with fold-{}'.format(fold_index))
    importance = [
        importance
        for feature_name, importance in zip(feat_names, clf.feat_importances)
    ]

    return importance

コード例 #3

0

ファイルを表示

ファイル: utils_test.py プロジェクト: google-research/social_cascades

 def test_equal_binary(self):
   """Tests if binary data have balanced class size."""
   x_seq = [np.array([1, 2, 3]), np.array([1]), np.array([2]),
            np.array([1, 3]), np.array([3]), np.array([1, 3]),
            np.array([1, 2, 3]), np.array([2])]
   y_label = np.array([1, 0, 1, 1, 0, 1, 0, 1])
   x_seq, y_label = utils.balance_data(x_seq, y_label)
   self.assertEqual(sum(y_label)*2, len(y_label))

コード例 #4

0

ファイルを表示

ファイル: utils_test.py プロジェクト: google-research/social_cascades

 def test_equal_with_size(self):
   """Tests if binary data have balanced class size with specified size."""
   x_seq = [np.array([1, 2, 3]), np.array([1]), np.array([2]),
            np.array([1, 3]), np.array([3]), np.array([1, 3]),
            np.array([1, 2, 3]), np.array([2])]
   y_label = np.array([1, 0, 1, 1, 0, 1, 0, 1])
   x_seq, y_label = utils.balance_data(x_seq, y_label, class_size=2)
   self.assertEqual(np.sum(y_label == 0), 2)
   self.assertEqual(np.sum(y_label == 1), 2)

コード例 #5

0

ファイルを表示

ファイル: utils_test.py プロジェクト: google-research/social_cascades

 def test_equal_multi(self):
   """Tests if multi-class data have balanced class size."""
   x_seq = [np.array([1, 2, 3]), np.array([1]), np.array([2]),
            np.array([1, 3]), np.array([3]), np.array([1, 3]),
            np.array([1, 2, 3]), np.array([2])]
   y_label = np.array([1, 0, 2, 1, 0, 2, 0, 1])
   x_seq, y_label = utils.balance_data(x_seq, y_label)
   self.assertEqual(np.sum(y_label == 0), 2)
   self.assertEqual(np.sum(y_label == 1), 2)
   self.assertEqual(np.sum(y_label == 2), 2)

コード例 #6

0

ファイルを表示

ファイル: debug_kfold_classifier.py プロジェクト: Jumabek/nids-with-sampling

def train_fold(X_train,y_train,fold_index,args,runs_dir):
    classifier_name = args['classifier_name']
    balance = args['balance']
    clf = get_classifier(args,runs_dir) # runs_dir is a dir to put training log of the classifier    
    if balance=='explicit':
        print("explicit balancing data")
        X_train,y_train = balance_data(X_train,y_train)

    tick = time.time()
    #clf.fit(X_train,y_train)
    
    if classifier_name in ['tree', 'forest']:
        pickle.dump(clf,open(runs_dir,'wb'))

    tock = time.time()
    duration = tock-tick
    print("Trained data of size {} for Fold-{} in {:.0f} min, {:.0f} sec ".format(X_train.shape,fold_index,duration//60,duration%60))
    return clf,duration

コード例 #7

0

ファイルを表示

def perform_single_knn(input_data):
    """ Perform a single trial of knn with selected features
  """
    # extract inputs from input tuple
    features = input_data[0]
    labels = input_data[1]
    knn_num_neighbors = input_data[2]
    knn_weights = input_data[3]
    knn_algorithm = input_data[4]
    knn_metric = input_data[5]
    knn_imbalanced_data = input_data[6]
    test_size = input_data[7]

    # VERY IMPORTANT: Provide a random state, since it seems like multiple workers split the
    # data in the same way
    random.seed()
    train_features, test_features, train_labels, test_labels = (
        model_selection.train_test_split(features,
                                         labels,
                                         test_size=test_size,
                                         random_state=random.randint(
                                             1, 99999999)))

    if not knn_imbalanced_data:
        # Manually balance data. Don't do this on the whole data set, instead do it only
        # on train set, so the it is balanced and precision per classes are equal.  Do not
        # balance test set, because test set should reflect the true distribution of new
        # points to predict.
        train_features, train_labels = utils.balance_data(
            train_features, train_labels)

    model = KNeighborsClassifier(n_neighbors=knn_num_neighbors,
                                 weights=knn_weights,
                                 algorithm=knn_algorithm,
                                 metric=knn_metric)
    model = model.fit(train_features, train_labels)
    predicted_labels = model.predict(test_features)
    label_values = [0, 1]
    trial_metrics = compute_evaluation_metrics(predicted_labels, test_labels,
                                               label_values)
    return trial_metrics

コード例 #8

0

ファイルを表示

ファイル: ddos_trainer.py プロジェクト: Jumabek/nids-with-sampling

def train_and_save_classifier(X_train,y_train,args):
    classifier_name = args['classifier_name']
    balance = args['balance']
    clf = get_classifier(args) # runs_dir is a dir to put training log of the classifier   
    
    if balance=='explicit':
        tick = time.time()
        X_train,y_train = balance_data(X_train,y_train)
        tock = time.time()

    tick = time.time()
    print("Shufling data")
    X_train, y_train = shuffle(X_train,y_train)
    print('fitting model')
    clf.fit(X_train,y_train)
    
    if classifier_name in ['tree', 'forest']:
        with open(join(args['runs_dir'],'model.pkl'),'wb') as f:
            pickle.dump(clf,f)

    tock = time.time()
    duration = tock-tick
    print("Trained data of size {} in {:.0f} min, {:.0f} sec ".format(X_train.shape,duration//60,duration%60))
    return

コード例 #9

0

ファイルを表示

ファイル: main.py プロジェクト: fkunneman/ADNEXT_predict

    if dp['train_test'] == 'train':
        train.append(dh)
    elif dp['train_test'] == 'test':
        test.append(dh)

##### Bundling data #####
print('bundling data')
trainfile = expdir + 'traindata.csv'
if len(train) > 0:
    dh_train = utils.bundle_data(train, trainfile)      
else:
    dh_train = datahandler.Datahandler()
    dh_train.set(trainfile)
if 'balance' in cp.sections():
    print('balancing data')
    dh_train = utils.balance_data(dh_train, cp['balance']['outfile'])
train_dataset = dh_train.dataset

testfile = expdir + 'testdata.csv'
if len(test) > 0:
    dh_test = utils.bundle_data(test, testfile)
    test_dataset = dh_test.dataset
else:
    try:
        dh_test = datahandler.Datahandler()
        dh_test.set(testfile)
        test_dataset = dh_test.dataset
    except:
        test_dataset = False

##### Sampling data #####

コード例 #10

0

ファイルを表示

        return self

    def predict(self, test_x):
        pre_y = self.model.predict(test_x, batch_size=256)
        return pre_y

if __name__ == '__main__':
    # 三个输入：loan、know、attribute共三个矩阵，然后combine后进行训练。
    Y = np.load('embedding_y.npy')
    X_attr = np.load('embedding_x_attr.npy')
    X_loan = load_file('embeding_matrix', tail='loan')
    X_chaxun = load_file('embeding_matrix', tail='chaxun')
    Y = np.where(Y == 'good', 1, 0)
    X_train1, X_test1, X_train2, X_test2, X_train3, X_test3, y_train_or, y_test = train_test_split(
        X_attr, X_loan, X_chaxun, Y, test_size=0.2, random_state=4, shuffle=False)
    X_train1, y_train = balance_data(X_train1, y_train_or)
    X_train2, _ = balance_data(X_train2, y_train_or)
    X_train3, _ = balance_data(X_train3, y_train_or)
    X_train1, X_test1 = normal(X_train1, X_test1)
    X_train2, X_test2 = normal(X_train2, X_test2)
    X_train3, X_test3 = normal(X_train3, X_test3)
    # print(X_train1.shape)
    # X_train1, y_train = up_sample(X_train1, y_train_or, 2)
    # X_train2, _ = up_sample(X_train2, y_train_or, 2)
    # X_train3, _ = up_sample(X_train3, y_train_or, 2)
    # print(X_train1.shape, X_train2.shape, X_train3.shape)
    y_train = to_categorical(y_train, 2)
    y_test = to_categorical(y_test, 2)
    # 输入模型
    model = Model_Graph([X_train1, X_train2, X_train3], y_train)
    model.mutilin_model()

コード例 #11

0

ファイルを表示

ファイル: model.py プロジェクト: mohammedamarnah/behavioral-cloning

	if paths[i][-1] == '2':
		continue
	utils.fixPath(paths[i]+'/IMG/', paths[i]+'/', append=app)
	app = True

data = pd.read_csv('../../dataset/data/driving_log_clean.csv')

# Shuffling the data
data = data.sample(frac=1).reset_index(drop=True)

# Reading the data from the pandas dataframe
X = data[['center', 'left', 'right']].values
y = data['steering'].values

# Balancing the dataset, and dropping some of the common examples
X, y = utils.balance_data(X, y)

# Some information about the data after balancing
print("Full Data Size: ", data.shape)
print("Data After Balancing: ", len(X)+len(y))

# Splitting the data: (See SPLIT under SOME CONSTANTS)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=SPLIT, random_state=0)

# Some information about the data after splitting
print("Splitting with split rate: ", SPLIT)
print("Training Data Size: ", X_train.shape, y_train.shape)
print("Validation Data Size: ", X_valid.shape, y_valid.shape)

# Freeing the memory block cause you know, it needs to be free.
data = None

コード例 #12

0

ファイルを表示

ファイル: main.py プロジェクト: pranka02/ML_Flask_app

def train():

    clf = request.form['train']
    if allowed_classifier(clf):
        string = str('train')
        hist_n = string + "hist.jpeg"
        cnmt_n = string + "cnmt.jpeg"
        pkl_hnd = store(app.config['static_path'], app.root_path)

        # Feature extraction
        data = utils.file_parser(
            os.path.join(app.config['upload_path'], "data.txt"))
        features = utils.feature_extractor(data['text'], 5000).todense()
        sh = data.shape

        # Preprocessing features and labels
        data_x = utils.preprocess_features(features, 2500)
        data_y, enc = utils.label_encoder(data['label'], False, None)
        pkl_hnd.dump(enc, 'enc')  # storing encoder

        # Splitting data into training set and validation set
        train_x, train_y, valid_x, valid_y = utils.train_valid(
            data_x, data_y, 0.2)

        #Balancing data with SMOTE
        text, label = utils.balance_data(train_x, train_y)

        # Selecting model and tuning hyperparameters
        tr = model(clf, text[:sh[0], :], label[:sh[0]], valid_x, valid_y)
        comb_mod = tr.model_selection()

        # Fitting model and predicting
        mod = tr.build_model(comb_mod)
        pkl_hnd.dump(mod, 'model')  # storing the model
        pr = predict_model(valid_x)
        pred = pr.predict_model(mod)

        #Training Statistics
        st = stats(pred, valid_y)
        acc, f1 = st.train_stats()

        #Plotting histogram and confusion matrix
        pkl_hnd.plot_hist(data['label'], hist_n)
        n_labels = np.unique(np.asarray(data['label']))
        pkl_hnd.dump(n_labels, 'n_labels')  # storing labels
        cnf_matrix = st.cnf_mtx()
        pkl_hnd.plot_confusion_matrix(
            cnf_matrix,
            n_labels,
            cnmt_n,
            normalize=True,
            title='Confusion matrix',
            cmap=plt.cm.Blues,
        )

        return render_template("train_result.html",
                               accuracy=acc,
                               img_hist=url_for(app.config['static_path'],
                                                filename=hist_n),
                               img_cfmt=url_for(app.config['static_path'],
                                                filename=cnmt_n),
                               f1=f1)
    else:
        flash('Please enter a valid classifier')
        return redirect(url_for('index'))

コード例 #13

0

ファイルを表示

def main():
    # repeat and get more samples than needed until we can create fully balanced test and
    # train. We might not be since labels falling on the quantiles get assigned to the left
    # class, thus making the sample unbalanced
    oversample_rate = 1.1
    while True:
        add_log_vars = True
        (train_features, train_labels, test_features, test_labels,
         class_values, feature_names, label_name) = utils.prepare_data(
             args.input_filename, args.label_column,
             int(args.train_size * oversample_rate),
             int(args.test_size * oversample_rate), add_log_vars)
        all_labels = np.concatenate([train_labels, test_labels])

        # change labels to equally sized classes, using all labels in case minimum falls in
        # test.
        quantiles = scipy.stats.mstats.mquantiles(
            all_labels, np.arange(0, 1, 1.0 / args.num_classes))
        train_labels = np.digitize(train_labels, quantiles)
        test_labels = np.digitize(test_labels, quantiles)
        class_values = range(1, args.num_classes + 1)
        train_features, train_labels = utils.balance_data(
            train_features, train_labels, class_values, args.train_size)
        test_features, test_labels = utils.balance_data(
            test_features, test_labels, class_values, args.test_size)
        if train_features is None or test_features is None:
            oversample_rate += 0.1
            continue
        break

    all_labels = np.concatenate([train_labels, test_labels])
    print("Label is {}".format(label_name))
    print("Max of all labels: %d" % np.max(all_labels))
    print("Min of all labels: %d" % np.min(all_labels))

    if not args.skip_feature_selection:
        (train_features, test_features,
         feature_names) = feature_selection.L1SVMFeatureSelection(
             train_features, train_labels, test_features, feature_names,
             args.feature_selection_cost, args.threshold, args.num_jobs)

    model = models.train_logistic(train_features, train_labels,
                                  args.skip_cross_validation, args.multi_class,
                                  args.penalty, args.evaluation, args.num_jobs,
                                  args.cost)

    # Predict test and report full stats
    y_true = test_labels
    y_pred_prob = model.predict_proba(test_features)
    df = pd.DataFrame(data=y_pred_prob, columns=model.classes_)
    df['max_prob'] = df.max(axis=1)
    df['max_prob_class'] = df.idxmax(axis=1)
    df['true'] = y_true
    y_pred = df['max_prob_class']
    # TODO: if requested, choose the predicted values such that the class frquencies match
    # the expected class frequencies

    print("\n*****************************\n")
    print('MAE on test: {}'.format(
        mean_absolute_error(y_true, y_pred, multioutput='uniform_average')))

    print('Test Accuracy: {}'.format(accuracy_score(y_true, y_pred) * 100.))
    print('Classification report:')
    print(classification_report(y_true, y_pred, class_values))
    print('Weighted Precision Recall:')
    print(
        precision_recall_fscore_support(y_true,
                                        y_pred,
                                        labels=class_values,
                                        pos_label=None,
                                        average='weighted'))
    print('Unweighted Precision Recall:')
    print(
        precision_recall_fscore_support(y_true,
                                        y_pred,
                                        labels=class_values,
                                        pos_label=None,
                                        average='macro'))

    # print and plot confusion matrix
    print('Confusion Matrix Without Normalization')
    np.set_printoptions(precision=2)
    cm = confusion_matrix(y_true, y_pred, class_values)
    print(cm)
    print('Confusion Matrix With Normalization')
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm_normalized)

    plt.figure()
    plt.subplot(2, 1, 1)
    utils.plot_confusion_matrix(cm, class_values,
                                'Unnormalized confusion matrix')

    # Normalize the confusion matrix by row (i.e by the number of samples
    # in each class)
    plt.subplot(2, 1, 2)
    utils.plot_confusion_matrix(cm_normalized, class_values,
                                'Normalized confusion matrix')

    pdf = PdfPages(args.output_filename + '.pdf')
    plt.savefig(pdf, format='pdf')
    pdf.close()

    # Now print stats on subsets based on confidence of max_prob_class. Sort predictions
    # by confidence in descending order and take subsets from the top of the sorted df
    df = df.sort_values(by='max_prob', ascending=False)
    print(','.join([
        'Probability Threshold', 'Percentage Predicted', 'Accuracy',
        'AverageRecall', 'AveragePrecision', 'AverageFscore'
    ]))
    for percent_to_predict in range(1, 100):
        lowest_idx = int(percent_to_predict * len(df.index) / 100.0)
        df_subset = df.iloc[0:lowest_idx]
        prob_threshold = df_subset['max_prob'].min()
        accuracy = accuracy_score(df_subset['true'],
                                  df_subset['max_prob_class'])

        (precision, recall, fscore,
         support) = precision_recall_fscore_support(y_true,
                                                    y_pred,
                                                    labels=class_values,
                                                    pos_label=None,
                                                    average='macro')
        print(','.join(
            map(str, [
                prob_threshold, percent_to_predict, accuracy, recall,
                precision, fscore
            ])))