コード例 #1
0
#     index = len(l) -1
#     # print(index)
#     feature_list_of_all_instances.append(l[0:index])
#     class_list_of_all_instances.append(l[index])

# total_matrix = []

gc.collect()

for i in class_list_of_all_instances:
    if i == 1:
        c += 1
print("Positive data  ", c)

kf = StratifiedKFold(n_splits=5, shuffle=True)
under_sample = RandomUnderSampler()

print("Starting K fold data to Classifier   ...   ")
avg_roc = 0
for train_set_indexes, test_set_indexes in kf.split(
        feature_list_of_all_instances, class_list_of_all_instances):

    # temp_train_feature_list = feature_list_of_all_instances[train_set_indexes]
    # temp_train_class_list = class_list_of_all_instances[train_set_indexes]
    temp_train_feature_list = []
    temp_train_class_list = []
    for index in train_set_indexes:
        temp_train_feature_list.append(feature_list_of_all_instances[index])
        temp_train_class_list.append(class_list_of_all_instances[index])

    temp_test_feature_list = []
コード例 #2
0
        'LinearSVC': LinearSVC(random_state=0, loss='squared_hinge', penalty='l1', C=1e-2, max_iter=1000, fit_intercept=True, dual=False, tol=1e-4),
        'Random Forest': RandomForestClassifier(random_state=0, n_estimators=300, min_samples_leaf=10, criterion='gini', max_features=None, max_depth=None),
        'Neural Network': MLPClassifier(random_state=0, activation='tanh', solver='adam', alpha=1e-2, beta_1=0.7, beta_2=0.9, max_iter=1000, hidden_layer_sizes=100),
        'Logistic Regression': LogisticRegression(random_state=0, penalty='l1', C=1e-3, max_iter=100, solver='saga', fit_intercept=True, dual=False, tol=1e-4),
    }

    print("Execution  Model                 AUPRC    AUROC    ACC    F1     Precision  Recall")

    # For each base learner...
    for learner_name, learner in base_learners.items():

        # undersample majority class, which is "False"
        counter = Counter(exec_training_target)
        true_count = counter[True]
        false_count = int(counter[False] * UNDERSAMPLING_RATES)
        undersampler = RandomUnderSampler(
            sampling_strategy={True: true_count, False: false_count})

        # build pipeline
        steps = [('under', undersampler), ('scale', StandardScaler()), ('learner', learner)]
        pipeline = Pipeline(steps=steps)
        pipeline.fit(exec_training_features, exec_training_target)

        # prediction
        predicted = pipeline.predict(exec_test_features)

        # evaluation
        acc = accuracy_score(exec_test_target, predicted)
        precision, recall, f1, _ = precision_recall_fscore_support(
            exec_test_target, predicted, average='binary')

        if hasattr(pipeline, "predict_proba"):
コード例 #3
0
# Deal with imbalanced class sizes below
# Make Data 1D for compatability upsampling methods
X_trainShape = X_train.shape[1] * X_train.shape[2] * X_train.shape[3]
X_testShape = X_test.shape[1] * X_test.shape[2] * X_test.shape[3]
X_trainFlat = X_train.reshape(X_train.shape[0], X_trainShape)
X_testFlat = X_test.reshape(X_test.shape[0], X_testShape)
#print("X_train Shape: ",X_train.shape)
#print("X_test Shape: ",X_test.shape)
#print("X_trainFlat Shape: ",X_trainFlat.shape)
#print("X_testFlat Shape: ",X_testFlat.shape)

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
#ros = RandomOverSampler(ratio='auto')
ros = RandomUnderSampler(ratio='auto')
X_trainRos, Y_trainRos = ros.fit_sample(X_trainFlat, Y_train)
X_testRos, Y_testRos = ros.fit_sample(X_testFlat, Y_test)

# Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0])
Y_trainRosHot = np_utils.to_categorical(Y_trainRos, num_classes=2)
Y_testRosHot = np_utils.to_categorical(Y_testRos, num_classes=2)
#print("X_train: ", X_train.shape)
#print("X_trainFlat: ", X_trainFlat.shape)
#print("X_trainRos Shape: ",X_trainRos.shape)
#print("X_testRos Shape: ",X_testRos.shape)
#print("Y_trainRosHot Shape: ",Y_trainRosHot.shape)
#print("Y_testRosHot Shape: ",Y_testRosHot.shape)

for i in range(len(X_trainRos)):
    height, width, channels = 50, 50, 3
コード例 #4
0
    def clustering(self, target: str):
        y: np.ndarray = self.data.pop(target).values
        X: np.ndarray = self.data.values

        # RAndomUnderSampler
        sampler = RandomUnderSampler(sampling_strategy='majority',
                                     ratio={
                                         1: 1000
                                     }).fit(X, y)
        X_norm1, y = sampler.fit_sample(X, y)

        #
        sampler = MinMaxScaler()
        X_norm = sampler.fit_transform(X_norm1, y)

        # SelectKBest
        X_new = SelectKBest(f_classif, k=2).fit_transform(X_norm1, y)
        y_trueK = cluster.KMeans(n_clusters=4,
                                 random_state=1).fit_predict(X_new)

        # PCA
        PCA(n_components=2).fit(X_norm)
        X_PCA = PCA(n_components=2).fit_transform(X_norm, y)
        y_trueP = cluster.KMeans(n_clusters=4,
                                 random_state=1).fit_predict(X_PCA)

        silh = []
        rand = []
        h**o = []
        index = ['Stad.Scaler', 'PCA']

        print("Silhouette[KBest] =",
              metrics.silhouette_score(X_new, y_trueK, metric='euclidean'))
        print("RI[KBest] =", metrics.adjusted_rand_score(y, y_trueK))
        print('Homogeniety[KBest] =', metrics.homogeneity_score(y, y_trueK))
        silh.append(
            metrics.silhouette_score(X_new, y_trueK, metric='euclidean'))
        rand.append(metrics.adjusted_rand_score(y, y_trueK))
        h**o.append(metrics.homogeneity_score(y, y_trueK))

        print("Silhouette[PCA] =",
              metrics.silhouette_score(X_PCA, y_trueP, metric='euclidean'))
        print("RI[PCA] =", metrics.adjusted_rand_score(y, y_trueP))
        print('Homogeniety[PCA] =', metrics.homogeneity_score(y, y_trueP))
        silh.append(
            metrics.silhouette_score(X_PCA, y_trueP, metric='euclidean'))
        rand.append(metrics.adjusted_rand_score(y, y_trueP))
        h**o.append(metrics.homogeneity_score(y, y_trueP))

        # plotting
        plt.figure(figsize=(12, 6))
        plt.suptitle('Clustering for CT Dataset - KMeans ', fontsize="x-large")

        grid = plt.GridSpec(1, 2, wspace=0.4, hspace=0.5)

        plt.subplot(grid[0, 0])
        plt.scatter(X_new[:, 0], X_new[:, 1], c=y_trueK)
        plt.title("selectKBeast")

        plt.subplot(grid[0, 1])
        plt.scatter(X_PCA[:, 0], X_PCA[:, 1], c=y_trueP)
        plt.title("PCA")

        plt.show()
コード例 #5
0

###############################################################################
# With the controlled under-sampling methods, the number of samples to be
# selected can be specified. ``RandomUnderSampler`` is the most naive way of
# performing such selection by randomly selecting a given number of samples by
# the targetted class.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = RandomUnderSampler(random_state=0)
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# ``NearMiss`` algorithms implement some heuristic rules in order to select
# samples. NearMiss-1 selects samples from the majority class for which the
# average distance of the :math:`k`` nearest samples of the minority class is
# the smallest. NearMiss-2 selects the samples from the majority class for
# which the average distance to the farthest samples of the negative class is
# the smallest. NearMiss-3 is a 2-step algorithm: first, for each minority
コード例 #6
0
    linear_model.LogisticRegression(), 
    {'C':np.logspace(0,3,4),
     'penalty':['l1','l2'],
    }),   
        
    ("Decision Tree", 'cart',
     tree.DecisionTreeClassifier(min_samples_leaf=min_samples_leaf,
                                 criterion=criterion),
    {'max_depth':max_depth,
     #'max_features':[3,5,10,None],
     #'splitter':['best','random'],
     'criterion':['entropy','gini'],
    }),

    ("RandomUnderSampling", 'rus',
     Pipeline([('res', RandomUnderSampler()),
               ('tree', tree.DecisionTreeClassifier(
                       min_samples_leaf=min_samples_leaf, criterion=criterion))
               ]),
    {'tree__max_depth':max_depth,
    }),

    ("SMOTE", 'smt',
     Pipeline([('res', SMOTE()),
               ('tree', tree.DecisionTreeClassifier(
                       min_samples_leaf=min_samples_leaf, criterion=criterion))
               ]),
    {'tree__max_depth':max_depth,
    }),

    ("UnderBagging", 'ub',
コード例 #7
0
X = np.array(df.drop(['label'], axis=1))
y = np.array(df['label'])

normalization_object = Normalizer()
X = normalization_object.fit_transform(X)

number_of_folds = 7

best_clustered_trees_average_auc = 0
best_one_tree_average_auc = 0
best_cluster = 0

skf = StratifiedKFold(n_splits=number_of_folds, shuffle=True)

sampler = RandomUnderSampler()

fold_counTer = 0
number_of_clusters = 23  # this is a hyper parameter
trees = {}
all_auc_with_clustered_trees = []
all_auc_with_one_tree = []
X_train_major = np.zeros((0, 1294))
y_train_major = []

avg_roc = 0
avg_aupr = 0

for train_index, test_index in skf.split(X, y):
    X_train = X[train_index]
    X_test = X[test_index]
コード例 #8
0
ファイル: search.py プロジェクト: vladutmd/datascience
    results['precision_score'] = precision_score(test_y, pred_y)
    results['roc_auc_score'] = roc_auc_score(test_y, pred_y)
    return results






X_train = np.loadtxt('X_train')
X_test = np.loadtxt('X_test')
y_train = np.loadtxt('y_train')
y_test = np.loadtxt('y_test')

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0, replacement=False)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)








log_reg_param_grid = {'C':[0.01, 0.1, 1, 10, 100],
                     'max_iter':[100, 200, 300, 500],
                     'penalty':['l2'],
                     'class_weight':['balanced',None],
                     'tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
                     }
コード例 #9
0
def main():
    input_file = sys.argv[1]
    embedding_file = sys.argv[2]
    is_TFIDF = sys.argv[3]
    test_file = sys.argv[4]

    if not os.path.exists('../models/svm_model.sav'):

        input_data = []

        with codecs.open(input_file, 'r', 'utf-8') as in_obj:
            for line in in_obj:
                line = line.strip().strip('\n\r')
                input_data.append(line.split('\t'))

        filtered_input_data = filter_tweets(input_data)

        tweet_text_vector = []
        labels = []
        for i in range(len(filtered_input_data)):
            labels.append(filtered_input_data[i][1])

        if is_TFIDF.strip().lower(
        ) == 'true' and not os.path.exists(input_file + '_tfidf.csv'):
            tweet_text_list = []
            if os.path.exists(input_file + '_class_inp.tsv'):
                with codecs.open(input_file + '_class_inp.tsv', 'r',
                                 'utf-8') as class_file_obj:
                    for line in class_file_obj:
                        line = line.split('\t')
                        tweet_text_vector.append(' '.join(
                            preprocess_tweet_text(line[0])))
                vectorizer = TfidfVectorizer(ngram_range=(1, 3))
                X_transform = vectorizer.fit_transform(
                    tweet_text_vector).toarray()
                print(X_transform[:10])
                np.savetxt(input_file + '_tfidf.csv',
                           X_transform,
                           delimiter='\t')
            else:
                raise NameError(
                    'please provide label annotated texts file named %s_class_inp.tsv'
                    % input_file)

        if not os.path.exists(input_file + '_embed.csv'
                              ) and is_TFIDF.strip().lower() != 'true':
            tweet_text_list = []
            for i in range(len(filtered_input_data)):
                tweet_text = get_tweet_text(filtered_input_data[i][0])
                tweet_text_list.append(tweet_text)
                tweet_text_vector.append(preprocess_tweet_text(tweet_text))

            X_transform = get_embedding_vector(tweet_text_vector,
                                               embedding_file)

            out_data = np.append(np.transpose([np.array(tweet_text_list)]),
                                 np.transpose([labels]),
                                 axis=1)
            np.savetxt(input_file + '_class_inp.tsv',
                       out_data,
                       fmt='%s	%s',
                       delimiter='\t')

            np.savetxt(input_file + '_embed.csv', X_transform, delimiter='\t')

        if is_TFIDF.strip().lower() != 'true':
            X_transform = np.loadtxt(input_file + '_embed.csv', delimiter='\t')
        else:
            X_transform = np.loadtxt(input_file + '_tfidf.csv', delimiter='\t')

        scaler = MinMaxScaler()

        X_transform_scaled = scaler.fit_transform(X_transform)
        print(len(X_transform_scaled))

        le = LabelEncoder()
        y = le.fit_transform(labels)
        print(len(y))

        rus = RandomUnderSampler(random_state=0)
        X_resample, y_resample = rus.fit_sample(X_transform_scaled, y)

        print(len(X_resample), len(y_resample))

        svm_clf = SVC(C=10, kernel='rbf', gamma='scale',
                      probability=True).fit(X_resample, y_resample)

        filename = '../models/svm_model.sav'
        pickle.dump(svm_clf, open(filename, 'wb'))

    # test data preprocessing

    loaded_model = pickle.load(open('../models/svm_model.sav', 'rb'))

    test_data = []
    test_id = []
    cnt_test = 0

    with codecs.open(test_file, 'r', 'utf-8') as in_obj:
        for line in in_obj:
            cnt_test += 1
            line = line.strip().strip('\n\r')
            test_data.append(line.split('\t')[-1])
            test_id.append(line.split('\t')[0])
    print("input file count =%s" % (cnt_test))
    print("test_data_count= %s" % (len(test_data)))

    tweet_test_text_vector = []

    for i in range(len(test_data)):
        tweet_test_text_vector.append(preprocess_tweet_text(test_data[i]))
    print("test text vector count = %s" % (len(tweet_test_text_vector)))
    X_test_transform = get_embedding_vector(tweet_test_text_vector,
                                            embedding_file)

    scaler = MinMaxScaler()
    X_test_transform_scaled = scaler.fit_transform(X_test_transform)
    print(len(X_test_transform_scaled))

    out_data = loaded_model.predict_proba(X_test_transform_scaled)

    np.savetxt(test_file + '_results.tsv',
               out_data,
               fmt='%s\t%s',
               delimiter='\t')

    with codecs.open(test_file + '_results_f8.tsv', 'w', 'utf-8') as f8_obj:

        with codecs.open(test_file + '_results.tsv', 'r',
                         'utf-8') as prediction_obj:
            for i, line in enumerate(prediction_obj):
                line = line.split('\t')
                if line[1] >= 0.8:
                    f8_obj.write(test_id[i] + '\t' + test_data[i] + '\n')
コード例 #10
0
ファイル: paper_tasks.py プロジェクト: wuuusicong/vizml
def load_features(task):

    log_file = log_dir + 'loading_task_' + str(task['pref_id']) + '.txt'
    load_logger = logger(log_file, task)

    dataset_prediction_task_to_outcomes = {
        'all_one_trace_type': {
            'two': ['line', 'bar'],
            'three': ['line', 'scatter', 'bar'],
            'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'],
        },
        'has_single_src': {
            'two': [True, False]
        },
        'num_x_axes': {
            'numeric': [i for i in range(5)]
        },
        'num_y_axes': {
            'numeric': [i for i in range(5)]
        }
    }

    field_prediction_task_to_outcomes = {
        'trace_type': {
            'two': ['line', 'bar'],
            'three': ['line', 'scatter', 'bar'],
            'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'],
        },
        'is_xsrc': {
            'two': [True, False]
        },
        'is_ysrc': {
            'two': [True, False]
        },
        'is_x_or_y': {
            'two': ['x', 'y']
        },
        'is_single_src': {
            'two': [True, False]
        }
    }

    if task['dataset'] == 'dataset':
        task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv'
        task['outcomes_df_file_name'] = 'chart_outcomes.csv'
        task['id_field'] = 'fid'
        prediction_task_to_outcomes = dataset_prediction_task_to_outcomes
    else:
        assert task['dataset'] == 'field'
        task['features_df_file_name'] = 'field_level_features.csv'
        task['outcomes_df_file_name'] = 'field_level_outcomes.csv'
        task['id_field'] = 'field_id'
        prediction_task_to_outcomes = field_prediction_task_to_outcomes


    features_df = pd.read_csv(
        join(features_directory, task['features_df_file_name']),
        nrows=num_datapoints)
    outcomes_df = pd.read_csv(
        join(features_directory, task['outcomes_df_file_name']),
        nrows=num_datapoints)
    feature_names_by_type = pickle.load(
        open(
            join(features_directory, feature_set_lookup_file_name),
            'rb'))

    # print(features_df)
    # print('Initial Features:', features_df.shape)
    # print('Initial Outcomes:', outcomes_df.shape)
    # load_logger.log_dict(feature_names_by_type)
    # load_logger.log('\n')
    # load_logger.log(features_df)
    load_logger.log('Initial Features: ' + str(features_df.shape))
    load_logger.log('Initial Outcomes: ' + str(outcomes_df.shape))

    if task['dataset'] == 'field':
        def is_x_or_y(is_xsrc, is_ysrc):
            if is_xsrc and pd.isnull(is_ysrc): return 'x'
            if is_ysrc and pd.isnull(is_xsrc): return 'y'
            else:                              return None
        outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc'])
        outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc']

    outcomes_df_subset = format_outcomes_df(load_logger, outcomes_df, 
                                            task['outcome_variable_name'],
                                            prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ],
                                            id_field=task['id_field'])
    
    final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=task['id_field'])
    last_index = final_df.columns.get_loc(task['outcome_variable_name'])

    X = final_df.iloc[:, :last_index]
    y = final_df.iloc[:, last_index]

    # print('Intermediate Outcomes:', y.shape)
    # value_counts = y.value_counts()
    # print('Value counts:')
    # print(value_counts)
    load_logger.log('Final DF Shape: ' + str(final_df.shape))
    load_logger.log('Last Index: ' + str(last_index))

    load_logger.log('Intermediate Outcomes: ' + str(y.shape))
    load_logger.log('Value counts: \n' + str(y.value_counts()))

    # delete variables to save memory!
    del final_df, outcomes_df

    task_types = ['dimensions', 'types', 'values', 'names']
    for task_name in task_types:
        names = get_feature_set_names_by_type(
            feature_names_by_type,
            task_type=task['dataset'],
            feature_set=task_name)
        indices = [X.columns.get_loc(c) for c in names if c in X.columns]
        # print('task is ' + task_name + ' and indices are:')
        #print('names are {}'.format(names) )
        # print(indices)
        # load_logger.log('task is ' + task_name + ' and indices are: ')
        # load_logger.log(indices)


    y = pd.get_dummies(y).values.argmax(1)

    if task['sampling_mode'] == 'over':
        res = RandomOverSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
    elif task['sampling_mode'] == 'under':
        res = RandomUnderSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
    elif isinstance(task['sampling_mode'], int):
        X_resampled_arrays, y_resampled_arrays = [], []
        for outcome in np.unique(y):
            outcome_mask = (y == outcome)
            X_resampled_outcome, y_resampled_outcome = resample(
                X[outcome_mask],
                y[outcome_mask],
                n_samples=task['sampling_mode'],
                random_state=RANDOM_STATE
            )
            X_resampled_arrays.append(X_resampled_outcome)
            y_resampled_arrays.append(y_resampled_outcome)

        X, y = np.concatenate(X_resampled_arrays).astype(
            np.float64), np.concatenate(y_resampled_arrays)
    else:
        X, y = X.values.astype(np.float64), y

    # print('Final Features:', X.shape)
    # print('Final Outcomes:', y.shape)
    load_logger.log('Final Features:' + str(X.shape))
    load_logger.log('Final Outcomes:' + str(y.shape))
    unique, counts = np.unique(y, return_counts=True)
    load_logger.log('Value counts after sampling:')
    load_logger.log_dict(dict(zip(unique, counts)))
    load_logger.log('\n')

    del load_logger
    return util.unison_shuffle(X, y)
        1: weights[1],
        2: weights[2],
        3: weights[3],
        4: weights[4]
    }
    over = SMOTE(sampling_strategy=ratio_over, random_state=314)
    X_train, y_train = over.fit_resample(X_train, y_train)
    # undersample samples > average
    ratio_under = {
        0: average_samples,
        1: average_samples,
        2: average_samples,
        3: average_samples,
        4: average_samples
    }
    under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314)
    X_train, y_train = under.fit_resample(X_train, y_train)

    # Configure the cross-validation procedure
    cv_inner = LeaveOneOut()

    #Hyper parameter code
    batch_size = [8, 16, 32]
    neurons = [30, 40, 50]
    hidden_layers = [1, 2, 3]
    epochs = [10, 50, 100]
    activation = ['relu', 'tanh', 'sigmoid', 'linear']
    param_grid = dict(batch_size=batch_size,
                      neurons=neurons,
                      hidden_layers=hidden_layers,
                      epochs=epochs,
コード例 #12
0
# Splitting the data with stratifying due imbalanced data
y = df.pop("TARGET")
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state = RSEED)

# Saving the splitted test data
X_test.to_csv("Home_Loan/X_test.csv", index=False)
y_test.to_csv("Home_Loan/y_test.csv", index=False)
print("Test data successfully saved to Home_Loan.")

# Clean the data
X_train_cleaned, y_train_cleaned = clean_data(X_train, y_train, test=False)
print("Data successfully cleaned.")

# Sampling because of high imbalance using undersample strategy
undersample = RandomUnderSampler(sampling_strategy='majority',random_state=RSEED)
X_train_balanced, y_train_balanced = undersample.fit_resample(X_train_cleaned, y_train_cleaned)
print("Data successfully balanced.")

# Selecting features
features = get_features()

# Training the model
model = RandomForestClassifier(n_estimators=196, min_samples_split = 2, 
                               max_leaf_nodes = 49, max_depth = 17, 
                               bootstrap = True, max_features = 'auto', 
                               min_weight_fraction_leaf = 0.1,  
                               n_jobs=-1, random_state=RSEED, verbose = 1)
model.fit(X_train_balanced[features], y_train_balanced)
print("Model successfully fitted.")
コード例 #13
0
def do_sampling(upsampling, train_features, train_truth, smote=False):
    X_train = train_features
    y_train = train_truth.label

    if smote is False:
        X = pd.concat([X_train, y_train], axis=1)

        completed = X[X.label == 0]
        drop_out = X[X.label == 1]

        if upsampling is True:
            sampled = resample(
                completed,
                replace=True,  # sample with replacement
                n_samples=len(drop_out),  # match number in majority class
                random_state=27)  # reproducible results
            sampled = pd.concat([drop_out, sampled])
        elif upsampling is False:
            sampled = resample(
                drop_out,
                replace=False,  # sample with replacement
                n_samples=len(completed),  # match number in majority class
                random_state=27)  # reproducible results
            sampled = pd.concat([sampled, completed])

        if upsampling is not None:
            sampled = sampled.sample(frac=1).reset_index(drop=True)
            print(sampled.label.value_counts())
            y_train = sampled.label
            X_train = sampled.drop('label', axis=1)

    elif smote is True:

        feature_columns = X_train.columns

        if upsampling is True:

            upsample = SMOTE()
            X_train, y_train = upsample.fit_resample(X_train, y_train)

        elif upsampling is False:

            over = SMOTE(sampling_strategy=0.4)
            under = RandomUnderSampler(sampling_strategy=0.5)

            steps = [('o', over), ('u', under)]
            pipeline = Pipeline(steps=steps)

            X_train, y_train = pipeline.fit_resample(X_train, y_train)

        features = pd.DataFrame(data=X_train, columns=columns)
        label = pd.DataFrame(data=y_train, columns=['label'])

        X = pd.concat([features, label], axis=1)

        X = X.sample(frac=1).reset_index(drop=True)

        print(X.label.value_counts())

        y_train = X.label
        X_train = X.drop('label', axis=1)

    return X_train, y_train
df = pd.read_csv('../Data/train.csv', header=0)
X = df.iloc[:,0:-1].copy()
Y = df.iloc[:, -1].copy()

#scaler = StandardScaler()
#X = scaler.fit_transform(X)

df = pd.read_csv('../Data/validation.csv', header=0)
X_valid = df.iloc[:,0:-1].copy()
Y_valid = df.iloc[:, -1].copy()
#scaler = StandardScaler()
#X_valid = scaler.fit_transform(X_valid)

# Handle the dataset with undersampling strategy
rus = RandomUnderSampler(sampling_strategy=0.8)
X_res, Y_res = rus.fit_resample(X, Y)

# Handle the dataset with oversampling strategy
ros = RandomOverSampler(random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X, Y)

# Handle the dataset with SMOTE
SM = SMOTE(random_state=0)
X_smote, Y_smote = SM.fit_sample(X, Y)

score_infor = [[],[],[],[]]
roc_auc_score_infor = [[],[],[],[]]
f1_score_infor = [[],[],[],[]]

#print(pd.value_counts(Y_smote))
"""### Oversampling and Undersampling"""

from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsemble 
from imblearn.under_sampling import RandomUnderSampler

bg_sm = SMOTE(random_state=10000)
bg_train_X_res, bg_train_y_res = bg_sm.fit_sample(bg_train_X, bg_train_y)



bg_train_y_res = pd.DataFrame(bg_train_y_res, columns = ["gender"])
bg_train_X_res = pd.DataFrame(bg_train_X_res, columns = ['star_sign','phone_os','height','weight','sleepiness','iq','fb_friends'])

bg_us = RandomUnderSampler(random_state=10000)
bg_train_X_red, bg_train_y_red = bg_us.fit_sample(bg_train_X, bg_train_y)


bg_train_y_red = pd.DataFrame(bg_train_y_red, columns = ["gender"])
bg_train_X_red = pd.DataFrame(bg_train_X_red, columns = ['star_sign','phone_os','height','weight','sleepiness','iq','fb_friends'])



print(str(pd.value_counts(bg_train_y_res.values.flatten())))

"""## Classification

### decision tree
"""
コード例 #16
0
Train_X = data_train.iloc[0:575212,2:217]
cv_X = data_train.iloc[575213:585212,2:217]
Train_Y = data_train.iloc[0:575212,1]
cv_Y = data_train.iloc[575213:585212,1:2]
Test_X = data_train.iloc[585213:595212,2:217]
Test_Y = data_train.iloc[585213:595212,1:2]

id_test = data_test.iloc[:,0:1]
data_test = data_test.iloc[:,1:216]

# Smote undersampling majority class
#from collections import Counter 
#from sklearn.datasets import fetch_mldata 
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(ratio = 'majority') 
Train_X_res, Train_Y_res = rus.fit_sample(Train_X,Train_Y)

print("Y=1(%): ",str(np.sum(Train_Y_res)/Train_X_res.shape[0]))
## Without Undersampling
# Train_X_res = Train_X
# Train_Y_res = Train_Y

# Normalizing the data
maxi = np.max(Train_X_res,axis = 0)
avgi = np.average(Train_X_res,axis = 0)
Train_X_res = (Train_X_res - avgi)/(maxi+0.0000000000000000001)
cv_X = (cv_X - avgi)/(maxi+0.0000000000000000001)
Test_X = (Test_X - avgi)/(maxi+0.0000000000000000001)
data_test = (data_test - avgi)/(maxi+0.0000000000000000001)
コード例 #17
0
#print("Accuracy After ROC-1: %.2f%%" % (roc_auc1 * 100.0))

pre_scor1 = precision_score(y_test, y_pred_over1)
re_scor1 = recall_score(y_test, y_pred_over1)
f1_scor1 = f1_score(y_test, y_pred_over1)
##print("\n ROC AUC Score:  %.2f%%" % (roc_auc * 100.0))
#print("\n Precision Score-1:  %.2f%%" % (pre_scor1 * 100.0))
#print("\n Recall Score-1:  %.2f%%" % (re_scor1 * 100.0))
#print('\n F1-Measure-1: %.2f%%' % (f1_scor1 * 100.0))
#precision, recall, thresholds = precision_recall_curve(y_test, y_pred_over1)

data2 = np.array(two_split[1]).astype(np.float)
X2, y2 = prep_data1(data2)

# Random UnderSampling
over2 = RandomUnderSampler()
# resample the training data
X_over2, y_over2 = over2.fit_sample(X2, y2)

#After resampling again accuracy count
model = KNeighborsClassifier()
model.fit(X_over2, y_over2)
y_pred_over2 = model.predict(X_test)

accuracy2 = accuracy_score(y_test, y_pred_over2)
print("Accuracy After RandomOverSlice-2: %.2f%%" %
      (((accuracy1 + accuracy2) / 2) * 100.0))

roc_auc2 = roc_auc_score(y_test, y_pred_over2)
print("Accuracy After ROC-2: %.2f%%" % ((roc_auc1 + roc_auc2) / 2 * 100.0))
コード例 #18
0
    def use_parameters(self, X_train, selected_features):
        '''


        Returns
        -------

        '''
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]

        ### XGBOOST
        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__objective': ['logloss'],
            'model__learning_rate': [0.005, 0.01, 0.05, 0.1,
                                     0.5],  # so called `eta` value
            'model__max_depth': [3, 4, 5],
            'model__min_child_weight': [1, 5, 11, 12, 15],
            'model__silent': [0],
            'model__subsample': [0.6, 0.8, 1.0],
            'model__colsample_bytree': [0.6, 0.8, 1.0],
            'model__n_estimators':
            [5, 50,
             100],  # number of trees, change it to 1000 for better results
            'model__missing': [-999],
            'model__gamma': [0.5, 1, 1.5, 2, 5],
            'model__seed': [1337]
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        ### XGBOOST
        return parameters
コード例 #19
0
labels,tweet = tweets_labels(tweets,annotated_list)

print("Number of tweets: {}".format(len(tweet))) #50339 - paris
print("Number of Solidarity englishtweets: {}".format(labels.count(1))) #20465
print("Number of Non-Solidarity englishtweets: {}".format(labels.count(-1))) #29874


# In[41]:


df_english = pd.DataFrame(index=range(len(tweet)))

df_english['tweets'] = pd.DataFrame(tweet)
df_english['labels'] = pd.DataFrame(labels)
df_english.to_csv('latest_paris_unique.csv',index=False)
rus= RandomUnderSampler()


# In[78]:


tweet_2D = []
for tw in tweet:
    tweet_2D.append([tw])

tweet_sampled,labels_sampled = rus.fit_sample(tweet_2D,labels)
plt.hist(labels_sampled)
plt.show()

tweet_sampled_1D = []
for tweet in tweet_sampled:
コード例 #20
0
num_try = 1
try_dict = dict()

highest_acc = 0.0

internal_feature_data = np.concatenate((feature_data_2013, feature_data_2018))
internal_feature_label = np.concatenate(
    (feature_label_2013_class1, feature_label_2018_class1))
internal_clinical_data = np.concatenate(
    (clinical_data_2013, clinical_data_2018))

external_feature_data = feature_data_2012
external_feature_label = feature_label_2012_class1
external_clinical_data = clinical_data_2012

rus = RandomUnderSampler(random_state=42)

class_3_index = ['normal label', 'penia label', 'porosis label']
class_3_cols = ['normal prediction', 'penia prediction', 'porosis prediction']

class_2_index = ['normal label', 'porosis & penia label']
class_2_cols = ['normal prediction', 'porosis & penia prediction']

class_1_index = ['normal&penia label', 'porosis label']
class_1_cols = ['normal&penia prediction', 'porosis prediction']

if len(Counter(internal_feature_label).keys()) == 2:
    average_method = 'binary'
    conf_index = class_1_index
    conf_cols = class_1_cols
else:
コード例 #21
0
    def compute_naive_bayes(self,
                            target: str,
                            drop: bool = True,
                            norm: bool = False,
                            threshold: float = 1,
                            cnf_mtx: bool = False):
        """
        Implement the Naive Bayes algorithm for the given threshold.
        Computes the Gaussian, Bernouly and Multinomial (only for normalized) methods.
        To fit the models there is the possibility to use SMOTE technique
        :return: --- <class 'tuple'> --- <class 'list'>
        For confusion matrix, set 'cnf_mtx' as True.
        :return: --- <class 'numpy.ndarray'>
        """
        if drop:
            full_set = self.compute_data_drop(threshold)
        else:
            full_set = self.compute_data_average(threshold)

        y: np.ndarray = full_set.pop(target).values
        x: np.ndarray = full_set.values
        labels: np.ndarray = pd.unique(y)

        trn_x, tst_x, trn_y, tst_y = train_test_split(x,
                                                      y,
                                                      train_size=0.7,
                                                      stratify=y)

        # normalization has to occur after train_test_split only to training data and after that the same normalization
        # is applied to test data
        if norm:
            # min_max_scaler = preprocessing.StandardScaler()
            min_max_scaler = preprocessing.MinMaxScaler()
            trn_x = min_max_scaler.fit_transform(trn_x)
            tst_x = min_max_scaler.transform(tst_x)

        clf = GaussianNB()
        print(trn_x.shape, trn_y.shape, tst_x.shape, tst_y.shape, labels.shape)
        clf.fit(trn_x, trn_y)
        prd_y = clf.predict(tst_x)

        if cnf_mtx:
            return metrics.confusion_matrix(tst_y, prd_y, labels)

        if norm:
            estimators = {
                'GaussianNB': GaussianNB(),
                'MultinomialNB': MultinomialNB(),
                'BernoulyNB': BernoulliNB()
            }
        else:
            estimators = {
                'GaussianNB': GaussianNB(),
                'BernoulyNB': BernoulliNB()
            }

        xvalues = []
        yvalues = []

        smote = SMOTE(random_state=42, ratio='minority')
        smote_x, smote_y = smote.fit_sample(x, y)

        # RandomUnderSampler
        sampler = RandomUnderSampler(sampling_strategy='all', ratio={1: 1000})
        X_rs, y_rs = sampler.fit_sample(trn_x, trn_y)

        for clf in estimators:
            xvalues.append(clf)
            # estimators[clf].fit(trn_x, trn_y) # unbalanced data
            estimators[clf].fit(X_rs, y_rs)  # random under-sampling
            # estimators[clf].fit(smote_x, smote_y)  # smote
            prd_y = estimators[clf].predict(tst_x)
            yvalues.append(metrics.accuracy_score(tst_y, prd_y))
        return xvalues, yvalues
コード例 #22
0
ファイル: rascunho2.py プロジェクト: eduardolp/tcc_toolwear
encoder.fit(target)
encoded_Y = encoder.transform(target)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# separate data into training and (validation + testing) datasets in a 70/30 (20/10) proportion
X_train, X_partial, y_train, y_partial = train_test_split(
    features, dummy_y, test_size=0.3, random_state=rand_state)
X_val, X_test, y_val, y_test = train_test_split(X_partial,
                                                y_partial,
                                                test_size=0.33,
                                                random_state=rand_state)

# Oversample the training data
ros = RandomOverSampler(sampling_strategy='minority', random_state=12)
rus = RandomUnderSampler(random_state=12, replacement=True)
# X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Reobtain the correct training, validation and testing datasets
X_train_reduced = X_train_res.loc[:, features_list]
y_train_reduced = y_train_res  #X_train_res.loc[:, targets_list] #Sim, X_train_res está correto

X_val_reduced = X_val.loc[:, features_list]
y_val_reduced = y_val  # X_val.loc[:, targets_list]

X_test_reduced = X_test.loc[:, features_list]
y_test_reduced = y_test  #X_test.loc[:, targets_list]

# Samples no_zeros giving it the same number of values for all vb_slice ranges
# no_zeros is shuffled and the number of values in each range is given by the
コード例 #23
0
# In[228]:

merge_AB = pd.get_dummies(merge_AB, prefix=['plan_type'], drop_first=True)
merge_AC = pd.get_dummies(merge_AC, prefix=['plan_type'], drop_first=True)

# ## AB Test

# ### 14-day and no trial

# In[229]:

import imblearn
print(imblearn.__version__)
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(
    merge_AB[['plan_type_low_uae_no_trial']], merge_AB[['current_sub_TF']])

# In[230]:

merge_AB_under = pd.concat([X_under, y_under], axis=1)

# In[231]:

merge_AB_under.head()

# In[232]:

import HW1 as ABTesting
コード例 #24
0
def random_under_sampler(X, y):
    rus = RandomUnderSampler(random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    return X_res, y_res
コード例 #25
0
ファイル: testselect.py プロジェクト: rock420/bugbug
    def __init__(self, lemmatization=False, granularity="label", failures_skip=None):
        Model.__init__(self, lemmatization)

        self.granularity = granularity
        self.failures_skip = failures_skip

        self.training_dbs = [repository.COMMITS_DB]
        self.eval_dbs[repository.COMMITS_DB] = (
            repository.COMMITS_DB,
            repository.COMMIT_EXPERIENCES_DB,
        )
        if granularity == "label":
            self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_LABEL_DB,
                test_scheduling.FAILING_TOGETHER_LABEL_DB,
            )
        elif granularity == "group":
            self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
            )
            self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = (
                test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB,
            )
        elif granularity == "config_group":
            self.training_dbs.append(test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
            )

        self.cross_validation_enabled = False

        self.entire_dataset_training = True

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            test_scheduling_features.prev_failures(),
        ]

        if granularity == "label":
            feature_extractors += [
                test_scheduling_features.platform(),
                # test_scheduling_features.chunk(),
                test_scheduling_features.suite(),
            ]
        elif granularity in ("group", "config_group"):
            feature_extractors += [
                test_scheduling_features.path_distance(),
                test_scheduling_features.common_path_components(),
                test_scheduling_features.touched_together(),
            ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "commit_extractor",
                    commit_features.CommitExtractor(feature_extractors, []),
                ),
                ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
コード例 #26
0
# ....................................
#
# ``sampling_strategy`` can be given a ``float``. For **under-sampling
# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
# :math:`N_{m}` are the number of samples in the majority class after
# resampling and the number of samples in the minority class, respectively.

# select only 2 classes since the ratio make sense in this case
binary_mask = np.bitwise_or(y == 0, y == 2)
binary_y = y[binary_mask]
binary_X = X[binary_mask]

sampling_strategy = 0.8

rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_res, y_res = rus.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an under-sampling method: \n '
      'sampling_strategy={} \n y: {}'.format(sampling_strategy,
                                             Counter(y_res)))
plot_pie(y_res)

###############################################################################
# For **over-sampling methods**, it correspond to the ratio
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
# minority class after resampling and the number of samples in the majority
# class, respectively.

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
コード例 #27
0
    testSets = ['p2p_lendingclub_30.csv']
    classe = 'loan_status'
    # defino a lista de classificadores
    clfs = [
        GaussianNB(),
        tree.DecisionTreeClassifier(),
        linear_model.LogisticRegression()
    ]
    names = ["Naive Bayes", "Decision Tree", "Logistic Regression"]
    final_names = list()
    for set in testSets:
        for name in names:
            final_names.append(str(name + '_' + set[:-4]))

    # defino a lista de tecnicas de sampling
    sTechniques = [RandomUnderSampler(random_state=1), SMOTE(random_state=1)]
    technique_names = ["RU", "SM"]

    def getParamsReSampling(reSamplingTechnique):
        if type(reSamplingTechnique) is SMOTETomek:
            return dict(smt__ratio=[0.8, 0.9, 1.0],
                        smt__k=[1, 3, 5, 7],
                        smt__m=[1, 3, 5, 7])

        elif type(reSamplingTechnique) is SMOTE:
            return dict(smt__kind=["regular", "borderline1", "borderline2"],
                        smt__ratio=[0.8, 0.9, 1.0],
                        smt__k_neighbors=[1, 3, 5, 7])

        elif type(reSamplingTechnique) is RandomUnderSampler:
            return dict(smt__ratio=[0.8, 0.9, 1.0])
コード例 #28
0
          CountVectorizer(max_df=0.95,
                          min_df=10,
                          ngram_range=(2, 2),
                          stop_words=None,
                          strip_accents='unicode',
                          tokenizer=LemmaTokenizer())),
     ],  # 2-Gram Vectorizer
     transformer_weights={
         'vect1': 1.0,
         'vect2': 1.0,
     },
 ),
 TfidfTransformer(use_idf=True),
 RandomUnderSampler(ratio={
     1: 19000,
     2: 27200,
     3: 20000
 }, random_state=22),
 SelectFromModel(
     estimator=LinearSVC(),
     threshold='1.2*mean'),  # Dimensionality Reduction               
 #MLPClassifier(verbose=True, hidden_layer_sizes=(200,), max_iter=200, solver='sgd', learning_rate='adaptive', learning_rate_init=0.60, momentum=0.50, alpha=1e-01),)
 MLPClassifier(verbose=True,
               random_state=22,
               hidden_layer_sizes=(100, ),
               max_iter=200,
               solver='sgd',
               learning_rate='constant',
               learning_rate_init=0.07,
               momentum=0.90,
               alpha=1e-01),
コード例 #29
0
def create_model(dataset):
    print("dataset : ", dataset)
    df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None)

    df['label'] = df[df.shape[1] - 1]

    df.drop([df.shape[1] - 2], axis=1, inplace=True)

    labelencoder = LabelEncoder()
    df['label'] = labelencoder.fit_transform(df['label'])

    X = np.array(df.drop(['label'], axis=1))
    y = np.array(df['label'])

    normalization_object = Normalizer()
    X = normalization_object.fit_transform(X)

    # This part is for stratified cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True)

    # This part is for Random Undersampling
    sampler = RandomUnderSampler()

    top_roc = 0
    for depth in range(2, 20, 1):
        for split in range(2, 9, 1):

            all_auc = []
            all_aupr = []

            classifier = AdaBoostClassifier(DecisionTreeClassifier(
                max_depth=depth, min_samples_split=split),
                                            n_estimators=100,
                                            learning_rate=1,
                                            algorithm='SAMME')

            for train_index, test_index in skf.split(X, y):
                X_train = X[train_index]
                X_test = X[test_index]

                y_train = y[train_index]
                y_test = y[test_index]

                X_train, y_train = sampler.fit_sample(X_train, y_train)

                classifier.fit(X_train, y_train)

                predictions = classifier.predict_proba(X_test)

                all_auc.append(roc_auc_score(y_test, predictions[:, 1]))
                all_aupr.append(
                    average_precision_score(y_test, predictions[:, 1]))

            average_auc = sum(all_auc) / len(all_auc)
            average_aupr = sum(all_aupr) / len(all_aupr)

            # print("for depth", depth, " and split ", split, "roc = ", average_auc)
            if average_auc > top_roc:
                print(dataset,
                      " for depth",
                      depth,
                      " split ",
                      split,
                      "roc = ",
                      average_auc,
                      " aupr ",
                      average_aupr,
                      end=' ')
                joblib.dump(classifier,
                            '/home/farshid/Desktop/models/' + dataset + '.pkl')
                top_roc = average_auc
                print("stored !!!!")
コード例 #30
0
import time
tic = time.clock()

#Read in preprocessed data
data_train = pd.read_pickle('train.pkl')
data_test = pd.read_pickle('test.pkl')

y_values = data_train['score'].values
data_train.drop(['score'], axis = 1, inplace = True)

columns = data_train.columns
x_values = data_train.values

# Apply the random under-sampling
rus = RandomUnderSampler(return_indices=True)
# x_train, y_train, idx_resampled = rus.fit_sample(x_values, y_values)
x_train, y_train = x_values, y_values

gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)

joblib.dump(gbc, 'GradientBoosting.pkl')

query_id = np.asarray(data_test.index.values)
y_test = data_test['score'].values
data_test.drop(['score'], axis = 1, inplace = True)
x_test = data_test.values
predictions = gbc.predict(x_test)
data = np.concatenate((query_id.reshape((-1, 1)),y_test.reshape((-1, 1))),axis=1 )
data = np.concatenate((data,predictions.reshape((-1, 1))),axis=1 )