# index = len(l) -1 # # print(index) # feature_list_of_all_instances.append(l[0:index]) # class_list_of_all_instances.append(l[index]) # total_matrix = [] gc.collect() for i in class_list_of_all_instances: if i == 1: c += 1 print("Positive data ", c) kf = StratifiedKFold(n_splits=5, shuffle=True) under_sample = RandomUnderSampler() print("Starting K fold data to Classifier ... ") avg_roc = 0 for train_set_indexes, test_set_indexes in kf.split( feature_list_of_all_instances, class_list_of_all_instances): # temp_train_feature_list = feature_list_of_all_instances[train_set_indexes] # temp_train_class_list = class_list_of_all_instances[train_set_indexes] temp_train_feature_list = [] temp_train_class_list = [] for index in train_set_indexes: temp_train_feature_list.append(feature_list_of_all_instances[index]) temp_train_class_list.append(class_list_of_all_instances[index]) temp_test_feature_list = []
'LinearSVC': LinearSVC(random_state=0, loss='squared_hinge', penalty='l1', C=1e-2, max_iter=1000, fit_intercept=True, dual=False, tol=1e-4), 'Random Forest': RandomForestClassifier(random_state=0, n_estimators=300, min_samples_leaf=10, criterion='gini', max_features=None, max_depth=None), 'Neural Network': MLPClassifier(random_state=0, activation='tanh', solver='adam', alpha=1e-2, beta_1=0.7, beta_2=0.9, max_iter=1000, hidden_layer_sizes=100), 'Logistic Regression': LogisticRegression(random_state=0, penalty='l1', C=1e-3, max_iter=100, solver='saga', fit_intercept=True, dual=False, tol=1e-4), } print("Execution Model AUPRC AUROC ACC F1 Precision Recall") # For each base learner... for learner_name, learner in base_learners.items(): # undersample majority class, which is "False" counter = Counter(exec_training_target) true_count = counter[True] false_count = int(counter[False] * UNDERSAMPLING_RATES) undersampler = RandomUnderSampler( sampling_strategy={True: true_count, False: false_count}) # build pipeline steps = [('under', undersampler), ('scale', StandardScaler()), ('learner', learner)] pipeline = Pipeline(steps=steps) pipeline.fit(exec_training_features, exec_training_target) # prediction predicted = pipeline.predict(exec_test_features) # evaluation acc = accuracy_score(exec_test_target, predicted) precision, recall, f1, _ = precision_recall_fscore_support( exec_test_target, predicted, average='binary') if hasattr(pipeline, "predict_proba"):
# Deal with imbalanced class sizes below # Make Data 1D for compatability upsampling methods X_trainShape = X_train.shape[1] * X_train.shape[2] * X_train.shape[3] X_testShape = X_test.shape[1] * X_test.shape[2] * X_test.shape[3] X_trainFlat = X_train.reshape(X_train.shape[0], X_trainShape) X_testFlat = X_test.reshape(X_test.shape[0], X_testShape) #print("X_train Shape: ",X_train.shape) #print("X_test Shape: ",X_test.shape) #print("X_trainFlat Shape: ",X_trainFlat.shape) #print("X_testFlat Shape: ",X_testFlat.shape) from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler #ros = RandomOverSampler(ratio='auto') ros = RandomUnderSampler(ratio='auto') X_trainRos, Y_trainRos = ros.fit_sample(X_trainFlat, Y_train) X_testRos, Y_testRos = ros.fit_sample(X_testFlat, Y_test) # Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0]) Y_trainRosHot = np_utils.to_categorical(Y_trainRos, num_classes=2) Y_testRosHot = np_utils.to_categorical(Y_testRos, num_classes=2) #print("X_train: ", X_train.shape) #print("X_trainFlat: ", X_trainFlat.shape) #print("X_trainRos Shape: ",X_trainRos.shape) #print("X_testRos Shape: ",X_testRos.shape) #print("Y_trainRosHot Shape: ",Y_trainRosHot.shape) #print("Y_testRosHot Shape: ",Y_testRosHot.shape) for i in range(len(X_trainRos)): height, width, channels = 50, 50, 3
def clustering(self, target: str): y: np.ndarray = self.data.pop(target).values X: np.ndarray = self.data.values # RAndomUnderSampler sampler = RandomUnderSampler(sampling_strategy='majority', ratio={ 1: 1000 }).fit(X, y) X_norm1, y = sampler.fit_sample(X, y) # sampler = MinMaxScaler() X_norm = sampler.fit_transform(X_norm1, y) # SelectKBest X_new = SelectKBest(f_classif, k=2).fit_transform(X_norm1, y) y_trueK = cluster.KMeans(n_clusters=4, random_state=1).fit_predict(X_new) # PCA PCA(n_components=2).fit(X_norm) X_PCA = PCA(n_components=2).fit_transform(X_norm, y) y_trueP = cluster.KMeans(n_clusters=4, random_state=1).fit_predict(X_PCA) silh = [] rand = [] h**o = [] index = ['Stad.Scaler', 'PCA'] print("Silhouette[KBest] =", metrics.silhouette_score(X_new, y_trueK, metric='euclidean')) print("RI[KBest] =", metrics.adjusted_rand_score(y, y_trueK)) print('Homogeniety[KBest] =', metrics.homogeneity_score(y, y_trueK)) silh.append( metrics.silhouette_score(X_new, y_trueK, metric='euclidean')) rand.append(metrics.adjusted_rand_score(y, y_trueK)) h**o.append(metrics.homogeneity_score(y, y_trueK)) print("Silhouette[PCA] =", metrics.silhouette_score(X_PCA, y_trueP, metric='euclidean')) print("RI[PCA] =", metrics.adjusted_rand_score(y, y_trueP)) print('Homogeniety[PCA] =', metrics.homogeneity_score(y, y_trueP)) silh.append( metrics.silhouette_score(X_PCA, y_trueP, metric='euclidean')) rand.append(metrics.adjusted_rand_score(y, y_trueP)) h**o.append(metrics.homogeneity_score(y, y_trueP)) # plotting plt.figure(figsize=(12, 6)) plt.suptitle('Clustering for CT Dataset - KMeans ', fontsize="x-large") grid = plt.GridSpec(1, 2, wspace=0.4, hspace=0.5) plt.subplot(grid[0, 0]) plt.scatter(X_new[:, 0], X_new[:, 1], c=y_trueK) plt.title("selectKBeast") plt.subplot(grid[0, 1]) plt.scatter(X_PCA[:, 0], X_PCA[:, 1], c=y_trueP) plt.title("PCA") plt.show()
############################################################################### # With the controlled under-sampling methods, the number of samples to be # selected can be specified. ``RandomUnderSampler`` is the most naive way of # performing such selection by randomly selecting a given number of samples by # the targetted class. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) sampler = RandomUnderSampler(random_state=0) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax3) ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # ``NearMiss`` algorithms implement some heuristic rules in order to select # samples. NearMiss-1 selects samples from the majority class for which the # average distance of the :math:`k`` nearest samples of the minority class is # the smallest. NearMiss-2 selects the samples from the majority class for # which the average distance to the farthest samples of the negative class is # the smallest. NearMiss-3 is a 2-step algorithm: first, for each minority
linear_model.LogisticRegression(), {'C':np.logspace(0,3,4), 'penalty':['l1','l2'], }), ("Decision Tree", 'cart', tree.DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, criterion=criterion), {'max_depth':max_depth, #'max_features':[3,5,10,None], #'splitter':['best','random'], 'criterion':['entropy','gini'], }), ("RandomUnderSampling", 'rus', Pipeline([('res', RandomUnderSampler()), ('tree', tree.DecisionTreeClassifier( min_samples_leaf=min_samples_leaf, criterion=criterion)) ]), {'tree__max_depth':max_depth, }), ("SMOTE", 'smt', Pipeline([('res', SMOTE()), ('tree', tree.DecisionTreeClassifier( min_samples_leaf=min_samples_leaf, criterion=criterion)) ]), {'tree__max_depth':max_depth, }), ("UnderBagging", 'ub',
X = np.array(df.drop(['label'], axis=1)) y = np.array(df['label']) normalization_object = Normalizer() X = normalization_object.fit_transform(X) number_of_folds = 7 best_clustered_trees_average_auc = 0 best_one_tree_average_auc = 0 best_cluster = 0 skf = StratifiedKFold(n_splits=number_of_folds, shuffle=True) sampler = RandomUnderSampler() fold_counTer = 0 number_of_clusters = 23 # this is a hyper parameter trees = {} all_auc_with_clustered_trees = [] all_auc_with_one_tree = [] X_train_major = np.zeros((0, 1294)) y_train_major = [] avg_roc = 0 avg_aupr = 0 for train_index, test_index in skf.split(X, y): X_train = X[train_index] X_test = X[test_index]
results['precision_score'] = precision_score(test_y, pred_y) results['roc_auc_score'] = roc_auc_score(test_y, pred_y) return results X_train = np.loadtxt('X_train') X_test = np.loadtxt('X_test') y_train = np.loadtxt('y_train') y_test = np.loadtxt('y_test') from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=0, replacement=False) X_resampled, y_resampled = rus.fit_sample(X_train, y_train) log_reg_param_grid = {'C':[0.01, 0.1, 1, 10, 100], 'max_iter':[100, 200, 300, 500], 'penalty':['l2'], 'class_weight':['balanced',None], 'tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2] }
def main(): input_file = sys.argv[1] embedding_file = sys.argv[2] is_TFIDF = sys.argv[3] test_file = sys.argv[4] if not os.path.exists('../models/svm_model.sav'): input_data = [] with codecs.open(input_file, 'r', 'utf-8') as in_obj: for line in in_obj: line = line.strip().strip('\n\r') input_data.append(line.split('\t')) filtered_input_data = filter_tweets(input_data) tweet_text_vector = [] labels = [] for i in range(len(filtered_input_data)): labels.append(filtered_input_data[i][1]) if is_TFIDF.strip().lower( ) == 'true' and not os.path.exists(input_file + '_tfidf.csv'): tweet_text_list = [] if os.path.exists(input_file + '_class_inp.tsv'): with codecs.open(input_file + '_class_inp.tsv', 'r', 'utf-8') as class_file_obj: for line in class_file_obj: line = line.split('\t') tweet_text_vector.append(' '.join( preprocess_tweet_text(line[0]))) vectorizer = TfidfVectorizer(ngram_range=(1, 3)) X_transform = vectorizer.fit_transform( tweet_text_vector).toarray() print(X_transform[:10]) np.savetxt(input_file + '_tfidf.csv', X_transform, delimiter='\t') else: raise NameError( 'please provide label annotated texts file named %s_class_inp.tsv' % input_file) if not os.path.exists(input_file + '_embed.csv' ) and is_TFIDF.strip().lower() != 'true': tweet_text_list = [] for i in range(len(filtered_input_data)): tweet_text = get_tweet_text(filtered_input_data[i][0]) tweet_text_list.append(tweet_text) tweet_text_vector.append(preprocess_tweet_text(tweet_text)) X_transform = get_embedding_vector(tweet_text_vector, embedding_file) out_data = np.append(np.transpose([np.array(tweet_text_list)]), np.transpose([labels]), axis=1) np.savetxt(input_file + '_class_inp.tsv', out_data, fmt='%s %s', delimiter='\t') np.savetxt(input_file + '_embed.csv', X_transform, delimiter='\t') if is_TFIDF.strip().lower() != 'true': X_transform = np.loadtxt(input_file + '_embed.csv', delimiter='\t') else: X_transform = np.loadtxt(input_file + '_tfidf.csv', delimiter='\t') scaler = MinMaxScaler() X_transform_scaled = scaler.fit_transform(X_transform) print(len(X_transform_scaled)) le = LabelEncoder() y = le.fit_transform(labels) print(len(y)) rus = RandomUnderSampler(random_state=0) X_resample, y_resample = rus.fit_sample(X_transform_scaled, y) print(len(X_resample), len(y_resample)) svm_clf = SVC(C=10, kernel='rbf', gamma='scale', probability=True).fit(X_resample, y_resample) filename = '../models/svm_model.sav' pickle.dump(svm_clf, open(filename, 'wb')) # test data preprocessing loaded_model = pickle.load(open('../models/svm_model.sav', 'rb')) test_data = [] test_id = [] cnt_test = 0 with codecs.open(test_file, 'r', 'utf-8') as in_obj: for line in in_obj: cnt_test += 1 line = line.strip().strip('\n\r') test_data.append(line.split('\t')[-1]) test_id.append(line.split('\t')[0]) print("input file count =%s" % (cnt_test)) print("test_data_count= %s" % (len(test_data))) tweet_test_text_vector = [] for i in range(len(test_data)): tweet_test_text_vector.append(preprocess_tweet_text(test_data[i])) print("test text vector count = %s" % (len(tweet_test_text_vector))) X_test_transform = get_embedding_vector(tweet_test_text_vector, embedding_file) scaler = MinMaxScaler() X_test_transform_scaled = scaler.fit_transform(X_test_transform) print(len(X_test_transform_scaled)) out_data = loaded_model.predict_proba(X_test_transform_scaled) np.savetxt(test_file + '_results.tsv', out_data, fmt='%s\t%s', delimiter='\t') with codecs.open(test_file + '_results_f8.tsv', 'w', 'utf-8') as f8_obj: with codecs.open(test_file + '_results.tsv', 'r', 'utf-8') as prediction_obj: for i, line in enumerate(prediction_obj): line = line.split('\t') if line[1] >= 0.8: f8_obj.write(test_id[i] + '\t' + test_data[i] + '\n')
def load_features(task): log_file = log_dir + 'loading_task_' + str(task['pref_id']) + '.txt' load_logger = logger(log_file, task) dataset_prediction_task_to_outcomes = { 'all_one_trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'], }, 'has_single_src': { 'two': [True, False] }, 'num_x_axes': { 'numeric': [i for i in range(5)] }, 'num_y_axes': { 'numeric': [i for i in range(5)] } } field_prediction_task_to_outcomes = { 'trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'], }, 'is_xsrc': { 'two': [True, False] }, 'is_ysrc': { 'two': [True, False] }, 'is_x_or_y': { 'two': ['x', 'y'] }, 'is_single_src': { 'two': [True, False] } } if task['dataset'] == 'dataset': task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv' task['outcomes_df_file_name'] = 'chart_outcomes.csv' task['id_field'] = 'fid' prediction_task_to_outcomes = dataset_prediction_task_to_outcomes else: assert task['dataset'] == 'field' task['features_df_file_name'] = 'field_level_features.csv' task['outcomes_df_file_name'] = 'field_level_outcomes.csv' task['id_field'] = 'field_id' prediction_task_to_outcomes = field_prediction_task_to_outcomes features_df = pd.read_csv( join(features_directory, task['features_df_file_name']), nrows=num_datapoints) outcomes_df = pd.read_csv( join(features_directory, task['outcomes_df_file_name']), nrows=num_datapoints) feature_names_by_type = pickle.load( open( join(features_directory, feature_set_lookup_file_name), 'rb')) # print(features_df) # print('Initial Features:', features_df.shape) # print('Initial Outcomes:', outcomes_df.shape) # load_logger.log_dict(feature_names_by_type) # load_logger.log('\n') # load_logger.log(features_df) load_logger.log('Initial Features: ' + str(features_df.shape)) load_logger.log('Initial Outcomes: ' + str(outcomes_df.shape)) if task['dataset'] == 'field': def is_x_or_y(is_xsrc, is_ysrc): if is_xsrc and pd.isnull(is_ysrc): return 'x' if is_ysrc and pd.isnull(is_xsrc): return 'y' else: return None outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc']) outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc'] outcomes_df_subset = format_outcomes_df(load_logger, outcomes_df, task['outcome_variable_name'], prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ], id_field=task['id_field']) final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=task['id_field']) last_index = final_df.columns.get_loc(task['outcome_variable_name']) X = final_df.iloc[:, :last_index] y = final_df.iloc[:, last_index] # print('Intermediate Outcomes:', y.shape) # value_counts = y.value_counts() # print('Value counts:') # print(value_counts) load_logger.log('Final DF Shape: ' + str(final_df.shape)) load_logger.log('Last Index: ' + str(last_index)) load_logger.log('Intermediate Outcomes: ' + str(y.shape)) load_logger.log('Value counts: \n' + str(y.value_counts())) # delete variables to save memory! del final_df, outcomes_df task_types = ['dimensions', 'types', 'values', 'names'] for task_name in task_types: names = get_feature_set_names_by_type( feature_names_by_type, task_type=task['dataset'], feature_set=task_name) indices = [X.columns.get_loc(c) for c in names if c in X.columns] # print('task is ' + task_name + ' and indices are:') #print('names are {}'.format(names) ) # print(indices) # load_logger.log('task is ' + task_name + ' and indices are: ') # load_logger.log(indices) y = pd.get_dummies(y).values.argmax(1) if task['sampling_mode'] == 'over': res = RandomOverSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif task['sampling_mode'] == 'under': res = RandomUnderSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif isinstance(task['sampling_mode'], int): X_resampled_arrays, y_resampled_arrays = [], [] for outcome in np.unique(y): outcome_mask = (y == outcome) X_resampled_outcome, y_resampled_outcome = resample( X[outcome_mask], y[outcome_mask], n_samples=task['sampling_mode'], random_state=RANDOM_STATE ) X_resampled_arrays.append(X_resampled_outcome) y_resampled_arrays.append(y_resampled_outcome) X, y = np.concatenate(X_resampled_arrays).astype( np.float64), np.concatenate(y_resampled_arrays) else: X, y = X.values.astype(np.float64), y # print('Final Features:', X.shape) # print('Final Outcomes:', y.shape) load_logger.log('Final Features:' + str(X.shape)) load_logger.log('Final Outcomes:' + str(y.shape)) unique, counts = np.unique(y, return_counts=True) load_logger.log('Value counts after sampling:') load_logger.log_dict(dict(zip(unique, counts))) load_logger.log('\n') del load_logger return util.unison_shuffle(X, y)
1: weights[1], 2: weights[2], 3: weights[3], 4: weights[4] } over = SMOTE(sampling_strategy=ratio_over, random_state=314) X_train, y_train = over.fit_resample(X_train, y_train) # undersample samples > average ratio_under = { 0: average_samples, 1: average_samples, 2: average_samples, 3: average_samples, 4: average_samples } under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314) X_train, y_train = under.fit_resample(X_train, y_train) # Configure the cross-validation procedure cv_inner = LeaveOneOut() #Hyper parameter code batch_size = [8, 16, 32] neurons = [30, 40, 50] hidden_layers = [1, 2, 3] epochs = [10, 50, 100] activation = ['relu', 'tanh', 'sigmoid', 'linear'] param_grid = dict(batch_size=batch_size, neurons=neurons, hidden_layers=hidden_layers, epochs=epochs,
# Splitting the data with stratifying due imbalanced data y = df.pop("TARGET") X = df X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state = RSEED) # Saving the splitted test data X_test.to_csv("Home_Loan/X_test.csv", index=False) y_test.to_csv("Home_Loan/y_test.csv", index=False) print("Test data successfully saved to Home_Loan.") # Clean the data X_train_cleaned, y_train_cleaned = clean_data(X_train, y_train, test=False) print("Data successfully cleaned.") # Sampling because of high imbalance using undersample strategy undersample = RandomUnderSampler(sampling_strategy='majority',random_state=RSEED) X_train_balanced, y_train_balanced = undersample.fit_resample(X_train_cleaned, y_train_cleaned) print("Data successfully balanced.") # Selecting features features = get_features() # Training the model model = RandomForestClassifier(n_estimators=196, min_samples_split = 2, max_leaf_nodes = 49, max_depth = 17, bootstrap = True, max_features = 'auto', min_weight_fraction_leaf = 0.1, n_jobs=-1, random_state=RSEED, verbose = 1) model.fit(X_train_balanced[features], y_train_balanced) print("Model successfully fitted.")
def do_sampling(upsampling, train_features, train_truth, smote=False): X_train = train_features y_train = train_truth.label if smote is False: X = pd.concat([X_train, y_train], axis=1) completed = X[X.label == 0] drop_out = X[X.label == 1] if upsampling is True: sampled = resample( completed, replace=True, # sample with replacement n_samples=len(drop_out), # match number in majority class random_state=27) # reproducible results sampled = pd.concat([drop_out, sampled]) elif upsampling is False: sampled = resample( drop_out, replace=False, # sample with replacement n_samples=len(completed), # match number in majority class random_state=27) # reproducible results sampled = pd.concat([sampled, completed]) if upsampling is not None: sampled = sampled.sample(frac=1).reset_index(drop=True) print(sampled.label.value_counts()) y_train = sampled.label X_train = sampled.drop('label', axis=1) elif smote is True: feature_columns = X_train.columns if upsampling is True: upsample = SMOTE() X_train, y_train = upsample.fit_resample(X_train, y_train) elif upsampling is False: over = SMOTE(sampling_strategy=0.4) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X_train, y_train = pipeline.fit_resample(X_train, y_train) features = pd.DataFrame(data=X_train, columns=columns) label = pd.DataFrame(data=y_train, columns=['label']) X = pd.concat([features, label], axis=1) X = X.sample(frac=1).reset_index(drop=True) print(X.label.value_counts()) y_train = X.label X_train = X.drop('label', axis=1) return X_train, y_train
df = pd.read_csv('../Data/train.csv', header=0) X = df.iloc[:,0:-1].copy() Y = df.iloc[:, -1].copy() #scaler = StandardScaler() #X = scaler.fit_transform(X) df = pd.read_csv('../Data/validation.csv', header=0) X_valid = df.iloc[:,0:-1].copy() Y_valid = df.iloc[:, -1].copy() #scaler = StandardScaler() #X_valid = scaler.fit_transform(X_valid) # Handle the dataset with undersampling strategy rus = RandomUnderSampler(sampling_strategy=0.8) X_res, Y_res = rus.fit_resample(X, Y) # Handle the dataset with oversampling strategy ros = RandomOverSampler(random_state=0) X_resampled, Y_resampled = ros.fit_resample(X, Y) # Handle the dataset with SMOTE SM = SMOTE(random_state=0) X_smote, Y_smote = SM.fit_sample(X, Y) score_infor = [[],[],[],[]] roc_auc_score_infor = [[],[],[],[]] f1_score_infor = [[],[],[],[]] #print(pd.value_counts(Y_smote))
"""### Oversampling and Undersampling""" from imblearn.over_sampling import SMOTE from imblearn.ensemble import EasyEnsemble from imblearn.under_sampling import RandomUnderSampler bg_sm = SMOTE(random_state=10000) bg_train_X_res, bg_train_y_res = bg_sm.fit_sample(bg_train_X, bg_train_y) bg_train_y_res = pd.DataFrame(bg_train_y_res, columns = ["gender"]) bg_train_X_res = pd.DataFrame(bg_train_X_res, columns = ['star_sign','phone_os','height','weight','sleepiness','iq','fb_friends']) bg_us = RandomUnderSampler(random_state=10000) bg_train_X_red, bg_train_y_red = bg_us.fit_sample(bg_train_X, bg_train_y) bg_train_y_red = pd.DataFrame(bg_train_y_red, columns = ["gender"]) bg_train_X_red = pd.DataFrame(bg_train_X_red, columns = ['star_sign','phone_os','height','weight','sleepiness','iq','fb_friends']) print(str(pd.value_counts(bg_train_y_res.values.flatten()))) """## Classification ### decision tree """
Train_X = data_train.iloc[0:575212,2:217] cv_X = data_train.iloc[575213:585212,2:217] Train_Y = data_train.iloc[0:575212,1] cv_Y = data_train.iloc[575213:585212,1:2] Test_X = data_train.iloc[585213:595212,2:217] Test_Y = data_train.iloc[585213:595212,1:2] id_test = data_test.iloc[:,0:1] data_test = data_test.iloc[:,1:216] # Smote undersampling majority class #from collections import Counter #from sklearn.datasets import fetch_mldata from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(ratio = 'majority') Train_X_res, Train_Y_res = rus.fit_sample(Train_X,Train_Y) print("Y=1(%): ",str(np.sum(Train_Y_res)/Train_X_res.shape[0])) ## Without Undersampling # Train_X_res = Train_X # Train_Y_res = Train_Y # Normalizing the data maxi = np.max(Train_X_res,axis = 0) avgi = np.average(Train_X_res,axis = 0) Train_X_res = (Train_X_res - avgi)/(maxi+0.0000000000000000001) cv_X = (cv_X - avgi)/(maxi+0.0000000000000000001) Test_X = (Test_X - avgi)/(maxi+0.0000000000000000001) data_test = (data_test - avgi)/(maxi+0.0000000000000000001)
#print("Accuracy After ROC-1: %.2f%%" % (roc_auc1 * 100.0)) pre_scor1 = precision_score(y_test, y_pred_over1) re_scor1 = recall_score(y_test, y_pred_over1) f1_scor1 = f1_score(y_test, y_pred_over1) ##print("\n ROC AUC Score: %.2f%%" % (roc_auc * 100.0)) #print("\n Precision Score-1: %.2f%%" % (pre_scor1 * 100.0)) #print("\n Recall Score-1: %.2f%%" % (re_scor1 * 100.0)) #print('\n F1-Measure-1: %.2f%%' % (f1_scor1 * 100.0)) #precision, recall, thresholds = precision_recall_curve(y_test, y_pred_over1) data2 = np.array(two_split[1]).astype(np.float) X2, y2 = prep_data1(data2) # Random UnderSampling over2 = RandomUnderSampler() # resample the training data X_over2, y_over2 = over2.fit_sample(X2, y2) #After resampling again accuracy count model = KNeighborsClassifier() model.fit(X_over2, y_over2) y_pred_over2 = model.predict(X_test) accuracy2 = accuracy_score(y_test, y_pred_over2) print("Accuracy After RandomOverSlice-2: %.2f%%" % (((accuracy1 + accuracy2) / 2) * 100.0)) roc_auc2 = roc_auc_score(y_test, y_pred_over2) print("Accuracy After ROC-2: %.2f%%" % ((roc_auc1 + roc_auc2) / 2 * 100.0))
def use_parameters(self, X_train, selected_features): ''' Returns ------- ''' test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] ### XGBOOST parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__objective': ['logloss'], 'model__learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5], # so called `eta` value 'model__max_depth': [3, 4, 5], 'model__min_child_weight': [1, 5, 11, 12, 15], 'model__silent': [0], 'model__subsample': [0.6, 0.8, 1.0], 'model__colsample_bytree': [0.6, 0.8, 1.0], 'model__n_estimators': [5, 50, 100], # number of trees, change it to 1000 for better results 'model__missing': [-999], 'model__gamma': [0.5, 1, 1.5, 2, 5], 'model__seed': [1337] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) ### XGBOOST return parameters
labels,tweet = tweets_labels(tweets,annotated_list) print("Number of tweets: {}".format(len(tweet))) #50339 - paris print("Number of Solidarity englishtweets: {}".format(labels.count(1))) #20465 print("Number of Non-Solidarity englishtweets: {}".format(labels.count(-1))) #29874 # In[41]: df_english = pd.DataFrame(index=range(len(tweet))) df_english['tweets'] = pd.DataFrame(tweet) df_english['labels'] = pd.DataFrame(labels) df_english.to_csv('latest_paris_unique.csv',index=False) rus= RandomUnderSampler() # In[78]: tweet_2D = [] for tw in tweet: tweet_2D.append([tw]) tweet_sampled,labels_sampled = rus.fit_sample(tweet_2D,labels) plt.hist(labels_sampled) plt.show() tweet_sampled_1D = [] for tweet in tweet_sampled:
num_try = 1 try_dict = dict() highest_acc = 0.0 internal_feature_data = np.concatenate((feature_data_2013, feature_data_2018)) internal_feature_label = np.concatenate( (feature_label_2013_class1, feature_label_2018_class1)) internal_clinical_data = np.concatenate( (clinical_data_2013, clinical_data_2018)) external_feature_data = feature_data_2012 external_feature_label = feature_label_2012_class1 external_clinical_data = clinical_data_2012 rus = RandomUnderSampler(random_state=42) class_3_index = ['normal label', 'penia label', 'porosis label'] class_3_cols = ['normal prediction', 'penia prediction', 'porosis prediction'] class_2_index = ['normal label', 'porosis & penia label'] class_2_cols = ['normal prediction', 'porosis & penia prediction'] class_1_index = ['normal&penia label', 'porosis label'] class_1_cols = ['normal&penia prediction', 'porosis prediction'] if len(Counter(internal_feature_label).keys()) == 2: average_method = 'binary' conf_index = class_1_index conf_cols = class_1_cols else:
def compute_naive_bayes(self, target: str, drop: bool = True, norm: bool = False, threshold: float = 1, cnf_mtx: bool = False): """ Implement the Naive Bayes algorithm for the given threshold. Computes the Gaussian, Bernouly and Multinomial (only for normalized) methods. To fit the models there is the possibility to use SMOTE technique :return: --- <class 'tuple'> --- <class 'list'> For confusion matrix, set 'cnf_mtx' as True. :return: --- <class 'numpy.ndarray'> """ if drop: full_set = self.compute_data_drop(threshold) else: full_set = self.compute_data_average(threshold) y: np.ndarray = full_set.pop(target).values x: np.ndarray = full_set.values labels: np.ndarray = pd.unique(y) trn_x, tst_x, trn_y, tst_y = train_test_split(x, y, train_size=0.7, stratify=y) # normalization has to occur after train_test_split only to training data and after that the same normalization # is applied to test data if norm: # min_max_scaler = preprocessing.StandardScaler() min_max_scaler = preprocessing.MinMaxScaler() trn_x = min_max_scaler.fit_transform(trn_x) tst_x = min_max_scaler.transform(tst_x) clf = GaussianNB() print(trn_x.shape, trn_y.shape, tst_x.shape, tst_y.shape, labels.shape) clf.fit(trn_x, trn_y) prd_y = clf.predict(tst_x) if cnf_mtx: return metrics.confusion_matrix(tst_y, prd_y, labels) if norm: estimators = { 'GaussianNB': GaussianNB(), 'MultinomialNB': MultinomialNB(), 'BernoulyNB': BernoulliNB() } else: estimators = { 'GaussianNB': GaussianNB(), 'BernoulyNB': BernoulliNB() } xvalues = [] yvalues = [] smote = SMOTE(random_state=42, ratio='minority') smote_x, smote_y = smote.fit_sample(x, y) # RandomUnderSampler sampler = RandomUnderSampler(sampling_strategy='all', ratio={1: 1000}) X_rs, y_rs = sampler.fit_sample(trn_x, trn_y) for clf in estimators: xvalues.append(clf) # estimators[clf].fit(trn_x, trn_y) # unbalanced data estimators[clf].fit(X_rs, y_rs) # random under-sampling # estimators[clf].fit(smote_x, smote_y) # smote prd_y = estimators[clf].predict(tst_x) yvalues.append(metrics.accuracy_score(tst_y, prd_y)) return xvalues, yvalues
encoder.fit(target) encoded_Y = encoder.transform(target) # convert integers to dummy variables (i.e. one hot encoded) dummy_y = np_utils.to_categorical(encoded_Y) # separate data into training and (validation + testing) datasets in a 70/30 (20/10) proportion X_train, X_partial, y_train, y_partial = train_test_split( features, dummy_y, test_size=0.3, random_state=rand_state) X_val, X_test, y_val, y_test = train_test_split(X_partial, y_partial, test_size=0.33, random_state=rand_state) # Oversample the training data ros = RandomOverSampler(sampling_strategy='minority', random_state=12) rus = RandomUnderSampler(random_state=12, replacement=True) # X_train_res, y_train_res = ros.fit_resample(X_train, y_train) X_train_res, y_train_res = rus.fit_resample(X_train, y_train) # Reobtain the correct training, validation and testing datasets X_train_reduced = X_train_res.loc[:, features_list] y_train_reduced = y_train_res #X_train_res.loc[:, targets_list] #Sim, X_train_res está correto X_val_reduced = X_val.loc[:, features_list] y_val_reduced = y_val # X_val.loc[:, targets_list] X_test_reduced = X_test.loc[:, features_list] y_test_reduced = y_test #X_test.loc[:, targets_list] # Samples no_zeros giving it the same number of values for all vb_slice ranges # no_zeros is shuffled and the number of values in each range is given by the
# In[228]: merge_AB = pd.get_dummies(merge_AB, prefix=['plan_type'], drop_first=True) merge_AC = pd.get_dummies(merge_AC, prefix=['plan_type'], drop_first=True) # ## AB Test # ### 14-day and no trial # In[229]: import imblearn print(imblearn.__version__) from imblearn.under_sampling import RandomUnderSampler undersample = RandomUnderSampler(sampling_strategy='majority') X_under, y_under = undersample.fit_resample( merge_AB[['plan_type_low_uae_no_trial']], merge_AB[['current_sub_TF']]) # In[230]: merge_AB_under = pd.concat([X_under, y_under], axis=1) # In[231]: merge_AB_under.head() # In[232]: import HW1 as ABTesting
def random_under_sampler(X, y): rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_resample(X, y) return X_res, y_res
def __init__(self, lemmatization=False, granularity="label", failures_skip=None): Model.__init__(self, lemmatization) self.granularity = granularity self.failures_skip = failures_skip self.training_dbs = [repository.COMMITS_DB] self.eval_dbs[repository.COMMITS_DB] = ( repository.COMMITS_DB, repository.COMMIT_EXPERIENCES_DB, ) if granularity == "label": self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_LABEL_DB, test_scheduling.FAILING_TOGETHER_LABEL_DB, ) elif granularity == "group": self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = ( test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB, ) elif granularity == "config_group": self.training_dbs.append(test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB) self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = ( test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) self.cross_validation_enabled = False self.entire_dataset_training = True self.sampler = RandomUnderSampler(random_state=0) feature_extractors = [ test_scheduling_features.prev_failures(), ] if granularity == "label": feature_extractors += [ test_scheduling_features.platform(), # test_scheduling_features.chunk(), test_scheduling_features.suite(), ] elif granularity in ("group", "config_group"): feature_extractors += [ test_scheduling_features.path_distance(), test_scheduling_features.common_path_components(), test_scheduling_features.touched_together(), ] self.extraction_pipeline = Pipeline( [ ( "commit_extractor", commit_features.CommitExtractor(feature_extractors, []), ), ("union", ColumnTransformer([("data", DictVectorizer(), "data")])), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
# .................................... # # ``sampling_strategy`` can be given a ``float``. For **under-sampling # methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by # :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and # :math:`N_{m}` are the number of samples in the majority class after # resampling and the number of samples in the minority class, respectively. # select only 2 classes since the ratio make sense in this case binary_mask = np.bitwise_or(y == 0, y == 2) binary_y = y[binary_mask] binary_X = X[binary_mask] sampling_strategy = 0.8 rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}'.format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy)
testSets = ['p2p_lendingclub_30.csv'] classe = 'loan_status' # defino a lista de classificadores clfs = [ GaussianNB(), tree.DecisionTreeClassifier(), linear_model.LogisticRegression() ] names = ["Naive Bayes", "Decision Tree", "Logistic Regression"] final_names = list() for set in testSets: for name in names: final_names.append(str(name + '_' + set[:-4])) # defino a lista de tecnicas de sampling sTechniques = [RandomUnderSampler(random_state=1), SMOTE(random_state=1)] technique_names = ["RU", "SM"] def getParamsReSampling(reSamplingTechnique): if type(reSamplingTechnique) is SMOTETomek: return dict(smt__ratio=[0.8, 0.9, 1.0], smt__k=[1, 3, 5, 7], smt__m=[1, 3, 5, 7]) elif type(reSamplingTechnique) is SMOTE: return dict(smt__kind=["regular", "borderline1", "borderline2"], smt__ratio=[0.8, 0.9, 1.0], smt__k_neighbors=[1, 3, 5, 7]) elif type(reSamplingTechnique) is RandomUnderSampler: return dict(smt__ratio=[0.8, 0.9, 1.0])
CountVectorizer(max_df=0.95, min_df=10, ngram_range=(2, 2), stop_words=None, strip_accents='unicode', tokenizer=LemmaTokenizer())), ], # 2-Gram Vectorizer transformer_weights={ 'vect1': 1.0, 'vect2': 1.0, }, ), TfidfTransformer(use_idf=True), RandomUnderSampler(ratio={ 1: 19000, 2: 27200, 3: 20000 }, random_state=22), SelectFromModel( estimator=LinearSVC(), threshold='1.2*mean'), # Dimensionality Reduction #MLPClassifier(verbose=True, hidden_layer_sizes=(200,), max_iter=200, solver='sgd', learning_rate='adaptive', learning_rate_init=0.60, momentum=0.50, alpha=1e-01),) MLPClassifier(verbose=True, random_state=22, hidden_layer_sizes=(100, ), max_iter=200, solver='sgd', learning_rate='constant', learning_rate_init=0.07, momentum=0.90, alpha=1e-01),
def create_model(dataset): print("dataset : ", dataset) df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None) df['label'] = df[df.shape[1] - 1] df.drop([df.shape[1] - 2], axis=1, inplace=True) labelencoder = LabelEncoder() df['label'] = labelencoder.fit_transform(df['label']) X = np.array(df.drop(['label'], axis=1)) y = np.array(df['label']) normalization_object = Normalizer() X = normalization_object.fit_transform(X) # This part is for stratified cross validation skf = StratifiedKFold(n_splits=5, shuffle=True) # This part is for Random Undersampling sampler = RandomUnderSampler() top_roc = 0 for depth in range(2, 20, 1): for split in range(2, 9, 1): all_auc = [] all_aupr = [] classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=depth, min_samples_split=split), n_estimators=100, learning_rate=1, algorithm='SAMME') for train_index, test_index in skf.split(X, y): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] X_train, y_train = sampler.fit_sample(X_train, y_train) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) all_auc.append(roc_auc_score(y_test, predictions[:, 1])) all_aupr.append( average_precision_score(y_test, predictions[:, 1])) average_auc = sum(all_auc) / len(all_auc) average_aupr = sum(all_aupr) / len(all_aupr) # print("for depth", depth, " and split ", split, "roc = ", average_auc) if average_auc > top_roc: print(dataset, " for depth", depth, " split ", split, "roc = ", average_auc, " aupr ", average_aupr, end=' ') joblib.dump(classifier, '/home/farshid/Desktop/models/' + dataset + '.pkl') top_roc = average_auc print("stored !!!!")
import time tic = time.clock() #Read in preprocessed data data_train = pd.read_pickle('train.pkl') data_test = pd.read_pickle('test.pkl') y_values = data_train['score'].values data_train.drop(['score'], axis = 1, inplace = True) columns = data_train.columns x_values = data_train.values # Apply the random under-sampling rus = RandomUnderSampler(return_indices=True) # x_train, y_train, idx_resampled = rus.fit_sample(x_values, y_values) x_train, y_train = x_values, y_values gbc = GradientBoostingClassifier() gbc.fit(x_train, y_train) joblib.dump(gbc, 'GradientBoosting.pkl') query_id = np.asarray(data_test.index.values) y_test = data_test['score'].values data_test.drop(['score'], axis = 1, inplace = True) x_test = data_test.values predictions = gbc.predict(x_test) data = np.concatenate((query_id.reshape((-1, 1)),y_test.reshape((-1, 1))),axis=1 ) data = np.concatenate((data,predictions.reshape((-1, 1))),axis=1 )