def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) X_trans2, y_trans2 = rus.fit_resample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_resample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def svmsampler(X, y, over_pct=0.1, under_pct=1): over = SVMSMOTE(random_state=42, sampling_strategy=over_pct) under = RandomUnderSampler(random_state=42, sampling_strategy=under_pct) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X, y = pipeline.fit_resample(X, y) return X, y
def smote_under(x_train, y_train, smote_ss=0.25, under_ss=0.75, rs_val=42): """ Creates artificial training dataset data points for "1" label, undersamples "0" label. Input: x_train: Training dataset features. y_train: Training dataset labels. smote_ss: Percentage of minority label in artificial dataset. under_ss: Percentage of majority label that will be kept in artificial dataset. rs_val: Random state value. Output: x_train: Features for artificial training dataset. y_train: Labels for artificial training dataset. """ # Create list of column names for x_train and y_train x_cols = list(x_train.columns) y_cols = list(y_train.columns) # Create artificial SMOTE data points for minority label, # undersample majority label. over = SMOTE(sampling_strategy=smote_ss, random_state=42) under = RandomUnderSampler(sampling_strategy=0.75, random_state=42) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) x_train, y_train = pipeline.fit_resample(x_train, y_train) # Change new dataset into Pandas dataframes. x_train = pd.DataFrame(x_train, columns=x_cols) y_train = pd.DataFrame(y_train, columns=y_cols) return x_train, y_train
def run_smote_oversampling_and_undersampling(): # Define dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) # Summarize class distribution counter = Counter(y) print(counter) # Define pipeline over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # Transform the dataset X, y = pipeline.fit_resample(X, y) # Summarize the new class distribution counter = Counter(y) print(counter) # Scatter plot of examples by class label for label, _ in counter.items(): row_ix = where(y == label)[0] pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) pyplot.legend() pyplot.show()
def smote(X, y): over = SMOTE(sampling_strategy=0.9) under = RandomUnderSampler(sampling_strategy=0.9) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X, y = pipeline.fit_resample(X, y) return X, y
def sample_data(self, Xtrain, ytrain, over=0.4, under=0.7): print("Sampling data...") over = SMOTE(sampling_strategy=over) under = RandomUnderSampler(sampling_strategy=under) steps = [("o", over), ("u", under)] pipeline = Pipeline(steps=steps) Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain) return Xtrain, ytrain
def sample_data(x, y, choice): seed = 42 k = 8 if choice == 'both': # over = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=seed) # under = RandomUnderSampler(sampling_strategy='auto') # strategy = {0: 1000, 1: 1000, 2: 1000} # over = RandomOverSampler(sampling_strategy=strategy, random_state=seed) # under = TomekLinks(sampling_strategy='majority') choice = 'Over_and_Under_Sampling' over = RandomOverSampler(sampling_strategy=0.5, random_state=seed) under = TomekLinks(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) x, y = pipeline.fit_resample(x, y) elif choice == 'u': choice = 'Under_Sampling' print('Performing Random Under Sample') strategy = 'auto' under = RandomUnderSampler(sampling_strategy=strategy) steps = [('u', under)] pipeline = Pipeline(steps=steps) x, y = pipeline.fit_resample(x, y) elif choice == 'o': choice = 'Over_Sampling' print('Performing Random Over Sample') over = RandomOverSampler(random_state=seed) steps = [('o', over)] pipeline = Pipeline(steps=steps) x, y = pipeline.fit_resample(x, y) elif choice == 'smote': choice = 'SMOTE' strategy = 'auto' smote_over_sample = SMOTE(sampling_strategy=strategy, k_neighbors=k, random_state=seed) x, y = smote_over_sample.fit_resample(x, y) return x, y, choice
def get_SMOTE_UnderSampler(X, Y,do_debug=False): pipeline = Pipeline(steps=[('o', SMOTE(sampling_strategy=0.1)), ('u', RandomUnderSampler(sampling_strategy=0.5))]) X_Sampled, Y_Sampled = pipeline.fit_resample(X, Y) if do_debug: x_range = numpy.array([X[:, 0].min(), X[:, 0].max()]) y_range = numpy.array([X[:, 1].min(), X[:, 1].max()]) df_sampled = pd.DataFrame(numpy.concatenate((Y_Sampled.reshape(-1, 1), X_Sampled), axis=1), columns=['Y', 'x0', 'x1']) df = pd.DataFrame(numpy.concatenate((Y.reshape(-1, 1), X), axis=1),columns=['Y', 'x0', 'x1']) customPalette = ['#808080', '#C00000'] P.plot_2D_features_v3(df, x_range=x_range,y_range=y_range,palette=customPalette,transparency=0.5,figsize=(6,4),filename_out='original.png') P.plot_2D_features_v3(df_sampled, x_range=x_range,y_range=y_range,palette=customPalette,transparency=0.5,figsize=(6,4),filename_out='SMOTE_UnderSampler.png') return X_Sampled, Y_Sampled
def balanced_classes(X, y, n, digit1, digit2): unique, counts = np.unique(y, return_counts=True) print('\nClassi non bilanciate', dict(zip(unique, counts))) under = RandomUnderSampler(sampling_strategy={digit1:int(n/2)}) over = RandomOverSampler(sampling_strategy={digit2:int(n/2)}) pipeline = Pipeline(steps=[('o', over), ('u', under)]) X, y = pipeline.fit_resample(X, y) unique, counts = np.unique(y, return_counts=True) print('Classi bilanciate', dict(zip(unique, counts))) return X, y
def simple_model(X_train, y_train): # define the methods over = SMOTE(k_neighbors=7) under = RandomUnderSampler() steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset new_X_train, new_y_train = pipeline.fit_resample(X_train, y_train) return new_X_train, new_y_train
def ensemble_model(X_train, y_train): # define the methods over = BorderlineSMOTE(k_neighbors=7, kind="borderline-1") under = EasyEnsemble(random_state=1) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset new_X_train, new_y_train = pipeline.fit_resample(X_train, y_train) return new_X_train[0], new_y_train[0]
def apply_over_random_under_sample_smote(X, y): # Oversample with SMOTE and random undersample for imbalanced dataset from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import Pipeline over = SMOTE(sampling_strategy=0.5) under = RandomUnderSampler(sampling_strategy=0.5) # over = SMOTE(ratio=0.1) # under = RandomUnderSampler(ratio=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset X_smt, y_smt = pipeline.fit_resample(X, y) return X_smt, y_smt
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) X_trans2, y_trans2 = rus.fit_resample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_resample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def __init__(self, data, target, features=None, steps=[]): super().__init__(data, target, features, steps) self.OverSampling = object_over(self.data, self.target, steps=steps) self.UnderSampling = object_under(self.data, self.target, steps=steps) if not self.steps == []: pipeline = Pipeline(steps=self.steps) features_resample, target_resample = pipeline.fit_resample( self.features_numpy, self.target_numpy) self.data = self.resample_dataframe( features_resample=features_resample, target_resample=target_resample, features=self.features, target=self.target) self.resample = self.data
def smote_sampling(x_train, y_train): # summarize class distribution counter = Counter(y_train) print(counter) # define pipeline over = SMOTE(sampling_strategy=0.2) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset X_result, y_result = pipeline.fit_resample(x_train, y_train) # summarize the new class distribution counter = Counter(y_result) print(counter) return X_result, y_result
def over_under_sample_func(train_x, train_y, target): try: logger.info( f"counter before over_under_sample is: {train_y[target].value_counts()}" ) # transform the dataset over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset train_x, train_y = pipeline.fit_resample(train_x, train_y) # summarize the new class distribution logger.info( f"counter after over_under_sample is: {train_y[target].value_counts()}" ) return train_x, train_y except Exception as ex: logger.error(f"failed to run over_under_sample_func due to: {ex}")
def balanceSampling(X_tr, y_train, up_ratio=1,dn_ratio=1): """ Docstring: up and under sampling data Parameters ---------- up_ratio: upsampling ratio dn_ratio: downsampling ratio """ # Ratio argument is the percentage of the upsampled minority class in relation to the majority class. Default is 1.0 over = SMOTE(sampling_strategy = up_ratio) under = RandomUnderSampler(sampling_strategy = dn_ratio) steps = [('over', over), ('under', under)] pipeline = Pipeline(steps=steps) X_train_sm, y_train_sm = pipeline.fit_resample(X_tr, y_train) print(X_train_sm.shape, y_train_sm.shape) return X_train_sm, y_train_sm
def oversample_smote_undersampling(self, X_train, y_train): print("SMOTE WITH UNDERSAMPLING") print(f"Shape before smote: {X_train.shape}") sampling_strategy = { 'RESIDENTIAL': 1000, 'INDUSTRIAL': 2000, 'PUBLIC': 1000, 'OFFICE': 1000, 'OTHER': 1500, 'RETAIL': 10000, 'AGRICULTURE': 1500 } over = SMOTE() under = RandomUnderSampler(sampling_strategy=sampling_strategy) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X, y = pipeline.fit_resample(X_train, y_train) print(f"Shape after SMOTE and undersampling: {X.shape}") return X, y
def under_sample_with_SMOTE(X, y): ''' Undersample the date with SMOTE algorithm :param X: :param y: labels :return: ''' counter = collections.Counter(y) print(counter) # define pipeline over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset X, y = pipeline.fit_resample(X, y) # summarize the new class distribution counter = collections.Counter(y) print(counter) return X, y
def SMOTE_Analysis(k, o, u): try: model = DecisionTreeClassifier() over = SMOTE(sampling_strategy=o, k_neighbors=k, random_state=2) under = RandomUnderSampler(sampling_strategy=u) steps = [('over', over), ('under', under)] pipeline = Pipeline(steps=steps) Xn, yn = pipeline.fit_resample(X, y.ravel()) cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) scores = cross_val_score(model, Xn, yn, scoring='roc_auc', cv=cv, n_jobs=-1) score = np.mean(scores) print("k={}, over={}, under={}, Mean ROC AUC: {:.3f}".format( k, o, u, score)) return [k, o, u] except Exception as e: return ""
def resampling(self, oversample_ratio=0.3, minority_num=368, majority_num=10000, minority_label='1.0', majority_label='0.0'): # define resampling under = RandomUnderSampler(sampling_strategy={ majority_label: majority_num, minority_label: minority_num }) over = SMOTE(sampling_strategy=oversample_ratio) # define pipeline pipeline = Pipeline(steps=[('u', under), ('o', over)]) X_sm, y_sm = pipeline.fit_resample(self.X, self.y) print('Proportion in data after resample: ', Counter(y_sm)) return X_sm, y_sm
def syntetic_sampling(X, y, over_sampling, under_sampling): """ Apply Synthetic Minority Oversampling Technique (SMOTE) to tn unbalanced class :type X: pandas DataFrame :param X: Training Features :type y: pandas Series :param y: Training Features :return: resampled data :rtype: tuple """ over = SMOTE(sampling_strategy=over_sampling) under = RandomUnderSampler(sampling_strategy=under_sampling) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) return pipeline.fit_resample(X, y)
def split_smote(drug_df, drug_name): X = drug_df.drop([drug_name], axis=1) y = drug_df[drug_name] counter = Counter(y) print('Originally, the distribution of classes is: {}'.format(counter)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y) over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) Xsm_train, ysm_train = pipeline.fit_resample(X_train, y_train) counter_balance = Counter(ysm_train) print( 'After SMOTE sampling, the distribution of classes in Training set is: {}' .format(counter_balance)) XSM_train = pd.DataFrame(Xsm_train, columns=X_train.columns) return XSM_train, ysm_train, X_test, y_test
def under_over_sample(X, y, under_samp_rate=0.15, over_samp_rate=0.75, random_state=42): under = RandomUnderSampler( sampling_strategy=under_samp_rate, random_state=random_state, ) over = RandomOverSampler(sampling_strategy=over_samp_rate, random_state=random_state) steps = [('under', under), ('over', over)] pipeline = Pipeline(steps=steps) X_res, y_res = pipeline.fit_resample(np.array(X).reshape(-1, 1), y) combined = pd.DataFrame(data={ "TEXT": X_res.squeeze(), "OUTPUT_LABEL": y_res }) return combined.fillna("")
def build_loaders(titles, labels, batch_size, under_sample=False, over_sample=False): train_titles, test_titles, train_labels, test_labels = \ train_test_split(titles, labels, test_size=0.1) val_titles, test_titles, val_labels, test_labels = \ train_test_split(test_titles, test_labels, test_size=0.01) steps = [] if under_sample: steps.append(("Under", EditedNearestNeighbours(n_neighbors=2))) if over_sample: steps.append(("Over", SMOTE(sampling_strategy=1))) if under_sample or over_sample: pipeline = Pipeline(steps=steps) train_titles, train_labels = pipeline.fit_resample(train_titles, train_labels) print("Train:") calc_ratio(train_labels) print("Validation:") calc_ratio(val_labels) print("Test:") calc_ratio(test_labels) train = TensorDataset(torch.from_numpy(train_titles), torch.from_numpy(train_labels)) val = TensorDataset(torch.from_numpy(val_titles), torch.from_numpy(val_labels)) test = TensorDataset(torch.from_numpy(test_titles), torch.from_numpy(test_labels)) train_loader = DataLoader(train, shuffle=True, batch_size=batch_size, drop_last=True) test_loader = DataLoader(test, shuffle=True, batch_size=batch_size, drop_last=True) val_loader = DataLoader(val, shuffle=True, batch_size=batch_size, drop_last=True) return train_loader, test_loader, val_loader
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print ("Running on:",device) scores=np.array([]) cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=2) for train_index, test_index in cv.split(X, y): #Put data in dataloaders print ("Augmenting Data...") X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] over = SMOTE(random_state=2) under = RandomUnderSampler(random_state=2) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X_train=X_train.reshape(X_train.shape[0],-1) X_train, y_train = pipeline.fit_resample(X_train, y_train) X_train = X_train.reshape(-1,X.shape[1], X.shape[2], X.shape[3]) X_test = X_test.reshape(-1,X.shape[1], X.shape[2], X.shape[3]) train_loader = make_into_dataloader(X_train, y_train,batch_size) test_loader = make_into_dataloader(X_test,y_test,batch_size) #Create model model = fmriNet(insize) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, betas = betas, weight_decay = l2) #Training losses = train(model, train_loader, num_epochs, criterion, optimizer, losses = []) #plt.plot(losses) #plt.show() #Testing
def load(): dtype = [ 'characteristic_B', 'characteristic_C', 'characteristic_D', 'characteristic_E', 'characteristic_G', 'characteristic_M', 'characteristic_P', 'characteristic_Q', 'characteristic_R', 'characteristic_S', 'characteristic_Y', 'characteristic_Z', 'catering_C', 'catering_F', 'catering_H', 'catering_M', 'catering_R', 'catering_T', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'freight', 'bank_holiday_running', 'length', 'speed', 'delayed' ] dtype = {key: "uint8" for key in dtype} categories = { "status": "category", "category": "category", "power_type": "category", "timing_load": "category", "seating": "category", "reservations": "category", "ATOC_code": "category", "destination_stanox_area": "category", "origin_stanox_area": "category" } dtype.update(categories) start = time.time() print("Loading data...", end="") df = pd.read_csv("data/dscm_w.csv", index_col=["uid"], parse_dates=["std", "sta", "atd", "ata"], dtype=dtype) path = os.path.join("models", "select") if not os.path.exists(path): os.mkdir(path) Y = df["delayed"] X = df.drop(["delay", "delayed", "atd", "ata", "origin", "destination"], axis=1) print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n") print(X.info()) X = RailEncoder().transform(X) categorical_features = [ "status", "category", "power_type", "timing_load", "seating", "reservations", "characteristics", "catering", "ATOC_code", "origin_stanox_area", "destination_stanox_area" ] for c in categorical_features: X[c] = X[c].cat.codes datetime_features = X.select_dtypes(include="datetime").columns.values datetime_transformer = Pipeline([("cyclical", DatetimeEncoder(cyclical=True))]) preprocessor = ColumnTransformer([ ("datetime", datetime_transformer, datetime_features), ], remainder="passthrough") resampler = IPipeline([ # ('over', SMOTE(sampling_strategy=0.2, random_state=1)), # Increase minority to 20% of majority ('under', RandomUnderSampler(sampling_strategy=1.0, random_state=1) ), # Reduce majority to 50% of minority ]) start = time.time() print("\nPreprocessing data...", end="") X = preprocessor.fit_transform(X, Y) print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n") print(X.shape) print("\nResampling data...", end="") X, Y = resampler.fit_resample(X, Y) print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n") print("{}, delayed: {}, not delayed: {}\n".format(X.shape, Y.sum(), len(Y) - Y.sum())) return X, Y
def lr_cv(disease, year_survival=5, period_of_analysis_days=None, kfold=5, random_state=17, authortype_list=None, added_features_list=None): """This is the main function to obtain the NLP experiment results. The function lr_cv (logistic regression - cross validation) performs n-year survival prediction (year_survival) using text notes and stage/grade, independently. Term-frequency inverse document-frequency (tf-idf) is applied to the text. Also, the function use the Scikit-learn SelectFromModel Meta-transformer for selecting features based on importance weights for each time point. Parameters ---------- disease: One of the values from ('breast','prostate','lung','glioma'). year_survival: Threshold to define survival. period_of_analysis_days: List of number of days after diagnosis considered to select the notes. kfold: Number of folds in the cross validation. random_state: Seed used by the random number generator. authortype_list: List of authors considered as valid. added_features_list: Features from the input dataset conserved in the output. Return ------ Dictionary with all results. Dictionary keys: val_f1: List of tuples (mean, std) of test sets F1 metric in the grid search, best index for each time point. val_area_under_curve: List of tuples (mean, std) of test sets AUC metric in the grid search, best index for each time point. tfidf_param_text: List of hyperparameter max_features for tfidfvectorizer in the grid search for each time point. C_param_text: List of hyperparameter C for logistic regression in the grid search for each time point. f1_train: List of tuples (F1 score, 0) of training set for each time point. area_under_curve_train: List of tuples (AUC score, 0) of training set for each time point. n: List of train set size for each time point. f1: List of tuples (F1 score, 0) of test set for each time point. area_under_curve: List of tuples (AUC score, 0) of test set for each time point. n_test: List of test set size for each time point. feature_names: List of importants features for each time point. predictions: List of predictions for the test set for each time point. val_f1_s: List of tuples (mean, std) of test sets F1 metric in the grid search best index for each time point. (stage/grade approach) val_area_under_curve_s: List of tuples (mean, std) of test sets AUC metric in the grid search best index for each time point. (stage/grade approach) C_param_s: List of hyperparameter C for logistic regression in the grid search for each time point. (stage/grade approach) f1_train_s: List of F1 score of training set for each time point. (stage/grade approach) area_under_curve_train_s: List of AUC score of training set for each time point. (stage/grade approach) f1_s: List of tuples (F1 score, 0) of test set for each time point. (stage/grade approach) area_under_curve_s: List of tuples (AUC score, 0) of test set for each time point. (stage/grade approach) predictions_s: Lis of predictions for the test set for each time point. (stage/grade approach) train: List of the complete training sets with added columns with the predictions for the two approaches for each time point. test: List of the complete test sets with added columns with the predictions for the two approaches for each time point. random_state: Seed used by the random number generator. """ # Initializations: val_f1 = [] val_area_under_curve = [] tfidf_param_text = [] C_param_text = [] f1_train = [] area_under_curve_train = [] n = [] f1 = [] area_under_curve = [] n_test = [] feature_names = [] predictions = [] tain_list = [] val_f1_s = [] val_area_under_curve_s = [] C_param_s = [] f1_train_s = [] area_under_curve_train_s = [] f1_s = [] area_under_curve_s = [] predictions_s = [] test_list = [] id_list_flag = True idlist = [] test_ids = [] train = None test = None train_frac = 0.8 unic_label = False # Nested stratification is done when value is False. ngram = 1 max_features = 200 # For feature importance. examples_col_names = [ 'id', 'overallsurvival', 'vitalstatusbinary', 'stage_grade', 'id_count' ] + added_features_list + ['text_length', 'text'] labels_col_names = ['stage_grade', 'label'] scoring = { 'f1': make_scorer(f1_score, average='macro'), 'auc': make_scorer(roc_auc_score) } parameter_grid = { 'logisticregression__C': [0.1, 1, 10, 100, 1000], 'tfidfvectorizer__max_features': [500, 1000, None] } parameter_grid_stage = {'logisticregression__C': [0.1, 1, 10, 100, 1000]} if period_of_analysis_days is None: period_of_analysis_days = [30, 365] # Main loop: Solving the problem at each time point. for period in period_of_analysis_days: print(f"Period in days: {period}") # 1. Define train and test set. if id_list_flag: id_list_flag = False df = combined_notes(disease, year_survival=year_survival, period=period, authortype_list=authortype_list, added_features_list=added_features_list) idlist = df['id'].copy().tolist() if unic_label: examples = df[examples_col_names].copy() labels = df['label'].copy() X_tr, X_te, y_tr, y_te = train_test_split( examples, labels, train_size=train_frac, stratify=labels, random_state=random_state) train = pd.concat([y_tr, X_tr], axis=1).reset_index(drop=True) test = pd.concat([y_te, X_te], axis=1).reset_index(drop=True) else: train, test = nested_train_test_split( df, examples_col_names, labels_col_names, train_frac=train_frac, random_state=random_state) test_ids = test['id'].copy().tolist() print('text is in df:', df.columns) else: df = combined_notes(disease, period=period, year_survival=year_survival, idlist=idlist, authortype_list=authortype_list, added_features_list=added_features_list) train = df[~df['id'].isin(test_ids)].copy().reset_index(drop=True) test = df[df['id'].isin(test_ids)].copy().reset_index(drop=True) train['is_test'] = False test['is_test'] = True # 2. NLP. # Create a temporary folder to store the transformers of the pipeline. cachedir = mkdtemp() memory = Memory(location=cachedir, verbose=10) pipeline = Pipeline([('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, ngram), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)), ('randomOversampler', RandomOverSampler(random_state=random_state)), ('logisticregression', LogisticRegression(random_state=random_state))], memory=memory) if unic_label: x_train = train['text'].copy() y_train = train['label'].values.copy() cross_validation = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=random_state) else: # Nested stratification. x_train_2col = train[['stage_grade', 'text']].copy() x_train = train['text'].copy() y_train = train['label'].values.copy() cross_validation = CatStratifiedKFold( n_splits=kfold, shuffle=True, random_state=random_state).split(x_train_2col, y_train) x_test = test['text'].copy() y_test = test['label'].values.copy() grid_search = GridSearchCV(pipeline, param_grid=parameter_grid, scoring=scoring, refit='f1', cv=cross_validation) # Fit. grid_search.fit(x_train, y_train) # Clear the cache directory when you don't need it anymore. rmtree(cachedir) # Record cross validation metrics. val_f1.append( (grid_search.cv_results_['mean_test_f1'][grid_search.best_index_], grid_search.cv_results_['std_test_f1'][grid_search.best_index_])) val_area_under_curve.append( (grid_search.cv_results_['mean_test_auc'][grid_search.best_index_], grid_search.cv_results_['std_test_auc'][grid_search.best_index_])) # Record cross validation hyperparameters. tfidf_param_text.append( grid_search.best_params_['tfidfvectorizer__max_features']) C_param_text.append(grid_search.best_params_['logisticregression__C']) # Final model. pipeline_final = Pipeline([ ('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, ngram), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=grid_search. best_params_['tfidfvectorizer__max_features'])), ('randomOversampler', RandomOverSampler(random_state=random_state)), ('logisticregression', LogisticRegression( random_state=random_state, C=grid_search.best_params_['logisticregression__C'])) ]) final_model = pipeline_final.fit(x_train, y_train) preds_train = final_model.predict(x_train) # Add predictions in train DF. train[str(period) + '_tf_pred'] = preds_train train_f1 = f1_score(y_train, preds_train, average='macro') f1_train.append((train_f1, 0)) # Record train f1 train_auc = roc_auc_score(y_train, preds_train) area_under_curve_train.append((train_auc, 0)) # Record train auc. n.append(len(train)) # Add number of examples in train. preds_test = final_model.predict(x_test) # Add predictions in test DF. test[str(period) + '_tf_pred'] = preds_test predictions.append(preds_test) # Add predictions in list. test_f1 = f1_score(y_test, preds_test, average='macro') f1.append((test_f1, 0)) # Record test f1. test_auc = roc_auc_score(y_test, preds_test) area_under_curve.append((test_auc, 0)) # Record test auc. n_test.append(len(test)) # Add number of examples in test. # Selecting features. pip_tfidf_ros = Pipeline([ ('tfidfvectorizer', TfidfVectorizer(ngram_range=(1, ngram), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=grid_search. best_params_['tfidfvectorizer__max_features'])), ('randomOversampler', RandomOverSampler(random_state=random_state)) ]) X_res, y_res = pip_tfidf_ros.fit_resample(x_train, y_train) clf = LogisticRegression( random_state=random_state, C=grid_search.best_params_['logisticregression__C']) sfm = SelectFromModel(clf, threshold=-np.inf, max_features=max_features) sfm.fit(X_res, y_res) embeded_lr_support = sfm.get_support() X_res_pandas = pd.DataFrame(X_res.todense()) embeded_lr_feature = X_res_pandas.loc[:, embeded_lr_support].columns.tolist( ) feature_names_list = np.array( pip_tfidf_ros['tfidfvectorizer'].get_feature_names( ))[embeded_lr_feature].tolist() feature_names.append(feature_names_list) # Add importants features. # 3. Stage. x_train_s = train[['stage_grade']].copy() y_train_s = train['label'].values.copy() x_test_s = test[['stage_grade']].copy() y_test_s = test['label'].values.copy() # Create a temporary folder to store the transformers of the pipeline. cachedir = mkdtemp() memory = Memory(location=cachedir, verbose=10) pipeline = Pipeline( [('randomOversampler', RandomOverSampler(random_state=random_state)), ('onehotencoder', OneHotEncoder(handle_unknown='ignore')), ('logisticregression', LogisticRegression(random_state=random_state))], memory=memory) cross_validation = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=random_state) grid_search = GridSearchCV(pipeline, param_grid=parameter_grid_stage, scoring=scoring, refit='f1', cv=cross_validation) grid_search.fit(x_train_s, y_train_s) # Clear the cache directory when you don't need it anymore. rmtree(cachedir) val_f1_s.append( (grid_search.cv_results_['mean_test_f1'][grid_search.best_index_], grid_search.cv_results_['std_test_f1'][grid_search.best_index_])) val_area_under_curve_s.append( (grid_search.cv_results_['mean_test_auc'][grid_search.best_index_], grid_search.cv_results_['std_test_auc'][grid_search.best_index_])) # Final model. pipeline_final = Pipeline([ ('randomOversampler', RandomOverSampler(random_state=random_state)), ('onehotencoder', OneHotEncoder(handle_unknown='ignore')), ('logisticregression', LogisticRegression( random_state=random_state, C=grid_search.best_params_['logisticregression__C'])) ]) C_param_s.append(grid_search.best_params_['logisticregression__C']) final_model = pipeline_final.fit(x_train_s, y_train_s) preds_train_s = final_model.predict(x_train_s) # Fill the output dictionary values train[str(period) + '_s_pred'] = preds_train_s train_f1_s = f1_score(y_train_s, preds_train_s, average='macro') f1_train_s.append(train_f1_s) train_auc_s = roc_auc_score(y_train_s, preds_train_s) area_under_curve_train_s.append(train_auc_s) preds_test_s = final_model.predict(x_test_s) test[str(period) + '_s_pred'] = preds_test_s predictions_s.append(preds_test_s) test_f1_s = f1_score(y_test_s, preds_test_s, average='macro') f1_s.append((test_f1_s, 0)) test_auc_s = roc_auc_score(y_test_s, preds_test_s) area_under_curve_s.append((test_auc_s, 0)) tain_list.append(train) test_list.append(test) return dict(val_f1=val_f1, val_area_under_curve=val_area_under_curve, tfidf_param_text=tfidf_param_text, C_param_text=C_param_text, f1_train=f1_train, area_under_curve_train=area_under_curve_train, n=n, f1=f1, area_under_curve=area_under_curve, n_test=n_test, feature_names=feature_names, predictions=predictions, val_f1_s=val_f1_s, val_area_under_curve_s=val_area_under_curve_s, C_param_s=C_param_s, f1_train_s=f1_train_s, area_under_curve_train_s=area_under_curve_train_s, f1_s=f1_s, area_under_curve_s=area_under_curve_s, predictions_s=predictions_s, train=tain_list, test=test_list, random_state=random_state) # --------------------------------------------------------------------------- # ***************************************************************************
def plot_ROC_wCV(ax, X, y, names, save=True, balance=True): sns.set_context("paper") nsplit = 5 cv = StratifiedKFold(n_splits=nsplit) classes = np.unique(y) colors = plt.cm.Dark2(np.linspace(0, 1, len(classes))) rf = RandomForestClassifier(n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=90, bootstrap=False) tprs = [] allAcc = [] aucs = [] all_confMatrices = [] mean_fpr = np.linspace(0, 1, 100) accuracy_tot = 0 nclass = len(classes) wrong = [] for j in range(nclass): i = 0 for train, test in cv.split(X, y): names_test = names[test] if nclass == 3: sampling1 = { 'SN Ia': Counter(y[train])['SN Ia'], 'Core Collapse': Counter(y[train])['Core Collapse'], 'SLSN': 1000 } sampling2 = { 'SN Ia': 1000, 'SLSN': 1000, 'Core Collapse': 1000 } elif nclass == 4: sampling1 = { 'SN Ia': Counter(y[train])['SN Ia'], 'Core Collapse': Counter(y[train])['Core Collapse'], 'SN Ia Pec': 500, 'SLSN': 500 } sampling2 = { 'SN Ia': 1000, 'Core Collapse': 1000, 'SN Ia Pec': 500, 'SLSN': 500 } elif nclass == 2: sampling1 = { 'SN Ia': Counter(y[train])['SN Ia'], 'Core Collapse': 3500 } sampling2 = {'SN Ia': 3500, 'Core Collapse': 3500} if balance: over = SMOTE(sampling_strategy=sampling1) under = RandomUnderSampler(sampling_strategy=sampling2) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) Xtrain_resampled, ytrain_resampled = pipeline.fit_resample( X[train], y[train]) else: Xtrain_resampled = X[train] ytrain_resampled = y[train] print('Distribution after imbalancing: {}'.format( Counter(ytrain_resampled))) print('Distribution of test set: {}'.format(Counter(y[test]))) probas_ = rf.fit(Xtrain_resampled, ytrain_resampled).predict_proba(X[test]) predictions = rf.predict(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, j], pos_label=classes[j]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) i += 1 tempAccuracy = np.sum(predictions == y[test]) / len(y[test]) * 100 wrong.append(names_test[y[test] != predictions]) print(tempAccuracy) allAcc.append(tempAccuracy) matr = sklearn.metrics.confusion_matrix(y[test], predictions, normalize='true') all_confMatrices.append(matr) print(matr) accuracy_tot += tempAccuracy mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) accuracy = accuracy_tot / (nsplit * len(classes)) if True: if classes[j] == 'Core Collapse': ax.plot(mean_fpr, mean_tpr, color=colors[j], label=r'CC (%0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) elif classes[j] == 'SLSN': ax.plot(mean_fpr, mean_tpr, color=colors[j], label=r'%s (%0.2f $\pm$ %0.2f)' % (classes[j], mean_auc, std_auc), lw=2, alpha=.8) else: ax.plot(mean_fpr, mean_tpr, color=colors[j], label=r'%s (%0.2f $\pm$ %0.2f)' % (classes[j].strip("SN "), mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color=colors[j], alpha=.05) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8) #if ~foley: ax.set_xlabel("False Positive Rate", fontsize=16) ax.set_ylabel("True Positive Rate", fontsize=16) #plt.title("ROC Curve, %i Classes" % (len(classes)), fontsize=26) ax.legend(loc=4, fontsize=12) # plt.text(0.1, 0.9, r'$N_{tot} = %i$'%len(y),fontsize=12) plt.text(0.1, 0.9, r'$N_{train} = 7000$') plt.text(0.1, 0.82, r'$N_{test} = 2226$') ax.set_xlim([-0.05, 1.05]) ax.set_ylim([-0.05, 1.05]) #plt.savefig("Combined_MeanROC_Curve_%i_Classes_dataML_noOverlapCuts.png" % len(classes)) return accuracy, rf, all_confMatrices, allAcc, wrong
random_state=42) x8_scaled_train.shape, x8_test.shape, y8_train.shape, y8_test.shape Counter(y5_train.failure), Counter(y5_test.failure) #Dataset # 4 # Run this for checking results with NO UPSAMPLE or DOWNSAMPLE of data. classify_hdd_failure(x8_scaled_train, x8_test, y8_train.values.ravel(), y8_test.values.ravel()) # Method 1: Upsample minority class and Downsample majority class from imblearn.pipeline import Pipeline oversample = SMOTE(sampling_strategy=0.2, random_state=42) undersample = RandomUnderSampler(sampling_strategy=0.3, random_state=42) steps = [('o', oversample), ('u', undersample)] pipeline = Pipeline(steps=steps) x8_scaled_train_s, y8_train_s = pipeline.fit_resample(x8_scaled_train, y8_train) # the Dataset has now reduced to 354K rows. This move was mainly to reduce the size of the dataset. Computational resources. x8_scaled_train_s.shape, x8_test.shape # After the SMOTE, the failure percentage in the data has now increased to 33%. Could look at how the results in the analysis change with this % Counter(y8_train_s.failure) print( 'The percentage of failure in the dataset is now: ', Counter(y8_train_s.failure)[1] / (Counter(y8_train_s.failure)[0] + Counter(y8_train_s.failure)[1])) ''' # Method 2: Upsample minority class. This method upsamples the minority class to 50% of the data. sm = SMOTE(random_state=42) x_scaled_sm , y_sm = sm.fit_sample(x_scaled , y) print('The percentage of failures now in data = ',Counter(y_sm.failure)[1]/(Counter(y_sm.failure)[1]+Counter(y_sm.failure)[0]))
# These ratio values correspond to the percentages of oversampling that were tested: 4%, 10%, 25%, 35% and 50%. ratio_list = [0.042, 0.111, 0.333, 0.538, 1] percentage_list = [4, 10, 25, 35, 50] #This loop executes the oversampling strategy (In this case ADASYN) for all the ratio's that were tested. for ratio, percentage in zip(ratio_list, percentage_list): #Create a train-test split where the ratio of target class is maintained x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y) #Initialize a ADASYN sampler with ratio that will be tested over = ADASYN(sampling_strategy=ratio) #Initialize a pipeline (One can add extra steps here if required) steps = [ ('o', over)] pipeline = Pipeline(steps) #Resample data x_res, y_res = pipeline.fit_resample(x_train, y_train) print('resample finished') #Train an xg_boost model with resampled data xgb = xg_boost(x_res, y_res, x_test, y_test, f"ADASYN_{percentage}") # The code below was used to calculate the running times. # Since some running times were very long, we let the code time-out after 10 hours. # It is less relevant for WWF, hence it is commented out. #List of sub-sample sizes that were evaluated to calculate running times. # subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000] # times_subsetsize_list = [] # def calculate_running_times(): # for i in subset_list: