def main(): # Get label encoder lb = LabelBinarizer() lbenc = lb.fit(utils.get_classes()) # Get train data X_train, y_train, train_filenames = utils.get_train( '../input/train', list(lbenc.classes_), img_width, img_height) # Create and train model model = train(X_train, y_train, epochs=100, batch_size=32) print("+++++++++++++++++++++++++++++++++++++++++++") # Load model ... #model = load_model('../models/'+ 'model2_f0.86/'+ 'model2-64-0.341.h5') # Get test data X_test, X_test_id = utils.get_test('../input/test', img_width, img_height) # Predict on test data preds = model.predict(X_test, verbose=1) # Create submission utils.create_submission(lbenc.inverse_transform(preds), X_test_id, output_path="../submissions/", filename=modelname, isSubmission=True) utils.to_csv_ens(lbenc.inverse_transform(preds), preds, X_test_id, utils.get_classes(), output_path="../submissions/", filename=modelname) print('Finished.')
def main(): parser = build_parser() options = parser.parse_args() batch_size=options.batch_size #train_names=utils.train_names # train_set = utils.get_train() # val_set = utils.val_n model = Resnet() model.cuda() model = torch.nn.DataParallel(model) ########################### #train_set,val_set = train_test_split(train_names, test_size=0.2, random_state=2050) train_set,val_set=utils.get_split() train_set = utils.get_train(train_set) train_datasest=ProteinDataset(dirpath=utils.TRAIN,fnames=train_set) train_loader = DataLoader(train_datasest, batch_size=batch_size, shuffle=True,num_workers=4) val_dataset = ProteinDataset(dirpath=utils.TRAIN,fnames=val_set) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4) train(model, options.epochs, train_loader,val_loader,'sgdr_rgb.pkl')
def pca(data): train = get_train(data).drop("cnt", axis=1) test = get_holdout(data).drop("cnt", axis=1) pca = daskpca(n_components=0.95, svd_solver="full").fit(train) print("\tPerforming PCA dimensionality reduction...") pca_train = dd.DataFrame(data=pca.transform(train)) pca_test = dd.DataFrame(data=pca.transform(test)) new_df = pca_train.append(pca_test) new_df["cnt"] = data["cnt"] return new_df
def weather_cluster(data): """ Creates a column that gives a cluster id based on KMeans clustering of only weather-related features :param data: a pandas dataframe where each row is an hour :return: a pandas dataframe containing the new column """ print("\tAdding clustering variable based on weather-related features...") df = data.copy()[["weathersit", "temp", "atemp", "hum", "windspeed"]] to_cluster = dd.get_dummies(df) train = get_train(to_cluster) holdout = get_holdout(to_cluster) kmeans = KMeans(n_clusters=5, random_state=SEED).fit(train) # magic numbers, blech data["weather_cluster"] = da.append(kmeans.labels_, kmeans.predict(holdout)) data["weather_cluster"] = data["weather_cluster"].astype("category") return data
def cluster_variable(data): """ Creates a column that gives a cluster id based on KMeans clustering of all features :param data: a pandas dataframe where each row is an hour :return: a pandas dataframe containing the new column """ print("\tAdding cluster variable...") data = data.copy() to_cluster = dd.get_dummies(data) train = get_train(to_cluster) holdout = get_holdout(to_cluster) kmeans = KMeans(n_clusters=5, random_state=SEED).fit( train.drop("cnt", axis=1)) # magic numbers, blech data["cluster"] = da.append(kmeans.labels_, kmeans.predict(holdout.drop("cnt", axis=1))) data["cluster"] = data["cluster"].astype("category") return data
def subcount_forecast(data, feature): """ Creates a new a column that is the predicted value of the input feature Essentially an abstraction for 'prediction_forecasts' :param data: a pandas dataframe where each row is an hour :param feature: a String containing the feature that should be forecasted (one of: casual, registered) :return: a pandas dataframe containing the new column """ var_name = feature + "_forecast" print("\tAdding {} variable...".format(var_name)) df = dd.get_dummies(data.copy().drop("cnt", axis=1)) to_predict = dd.read_csv(PATH)[feature] df[feature] = to_predict train = get_train(df) model = RandomForestRegressor(random_state=SEED) model_params = {"n_estimators": list(range(10, 110, 10))} #tscv = TimeSeriesSplit(n_splits=5) grid_search = GridSearchCV(estimator=model, param_grid=model_params, scoring="r2", cv=None, refit=True) grid_search.fit(train.drop(feature, axis=1), train[feature]) print("\t\tPredictions for GridSearchCV on {}: {:.5f} +/- {:.5f}".format( feature, grid_search.best_score_, grid_search.cv_results_["std_test_score"][da.argmax( grid_search.cv_results_["mean_test_score"])])) data[var_name] = grid_search.best_estimator_.predict( dd.get_dummies(data.drop("cnt", axis=1))) return data
def summary(model, sampling_method, k_folds, use_international, cat_code, data_dir, results_dir, verbose=True): print("model: {} - sampling method: {}".format(model, sampling_method)) aggregate = {} for k in range(k_folds): result = pickle.load( open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "rb")) print("Fold {}".format(k + 1)) for key, val in result.items(): print(key) print(val) if key not in aggregate: if type(val) is np.float32 or type(val) is np.float64 or type( val) is float: aggregate[key] = val else: if type(val) is np.float32 or type(val) is np.float64 or type( val) is float: aggregate[key] += val aggregate = {key: val / k_folds for key, val in aggregate.items()} print("Aggregate") print(aggregate) if model == 'logistic': print("feature importance not implemented for logistic regression") return features, labels, feature_labels = get_train( data_dir, one_hot=not cat_code, use_international=use_international) country_names = get_country_names(data_dir) if use_international: country_names = country_names[:2].tolist() + ['international'] for k in range(k_folds): print("Fold {}".format(k + 1)) correct_examples = pickle.load( open( os.path.join( results_dir, "{}_{}_fold_{}_correct_examples.p".format( model, sampling_method, k + 1)), "rb")) incorrect_examples = pickle.load( open( os.path.join( results_dir, "{}_{}_fold_{}_incorrect_examples.p".format( model, sampling_method, k + 1)), "rb")) feature_imp = pickle.load( open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( model, sampling_method, k + 1)), "rb")) top_20 = [ (label, feature_imp[label]) for label in sorted(feature_imp, key=feature_imp.get, reverse=True) ][:20] print(top_20) print("correct examples\n") for example in correct_examples: print("{} features\n".format(country_names[example['label']])) feature_dict = { label: feature for label, feature in zip(feature_labels, example['features']) } for label, weight in top_20: print("{},{}".format(label, feature_dict[label])) print("") print("\nincorrect examples\n") for example in incorrect_examples: print("{} features".format(country_names[example['label']])) print("prediction was {}\n".format( country_names[example['prediction']])) feature_dict = { label: feature for label, feature in zip(feature_labels, example['features']) } for label, weight in top_20: print("{},{}".format(label, feature_dict[label])) print("")
def train(sampling_method, k_folds, data_dir, results_dir, device='cpu', use_international=False, verbose=True): model = 'lgbm' start_time = time.time() if verbose: print("Using device: {}".format(device)) print("Reading train data in...") if use_international: print("Using international class.") X_train, Y_train, feature_labels = get_train(data_dir, one_hot=False, use_international=use_international) categorical_feature = ['age_bucket', 'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] if verbose: print("Successfully loaded data") print("Starting Cross-Validation with {} folds...".format(k_folds)) kf = KFold(n_splits=k_folds) kf.get_n_splits(X_train) params = { 'task': 'train', 'objective': 'multiclass', 'num_class': 12, 'num_leaves': 31, 'lambda_l2': 0.1, 'learning_rate': 0.3, 'feature_fraction': 0.9, 'min_child_weight': 1.0, 'device': device, 'gpu_device_id': 0, 'gpu_platform_id': 0, 'max_bin': 63, 'verbose': 0 } if use_international: params['objective'] = 'binary' del params["num_class"] for k, (train_index, test_index) in enumerate(kf.split(X_train)): print("Processing Fold {} out of {}".format(k+1, k_folds)) X_trainCV, X_testCV = X_train[train_index], X_train[test_index] Y_trainCV, Y_testCV = Y_train[train_index], Y_train[test_index] if verbose: print("{} sampling process started...".format(sampling_method)) curr_time = time.time() if sampling_method == "adasyn": X_train_resampled, Y_train_resampled = ADASYN().fit_sample(X_trainCV, Y_trainCV) elif sampling_method == "smote": X_train_resampled, Y_train_resampled = SMOTE().fit_sample(X_trainCV, Y_trainCV) elif sampling_method == "random": X_train_resampled, Y_train_resampled = RandomOverSampler().fit_sample(X_trainCV, Y_trainCV) elif sampling_method == "smoteenn": X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample(X_trainCV, Y_trainCV) else: X_train_resampled, Y_train_resampled = X_trainCV, Y_trainCV if verbose: print("Oversampling completed") print("Time Taken: {:.2f}".format(time.time()-curr_time)) print("Size of Oversampled data: {}".format(X_train_resampled.shape)) print("{} model(s) selected for classification".format(model)) curr_time = time.time() lgb_train = lgb.Dataset(data=X_train_resampled, label=Y_train_resampled, feature_name=feature_labels, categorical_feature=categorical_feature) clf = lgb.train(params, lgb_train, num_boost_round=30) print("Time taken: {:.2f}".format(time.time()-curr_time)) Y_probs = clf.predict(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importance(importance_type='gain') feature_imp = {label: imp for label, imp in zip(feature_labels, feature_imp)} pickle.dump(feature_imp, open(os.path.join(results_dir, "{}_{}_feature_imp_fold_{}.p".format(model, sampling_method, k+1)), "wb" )) pickle.dump(result, open(os.path.join(results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k+1)), "wb" )) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k+1, save_dir=results_dir) print("Training took {:.2f}s.".format(time.time()-start_time)) print("Finished.")
def submission(model, sampling_method, data_dir, results_dir, device='cpu', verbose=True): if verbose: print("Using device: {}".format(device)) print("Reading train data in...") if model == 'lgbm': X_train, Y_train, feature_labels = get_train(data_dir, one_hot=False) else: X_train, Y_train, feature_labels = get_train(data_dir) X_test = get_test(data_dir) train_ids, test_ids = get_ids(data_dir) country_names = get_country_names(data_dir) if verbose: print("Successfully loaded data") lgbm_params = { 'task': 'train', 'objective': 'multiclass', 'num_class': 12, 'num_leaves': 31, 'learning_rate': 0.3, 'lambda_l2': 1.0, 'feature_fraction': 0.9, 'min_child_weight': 1.0, 'device': device, 'gpu_device_id': 0, 'gpu_platform_id': 0, 'max_bin': 63, 'verbose': 0 } if device == 'cpu': xgb_params = { "objective": "multi:softprob", "num_class": 12, "tree_method": "hist", "colsample_bytree": 0.9, "n_jobs": 2, "silent": 1 } else: xgb_params = { "objective": "multi:softprob", "num_class": 12, "tree_method": "gpu_hist", "colsample_bytree": 0.9, "gpu_id": 0, "max_bin": 16, "silent": 1 } if verbose: print("{} sampling process started...".format(sampling_method)) curr_time = time.time() if sampling_method == "adasyn": X_train_resampled, Y_train_resampled = ADASYN().fit_sample( X_train, Y_train) elif sampling_method == "smote": X_train_resampled, Y_train_resampled = SMOTE().fit_sample( X_train, Y_train) elif sampling_method == "random": X_train_resampled, Y_train_resampled = RandomOverSampler().fit_sample( X_train, Y_train) elif sampling_method == "smoteenn": X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample( X_train, Y_train) else: X_train_resampled, Y_train_resampled = X_train, Y_train if verbose: print("Oversampling completed") print("Time Taken: {:.2f}".format(time.time() - curr_time)) print("Size of Oversampled data: {}".format(X_train_resampled.shape)) print("{} selected for classification".format(model)) curr_time = time.time() if model == 'lgbm': categorical_feature = [ 'age_bucket', 'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser' ] lgb_train = lgb.Dataset(data=X_train_resampled, label=Y_train_resampled, feature_name=feature_labels, categorical_feature=categorical_feature) clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict(X_test) order = np.argsort(-Y_probs[:, :5], axis=1) else: X_train_xgb = xgb.DMatrix(X_train_resampled, Y_train_resampled, feature_names=feature_labels) X_test_xgb = xgb.DMatrix(X_test, feature_names=feature_labels) clf = xgb.train(xgb_params, X_train_xgb, 30) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict(X_test_xgb) order = np.argsort(-Y_probs[:, :5], axis=1) print("Generating submission csv...") with open(os.path.join(results_dir, 'submission_{}.csv'.format(model)), 'w') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(['id', 'country']) for i in range(len(test_ids)): for k in range(5): writer.writerow([test_ids[i], country_names[order[i, k]]]) print("Finished.")
""" This file get preprocessed data """ from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from sklearn.model_selection import train_test_split from tqdm import tqdm from utils import load_glove, clean_str, get_train import pandas as pd import numpy as np import re # get input train_df = get_train() texts = train_df['text'].to_list() tags = train_df['tag'].to_list() # clean the text train_df['text'] = train_df['text'].apply(clean_str) # text2sequence emb_size = 300 max_features = 6000 maxlen = 50 tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(texts)
def train(model, sampling_method, k_folds, data_dir, results_dir, device='cpu', use_international=False, verbose=True): start_time = time.time() if verbose: print("Using device: {}".format(device)) print("Reading train data in...") if use_international: print("Using international class.") X_train, Y_train, feature_labels = get_train( data_dir, use_international=use_international) if verbose: print("Successfully loaded data") print("Starting Cross-Validation with {} folds...".format(k_folds)) kf = KFold(n_splits=k_folds) kf.get_n_splits(X_train) lgbm_params = { 'task': 'train', 'objective': 'multiclass', 'num_class': 12, 'num_leaves': 31, 'learning_rate': 0.1, 'lambda_l2': 1.0, 'feature_fraction': 0.9, 'min_child_weight': 1.0, 'device': device, 'gpu_device_id': 0, 'gpu_platform_id': 0, 'max_bin': 63, 'verbose': 0 } if use_international: lgbm_params['objective'] = 'binary' del lgbm_params["num_class"] if device == 'cpu': xgb_params = { "objective": "multi:softprob", "num_class": 12, "tree_method": "hist", "colsample_bytree": 0.9, "n_jobs": 2, "silent": 1 } else: xgb_params = { "objective": "multi:softprob", "num_class": 12, "tree_method": "gpu_hist", "colsample_bytree": 0.9, "gpu_id": 0, "max_bin": 16, "silent": 1 } if use_international: xgb_params["objective"] = "binary:logistic" del xgb_params["num_class"] for k, (train_index, test_index) in enumerate(kf.split(X_train)): print("Processing Fold {} out of {}".format(k + 1, k_folds)) X_trainCV, X_testCV = X_train[train_index], X_train[test_index] Y_trainCV, Y_testCV = Y_train[train_index], Y_train[test_index] if verbose: print("{} sampling process started...".format(sampling_method)) curr_time = time.time() if sampling_method == "adasyn": X_train_resampled, Y_train_resampled = ADASYN().fit_sample( X_trainCV, Y_trainCV) elif sampling_method == "smote": X_train_resampled, Y_train_resampled = SMOTE().fit_sample( X_trainCV, Y_trainCV) elif sampling_method == "random": X_train_resampled, Y_train_resampled = RandomOverSampler( ).fit_sample(X_trainCV, Y_trainCV) elif sampling_method == "smoteenn": X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample( X_trainCV, Y_trainCV) else: X_train_resampled, Y_train_resampled = X_trainCV, Y_trainCV if verbose: print("Oversampling completed") print("Time Taken: {:.2f}".format(time.time() - curr_time)) print("Size of Oversampled data: {}".format( X_train_resampled.shape)) print("{} model(s) selected for classification".format(model)) curr_time = time.time() if model == "tree": clf = DecisionTreeClassifier().fit(X_train_resampled, Y_train_resampled) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importances_ feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( model, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k + 1, save_dir=results_dir) elif model == "logistic": clf = LogisticRegression(penalty="l2", C=1).fit(X_train_resampled, Y_train_resampled) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) assert (np.all( np.argmax(Y_probs, axis=1) == clf.predict(X_testCV))) result = evaluate(Y_testCV, Y_probs) print(result) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k + 1, save_dir=results_dir) elif model == "xgb": X_train_xgb = xgb.DMatrix(X_train_resampled, Y_train_resampled, feature_names=feature_labels) X_test_xgb = xgb.DMatrix(X_testCV, feature_names=feature_labels) clf = xgb.train(xgb_params, X_train_xgb, 30) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict(X_test_xgb) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.get_score(importance_type='gain') pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( model, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k + 1, save_dir=results_dir) elif model == "lgbm": lgb_train = lgb.Dataset(data=X_train_resampled, label=Y_train_resampled, feature_name=feature_labels) clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importance(importance_type='gain') feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( model, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k + 1, save_dir=results_dir) elif model == "ada": clf = AdaBoostClassifier(n_estimators=30).fit( X_train_resampled, Y_train_resampled) print("Time taken for {}: {:.2f}".format(model, time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importances_ feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( model, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k + 1, save_dir=results_dir) elif model == "forest": clf = RandomForestClassifier(n_estimators=30, n_jobs=2).fit(X_train_resampled, Y_train_resampled) print("Time taken: {:.2f}".format(time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importances_ feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( model, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k + 1, save_dir=results_dir) else: models = [ "lgbm", "xgb", "ada", "forest", "tree", "logistic" ] # for category codes instead of one hot, use lgbm_train.py for m in models: print("Training {}...".format(m)) curr_time = time.time() if m == "xgb": X_train_xgb = xgb.DMatrix(X_train_resampled, Y_train_resampled, feature_names=feature_labels) X_test_xgb = xgb.DMatrix(X_testCV, feature_names=feature_labels) clf = xgb.train(xgb_params, X_train_xgb, 30) print("Time taken for {}: {:.2f}".format( m, time.time() - curr_time)) Y_probs = clf.predict(X_test_xgb) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.get_score(importance_type='gain') pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, m, sampling_method, k + 1, save_dir=results_dir) elif m == "lgbm": lgb_train = lgb.Dataset(data=X_train_resampled, label=Y_train_resampled, feature_name=feature_labels) clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30) print("Time taken for {}: {:.2f}".format( m, time.time() - curr_time)) Y_probs = clf.predict(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importance( importance_type='gain') feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, m, sampling_method, k + 1, save_dir=results_dir) elif m == "ada": clf = AdaBoostClassifier(n_estimators=30).fit( X_train_resampled, Y_train_resampled) print("Time taken for {}: {:.2f}".format( m, time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importances_ feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, m, sampling_method, k + 1, save_dir=results_dir) elif m == "forest": clf = RandomForestClassifier(n_estimators=30, n_jobs=2).fit( X_train_resampled, Y_train_resampled) print("Time taken for {}: {:.2f}".format( m, time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importances_ feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, m, sampling_method, k + 1, save_dir=results_dir) elif m == "tree": clf = DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=5).fit( X_train_resampled, Y_train_resampled) print("Time taken for {}: {:.2f}".format( m, time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) feature_imp = clf.feature_importances_ feature_imp = { label: imp for label, imp in zip(feature_labels, feature_imp) } pickle.dump( feature_imp, open( os.path.join( results_dir, "{}_{}_feature_imp_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, m, sampling_method, k + 1, save_dir=results_dir) else: clf = LogisticRegression(penalty="l2").fit( X_train_resampled, Y_train_resampled) print("Time taken for {}: {:.2f}".format( m, time.time() - curr_time)) Y_probs = clf.predict_proba(X_testCV) result = evaluate(Y_testCV, Y_probs) print(result) pickle.dump( result, open( os.path.join( results_dir, "{}_{}_fold_{}.p".format( m, sampling_method, k + 1)), "wb")) save_examples(X_testCV, Y_testCV, Y_probs, m, sampling_method, k + 1, save_dir=results_dir) print("Training took {:.2f}s.".format(time.time() - start_time)) print("Finished.")
device = torch.device('cuda') os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3" # hyperparameters FILE_PATH = "training file.txt" MAX_SEQUENCE_LENGTH = 75 TRAIN_SIZE = 6500 SEED = 666 EPOCHS = 5 LR = 2e-5 BATCH_SIZE = 32 ACCUMULATION_STEPS = 2 # how many steps it should backward propagate before optimization OUTPUT_FILE_NAME = "bert_pytorch.bin" # convert the origin data into a formatted pandas dataframe train_df = get_train(FILE_PATH) train_df['text'] = train_df['text'].apply(clean_str) #convert tag to sequence, maybe there are more elegant way to do this tags = train_df['tag'].to_list() tokenizer_tag = Tokenizer() tokenizer_tag.fit_on_texts(tags) tags = tokenizer_tag.texts_to_sequences(tags) tags = np.array(list((map(lambda x: x[0], tags)))) # convert text to bert format sequence tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer) #shuffle the data
# -*- coding: utf-8 -*- """ Created on Sun Feb 14 16:59:19 2021 @author: clara """ import utils label = 0 # parameters of the classifier SVMparams = [0.006, 0.005] methods = ['KS_7'] Xtr, ytr = utils.get_train(label) utils.grid_search(label, Xtr, ytr, SVMparams, methods, train_size=0.75, graph=False)