def extract_features(train_dir, test_dir=""): print_message("Extracting Features") train_data, train_labels, test_data, test_labels = get_data( test_dir, train_dir) glbs.LABELS = train_labels + test_labels glbs.TRAIN_DATA = train_data feature_lst = [] # add all the N-Grams feature to the list for feature in glbs.FEATURES: if is_ngrams(feature): vectorizer = get_vectorizer(feature) feature_lst = add_feature(feature_lst, feature, vectorizer) # add all the stylistic features to the list for feature in glbs.STYLISTIC_FEATURES: vectorizers = get_stylistic_features_vectorizer(feature) for i in range(len(vectorizers)): feature_lst = add_feature(feature_lst, feature + str(i), vectorizers[i]) # convert the list to one vectoriazer using FeatureUnion all_features = FeatureUnion(feature_lst) train_features = all_features.fit_transform(train_data) test_features = all_features.transform(test_data) return train_features, train_labels, test_features, test_labels, all_features
def classify(train, tr_labels, test, ts_labels, all_features, num_iteration=1, model_number=0): results = {} result = [] le = LabelEncoder() le.fit(tr_labels) ts_labels = le.transform(ts_labels) tr_labels = le.transform(tr_labels) print_message("Classifying") # if os.path.exists(temp_file_path): # results = load_backup_file(temp_file_path) for classifier in glbs.METHODS: print_message("running " + str(classifier), num_tabs=1) if classifier in results.keys(): continue if classifier == "rnn": clf = get_rnn_model(train) clf.fit(train, tr_labels, epochs=3, batch_size=64) # scores = clf.evaluate(test, ts_labels, verbose=0) # acc_score += scores[1] else: clf = methods[classifier] clf.fit(train, tr_labels) prediction = clf.predict(test) decision = [] try: decision = clf.decision_function(test) except: decision = clf.predict_proba(test) decision = decision[:, 1] result = get_results(ts_labels, prediction, decision) model_path = os.path.join(glbs.RESULTS_PATH, "Model " + classifier + str(model_number)) save_model(clf, model_path) del clf results[classifier] = result # save_backup_file(results, temp_file_path) # print(results) return results
def extract_features(dataset_dir): print_message("Extracting Features") X, y = get_data(dataset_dir) glbs.LABELS = y glbs.DATASET_DATA = X ######################################## # X, y = zip(*list(zip(X, y))[:160]) # from help_functions import get_fetuer_by_DF # get_fetuer_by_DF(X) ######################################## feature_lst = [] # add all the N-Grams feature to the list for feature in glbs.FEATURES: if is_ngrams(feature): vectorizer = get_vectorizer(feature) feature_lst = add_feature(feature_lst, feature, vectorizer) # add all the stylistic features to the list for feature in glbs.STYLISTIC_FEATURES: vectorizers = get_stylistic_features_vectorizer(feature) for i in range(len(vectorizers)): feature_lst = add_feature(feature_lst, feature + str(i), vectorizers[i]) # convert the list to one vectoriazer using FeatureUnion if glbs.MULTIPROCESSING: n_jobs = -1 else: n_jobs = None all_features = FeatureUnion(feature_lst, n_jobs=n_jobs) glbs.FEATURE_MODEL.append(all_features) all_features.fit(X, y) glbs.NUM_OF_FEATURE = len(all_features.get_feature_names()) if glbs.SELECTION: from feature_selction import get_selected_features get_selected_features(X, y, all_features) return X, y
def write_results(results): glbs = GlobalParameters() print_message("Writing results...") pickle_path = glbs.RESULTS_PATH + "\\Pickle files" if path.exists(pickle_path): shutil.rmtree(pickle_path, ignore_errors=True) os.makedirs(pickle_path) xlsx_path = glbs.RESULTS_PATH + "\\Xlsx files" if path.exists(xlsx_path): shutil.rmtree(xlsx_path, ignore_errors=True) time.sleep(0.5) os.makedirs(xlsx_path) for key in results.keys(): with open(pickle_path + "\\" + key + ".pickle", "wb+") as file: pickle.dump(results[key], file) new_write_file_content(pickle_path + "\\" + key + ".pickle", key, xlsx_path)
def write_results(results): glbs = GlobalParameters() print_message("Writing results...") # add_to_csv(results, glbs.RESULTS_PATH) pickle_path = os.path.join(glbs.RESULTS_PATH, "Pickle files") if path.exists(pickle_path): shutil.rmtree(pickle_path, ignore_errors=True) os.makedirs(pickle_path) xlsx_path = os.path.join(glbs.RESULTS_PATH, "Xlsx files") if path.exists(xlsx_path): shutil.rmtree(xlsx_path, ignore_errors=True) time.sleep(0.5) os.makedirs(xlsx_path) for key in results.keys(): with open(os.path.join(pickle_path, key) + ".pickle", "wb+") as file: pickle.dump(results[key], file) new_write_file_content( os.path.join(pickle_path, key) + ".pickle", key, xlsx_path)
def main(cfg): try: # nltk.download("vader_lexicon") # nltk.download('wordnet') glbs = GlobalParameters() configs = get_cfg_files(cfg) total_files = len(configs) results = {} for i, config in enumerate(configs): print_message("Running config {}/{}".format(i + 1, total_files)) set_global_parameters(config) print_run_details() dataset_dir = normalize() X, y = extract_features(dataset_dir) config_result = classify(X, y, glbs.K_FOLDS, glbs.ITERATIONS) glbs.RESULTS[glbs.FILE_NAME] = config_result glbs.RESULTS = add_results(glbs.RESULTS, glbs) if glbs.EXPORT_AS_BASELINE: export_as_baseline(config_result, config[1]) if glbs.WORDCLOUD: print_message("Generating word clouds (long processes)") generate_word_clouds() add_results_glbs(results, glbs) write_results(divide_results(glbs.RESULTS)) send_work_done(glbs.DATASET_DIR) print_message("Done!") except Exception as e: traceback.print_exc() send_work_done(glbs.DATASET_DIR, "", error=str(e), traceback=str(traceback.format_exc()))
def normalize(test=False): init_tools() n = glbs.NORMALIZATION if not test: i = glbs.TRAIN_DIR print_message("Normalizing") else: i = glbs.TEST_DIR print_message("Normalizing test dataset") if not is_nargs_empty(n): # the normalized folder parent_dir = i + "@" + n # create the dir if does not exist if not os.path.exists(parent_dir): os.mkdir(parent_dir) for category in os.listdir(i): with open(os.path.join(i, category), 'r', encoding='utf8', errors='ignore') as read: n_lines = [] for line in read: line = line.rstrip('\n') n_lines.append(normal(line, n)) n_file = '\n'.join(n_lines) del n_lines # write normalized_file with open(os.path.join(parent_dir, category), 'w+', encoding='utf8') as write: write.write(n_file) del n_file else: # if it does exist print_message("found normalized dataset") # check if both dirs have the same number of files if not len(os.listdir(parent_dir)) == len(os.listdir(i)): # delete normalized folder and create a new one. print_message( "Corrupted normalization found, deleting and starting over..." ) shutil.rmtree(parent_dir, ignore_errors=True) normalize() glbsNORM_PATH = parent_dir else: glbsNORM_PATH = i return glbsNORM_PATH
def selectionHalfMethod(X, y, all_features): glbs = GlobalParameters() filename = glbs.FILE_NAME results = {} # nxt = (glbs.SELECTION[0][0], int(glbs.SELECTION[0][1])) nxt = (glbs.SELECTION[0][0], int(glbs.SELECTION[0][1])) max_last_result = 0 bottom = (0, 0) top = nxt while top != bottom: max_nxt_result = 0 print_message(nxt[0]) print_message(nxt[1]) glbs.FILE_NAME = glbs.FILE_NAME + str(nxt[1]) select = select_k_best(nxt[0], int(nxt[1])) glbs.FEATURE_MODEL[1] = select results[glbs.FILE_NAME] = classify(X, y, glbs.K_FOLDS, glbs.ITERATIONS) for method in results[glbs.FILE_NAME].items(): if mean(method[1]["accuracy"]) > max_nxt_result: max_nxt_result = mean(method[1]["accuracy"]) results = add_results(results, glbs, nxt) if max_nxt_result >= max_last_result: top = nxt if bottom[1] == 0: nxt = (nxt[0], int(int(nxt[1]) / 2)) if bottom[1] != 0: nxt = (nxt[0], int((int(nxt[1]) + bottom[1]) / 2)) max_last_result = max_nxt_result elif max_nxt_result < max_last_result: bottom = nxt nxt = (nxt[0], int((top[1] + bottom[1]) / 2)) glbs.SELECTION[0] = nxt if bottom[1] - top[1] == -1 and bottom == nxt: break glbs.FILE_NAME = filename add_results_glbs(results, glbs)
def main(cfg): try: glbs = GlobalParameters() configs = get_cfg_files(cfg) results = {} n_test_dir = "" total_files = len(configs) for i, config in enumerate(configs): print_message("Running config {}/{}".format(i + 1, total_files)) set_global_parameters(config) print_run_details() n_train_dir = normalize() if glbs.TEST_DIR != "": n_test_dir = normalize(test=True) train, tr_labels, test, ts_labels, all_features = extract_features( n_train_dir, n_test_dir) for selection in glbs.SELECTION: try: train, test = get_selected_features( selection, train, tr_labels, test, ts_labels, all_features) except: pass results[glbs.FILE_NAME] = classify(train, tr_labels, test, ts_labels, all_features, model_number=i) results = add_results(results) if glbs.WORDCLOUD: print_message("Generating word clouds (long processes)") generate_word_clouds() write_results(divide_results(results)) send_work_done(glbs.TRAIN_DIR) print_message("Done!") # clean_backup_files() except Exception as e: traceback.print_exc() send_work_done(glbs.TRAIN_DIR, "", error=str(e), traceback=str(traceback.format_exc()))
def classify(X, y, k_fold, num_iteration=1): results = {} le = LabelEncoder() le.fit(y) y = le.transform(y) print_message("Classifying") """def cross_validation(X, classifier, clf, i, k_fold, num_iteration, y): print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2) scores = cross_validate(clf, X, y, cv=k_fold, scoring=glbs.MEASURE) for measure in glbs.MEASURE: if measure in results[classifier].keys(): results[classifier][measure] += list(scores['test_' + measure]) else: results[classifier][measure] = list(scores['test_' + measure])""" for classifier in glbs.METHODS: print_message("running " + str(classifier), num_tabs=1) if classifier not in results.keys(): results[classifier] = {} if classifier == "rnn": # clf = get_rnn_model(X) continue else: clf = methods[classifier] lst = [] if glbs.SELECTION: lst.append(("feture", glbs.FEATURE_MODEL[0])) lst.append(("select", glbs.FEATURE_MODEL[1])) lst.append(("classifier", clf)) clf = Pipeline(lst) else: clf = make_pipeline(glbs.FEATURE_MODEL[0], clf) if glbs.MULTIPROCESSING: n_jobs = -1 else: n_jobs = None #################################################################### # Used for parameters tuning from sklearn.model_selection import RandomizedSearchCV # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=1000, stop=2000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10, 15] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4, 6] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = {'classifier__n_estimators': n_estimators, 'classifier__max_features': max_features, 'classifier__max_depth': max_depth, 'classifier__min_samples_split': min_samples_split, 'classifier__min_samples_leaf': min_samples_leaf, 'classifier__bootstrap': bootstrap} print(random_grid) # Use the random grid to search for best hyperparameters # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores rf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, n_iter=100, cv=k_fold, verbose=2, random_state=42, n_jobs=-1) print(clf.get_params().keys()) # Fit the random search model rf_random.fit(X, y) print(rf_random.best_params_) clf = rf_random.best_estimator_ with open("best_estimator.json", "w") as file: json.dump(rf_random.best_params_, file, indent=6) """best_random = rf_random.best_estimator_ scores = cross_validate(best_random, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs) random_accuracy = numpy.mean(list(scores["test_accuracy"])) base_model = clf scores = cross_validate(base_model, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs) base_accuracy = numpy.mean(list(scores["test_accuracy"])) print('Base accuracy: {:0.2f}%.'.format(100 * base_accuracy)) print('Random accuracy: {:0.2f}%.'.format(100 * random_accuracy)) print('Improvement of {:0.2f}%.'.format(100 * (random_accuracy - base_accuracy) / base_accuracy)) stop = "Put breakpoint here""""" #################################################################### for i in range(num_iteration): # Shuffle the inner order of the posts within each fold Xy = list(zip(X, y)) splited = [] len_l = len(Xy) for j in range(k_fold): start = int(j * len_l / k_fold) end = int((j + 1) * len_l / k_fold) splited.append(Xy[start:end]) Xy = [] for fold in splited: random.Random(num_iteration).shuffle(fold) Xy += fold X, y = zip(*Xy) print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2) scores = cross_validate( clf, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs ) for measure in glbs.MEASURE: if measure in results[classifier].keys(): results[classifier][measure] += list(scores["test_" + measure]) else: results[classifier][measure] = list(scores["test_" + measure]) del clf return results
def clean_backup_files(): glbs = GlobalParameters() print_message("removing temp files...") folder_path = os.sep.join(glbs.RESULTS_PATH.split( os.sep)[:-1]) + os.sep + "temp_backups" shutil.rmtree(folder_path, ignore_errors=True)
def classify(X, y, k_fold, num_iteration=1): results = {} le = LabelEncoder() le.fit(y) y = le.transform(y) print_message("Classifying") """def cross_validation(X, classifier, clf, i, k_fold, num_iteration, y): print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2) scores = cross_validate(clf, X, y, cv=k_fold, scoring=glbs.MEASURE) for measure in glbs.MEASURE: if measure in results[classifier].keys(): results[classifier][measure] += list(scores['test_' + measure]) else: results[classifier][measure] = list(scores['test_' + measure])""" for classifier in glbs.METHODS: print_message("running " + str(classifier), num_tabs=1) if classifier not in results.keys(): results[classifier] = {} if classifier == "rnn": # clf = get_rnn_model(X) continue else: clf = methods[classifier] lst = [] if glbs.SELECTION: lst.append(("feture", glbs.FEATURE_MODEL[0])) lst.append(("select", glbs.FEATURE_MODEL[1])) lst.append(("classifier", clf)) clf = Pipeline(lst) A = clf.fit(X, y) print(A) else: clf = make_pipeline(glbs.FEATURE_MODEL[0], clf) if glbs.MULTIPROCESSING: n_jobs = -1 else: n_jobs = None for i in range(num_iteration): # Shuffle the inner order of the posts within each fold Xy = list(zip(X, y)) splited = [] len_l = len(Xy) for j in range(k_fold): start = int(j * len_l / k_fold) end = int((j + 1) * len_l / k_fold) splited.append(Xy[start:end]) Xy = [] for fold in splited: random.Random(num_iteration).shuffle(fold) Xy += fold X, y = zip(*Xy) print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2) scores = cross_validate(clf, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs) for measure in glbs.MEASURE: if measure in results[classifier].keys(): results[classifier][measure] += list(scores["test_" + measure]) else: results[classifier][measure] = list(scores["test_" + measure]) del clf return results