def main(cfg): try: # nltk.download("vader_lexicon") # nltk.download('wordnet') glbs = GlobalParameters() configs = get_cfg_files(cfg) total_files = len(configs) results = {} for i, config in enumerate(configs): print_message("Running config {}/{}".format(i + 1, total_files)) set_global_parameters(config) print_run_details() dataset_dir = normalize() X, y = extract_features(dataset_dir) config_result = classify(X, y, glbs.K_FOLDS, glbs.ITERATIONS) glbs.RESULTS[glbs.FILE_NAME] = config_result glbs.RESULTS = add_results(glbs.RESULTS, glbs) if glbs.EXPORT_AS_BASELINE: export_as_baseline(config_result, config[1]) if glbs.WORDCLOUD: print_message("Generating word clouds (long processes)") generate_word_clouds() add_results_glbs(results, glbs) write_results(divide_results(glbs.RESULTS)) send_work_done(glbs.DATASET_DIR) print_message("Done!") except Exception as e: traceback.print_exc() send_work_done(glbs.DATASET_DIR, "", error=str(e), traceback=str(traceback.format_exc()))
def write_info_gain(features, name): glbs = GlobalParameters() file_path = (GlobalParameters().RESULTS_PATH + "\\" + name + " for hebrew dataset" + ".xlsx") # Create an new Excel file and add a worksheet. workbook = xlsxwriter.Workbook(file_path) worksheet = workbook.add_worksheet() row = 0 for i, data in enumerate(features): worksheet.write("A" + str(i + 3), data[0].split("_")[0]) worksheet.write("B" + str(i + 3), data[0].split("_")[-1]) worksheet.write("C" + str(i + 3), "{:.2f}".format(data[1])) try: worksheet.write("D" + str(i + 3), "{:.2f}".format(data[2])) if len(glbs.IDF) > 0: worksheet.write("E" + str(i + 3), glbs.IDF[i]) except: if len(glbs.IDF) > 0: worksheet.write("D" + str(i + 3), glbs.IDF[i]) row = i worksheet.add_table( "A2:D" + str(row + 3), { "columns": [ { "header": "selection type" }, { "header": "feture name" }, { "header": name }, { "header": "p-value" }, { "header": "tfidf" }, ], "style": "Table Style Light 8", }, ) workbook.close()
def adc_energy_graphs(): gp = GlobalParameters() rram = RRAM(gp) rram.adc.energy_calc(plot=True) rram.adc.energy_calc() print("Conversion Energy: ", rram.adc.energy) print("ADC resolution: ", rram.adc.N)
def corpus_name(): """ return the name of the corpus (for example: Corpus of 1000 female & 600 male in English) :param train: the train (or the test) text :param train_labels: the labels of the train corpus :param test_labels: the labels of the test corpus :return: string, the name of the corpus """ glbs = GlobalParameters() string = "Corpus of " labels = glbs.LABELS dic = {} for label in labels: # usually the labels contain the file format at the end (file.txt) label = label.split('.')[0] if label in dic: dic[label] += 1 else: dic[label] = 1 for label, number in dic.items(): string += str(number) + " " + label + ", " language = text_language(glbs.TRAIN_DATA[0]) string = string[:-2] + " in " + language[0].upper() + language[1:] # Replace the last , with & for i in range(1, len(string)): if string[-i] == ',': string = string[:-i] + ' &' + string[-i + 1:] break return string
def energy_vs_adc_res(): # Default global params gp = GlobalParameters() # Define Constant Global params gp.adc.comp_var = 0.000 gp.rram.size_x = 64 gp.rram.size_y = 64 gp.rram.r_var = np.log10(1) / 6 #Define Parametric Global params set1 = [1, 2, 4, 8] set2 = [1, 3, 7, 15] #set2 = [1, 3] settings = [[s1, s2] for s1 in set1 for s2 in set2] gp_list = [] for s in settings: gp.rram.n_bit = s[0] gp.mvm.active_rows = s[1] gp_list.append(copy.deepcopy(gp)) [distance, energy, bools] = sweep_gp_params(gp_list) print(np.array(energy)) energy = np.array(energy) * 1e12 / (128 * 128 + 127 * 128) f = open("outputs/energy_vs_adc_res", 'w') for e in energy: f.write(str(energy)) f.close()
def write_info_gain(features, name): file_path = os.path.join(GlobalParameters().RESULTS_PATH, name + " for hebrew dataset" + '.xlsx') # Create an new Excel file and add a worksheet. workbook = xlsxwriter.Workbook(file_path) worksheet = workbook.add_worksheet() row = 0 for i, data in enumerate(features): print(data) worksheet.write('A' + str(i + 3), data[0].split('_')[0]) worksheet.write('B' + str(i + 3), data[0].split('_')[-1]) worksheet.write('C' + str(i + 3), str(data[1])) try: worksheet.write('D' + str(i + 3), str(data[2])) except: pass row = i worksheet.add_table( "A2:D" + str(row + 3), { 'columns': [{ 'header': 'selection type' }, { 'header': "feture name" }, { 'header': name }, { 'header': "p-value" }], 'style': 'Table Style Light 8' }) workbook.close()
def print_run_details(): glbs = GlobalParameters() print(""" --------------------------------------- Training path: {} Testing Path: {} Features: {} Stylistic Features: {} Normalization: {} Methods: {} Measure: {} Output Path: {} Results Path: {} --------------------------------------- """.format( glbs.TRAIN_DIR, glbs.TEST_DIR, glbs.FEATURES, glbs.STYLISTIC_FEATURES, glbs.NORMALIZATION, glbs.METHODS, glbs.MEASURE, glbs.OUTPUT_DIR, glbs.RESULTS_PATH, ))
def add_results(old_results): glbs = GlobalParameters() temp = {} temp["results"] = old_results[glbs.FILE_NAME] temp["features"] = glbs.FEATURES temp["normalization"] = glbs.NORMALIZATION temp["stylistic_features"] = glbs.STYLISTIC_FEATURES old_results[glbs.FILE_NAME] = temp return old_results
def generate_word_clouds(max_words=200): glbs = GlobalParameters() training_path = glbs.TRAIN_DIR testing_path = glbs.TEST_DIR result_path = glbs.RESULTS_PATH + r"\Words Clouds" if path.exists(result_path): shutil.rmtree(result_path, ignore_errors=True) os.makedirs(result_path) training = {} for file in os.listdir(training_path): if file.endswith('.txt'): training[file.replace('.txt', '')] = open(os.path.join( training_path, file), "r", encoding="utf8", errors='replace').readlines() testing = {} for file in os.listdir(testing_path): if file.endswith('.txt'): testing[file.replace('.txt', '')] = open(os.path.join(testing_path, file), "r", encoding="utf8", errors='replace').readlines() if text_language(testing[list(testing.keys())[0]][0]) == 'hebrew': for key, value in training.items(): for post in range(len(value)): training[key][post] = training[key][post][::-1] for key, value in testing.items(): for post in range(len(value)): testing[key][post] = testing[key][post][::-1] stop_words = hebrew_stopwords else: stop_words = stopwords for name, text in training.items(): title = "training " + name + " unigrams" freq = dict(get_top_n_words(text, 1, 1)) generate_and_save(freq, max_words, result_path, stop_words, title) title = "training " + name + " bigrams" freq = dict(get_top_n_words(text, 2, 2)) generate_and_save(freq, max_words, result_path, stop_words, title) for name, text in testing.items(): title = "testing " + name + " unigrams" freq = dict(get_top_n_words(text, 1, 1)) generate_and_save(freq, max_words, result_path, stop_words, title) title = "testing " + name + " bigrams" freq = dict(get_top_n_words(text, 2, 2)) generate_and_save(freq, max_words, result_path, stop_words, title)
def rvar_arows_shmoo(): # Default global params gp = GlobalParameters() gp.adc.comp_var = 0.000 gp.rram.size_x = 256 gp.rram.size_y = 256 r_var = np.log10([1, 1.01, 1.05, 1.1, 1.2, 1.5, 2, 3]) / 6 active_rows = [1, 3, 7, 15, 31, 63, 127, 255] #r_var = np.log10([1, 1.01, 2])/6 #active_rows = [1, 15, 63] settings = [[r, a] for r in r_var for a in active_rows] shmoo_grid = [] print('R-Dim | Nb | AR') print('---------------') shmoo_grid = [] for s in settings: gp.rram.rvar = s[0] gp.mvm.active_rows = s[1] print('{} | {} :'.format(10**(6 * s[0]), s[1]), end='') M = 128 N = 32 res = 8 mvm = MVM(gp) vec = np.random.random([1, M]) * 2 - 1 mat = np.random.random([M, N]) * 2 - 1 start = time.time() result = mvm.dot(vec, mat, res) print('{:.2f}:'.format(mvm.e_read * 1e12), end=' ') result_t = mvm.dot_truth(vec, mat, res) if False in (result == result_t): shmoo_grid.append(0) print("Fail") else: shmoo_grid.append(1) print("Pass") shmoo_grid = np.array(shmoo_grid).reshape(len(r_var), len(active_rows)) print(shmoo_grid) f = open("outputs/schmoo", 'w') f.write(str(shmoo_grid)) f.close()
def write_sfm(score): file_path = (GlobalParameters().RESULTS_PATH + "\\" + "sfm" + " for hebrew dataset" + ".xlsx") # Create an new Excel file and add a worksheet. workbook = xlsxwriter.Workbook(file_path) worksheet = workbook.add_worksheet() row = 1 for key, value in score.items(): worksheet.write("A" + str(row), key) worksheet.write("B" + str(row), value) row += 1 workbook.close()
def distance_vs_active_rows(): # Default global params gp = GlobalParameters() # Define Constant Global params gp.adc.comp_var = 0.005 gp.rram.size_x = 128 gp.rram.size_y = 128 #gp.rram.size_x = 8 #gp.rram.size_y = 8 #Define Parametric Global params set1 = np.log10([1, 1.05, 1.1, 1.5, 2]) / 6 set2 = [1, 3, 7, 15, 31, 63, 127] #set1 = np.log10([1, 2])/6 #set2 = [1, 3, 7] settings = [[s1, s2] for s1 in set1 for s2 in set2] gp_list = [] for s in settings: gp.rram.rvar = s[0] gp.mvm.active_rows = s[1] gp_list.append(copy.deepcopy(gp)) [distance, energy, bools] = sweep_gp_params(gp_list) distance = np.array(distance).reshape(len(set1), len(set2)) #print(distance) f = open("outputs/distance_vs_arows", 'w') for d in distance: f.write(str(d) + '\n') f.close() fig, ax = plt.subplots() #ax.set_xlabel('Active Rows (K)') ax.set_xlabel('ADC Resolution (N)') ax.set_ylabel('Accuracy (Distance from truth)') ax.set_title('MAC accuracy') for i in range(distance.shape[0]): plt.plot(np.log2(np.array(set2) + 1), distance[i]) r_var = (10**(np.array(set1) * 6) - 1) * 100 r_var_str = ['{:.0f}%'.format(r) for r in r_var] ax.legend(r_var_str, title='Rcell Variation') plt.show()
def main(cfg): try: glbs = GlobalParameters() configs = get_cfg_files(cfg) results = {} n_test_dir = "" total_files = len(configs) for i, config in enumerate(configs): print_message("Running config {}/{}".format(i + 1, total_files)) set_global_parameters(config) print_run_details() n_train_dir = normalize() if glbs.TEST_DIR != "": n_test_dir = normalize(test=True) train, tr_labels, test, ts_labels, all_features = extract_features( n_train_dir, n_test_dir) for selection in glbs.SELECTION: try: train, test = get_selected_features( selection, train, tr_labels, test, ts_labels, all_features) except: pass results[glbs.FILE_NAME] = classify(train, tr_labels, test, ts_labels, all_features, model_number=i) results = add_results(results) if glbs.WORDCLOUD: print_message("Generating word clouds (long processes)") generate_word_clouds() write_results(divide_results(results)) send_work_done(glbs.TRAIN_DIR) print_message("Done!") # clean_backup_files() except Exception as e: traceback.print_exc() send_work_done(glbs.TRAIN_DIR, "", error=str(e), traceback=str(traceback.format_exc()))
def write_results(results): glbs = GlobalParameters() print_message("Writing results...") pickle_path = glbs.RESULTS_PATH + "\\Pickle files" if path.exists(pickle_path): shutil.rmtree(pickle_path, ignore_errors=True) os.makedirs(pickle_path) xlsx_path = glbs.RESULTS_PATH + "\\Xlsx files" if path.exists(xlsx_path): shutil.rmtree(xlsx_path, ignore_errors=True) time.sleep(0.5) os.makedirs(xlsx_path) for key in results.keys(): with open(pickle_path + "\\" + key + ".pickle", "wb+") as file: pickle.dump(results[key], file) new_write_file_content(pickle_path + "\\" + key + ".pickle", key, xlsx_path)
def write_results(results): glbs = GlobalParameters() print_message("Writing results...") # add_to_csv(results, glbs.RESULTS_PATH) pickle_path = os.path.join(glbs.RESULTS_PATH, "Pickle files") if path.exists(pickle_path): shutil.rmtree(pickle_path, ignore_errors=True) os.makedirs(pickle_path) xlsx_path = os.path.join(glbs.RESULTS_PATH, "Xlsx files") if path.exists(xlsx_path): shutil.rmtree(xlsx_path, ignore_errors=True) time.sleep(0.5) os.makedirs(xlsx_path) for key in results.keys(): with open(os.path.join(pickle_path, key) + ".pickle", "wb+") as file: pickle.dump(results[key], file) new_write_file_content( os.path.join(pickle_path, key) + ".pickle", key, xlsx_path)
def test_mvm(): # Default global params gp = GlobalParameters() dim = [64] n_bit = [1] #active_rows = [1, 2, 3, 8, 16, 20, 24, 28, 32] active_rows = [1, 2, 8, 16, 32, 64] #active_rows = [1, 15, 63, 128] settings = [[d, n, a] for d in dim for n in n_bit for a in active_rows] print('R-Dim | Nb | AR') print('---------------') for s in settings: gp.rram.size_x = s[0] gp.rram.size_y = s[0] gp.rram.n_bit = s[1] gp.mvm.active_rows = s[2] print('{}x{} | {} | {} :'.format(s[0], s[0], s[1], s[2]), end='') M = 128 N = 128 res = 8 mvm = MVM(gp) vec = np.random.random([1, M]) * 2 - 1 mat = np.random.random([M, N]) * 2 - 1 start = time.time() result = mvm.dot(vec, mat, res) print('{:.2f}'.format(mvm.e_read * 1e12), end=' ') result_t = mvm.dot_truth(vec, mat, res) if False in (result == result_t): print("Fail") else: print("Pass")
def print_run_details(): glbs = GlobalParameters() print(""" --------------------------------------- Dataset path: {} Features: {} Stylistic Features: {} Normalization: {} Methods: {} Measure: {} Export as baseline: {} Results Path: {} --------------------------------------- """.format( glbs.DATASET_DIR, glbs.FEATURES, glbs.STYLISTIC_FEATURES, glbs.NORMALIZATION, glbs.METHODS, glbs.MEASURE, glbs.EXPORT_AS_BASELINE, glbs.RESULTS_PATH, ))
def energy_vs_active_rows(): # Default global params gp = GlobalParameters() # Define Constant Global params gp.adc.comp_var = 0.000 gp.rram.size_x = 256 gp.rram.size_y = 256 gp.rram.r_var = np.log10(1) / 6 #Define Parametric Global params set1 = [0] set2 = [1, 3, 7, 15, 31, 63, 127, 255] #set2 = [1, 3] settings = [[s1, s2] for s1 in set1 for s2 in set2] gp_list = [] for s in settings: gp.rram.rvar = s[0] gp.mvm.active_rows = s[1] gp_list.append(copy.deepcopy(gp)) [distance, energy, bools] = sweep_gp_params(gp_list) print(np.array(energy)) energy = np.array(energy) * 1e12 / (128 * 128 + 127 * 128) f = open("outputs/energy_vs_arows", 'w') for e in energy: f.write(str(energy)) f.close() fig, ax = plt.subplots() ax.set_xlabel('ADC Resolution (N)') ax.set_ylabel('Energy/OP (pJ)') ax.set_title('Energy Efficiency of MAC') plt.plot(np.log2(np.array(set2) + 1), energy) plt.show()
def boundary_test(): M = 4 N = 4 res = 8 gp = GlobalParameters() mvm = MVM(gp) vec = np.random.random([1, M]) * 2 - 1 print("==Vec==") print(vec) mat = np.random.random([M, N]) * 2 - 1 print("==Mat==") print(mat) result = mvm.dot(vec, mat, res) print("==Res==") print(result) result_t = mvm.dot_truth(vec, mat, res) if False in (result == result_t): print("Fail") else: print("Pass")
def selectionHalfMethod(X, y, all_features): glbs = GlobalParameters() filename = glbs.FILE_NAME results = {} # nxt = (glbs.SELECTION[0][0], int(glbs.SELECTION[0][1])) nxt = (glbs.SELECTION[0][0], int(glbs.SELECTION[0][1])) max_last_result = 0 bottom = (0, 0) top = nxt while top != bottom: max_nxt_result = 0 print_message(nxt[0]) print_message(nxt[1]) glbs.FILE_NAME = glbs.FILE_NAME + str(nxt[1]) select = select_k_best(nxt[0], int(nxt[1])) glbs.FEATURE_MODEL[1] = select results[glbs.FILE_NAME] = classify(X, y, glbs.K_FOLDS, glbs.ITERATIONS) for method in results[glbs.FILE_NAME].items(): if mean(method[1]["accuracy"]) > max_nxt_result: max_nxt_result = mean(method[1]["accuracy"]) results = add_results(results, glbs, nxt) if max_nxt_result >= max_last_result: top = nxt if bottom[1] == 0: nxt = (nxt[0], int(int(nxt[1]) / 2)) if bottom[1] != 0: nxt = (nxt[0], int((int(nxt[1]) + bottom[1]) / 2)) max_last_result = max_nxt_result elif max_nxt_result < max_last_result: bottom = nxt nxt = (nxt[0], int((top[1] + bottom[1]) / 2)) glbs.SELECTION[0] = nxt if bottom[1] - top[1] == -1 and bottom == nxt: break glbs.FILE_NAME = filename add_results_glbs(results, glbs)
from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import roc_auc_score from sklearn.naive_bayes import MultinomialNB from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import LabelEncoder from sklearn.svm import LinearSVC from confusion_matrix import accuracy_confusion_matrix from global_parameters import print_message, GlobalParameters from model_persistence import save_model from precision_recall_curve import precision_recall from roc_curve import roc_curve_data glbs = GlobalParameters() methods = { "svc": LinearSVC(), "rf": RandomForestClassifier(), "mlp": MLPClassifier(), "lr": LogisticRegression(), "mnb": MultinomialNB(), } def get_results(ts_labels, prediction, decision): # ( "multilabel_confusion_matrix", multilabel_confusion_matrix(ts_labels, prediction)), measures = { "accuracy_score": accuracy_score(ts_labels, prediction),
def set_global_parameters(configs): glbls = GlobalParameters() config = configs[1] glbls.FILE_NAME = configs[0] glbls.FEATURES = config["features"] glbls.NORMALIZATION = "".join(sorted(config["nargs"].upper())) glbls.OUTPUT_DIR = config["output_csv"] glbls.METHODS = config["methods"] glbls.TRAIN_DIR = config["train"] glbls.TEST_DIR = config["test"] glbls.RESULTS_PATH = config["results"] glbls.MEASURE = config["measure"] glbls.STYLISTIC_FEATURES = config["stylistic_features"] glbls.SELECTION = config["selection"].items( ) if "selection" in config else [] try: if 'language' in config: glbls.LANGUAGE = config['language'] else: path = os.path.join(config["train"], os.listdir(config["train"])[0]) glbls.LANGUAGE = text_language( open(path, "r", encoding="utf8", errors='replace').read()) except: glbls.LANGUAGE = "english"
def clean_backup_files(): glbs = GlobalParameters() print_message("removing temp files...") folder_path = os.sep.join(glbs.RESULTS_PATH.split( os.sep)[:-1]) + os.sep + "temp_backups" shutil.rmtree(folder_path, ignore_errors=True)
def plot_confusion_matrix(cm, result_path, normalize=True, title=None, accuracy=None, cmap=plt.cm.Blues, color='black'): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if not title: if normalize: title = 'Normalized confusion matrix' else: title = 'Confusion matrix, without normalization' if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] glbs = GlobalParameters() labels = [label.split('.')[0] for label in list(set(glbs.LABELS))] fig, ax = plt.subplots(figsize=(3, 2)) im = ax.imshow(cm, interpolation='nearest', cmap=cmap) ax.figure.colorbar(im, ax=ax) # We want to show all ticks... ax.set( xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), # ... and label them with the respective list entries xticklabels=labels, yticklabels=labels, # title=title, ylabel='True', xlabel='Predicted') bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), ha="right", rotation_mode="anchor") # Loop over data dimensions and create text annotations. fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") fig.tight_layout() if accuracy: accuracy = float('{0:.4g}'.format(accuracy * 100)) plt.title('Accuracy Score: ' + str(accuracy) + '\nConfusion Matrix:') plt.rcParams.update({"text.color": color}) plt.savefig(os.path.join(result_path, title) + '.jpg', bbox_inches='tight') plt.close('all')
def set_global_parameters(configs): glbls = GlobalParameters() config = configs[1] glbls.FILE_NAME = configs[0] glbls.FEATURES = config["features"] glbls.NORMALIZATION = [n.lower() for n in config["nargs"]] glbls.METHODS = config["methods"] glbls.DATASET_DIR = config["dataset"] glbls.RESULTS_PATH = config["results"] glbls.MEASURE = config["measure"] glbls.STYLISTIC_FEATURES = config["stylistic_features"] glbls.SELECTION = list(config["selection"].items()) glbls.K_FOLDS = config["k_folds_cv"] glbls.ITERATIONS = config["iterations"] glbls.BASELINE_PATH = config["baseline_path"] glbls.EXPORT_AS_BASELINE = config["export_as_baseline"] glbls.FEATURE_MODEL = [] try: if "language" in config: glbls.LANGUAGE = config["language"] else: path = config["dataset"] + "\\" + os.listdir(config["dataset"])[0] glbls.LANGUAGE = text_language( open(path, "r", encoding="utf8", errors="replace").read()) except: glbls.LANGUAGE = "english" glbls.STOP_WORDS = None if "s" in config["nargs"]: glbls.STOP_WORDS = "english"