def main(): print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Matrix Factorization~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) config = get_ArgumentParser().parse_args() config.LOG_FNAME = algo_fn[config.TYPE] + ".log" logger = init(config, config.LOG_DIR, config.LOG_FNAME) dataset = load_dataset( config, path.join(config.DATA_PATH, config.DATA_DIR.lower())) config.MULTI_LABEL = dataset.multilabel print("Config: %s" % (config)) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) S = du.get_proximity_matrix(dataset.relations, float(config.ETA)) B = du.get_modularity_matrix(dataset.relations) perc_data = dataset.expt_sets for a in perc_data: temp1 = {} temp2 = {} temp3 = {} print("% of randomly sampled training data ---- ", a) avg_lr_acc = {'micro_f1': 0.0, 'macro_f1': 0.0, 'accuracy': 0.0} avg_svm_acc = {'micro_f1': 0.0, 'macro_f1': 0.0, 'accuracy': 0.0} avg_n_acc = {'micro_f1': 0.0, 'macro_f1': 0.0, 'accuracy': 0.0} itr = 0 # for b in range(1, dataset.n_folds + 1): # You can choose to run for all five folds for b in range(1, 1 + 1): # Running for only one fold data_dir = path.join(config.DATA_PATH, config.DATA_DIR.lower(), 'index', str(a), str(b)) train_ids = np.load(path.join(data_dir, 'train_ids.npy')).astype(dtype=bool) val_ids = np.load(path.join(data_dir, 'val_ids.npy')).astype(dtype=bool) train_ids = np.logical_or(train_ids, val_ids) test_ids = np.load(path.join(data_dir, 'test_ids.npy')).astype(dtype=bool) labelled_ids = train_ids unlabelled_ids = np.logical_not(labelled_ids) n_unlabelled = np.count_nonzero(unlabelled_ids) labels = np.copy(dataset.truth) labels[unlabelled_ids, :] = np.zeros( (n_unlabelled, dataset.n_labels)) Y = dataset.truth Y_train = labels if not os.path.isfile( path.join(config.LOG_DIR, config.FOLDER_SUFFIX + "_U" + str(b) + ".npy")): module = __import__(algo_fn[config.TYPE]) best_result_lr = getattr(module, 'factorize')(config, S, B, Y.T, Y_train.T, train_ids, val_ids, test_ids, logger) else: U = np.load( path.join(config.LOG_DIR, config.FOLDER_SUFFIX + "_U" + str(b) + ".npy")) Q = np.load( path.join(config.LOG_DIR, config.FOLDER_SUFFIX + "_Q" + str(b) + ".npy")) best_result_lr = {'Q': Q, 'U': U, 'H': None, 'i': 0} best_lr_accu = get_perf_metrics(config, best_result_lr['U'], best_result_lr['Q'], Y, train_ids, test_ids, 'lr') best_svm_accu = get_perf_metrics(config, best_result_lr['U'], best_result_lr['Q'], Y, train_ids, test_ids, 'svm') if config.TYPE in ["1", "12", "18", "21", "22", "23", "25"]: best_n_accu = best_lr_accu else: best_n_accu = get_perf_metrics(config, best_result_lr['U'], best_result_lr['Q'], Y, train_ids, test_ids, 'n') for k, v in avg_lr_acc.items(): avg_lr_acc[k] = avg_lr_acc[k] + best_lr_accu[k] avg_svm_acc[k] = avg_svm_acc[k] + best_svm_accu[k] avg_n_acc[k] = avg_n_acc[k] + best_n_accu[k] logger.debug( "Iter# {} LR_Micro_F1: {} SVM_Micro_F1: {} N_Micro_F1: {}". format(best_result_lr['i'], best_lr_accu['micro_f1'], best_svm_accu['micro_f1'], best_n_accu["micro_f1"])) itr += 1 if config.SAVE_EMB: logger.info("Save embedding to %s", config.LOG_DIR) np.save(path.join( config.LOG_DIR, config.FOLDER_SUFFIX + "_U" + str(a) + "_" + str(b) + ".npy"), best_result_lr['U'], allow_pickle=False) np.save(path.join( config.LOG_DIR, config.FOLDER_SUFFIX + "_Q" + str(a) + "_" + str(b) + ".npy"), best_result_lr['Q'], allow_pickle=False) avg_lr_acc = {k: v / itr for k, v in avg_lr_acc.items()} avg_svm_acc = {k: v / itr for k, v in avg_svm_acc.items()} avg_n_acc = {k: v / itr for k, v in avg_n_acc.items()} for k, v in { "50_MI": 'micro_f1', "50_MA": 'macro_f1', "50_AC": 'accuracy' }.items(): temp1[k] = avg_lr_acc[v] temp2[k] = avg_svm_acc[v] temp3[k] = avg_n_acc[v] with open( "tmp_output_files/" + str(config.DATA_DIR) + "_" + config.TYPE + "_" + str(a) + "_" + "best_params_nc.txt", 'wb') as fp: pkl.dump({"LR": temp1, "SVM": temp2, "N": temp3}, fp)
def main(): print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Matrix Factorization~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) config = get_ArgumentParser().parse_args() if config.DATA_DIR in [ 'washington', 'wisconsin', 'texas', 'cornell', 'armherst', 'rochester', 'mich', 'hamilton', 'citeseer', 'cora', 'wiki' ]: config.MULTI_LABEL = False elif config.DATA_DIR in ['ppi', 'blogcatalog', 'wiki_n2v']: config.MULTI_LABEL = True print("Config: %s" % (config)) print( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ) # Initialization and loading dataset logger = init(config.LOG_DIR, config.LOG_FNAME) dataset = load_dataset(path.join("../Datasets/", config.DATA_DIR.lower())) tmp = { "accuracy": 0, "micro_precision": 0, "micro_recall": 0, "micro_f1": 0, "macro_precision": 0, "macro_recall": 0, "macro_f1": 0, "average_precision": 0, "coverage": 0, "ranking_loss": 0, "hamming_loss": 0, "cross_entropy": 0, "bae": 0, "pak": 0 } overall_performances_a = [tmp] overall_performances_b = [tmp] overall_performances_c = [tmp] all_results_a = {} all_avg_results_a = {} all_results_b = {} all_avg_results_b = {} all_results_c = {} all_avg_results_c = {} l_res_a = list() l_res_b = list() l_res_c = list() # graph_file = path.join(config.MODEL, "Net", config.DATA_DIR.title() + "_net.txt") # S = np.loadtxt(graph_file) # S = du.get_proximity_similarity_matrix(dataset.relations[0], float(config.ETA)) # S = csr_matrix(du.get_proximity_matrix(dataset.relations[0], float(config.ETA))) # B = csr_matrix(du.get_modularity_matrix(dataset.relations[0])) S = du.get_proximity_matrix(dataset.relations[0], float(config.ETA)) B = du.get_modularity_matrix(dataset.relations[0]) perc_data = dataset.expt_sets for a in perc_data: all_results_a[a] = {} all_results_b[a] = {} all_results_c[a] = {} all_avg_results_a[config.FOLDER_SUFFIX] = list() all_avg_results_b[config.FOLDER_SUFFIX] = list() all_avg_results_c[config.FOLDER_SUFFIX] = list() overall_performances_a = [ dict.fromkeys(g, 0) for g in overall_performances_a ] overall_performances_b = [ dict.fromkeys(g, 0) for g in overall_performances_b ] overall_performances_c = [ dict.fromkeys(g, 0) for g in overall_performances_c ] itr = 0 print("% of randomly sampled training data ---- ", a) # for b in range(1, dataset.n_folds+1) : for b in range(1, 2): data_dir = path.join("../Datasets/", config.DATA_DIR.lower(), 'index', str(a), str(b)) train_ids = np.load(path.join(data_dir, 'train_ids.npy')).astype(dtype=bool) val_ids = np.load(path.join(data_dir, 'val_ids.npy')).astype(dtype=bool) train_ids = np.logical_or(train_ids, val_ids) test_ids = np.load(path.join(data_dir, 'test_ids.npy')).astype(dtype=bool) # test_ids = np.logical_or(test_ids, val_ids) labelled_ids = train_ids unlabelled_ids = np.logical_not(labelled_ids) n_unlabelled = np.count_nonzero(unlabelled_ids) n_labelled = np.count_nonzero(labelled_ids) labels = np.copy(dataset.truth) labels[unlabelled_ids, :] = np.zeros( (n_unlabelled, dataset.n_labels)) # Y = csr_matrix(dataset.truth) # Y_train = csr_matrix(labels) # D = [csr_matrix(i.T) for i in dataset.attributes] # mxn # X = [csr_matrix(i) for i in dataset.relations] # nxn Y = dataset.truth Y_train = labels D = [i.T for i in dataset.attributes] # mxn X = [i for i in dataset.relations] # nxn performances_a = [] performances_b = [] performances_c = [] best_result_lr, best_result_svm, best_result = mnf.factorize( config, dataset, S, B, D[0], X[0], Y.T, Y_train.T, train_ids, val_ids, test_ids, logger) # outputEntities = path.join(config.LOG_DIR, "U_" + str(a) + "_" + str(b) + "_" + "_n.log") # U # np.savetxt(outputEntities, best_result_n['U'], fmt="%f") # outputEntities = path.join(config.LOG_DIR, "H_" + str(a) + "_" + str(b) + "_" + "_n.log") # U # np.savetxt(outputEntities, best_result_n['H'], fmt="%f") # outputEntities = path.join(config.LOG_DIR, "Q_" + str(a) + "_" + str(b) + "_" + "_n.log") # U # np.savetxt(outputEntities, best_result_n['Q'], fmt="%f") # performance_lr = get_perf_metrics_using_lr(config, best_result_lr['U'], Y.toarray(), train_ids, val_ids, # test_ids) # print("Performance_using_LR : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % ( # performance_lr['accuracy'], performance_lr['cross_entropy'], best_result_lr['i'])) # performances_a.append(performance_lr) # performance_svm = get_perf_metrics_using_svm(config, best_result_svm['U'], Y.toarray(), train_ids, val_ids, # test_ids) # print("Performance_using_SVM : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % ( # performance_svm['accuracy'], performance_svm['cross_entropy'], best_result_svm['i'])) # performances_b.append(performance_svm) # performance = get_perf_metrics(config, best_result['U'], best_result['Q'], Y.toarray(), train_ids, val_ids, # test_ids) # print("Performance_without_classifier : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % ( # performance['accuracy'], performance['cross_entropy'], best_result['i'])) performance_lr = get_perf_metrics_using_lr(config, best_result_lr['U'], Y, train_ids, val_ids, test_ids) print( "Performance_using_LR : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % (performance_lr['accuracy'], performance_lr['cross_entropy'], best_result_lr['i'])) performances_a.append(performance_lr) performance_svm = get_perf_metrics_using_svm( config, best_result_svm['U'], Y, train_ids, val_ids, test_ids) print( "Performance_using_SVM : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % (performance_svm['accuracy'], performance_svm['cross_entropy'], best_result_svm['i'])) performances_b.append(performance_svm) performance = get_perf_metrics(config, best_result['U'], best_result['Q'], Y, train_ids, val_ids, test_ids) print( "Performance_without_classifier : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % (performance['accuracy'], performance['cross_entropy'], best_result['i'])) performances_c.append(performance) all_results_a[a][b] = performance_lr all_results_b[a][b] = performance_svm all_results_c[a][b] = performance for i in range(len(overall_performances_a)): if len(overall_performances_a) == len(performances_a): overall_performances_a[i]["accuracy"] += performances_a[i][ "accuracy"] overall_performances_a[i][ "micro_precision"] += performances_a[i][ "micro_precision"] overall_performances_a[i][ "micro_recall"] += performances_a[i]["micro_recall"] overall_performances_a[i]["micro_f1"] += performances_a[i][ "micro_f1"] overall_performances_a[i][ "macro_precision"] += performances_a[i][ "macro_precision"] overall_performances_a[i][ "macro_recall"] += performances_a[i]["macro_recall"] overall_performances_a[i]["macro_f1"] += performances_a[i][ "macro_f1"] overall_performances_a[i][ "average_precision"] += performances_a[i][ "average_precision"] overall_performances_a[i]["coverage"] += performances_a[i][ "coverage"] overall_performances_a[i][ "ranking_loss"] += performances_a[i]["ranking_loss"] overall_performances_a[i][ "hamming_loss"] += performances_a[i]["hamming_loss"] overall_performances_a[i][ "cross_entropy"] += performances_a[i]["cross_entropy"] overall_performances_a[i]["bae"] += performances_a[i][ "bae"] overall_performances_a[i]["pak"] += performances_a[i][ "pak"] for i in range(len(overall_performances_b)): if len(overall_performances_b) == len(performances_b): overall_performances_b[i]["accuracy"] += performances_b[i][ "accuracy"] overall_performances_b[i][ "micro_precision"] += performances_b[i][ "micro_precision"] overall_performances_b[i][ "micro_recall"] += performances_b[i]["micro_recall"] overall_performances_b[i]["micro_f1"] += performances_b[i][ "micro_f1"] overall_performances_b[i][ "macro_precision"] += performances_b[i][ "macro_precision"] overall_performances_b[i][ "macro_recall"] += performances_b[i]["macro_recall"] overall_performances_b[i]["macro_f1"] += performances_b[i][ "macro_f1"] overall_performances_b[i][ "average_precision"] += performances_b[i][ "average_precision"] overall_performances_b[i]["coverage"] += performances_b[i][ "coverage"] overall_performances_b[i][ "ranking_loss"] += performances_b[i]["ranking_loss"] overall_performances_b[i][ "hamming_loss"] += performances_b[i]["hamming_loss"] overall_performances_b[i][ "cross_entropy"] += performances_b[i]["cross_entropy"] overall_performances_b[i]["bae"] += performances_b[i][ "bae"] overall_performances_b[i]["pak"] += performances_b[i][ "pak"] for i in range(len(overall_performances_c)): if len(overall_performances_c) == len(performances_c): overall_performances_c[i]["accuracy"] += performances_c[i][ "accuracy"] overall_performances_c[i][ "micro_precision"] += performances_c[i][ "micro_precision"] overall_performances_c[i][ "micro_recall"] += performances_c[i]["micro_recall"] overall_performances_c[i]["micro_f1"] += performances_c[i][ "micro_f1"] overall_performances_c[i][ "macro_precision"] += performances_c[i][ "macro_precision"] overall_performances_c[i][ "macro_recall"] += performances_c[i]["macro_recall"] overall_performances_c[i]["macro_f1"] += performances_c[i][ "macro_f1"] overall_performances_c[i][ "average_precision"] += performances_c[i][ "average_precision"] overall_performances_c[i]["coverage"] += performances_c[i][ "coverage"] overall_performances_c[i][ "ranking_loss"] += performances_c[i]["ranking_loss"] overall_performances_c[i][ "hamming_loss"] += performances_c[i]["hamming_loss"] overall_performances_c[i][ "cross_entropy"] += performances_c[i]["cross_entropy"] overall_performances_c[i]["bae"] += performances_c[i][ "bae"] overall_performances_c[i]["pak"] += performances_c[i][ "pak"] print("**********************************************************") itr += 1 overall_performances_a = [{ k: v / dataset.n_folds for k, v in d.items() } for d in overall_performances_a] overall_performances_b = [{ k: v / dataset.n_folds for k, v in d.items() } for d in overall_performances_b] overall_performances_c = [{ k: v / dataset.n_folds for k, v in d.items() } for d in overall_performances_c] print('LR ---> ', overall_performances_a) print('SVM ---> ', overall_performances_b) print('N ---> ', overall_performances_c) l_res_a.append({a: overall_performances_a[0]}) l_res_b.append({a: overall_performances_b[0]}) l_res_c.append({a: overall_performances_c[0]}) all_results_a[str(0)] = config all_results_b[str(0)] = config all_results_c[str(0)] = config np.save(path.join(config.LOG_DIR, 'results_lr.npy'), all_results_a) np.save(path.join(config.LOG_DIR, 'results_svm.npy'), all_results_b) np.save(path.join(config.LOG_DIR, 'results_n.npy'), all_results_c) l_res_a.append({str(0): config}) l_res_b.append({str(0): config}) l_res_c.append({str(0): config}) all_avg_results_a[config.FOLDER_SUFFIX] = l_res_a all_avg_results_b[config.FOLDER_SUFFIX] = l_res_b all_avg_results_c[config.FOLDER_SUFFIX] = l_res_c fn = path.join(config.LOG_DIR, "Avg") if not path.exists(fn): makedirs(fn, exist_ok=True) np.save(path.join(config.LOG_DIR, "Avg", 'results_avg_lr.npy'), all_avg_results_a) np.save(path.join(config.LOG_DIR, "Avg", 'results_avg_svm.npy'), all_avg_results_b) np.save(path.join(config.LOG_DIR, "Avg", 'results_avg_n.npy'), all_avg_results_c)