Exemple #1
0
def main():
    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Matrix Factorization~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    config = get_ArgumentParser().parse_args()
    config.LOG_FNAME = algo_fn[config.TYPE] + ".log"
    logger = init(config, config.LOG_DIR, config.LOG_FNAME)
    dataset = load_dataset(
        config, path.join(config.DATA_PATH, config.DATA_DIR.lower()))
    config.MULTI_LABEL = dataset.multilabel
    print("Config: %s" % (config))
    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    S = du.get_proximity_matrix(dataset.relations, float(config.ETA))
    B = du.get_modularity_matrix(dataset.relations)

    perc_data = dataset.expt_sets
    for a in perc_data:
        temp1 = {}
        temp2 = {}
        temp3 = {}
        print("% of randomly sampled training data ---- ", a)
        avg_lr_acc = {'micro_f1': 0.0, 'macro_f1': 0.0, 'accuracy': 0.0}
        avg_svm_acc = {'micro_f1': 0.0, 'macro_f1': 0.0, 'accuracy': 0.0}
        avg_n_acc = {'micro_f1': 0.0, 'macro_f1': 0.0, 'accuracy': 0.0}
        itr = 0
        # for b in range(1, dataset.n_folds + 1): # You can choose to run for all five folds
        for b in range(1, 1 + 1):  # Running for only one fold
            data_dir = path.join(config.DATA_PATH, config.DATA_DIR.lower(),
                                 'index', str(a), str(b))
            train_ids = np.load(path.join(data_dir,
                                          'train_ids.npy')).astype(dtype=bool)
            val_ids = np.load(path.join(data_dir,
                                        'val_ids.npy')).astype(dtype=bool)
            train_ids = np.logical_or(train_ids, val_ids)
            test_ids = np.load(path.join(data_dir,
                                         'test_ids.npy')).astype(dtype=bool)
            labelled_ids = train_ids
            unlabelled_ids = np.logical_not(labelled_ids)
            n_unlabelled = np.count_nonzero(unlabelled_ids)
            labels = np.copy(dataset.truth)
            labels[unlabelled_ids, :] = np.zeros(
                (n_unlabelled, dataset.n_labels))
            Y = dataset.truth
            Y_train = labels

            if not os.path.isfile(
                    path.join(config.LOG_DIR,
                              config.FOLDER_SUFFIX + "_U" + str(b) + ".npy")):
                module = __import__(algo_fn[config.TYPE])
                best_result_lr = getattr(module,
                                         'factorize')(config, S, B, Y.T,
                                                      Y_train.T, train_ids,
                                                      val_ids, test_ids,
                                                      logger)
            else:
                U = np.load(
                    path.join(config.LOG_DIR,
                              config.FOLDER_SUFFIX + "_U" + str(b) + ".npy"))
                Q = np.load(
                    path.join(config.LOG_DIR,
                              config.FOLDER_SUFFIX + "_Q" + str(b) + ".npy"))
                best_result_lr = {'Q': Q, 'U': U, 'H': None, 'i': 0}

            best_lr_accu = get_perf_metrics(config, best_result_lr['U'],
                                            best_result_lr['Q'], Y, train_ids,
                                            test_ids, 'lr')
            best_svm_accu = get_perf_metrics(config, best_result_lr['U'],
                                             best_result_lr['Q'], Y, train_ids,
                                             test_ids, 'svm')
            if config.TYPE in ["1", "12", "18", "21", "22", "23", "25"]:
                best_n_accu = best_lr_accu
            else:
                best_n_accu = get_perf_metrics(config, best_result_lr['U'],
                                               best_result_lr['Q'], Y,
                                               train_ids, test_ids, 'n')
            for k, v in avg_lr_acc.items():
                avg_lr_acc[k] = avg_lr_acc[k] + best_lr_accu[k]
                avg_svm_acc[k] = avg_svm_acc[k] + best_svm_accu[k]
                avg_n_acc[k] = avg_n_acc[k] + best_n_accu[k]
            logger.debug(
                "Iter# {} LR_Micro_F1: {} SVM_Micro_F1: {} N_Micro_F1: {}".
                format(best_result_lr['i'], best_lr_accu['micro_f1'],
                       best_svm_accu['micro_f1'], best_n_accu["micro_f1"]))
            itr += 1
            if config.SAVE_EMB:
                logger.info("Save embedding to %s", config.LOG_DIR)
                np.save(path.join(
                    config.LOG_DIR, config.FOLDER_SUFFIX + "_U" + str(a) +
                    "_" + str(b) + ".npy"),
                        best_result_lr['U'],
                        allow_pickle=False)
                np.save(path.join(
                    config.LOG_DIR, config.FOLDER_SUFFIX + "_Q" + str(a) +
                    "_" + str(b) + ".npy"),
                        best_result_lr['Q'],
                        allow_pickle=False)
        avg_lr_acc = {k: v / itr for k, v in avg_lr_acc.items()}
        avg_svm_acc = {k: v / itr for k, v in avg_svm_acc.items()}
        avg_n_acc = {k: v / itr for k, v in avg_n_acc.items()}
        for k, v in {
                "50_MI": 'micro_f1',
                "50_MA": 'macro_f1',
                "50_AC": 'accuracy'
        }.items():
            temp1[k] = avg_lr_acc[v]
            temp2[k] = avg_svm_acc[v]
            temp3[k] = avg_n_acc[v]
        with open(
                "tmp_output_files/" + str(config.DATA_DIR) + "_" +
                config.TYPE + "_" + str(a) + "_" + "best_params_nc.txt",
                'wb') as fp:
            pkl.dump({"LR": temp1, "SVM": temp2, "N": temp3}, fp)
Exemple #2
0
def main():
    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Matrix Factorization~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )
    config = get_ArgumentParser().parse_args()
    if config.DATA_DIR in [
            'washington', 'wisconsin', 'texas', 'cornell', 'armherst',
            'rochester', 'mich', 'hamilton', 'citeseer', 'cora', 'wiki'
    ]:
        config.MULTI_LABEL = False
    elif config.DATA_DIR in ['ppi', 'blogcatalog', 'wiki_n2v']:
        config.MULTI_LABEL = True
    print("Config: %s" % (config))
    print(
        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    )

    # Initialization and loading dataset
    logger = init(config.LOG_DIR, config.LOG_FNAME)
    dataset = load_dataset(path.join("../Datasets/", config.DATA_DIR.lower()))
    tmp = {
        "accuracy": 0,
        "micro_precision": 0,
        "micro_recall": 0,
        "micro_f1": 0,
        "macro_precision": 0,
        "macro_recall": 0,
        "macro_f1": 0,
        "average_precision": 0,
        "coverage": 0,
        "ranking_loss": 0,
        "hamming_loss": 0,
        "cross_entropy": 0,
        "bae": 0,
        "pak": 0
    }
    overall_performances_a = [tmp]
    overall_performances_b = [tmp]
    overall_performances_c = [tmp]
    all_results_a = {}
    all_avg_results_a = {}
    all_results_b = {}
    all_avg_results_b = {}
    all_results_c = {}
    all_avg_results_c = {}
    l_res_a = list()
    l_res_b = list()
    l_res_c = list()

    # graph_file = path.join(config.MODEL, "Net", config.DATA_DIR.title() + "_net.txt")
    # S = np.loadtxt(graph_file)
    # S = du.get_proximity_similarity_matrix(dataset.relations[0], float(config.ETA))
    # S = csr_matrix(du.get_proximity_matrix(dataset.relations[0], float(config.ETA)))
    # B = csr_matrix(du.get_modularity_matrix(dataset.relations[0]))
    S = du.get_proximity_matrix(dataset.relations[0], float(config.ETA))
    B = du.get_modularity_matrix(dataset.relations[0])

    perc_data = dataset.expt_sets
    for a in perc_data:
        all_results_a[a] = {}
        all_results_b[a] = {}
        all_results_c[a] = {}
        all_avg_results_a[config.FOLDER_SUFFIX] = list()
        all_avg_results_b[config.FOLDER_SUFFIX] = list()
        all_avg_results_c[config.FOLDER_SUFFIX] = list()
        overall_performances_a = [
            dict.fromkeys(g, 0) for g in overall_performances_a
        ]
        overall_performances_b = [
            dict.fromkeys(g, 0) for g in overall_performances_b
        ]
        overall_performances_c = [
            dict.fromkeys(g, 0) for g in overall_performances_c
        ]
        itr = 0
        print("% of randomly sampled training data ---- ", a)
        # for b in range(1, dataset.n_folds+1) :
        for b in range(1, 2):
            data_dir = path.join("../Datasets/", config.DATA_DIR.lower(),
                                 'index', str(a), str(b))
            train_ids = np.load(path.join(data_dir,
                                          'train_ids.npy')).astype(dtype=bool)
            val_ids = np.load(path.join(data_dir,
                                        'val_ids.npy')).astype(dtype=bool)
            train_ids = np.logical_or(train_ids, val_ids)
            test_ids = np.load(path.join(data_dir,
                                         'test_ids.npy')).astype(dtype=bool)
            # test_ids = np.logical_or(test_ids, val_ids)

            labelled_ids = train_ids
            unlabelled_ids = np.logical_not(labelled_ids)
            n_unlabelled = np.count_nonzero(unlabelled_ids)
            n_labelled = np.count_nonzero(labelled_ids)
            labels = np.copy(dataset.truth)
            labels[unlabelled_ids, :] = np.zeros(
                (n_unlabelled, dataset.n_labels))
            # Y = csr_matrix(dataset.truth)
            # Y_train = csr_matrix(labels)
            # D = [csr_matrix(i.T) for i in dataset.attributes]  # mxn
            # X = [csr_matrix(i) for i in dataset.relations]  # nxn
            Y = dataset.truth
            Y_train = labels
            D = [i.T for i in dataset.attributes]  # mxn
            X = [i for i in dataset.relations]  # nxn

            performances_a = []
            performances_b = []
            performances_c = []
            best_result_lr, best_result_svm, best_result = mnf.factorize(
                config, dataset, S, B, D[0], X[0], Y.T, Y_train.T, train_ids,
                val_ids, test_ids, logger)

            # outputEntities = path.join(config.LOG_DIR, "U_" + str(a) + "_" + str(b) + "_" + "_n.log")  # U
            # np.savetxt(outputEntities, best_result_n['U'], fmt="%f")
            # outputEntities = path.join(config.LOG_DIR, "H_" + str(a) + "_" + str(b) + "_" + "_n.log")  # U
            # np.savetxt(outputEntities, best_result_n['H'], fmt="%f")
            # outputEntities = path.join(config.LOG_DIR, "Q_" + str(a) + "_" + str(b) + "_" + "_n.log")  # U
            # np.savetxt(outputEntities, best_result_n['Q'], fmt="%f")

            # performance_lr = get_perf_metrics_using_lr(config, best_result_lr['U'], Y.toarray(), train_ids, val_ids,
            #                                            test_ids)
            # print("Performance_using_LR : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % (
            #     performance_lr['accuracy'], performance_lr['cross_entropy'], best_result_lr['i']))
            # performances_a.append(performance_lr)
            # performance_svm = get_perf_metrics_using_svm(config, best_result_svm['U'], Y.toarray(), train_ids, val_ids,
            #                                              test_ids)
            # print("Performance_using_SVM : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % (
            #     performance_svm['accuracy'], performance_svm['cross_entropy'], best_result_svm['i']))
            # performances_b.append(performance_svm)
            # performance = get_perf_metrics(config, best_result['U'], best_result['Q'], Y.toarray(), train_ids, val_ids,
            #                                test_ids)
            # print("Performance_without_classifier : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}" % (
            #     performance['accuracy'], performance['cross_entropy'], best_result['i']))
            performance_lr = get_perf_metrics_using_lr(config,
                                                       best_result_lr['U'], Y,
                                                       train_ids, val_ids,
                                                       test_ids)
            print(
                "Performance_using_LR : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}"
                % (performance_lr['accuracy'], performance_lr['cross_entropy'],
                   best_result_lr['i']))
            performances_a.append(performance_lr)
            performance_svm = get_perf_metrics_using_svm(
                config, best_result_svm['U'], Y, train_ids, val_ids, test_ids)
            print(
                "Performance_using_SVM : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}"
                % (performance_svm['accuracy'],
                   performance_svm['cross_entropy'], best_result_svm['i']))
            performances_b.append(performance_svm)
            performance = get_perf_metrics(config, best_result['U'],
                                           best_result['Q'], Y, train_ids,
                                           val_ids, test_ids)
            print(
                "Performance_without_classifier : Test accuracy: {%0.5f } , Test Loss: {%0.5f } Iter: {%d}"
                % (performance['accuracy'], performance['cross_entropy'],
                   best_result['i']))
            performances_c.append(performance)
            all_results_a[a][b] = performance_lr
            all_results_b[a][b] = performance_svm
            all_results_c[a][b] = performance

            for i in range(len(overall_performances_a)):
                if len(overall_performances_a) == len(performances_a):
                    overall_performances_a[i]["accuracy"] += performances_a[i][
                        "accuracy"]
                    overall_performances_a[i][
                        "micro_precision"] += performances_a[i][
                            "micro_precision"]
                    overall_performances_a[i][
                        "micro_recall"] += performances_a[i]["micro_recall"]
                    overall_performances_a[i]["micro_f1"] += performances_a[i][
                        "micro_f1"]
                    overall_performances_a[i][
                        "macro_precision"] += performances_a[i][
                            "macro_precision"]
                    overall_performances_a[i][
                        "macro_recall"] += performances_a[i]["macro_recall"]
                    overall_performances_a[i]["macro_f1"] += performances_a[i][
                        "macro_f1"]
                    overall_performances_a[i][
                        "average_precision"] += performances_a[i][
                            "average_precision"]
                    overall_performances_a[i]["coverage"] += performances_a[i][
                        "coverage"]
                    overall_performances_a[i][
                        "ranking_loss"] += performances_a[i]["ranking_loss"]
                    overall_performances_a[i][
                        "hamming_loss"] += performances_a[i]["hamming_loss"]
                    overall_performances_a[i][
                        "cross_entropy"] += performances_a[i]["cross_entropy"]
                    overall_performances_a[i]["bae"] += performances_a[i][
                        "bae"]
                    overall_performances_a[i]["pak"] += performances_a[i][
                        "pak"]
            for i in range(len(overall_performances_b)):
                if len(overall_performances_b) == len(performances_b):
                    overall_performances_b[i]["accuracy"] += performances_b[i][
                        "accuracy"]
                    overall_performances_b[i][
                        "micro_precision"] += performances_b[i][
                            "micro_precision"]
                    overall_performances_b[i][
                        "micro_recall"] += performances_b[i]["micro_recall"]
                    overall_performances_b[i]["micro_f1"] += performances_b[i][
                        "micro_f1"]
                    overall_performances_b[i][
                        "macro_precision"] += performances_b[i][
                            "macro_precision"]
                    overall_performances_b[i][
                        "macro_recall"] += performances_b[i]["macro_recall"]
                    overall_performances_b[i]["macro_f1"] += performances_b[i][
                        "macro_f1"]
                    overall_performances_b[i][
                        "average_precision"] += performances_b[i][
                            "average_precision"]
                    overall_performances_b[i]["coverage"] += performances_b[i][
                        "coverage"]
                    overall_performances_b[i][
                        "ranking_loss"] += performances_b[i]["ranking_loss"]
                    overall_performances_b[i][
                        "hamming_loss"] += performances_b[i]["hamming_loss"]
                    overall_performances_b[i][
                        "cross_entropy"] += performances_b[i]["cross_entropy"]
                    overall_performances_b[i]["bae"] += performances_b[i][
                        "bae"]
                    overall_performances_b[i]["pak"] += performances_b[i][
                        "pak"]
            for i in range(len(overall_performances_c)):
                if len(overall_performances_c) == len(performances_c):
                    overall_performances_c[i]["accuracy"] += performances_c[i][
                        "accuracy"]
                    overall_performances_c[i][
                        "micro_precision"] += performances_c[i][
                            "micro_precision"]
                    overall_performances_c[i][
                        "micro_recall"] += performances_c[i]["micro_recall"]
                    overall_performances_c[i]["micro_f1"] += performances_c[i][
                        "micro_f1"]
                    overall_performances_c[i][
                        "macro_precision"] += performances_c[i][
                            "macro_precision"]
                    overall_performances_c[i][
                        "macro_recall"] += performances_c[i]["macro_recall"]
                    overall_performances_c[i]["macro_f1"] += performances_c[i][
                        "macro_f1"]
                    overall_performances_c[i][
                        "average_precision"] += performances_c[i][
                            "average_precision"]
                    overall_performances_c[i]["coverage"] += performances_c[i][
                        "coverage"]
                    overall_performances_c[i][
                        "ranking_loss"] += performances_c[i]["ranking_loss"]
                    overall_performances_c[i][
                        "hamming_loss"] += performances_c[i]["hamming_loss"]
                    overall_performances_c[i][
                        "cross_entropy"] += performances_c[i]["cross_entropy"]
                    overall_performances_c[i]["bae"] += performances_c[i][
                        "bae"]
                    overall_performances_c[i]["pak"] += performances_c[i][
                        "pak"]
            print("**********************************************************")
            itr += 1

        overall_performances_a = [{
            k: v / dataset.n_folds
            for k, v in d.items()
        } for d in overall_performances_a]
        overall_performances_b = [{
            k: v / dataset.n_folds
            for k, v in d.items()
        } for d in overall_performances_b]
        overall_performances_c = [{
            k: v / dataset.n_folds
            for k, v in d.items()
        } for d in overall_performances_c]
        print('LR ---> ', overall_performances_a)
        print('SVM ---> ', overall_performances_b)
        print('N ---> ', overall_performances_c)
        l_res_a.append({a: overall_performances_a[0]})
        l_res_b.append({a: overall_performances_b[0]})
        l_res_c.append({a: overall_performances_c[0]})

    all_results_a[str(0)] = config
    all_results_b[str(0)] = config
    all_results_c[str(0)] = config
    np.save(path.join(config.LOG_DIR, 'results_lr.npy'), all_results_a)
    np.save(path.join(config.LOG_DIR, 'results_svm.npy'), all_results_b)
    np.save(path.join(config.LOG_DIR, 'results_n.npy'), all_results_c)
    l_res_a.append({str(0): config})
    l_res_b.append({str(0): config})
    l_res_c.append({str(0): config})
    all_avg_results_a[config.FOLDER_SUFFIX] = l_res_a
    all_avg_results_b[config.FOLDER_SUFFIX] = l_res_b
    all_avg_results_c[config.FOLDER_SUFFIX] = l_res_c
    fn = path.join(config.LOG_DIR, "Avg")
    if not path.exists(fn):
        makedirs(fn, exist_ok=True)
    np.save(path.join(config.LOG_DIR, "Avg", 'results_avg_lr.npy'),
            all_avg_results_a)
    np.save(path.join(config.LOG_DIR, "Avg", 'results_avg_svm.npy'),
            all_avg_results_b)
    np.save(path.join(config.LOG_DIR, "Avg", 'results_avg_n.npy'),
            all_avg_results_c)