def evaluate(self, x, y):

        xf = x[y > 0]
        yf = y[y > 0]

        print "Starting Kmeans clustering.."
        clustering = KMeans(n_clusters=100)
        predictions_minibatch = clustering.fit_predict(xf)
        print "done Kmeans clustering.."
        print "homogeneity score = %s" % metrics.homogeneity_score(
            yf, predictions_minibatch)
        e = ClusterEvaluation(yf, predictions_minibatch)
        m = e.printEvaluation()
    def cluster(self, x, clustering=None, n_clusters=100, labels=None):

        if labels is None:
            labels = self.labels

        # print "Starting Kmeans clustering.."
        if clustering is None:
            clustering = KMeans(n_clusters=n_clusters)

        pred = clustering.fit_predict(x)
        # print "done Kmeans clustering.."
        self.clusters = pred

        e = ClusterEvaluation(labels, pred)
        m = e.printEvaluation()

        return m
def train_SN(train_data_file,
             val_data_file,
             test_data_file,
             wordvec_file,
             load_model_name=None,
             save_model_name='SN',
             trainset_loss_type='triplet',
             testset_loss_type='none',
             testset_loss_mask_epoch=3,
             p_cond=0.03,
             p_denoise=1.0,
             rel2id_file=None,
             similarity_file=None,
             dynamic_margin=True,
             margin=1.0,
             louvain_weighted=False,
             level_train=False,
             shallow_to_deep=False,
             same_level_pair_file=None,
             max_len=120,
             pos_emb_dim=5,
             same_ratio=0.06,
             batch_size=64,
             batch_num=10000,
             epoch_num=1,
             val_size=10000,
             select_cluster=None,
             omit_relid=None,
             labeled_sample_num=None,
             squared=True,
             same_level_part=None,
             mask_same_level_epoch=1,
             same_v_adv=False,
             random_init=False,
             seed=42,
             K_num=4,
             evaluate_hierarchy=False,
             train_for_cluster_file=None,
             train_structure_file=None,
             all_structure_file=None,
             to_cluster_data_num=100,
             incre_threshold=0,
             iso_threshold=5,
             avg_link_increment=True,
             modularity_increment=False):
    # preparing saving files.
    if select_cluster is None:
        select_cluster = ['Louvain']
    if load_model_name is not None:

        load_path = os.path.join('model_file',
                                 load_model_name).replace('\\', '/')
    else:
        load_path = None

    save_path = os.path.join('model_file', save_model_name).replace('\\', '/')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    msger = messager(save_path=save_path,
                     types=[
                         'train_data_file', 'val_data_file', 'test_data_file',
                         'load_model_name', 'save_model_name',
                         'trainset_loss_type', 'testset_loss_type',
                         'testset_loss_mask_epoch', 'p_cond', 'p_denoise',
                         'same_ratio', 'labeled_sample_num'
                     ],
                     json_name='train_msg.json')
    msger.record_message([
        train_data_file, val_data_file, test_data_file, load_model_name,
        save_model_name, trainset_loss_type, testset_loss_type,
        testset_loss_mask_epoch, p_cond, p_denoise, same_ratio,
        labeled_sample_num
    ])
    msger.save_json()

    print('-----Data Loading-----')
    # for train
    dataloader_train = dataloader(train_data_file,
                                  wordvec_file,
                                  rel2id_file,
                                  similarity_file,
                                  same_level_pair_file,
                                  max_len=max_len,
                                  random_init=random_init,
                                  seed=seed)
    # for cluster never seen instances
    dataloader_train_for_cluster = dataloader(train_for_cluster_file,
                                              wordvec_file,
                                              rel2id_file,
                                              similarity_file,
                                              same_level_pair_file,
                                              max_len=max_len,
                                              random_init=random_init,
                                              seed=seed)
    # for validation, to select best model
    dataloader_val = dataloader(val_data_file,
                                wordvec_file,
                                rel2id_file,
                                similarity_file,
                                max_len=max_len)
    # for cluster
    dataloader_test = dataloader(test_data_file,
                                 wordvec_file,
                                 rel2id_file,
                                 similarity_file,
                                 max_len=max_len)
    word_emb_dim = dataloader_train._word_emb_dim_()
    word_vec_mat = dataloader_train._word_vec_mat_()
    print('word_emb_dim is {}'.format(word_emb_dim))

    # compile model
    print('-----Model Initializing-----')

    rsn = RSN(word_vec_mat=word_vec_mat,
              max_len=max_len,
              pos_emb_dim=pos_emb_dim,
              dropout=0.2)

    if load_path:
        rsn.load_model(load_path)
    rsn = cudafy(rsn)
    rsn.set_train_op(batch_size=batch_size,
                     train_loss_type=trainset_loss_type,
                     testset_loss_type=testset_loss_type,
                     p_cond=p_cond,
                     p_denoise=p_denoise,
                     p_mult=0.02,
                     squared=squared,
                     margin=margin)

    print('-----Validation Data Preparing-----')

    val_data, val_data_label = dataloader_val._part_data_(100)

    print('-----Clustering Data Preparing-----')
    train_hierarchy_structure_info = json.load(open(train_structure_file))
    all_hierarchy_structure_info = json.load(open(all_structure_file))
    train_hierarchy_cluster_list, gt_hierarchy_cluster_list, train_data_num, test_data_num, train_data, train_label, test_data, test_label = prepare_cluster_list(
        dataloader_train_for_cluster, dataloader_test,
        train_hierarchy_structure_info, all_hierarchy_structure_info,
        to_cluster_data_num)
    batch_num_list = [batch_num] * epoch_num
    # start_cluster_accuracy = 0.5
    best_validation_f1 = 0
    least_epoch = 1
    best_step = 0
    for epoch in range(epoch_num):
        msger = messager(save_path=save_path,
                         types=[
                             'batch_num', 'train_tp', 'train_fp', 'train_fn',
                             'train_tn', 'train_l', 'test_tp', 'test_fp',
                             'test_fn', 'test_tn', 'test_l'
                         ],
                         json_name='SNmsg' + str(epoch) + '.json')
        # for cluster
        # test_data, test_data_label = dataloader_test._data_()
        print('------epoch {}------'.format(epoch))
        print('max batch num to train is {}'.format(batch_num_list[epoch]))
        for i in range(1, batch_num_list[epoch] + 1):
            to_cluster_flag = False
            if trainset_loss_type.startswith("triplet"):
                if level_train and epoch < mask_same_level_epoch:
                    if i <= 1 / same_level_part * batch_num_list[epoch]:
                        rsn.train_triplet_same_level(
                            dataloader_train,
                            batch_size=batch_size,
                            K_num=4,
                            dynamic_margin=dynamic_margin,
                            level=1,
                            same_v_adv=same_v_adv)
                    elif i <= 2 / same_level_part * batch_num_list[epoch]:
                        rsn.train_triplet_same_level(
                            dataloader_train,
                            batch_size=batch_size,
                            K_num=4,
                            dynamic_margin=dynamic_margin,
                            level=2,
                            same_v_adv=same_v_adv)
                    else:
                        rsn.train_triplet_loss(dataloader_train,
                                               batch_size=batch_size,
                                               dynamic_margin=dynamic_margin)
                else:
                    rsn.train_triplet_loss(dataloader_train,
                                           batch_size=batch_size,
                                           dynamic_margin=dynamic_margin)
            else:
                rsn.train_RSN(dataloader_train,
                              dataloader_test,
                              batch_size=batch_size)

            if i % 100 == 0:
                print('temp_batch_num: ', i, ' total_batch_num: ',
                      batch_num_list[epoch])
            if i % 1000 == 0 and epoch >= least_epoch:
                print(save_model_name, 'epoch:', epoch)

                print('Validation:')
                cluster_result, cluster_msg = Louvain_no_isolation(
                    dataset=val_data,
                    edge_measure=rsn.pred_X,
                    weighted=louvain_weighted)
                cluster_eval_b3 = ClusterEvaluation(
                    val_data_label,
                    cluster_result).printEvaluation(print_flag=False)

                cluster_eval_new = ClusterEvaluationNew(
                    val_data_label,
                    cluster_result).printEvaluation(print_flag=False)
                two_f1 = cluster_eval_new['F1']
                if two_f1 > best_validation_f1:  # acc
                    to_cluster_flag = True
                    best_step = i
                    best_validation_f1 = two_f1

            if to_cluster_flag:
                # if True:
                if 'Louvain' in select_cluster:
                    print('-----Top Down Hierarchy Louvain Clustering-----')
                    if avg_link_increment:
                        # link_th_list = [0.5, 1, 2, 5, 10, 15, 20, 50, 100]
                        # link_th_list = [0.05, 0.08, 0.1, 0.12, 0.15, 0.18, 0.2, 0.3, 0.4]
                        # link_th_list = [i * 0.02 for i in range(1, 100)]
                        link_th_list = [0.05]
                        cluster_result, cluster_msg = Louvain_no_isolation(
                            dataset=test_data,
                            edge_measure=rsn.pred_X,
                            weighted=louvain_weighted)

                        predicted_cluster_dict_list = Top_Down_Louvain_with_test_cluster_done_avg_link_list(
                            # predicted_cluster_dict_list = Louvain_with_test_cluster_done_avg_link_list(
                            cluster_result,
                            train_data_num,
                            test_data_num,
                            train_data,
                            test_data,
                            train_hierarchy_cluster_list,
                            rsn.pred_X,
                            link_th_list)
                        best_hyper_score = 0
                        best_eval_info = None
                        for predicted_cluster_dict in predicted_cluster_dict_list:
                            predicted_cluster_list = predicted_cluster_dict[
                                'list']
                            evaluation = HierarchyClusterEvaluation(
                                gt_hierarchy_cluster_list,
                                predicted_cluster_list, test_data_num)
                            eval_info = evaluation.printEvaluation()
                            if eval_info['total_F1'] > best_hyper_score:
                                best_eval_info = eval_info
                                best_hyper_score = eval_info['total_F1']

                    rsn.save_model(save_path=save_path,
                                   global_step=i + epoch * batch_num)
                    print('model and clustering messages saved.')
        print('End: The model is:', save_model_name, trainset_loss_type,
              testset_loss_type, 'p_cond is:', p_cond)
    print(seed)
    print("best step:", best_step)
    print("new metric Info:")
    print("F1(%)")
    print(best_eval_info['match_f1'] * 100)

    print("taxonomy Info:")
    print("Precision(%); Recall(%); F1(%)")
    print(round(best_eval_info['taxonomy_precision'] * 100, 3), "; ",
          round(best_eval_info['taxonomy_recall'] * 100, 3), "; ",
          round(best_eval_info['taxonomy_F1'] * 100, 3))

    print("Total Info:")
    print("Precision(%); Recall(%); F1(%)")
    print(round(best_eval_info['total_precision'] * 100, 3), "; ",
          round(best_eval_info['total_recall'] * 100, 3), "; ",
          round(best_eval_info['total_F1'] * 100, 3))
Beispiel #4
0
def train_CNN(train_data_file,test_data_file,wordvec_file,load_model_name,save_model_name,
    loss_type,max_len=120, pos_emb_dim=5,batch_size=100,batch_num=1000,epoch_num=1,val_size=1000):

    # preparing saving files
    save_path = os.path.join('model_file',save_model_name).replace('\\','/')
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # train data loading
    print('-----Data Loading-----')
    dataloader_train = dataloader(train_data_file, wordvec_file, max_len=max_len)
    dataloader_test = dataloader(test_data_file, wordvec_file, max_len=max_len)
    word_emb_dim = dataloader_train._word_emb_dim_()
    word_vec_mat = dataloader_train._word_vec_mat_()
    print('word_emb_dim is {}'.format(word_emb_dim))

    # compile model
    print('-----Model Intializing-----')
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    config.gpu_options.allow_growth = True
    sess = tf.Session(config = config)    
    cnn = CNN(session=sess,word_vec_mat=word_vec_mat,max_len=max_len, pos_emb_dim=pos_emb_dim,dropout=0.2)
    cnn.set_ph(batch_size=batch_size)
    cnn.set_train_op(loss_type=loss_type,p_mult=0.02)
    cnn.init_model()

    print('-----Testing Data Preparing-----')

    # preparing testing samples
    val_testset_input, val_testset_label = dataloader_test.next_batch_cnn(val_size)
    val_trainset_input, val_trainset_label = dataloader_train.next_batch_cnn(val_size)

    # intializing parameters
    batch_num_list = [batch_num]

    for epoch in range(epoch_num):
        # preparing message lists
        msger = messager(save_path=save_path,types=['batch_num','train_acc','train_l','test_acc','test_l'], json_name='CNNmsg'+str(epoch)+'.json')

        print('------epoch {}------'.format(epoch))
        print('max batch num to train is {}'.format(batch_num_list[epoch]))
        for i in range(batch_num_list[epoch]):
            # training
            cnn.train(dataloader_train)

            # testing and saving
            if i % 10 == 0:
                print('temp_batch_num: ', i,' total_batch_num: ', batch_num_list[epoch])
            if i % 100 == 0:
                print('model_name',save_model_name)
                print('trainset:')
                val_trainset_info = cnn.validation(val_trainset_input, val_trainset_label)
                print('testset:')
                val_testset_info = cnn.validation(val_testset_input, val_testset_label)
                msger.record_message((i,)+val_trainset_info+val_testset_info)
                msger.save_json()
                cnn.save_model(save_path=save_path,global_step=i)
                print('model and messages saved.')

        # Clustering
        print('Data to cluster loading...')
        msger = messager(save_path=save_path,types=['method','F1','precision','recall','msg'], json_name='cluster_msg'+str(epoch)+'.json')
        data_to_cluster, gt = dataloader_test._data_()
        for i,item in enumerate(gt):
            gt[i]=dataloader_test.relid_dict[item]

        print('-----CNN Clustering-----')
        cluster_result = cnn.pred_X(data_to_cluster)
        cluster_result = np.squeeze(cluster_result).tolist()
        cluster_msg = create_msg(cluster_result)

        print('Evaluating...')
        cluster_eval = ClusterEvaluation(gt,cluster_result).printEvaluation()
        msger.record_message(['CNN',cluster_eval['F1'],cluster_eval['precision'],
            cluster_eval['recall'],cluster_msg])
        msger.save_json()
        print(cluster_eval)
        print('clustering messages saved.')

        print("-----End-----")
        print("The model name is:",save_model_name)
        print("loss type is:",loss_type)
Beispiel #5
0
def train_SN(train_data_file,
             val_data_file,
             test_data_file,
             wordvec_file,
             load_model_name=None,
             save_model_name='SN',
             trainset_loss_type='triplet',
             testset_loss_type='none',
             testset_loss_mask_epoch=3,
             p_cond=0.03,
             p_denoise=1.0,
             rel2id_file=None,
             similarity_file=None,
             dynamic_margin=True,
             margin=1.0,
             louvain_weighted=False,
             level_train=False,
             shallow_to_deep=False,
             same_level_pair_file=None,
             max_len=120,
             pos_emb_dim=5,
             same_ratio=0.06,
             batch_size=64,
             batch_num=10000,
             epoch_num=1,
             val_size=10000,
             select_cluster=None,
             omit_relid=None,
             labeled_sample_num=None,
             squared=True,
             same_level_part=None,
             mask_same_level_epoch=1,
             same_v_adv=False,
             random_init=False,
             seed=42,
             K_num=4,
             evaluate_hierarchy=False,
             gt_hierarchy_file=None):
    # preparing saving files.
    if select_cluster is None:
        select_cluster = ['Louvain']
    if load_model_name is not None:
        load_path = os.path.join('model_file',
                                 load_model_name).replace('\\', '/')
    else:
        load_path = None

    save_path = os.path.join('model_file', save_model_name).replace('\\', '/')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    msger = messager(save_path=save_path,
                     types=[
                         'train_data_file', 'val_data_file', 'test_data_file',
                         'load_model_name', 'save_model_name',
                         'trainset_loss_type', 'testset_loss_type',
                         'testset_loss_mask_epoch', 'p_cond', 'p_denoise',
                         'same_ratio', 'labeled_sample_num'
                     ],
                     json_name='train_msg.json')
    msger.record_message([
        train_data_file, val_data_file, test_data_file, load_model_name,
        save_model_name, trainset_loss_type, testset_loss_type,
        testset_loss_mask_epoch, p_cond, p_denoise, same_ratio,
        labeled_sample_num
    ])
    msger.save_json()
    # if not trainset_loss_type.startswith("triplet"):
    #     batch_size = 100
    # train data loading
    print('-----Data Loading-----')
    dataloader_train = dataloader(train_data_file,
                                  wordvec_file,
                                  rel2id_file,
                                  similarity_file,
                                  same_level_pair_file,
                                  max_len=max_len,
                                  random_init=random_init,
                                  seed=seed)
    dataloader_val = dataloader(val_data_file,
                                wordvec_file,
                                rel2id_file,
                                similarity_file,
                                max_len=max_len)
    dataloader_test = dataloader(test_data_file,
                                 wordvec_file,
                                 rel2id_file,
                                 similarity_file,
                                 max_len=max_len)
    word_emb_dim = dataloader_train._word_emb_dim_()
    word_vec_mat = dataloader_train._word_vec_mat_()
    print('word_emb_dim is {}'.format(word_emb_dim))

    # compile model
    print('-----Model Initializing-----')

    rsn = RSN(word_vec_mat=word_vec_mat,
              max_len=max_len,
              pos_emb_dim=pos_emb_dim,
              dropout=0.2)
    # rsn
    if load_path:
        rsn.load_model(load_path)
    rsn = cudafy(rsn)
    rsn.set_train_op(batch_size=batch_size,
                     train_loss_type=trainset_loss_type,
                     testset_loss_type=testset_loss_type,
                     p_cond=p_cond,
                     p_denoise=p_denoise,
                     p_mult=0.02,
                     squared=squared,
                     margin=margin)

    print('-----Validation Data Preparing-----')

    val_data, val_data_label = dataloader_val._part_data_(100)

    # intializing parameters
    batch_num_list = [batch_num] * epoch_num
    # clustering_test_time = np.arange(19999, batch_num, 20000).tolist()
    msger_cluster = messager(
        save_path=save_path,
        types=['method', 'temp_batch_num', 'F1', 'precision', 'recall', 'msg'],
        json_name='cluster_msg.json')
    # best_validation_accuracy = 0.9
    least_epoch = 1
    best_step = 0
    print_flag = True
    best_validation_f1 = 0
    for epoch in range(epoch_num):
        test_data, test_data_label = dataloader_test._data_()
        print('------epoch {}------'.format(epoch))
        print('max batch num to train is {}'.format(batch_num_list[epoch]))
        for i in range(1, batch_num_list[epoch] + 1):
            to_cluster_flag = False
            if trainset_loss_type.startswith("triplet"):
                if level_train and epoch < mask_same_level_epoch:
                    if i <= 1 / same_level_part * batch_num_list[epoch]:
                        rsn.train_triplet_same_level(
                            dataloader_train,
                            batch_size=batch_size,
                            K_num=4,
                            dynamic_margin=dynamic_margin,
                            level=1,
                            same_v_adv=same_v_adv)
                    elif i <= 2 / same_level_part * batch_num_list[epoch]:
                        rsn.train_triplet_same_level(
                            dataloader_train,
                            batch_size=batch_size,
                            K_num=4,
                            dynamic_margin=dynamic_margin,
                            level=2,
                            same_v_adv=same_v_adv)
                    else:
                        rsn.train_triplet_loss(dataloader_train,
                                               batch_size=batch_size,
                                               dynamic_margin=dynamic_margin)
                else:
                    rsn.train_triplet_loss(dataloader_train,
                                           batch_size=batch_size,
                                           dynamic_margin=dynamic_margin)
            else:
                rsn.train_RSN(dataloader_train,
                              dataloader_test,
                              batch_size=batch_size)

            if i % 500 == 0:
                print('temp_batch_num: ', i, ' total_batch_num: ',
                      batch_num_list[epoch])
            if i % 1000 == 0 and epoch >= least_epoch:
                print(save_model_name, 'epoch:', epoch)

                print('Validation:')
                cluster_result, cluster_msg = Louvain_no_isolation(
                    dataset=val_data,
                    edge_measure=rsn.pred_X,
                    weighted=louvain_weighted)
                cluster_eval_new = ClusterEvaluationNew(
                    val_data_label,
                    cluster_result).printEvaluation(print_flag=False)

                cluster_eval_b3 = ClusterEvaluation(
                    val_data_label,
                    cluster_result).printEvaluation(print_flag=False)
                # two_f1 = cluster_eval_new['F1'] + cluster_eval_b3['F1']
                two_f1 = cluster_eval_b3['F1']
                if two_f1 > best_validation_f1:  # acc
                    to_cluster_flag = True
                    best_step = i
                    best_validation_f1 = two_f1

            if to_cluster_flag:
                if 'Louvain' in select_cluster:
                    print('-----Louvain Clustering-----')
                    if not evaluate_hierarchy:
                        cluster_result, cluster_msg = Louvain_no_isolation(
                            dataset=test_data,
                            edge_measure=rsn.pred_X,
                            weighted=louvain_weighted)
                        cluster_eval_new = ClusterEvaluationNew(
                            test_data_label, cluster_result).printEvaluation(
                                print_flag=print_flag)
                        # msger_cluster.record_message(['Louvain_New', i, cluster_eval_new['F1'], cluster_msg])
                        # print("New Metric", cluster_eval)
                        cluster_eval_b3 = ClusterEvaluation(
                            test_data_label, cluster_result).printEvaluation(
                                print_flag=print_flag, extra_info=True)

                        # msger_cluster.record_message(['Louvain', i, cluster_eval_b3['F1'], cluster_eval_b3['precision'],
                        #                               cluster_eval_b3['recall'], cluster_msg])
                        best_cluster_eval_new = cluster_eval_new
                        best_cluster_eval_b3 = cluster_eval_b3
                    rsn.save_model(save_path=save_path,
                                   global_step=i + epoch * batch_num)
                    print('model and clustering messages saved.')

        print('End: The model is:', save_model_name, trainset_loss_type,
              testset_loss_type, 'p_cond is:', p_cond)
    print("best_cluster_eval_new", best_cluster_eval_new)
    print("best_cluster_eval_b3", best_cluster_eval_b3)
    print(seed)
    return best_cluster_eval_new, best_cluster_eval_b3
Beispiel #6
0
def train_SN(train_data_file,
             val_data_file,
             test_data_file,
             wordvec_file,
             load_model_name=None,
             save_model_name='SN',
             trainset_loss_type='cross',
             testset_loss_type='none',
             testset_loss_mask_epoch=3,
             p_cond=0.03,
             p_denoise=1.0,
             max_len=120,
             pos_emb_dim=5,
             same_ratio=0.06,
             batch_size=100,
             batch_num=100000,
             epoch_num=1,
             val_size=10000,
             select_cluster='Louvain',
             omit_relid=None,
             labeled_sample_num=None):

    # preparing saving files
    if load_model_name is not None:
        load_path = os.path.join('model_file',
                                 load_model_name).replace('\\', '/')
    else:
        load_path = None
    save_path = os.path.join('model_file', save_model_name).replace('\\', '/')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    msger = messager(save_path=save_path,
                     types=[
                         'train_data_file', 'val_data_file', 'test_data_file',
                         'load_model_name', 'save_model_name',
                         'trainset_loss_type', 'testset_loss_type',
                         'testset_loss_mask_epoch', 'p_cond', 'p_denoise',
                         'same_ratio', 'labeled_sample_num'
                     ],
                     json_name='train_msg.json')
    msger.record_message([
        train_data_file, val_data_file, test_data_file, load_model_name,
        save_model_name, trainset_loss_type, testset_loss_type,
        testset_loss_mask_epoch, p_cond, p_denoise, same_ratio,
        labeled_sample_num
    ])
    msger.save_json()

    # train data loading
    print('-----Data Loading-----')
    dataloader_train = dataloader(train_data_file,
                                  wordvec_file,
                                  max_len=max_len)
    if omit_relid is not None and omit_relid >= 4:
        dataloader_train.select_relation(
            np.arange(2, omit_relid + 1, 1).tolist())
    if labeled_sample_num is not None:
        dataloader_train.select_sample_num(labeled_sample_num)
    dataloader_testset = dataloader(val_data_file,
                                    wordvec_file,
                                    max_len=max_len)
    dataloader_test = dataloader(test_data_file, wordvec_file, max_len=max_len)
    word_emb_dim = dataloader_train._word_emb_dim_()
    word_vec_mat = dataloader_train._word_vec_mat_()
    print('word_emb_dim is {}'.format(word_emb_dim))

    # compile model
    print('-----Model Intializing-----')
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    SN = VASN(session=sess,
              word_vec_mat=word_vec_mat,
              max_len=max_len,
              pos_emb_dim=pos_emb_dim,
              dropout=0.2)
    SN.set_ph(batch_size=batch_size)
    SN.set_train_op(trainset_loss_type=trainset_loss_type,
                    testset_loss_type=testset_loss_type,
                    p_cond=p_cond,
                    p_denoise=p_denoise,
                    p_mult=0.02)
    SN.init_model(load_path)

    print('-----Testing Data Preparing-----')

    # preparing testing samples
    val_testset_left_input, val_testset_right_input, val_testset_data_label = \
    dataloader_testset.next_batch(val_size,same_ratio=same_ratio)
    val_trainset_left_input, val_trainset_right_input, val_trainset_data_label = \
    dataloader_train.next_batch(val_size,same_ratio=same_ratio)

    # intializing parameters
    batch_num_list = [batch_num] * epoch_num
    clustering_test_time = np.arange(19999, batch_num, 20000).tolist()
    msger_cluster = messager(
        save_path=save_path,
        types=['method', 'temp_batch_num', 'F1', 'precision', 'recall', 'msg'],
        json_name='cluster_msg.json')

    for epoch in range(epoch_num):
        if epoch < testset_loss_mask_epoch:
            SN.set_train_op(trainset_loss_type=trainset_loss_type,
                            testset_loss_type='none',
                            p_cond=p_cond,
                            p_denoise=p_denoise,
                            p_mult=0.02)
        else:
            SN.set_train_op(trainset_loss_type=trainset_loss_type,
                            testset_loss_type=testset_loss_type,
                            p_cond=p_cond,
                            p_denoise=p_denoise,
                            p_mult=0.02)

        # preparing message lists
        msger = messager(save_path=save_path,
                         types=[
                             'batch_num', 'train_tp', 'train_fp', 'train_fn',
                             'train_tn', 'train_l', 'test_tp', 'test_fp',
                             'test_fn', 'test_tn', 'test_l'
                         ],
                         json_name='SNmsg' + str(epoch) + '.json')

        data_to_cluster, gt = dataloader_test._data_()

        print('------epoch {}------'.format(epoch))
        print('max batch num to train is {}'.format(batch_num_list[epoch]))
        for i in range(batch_num_list[epoch]):
            # training
            if omit_relid is not None and omit_relid == 0:
                SN.train_unsup(dataloader_train,
                               dataloader_testset,
                               batch_size=batch_size,
                               same_ratio=same_ratio)
            else:
                SN.train(dataloader_train,
                         dataloader_testset,
                         batch_size=batch_size,
                         same_ratio=same_ratio)

            # testing and saving
            if i % 100 == 0:
                print('temp_batch_num: ', i, ' total_batch_num: ',
                      batch_num_list[epoch])
            if i % 1000 == 0:
                print(save_model_name, 'epoch:', epoch)
                print('trainset:')
                val_trainset_info = SN.validation(val_trainset_left_input,
                                                  val_trainset_right_input,
                                                  val_trainset_data_label)
                print('testset:')
                val_testset_info = SN.validation(val_testset_left_input,
                                                 val_testset_right_input,
                                                 val_testset_data_label)
                msger.record_message((i, ) + val_trainset_info +
                                     val_testset_info)
                msger.save_json()
                SN.save_model(save_path=save_path, global_step=i)
                print('model and messages saved.')
            if i in clustering_test_time or i == batch_num_list[epoch] - 1:
                if 'Louvain' in select_cluster:
                    print('-----Louvain Clustering-----')
                    cluster_result, cluster_msg = Louvain_no_isolation(
                        dataset=data_to_cluster, edge_measure=SN.pred_X)
                    cluster_eval = ClusterEvaluation(
                        gt, cluster_result).printEvaluation()
                    msger_cluster.record_message([
                        'Louvain', i, cluster_eval['F1'],
                        cluster_eval['precision'], cluster_eval['recall'],
                        cluster_msg
                    ])
                    msger_cluster.save_json()
                    print(cluster_eval)
                    print('clustering messages saved.')

                if 'HAC' in select_cluster:
                    print('-----HAC Clustering-----')
                    cluster_result, cluster_msg = complete_HAC(
                        dataset=data_to_cluster,
                        HAC_dist=SN.pred_X,
                        k=len(list(set(gt))))
                    cluster_eval = ClusterEvaluation(
                        gt, cluster_result).printEvaluation()
                    msger_cluster.record_message([
                        'HAC', i, cluster_eval['F1'],
                        cluster_eval['precision'], cluster_eval['recall'],
                        cluster_msg
                    ])
                    msger_cluster.save_json()
                    print(cluster_eval)
                    print('clustering messages saved.')

        print('End: The model is:', save_model_name, trainset_loss_type,
              testset_loss_type, 'p_cond is:', p_cond)
Beispiel #7
0
def load_cluster(train_data_file,
                 test_data_file,
                 wordvec_file,
                 load_model_name=None,
                 all_structure_file=None,
                 trainset_loss_type='triplet',
                 testset_loss_type='none',
                 p_cond=0.03,
                 to_cluster_data_num=100,
                 p_denoise=1.0,
                 rel2id_file=None,
                 similarity_file=None,
                 margin=1.0,
                 save_cluster=False,
                 louvain_weighted=False,
                 same_level_pair_file=None,
                 train_for_cluster_file=None,
                 train_structure_file=None,
                 test_infos_file=None,
                 val_hier=False,
                 golden=False,
                 max_len=120,
                 pos_emb_dim=5,
                 batch_size=64,
                 squared=True,
                 random_init=False,
                 seed=42):
    if load_model_name is not None:
        load_path = os.path.join('model_file',
                                 load_model_name).replace('\\', '/')
    else:
        load_path = None

    print('-----Data Loading-----')
    # for train
    dataloader_train = dataloader(train_data_file,
                                  wordvec_file,
                                  rel2id_file,
                                  similarity_file,
                                  same_level_pair_file,
                                  max_len=max_len,
                                  random_init=random_init,
                                  seed=seed)
    # for cluster never seen instances
    dataloader_train_for_cluster = dataloader(train_for_cluster_file,
                                              wordvec_file,
                                              rel2id_file,
                                              similarity_file,
                                              same_level_pair_file,
                                              max_len=max_len)

    dataloader_test = dataloader(test_data_file,
                                 wordvec_file,
                                 rel2id_file,
                                 similarity_file,
                                 max_len=max_len)
    word_emb_dim = dataloader_train._word_emb_dim_()
    word_vec_mat = dataloader_train._word_vec_mat_()
    print('word_emb_dim is {}'.format(word_emb_dim))

    # compile model
    print('-----Model Initializing-----')

    rsn = RSN(word_vec_mat=word_vec_mat,
              max_len=max_len,
              pos_emb_dim=pos_emb_dim,
              dropout=0)
    rsn.set_train_op(batch_size=batch_size,
                     train_loss_type=trainset_loss_type,
                     testset_loss_type=testset_loss_type,
                     p_cond=p_cond,
                     p_denoise=p_denoise,
                     p_mult=0.02,
                     squared=squared,
                     margin=margin)

    if load_path:
        rsn.load_model(load_path + "/RSNbest.pt")
    rsn = cudafy(rsn)
    rsn.eval()
    print('-----Louvain Clustering-----')

    if val_hier:
        print('-----Top Down Hierarchy Expansion-----')
        train_hierarchy_structure_info = json.load(open(train_structure_file))
        all_hierarchy_structure_info = json.load(open(all_structure_file))
        train_hierarchy_cluster_list, gt_hierarchy_cluster_list, train_data_num, test_data_num, train_data, train_label, test_data, test_label = prepare_cluster_list(
            dataloader_train_for_cluster, dataloader_test,
            train_hierarchy_structure_info, all_hierarchy_structure_info,
            to_cluster_data_num)
        link_th_list = [0.2]

        if golden:
            link_th_list = [0.3]
            predicted_cluster_dict_list = Top_Down_Louvain_with_test_cluster_done_avg_link_list_golden(
                gt_hierarchy_cluster_list, train_data_num, test_data_num,
                train_data, test_data, train_hierarchy_cluster_list,
                rsn.pred_X, link_th_list)
        else:
            cluster_result, cluster_msg = Louvain_no_isolation(
                dataset=test_data,
                edge_measure=rsn.pred_X,
                weighted=louvain_weighted)
            predicted_cluster_dict_list = Top_Down_Louvain_with_test_cluster_done_avg_link_list(
                cluster_result, train_data_num, test_data_num, train_data,
                test_data, train_hierarchy_cluster_list, rsn.pred_X,
                link_th_list)
            if save_cluster:
                json.dump(cluster_result, open("cluster_result.json", "w"))
                pickle.dump(predicted_cluster_dict_list,
                            open("predicted_cluster_dict_list.pkl", "wb"))
                pickle.dump(gt_hierarchy_cluster_list,
                            open("gt_hierarchy_cluster_list.pkl", "wb"))
                print("saved results!")
        for predicted_cluster_dict in predicted_cluster_dict_list:
            print("\n\n")
            predicted_cluster_list = predicted_cluster_dict['list']
            print("Isolation threhold", predicted_cluster_dict['iso'])
            print("Average Link threhold", predicted_cluster_dict['link_th'])
            pickle.dump(predicted_cluster_list,
                        open("predicted_cluster_list.pkl", "wb"))
            evaluation = HierarchyClusterEvaluation(gt_hierarchy_cluster_list,
                                                    predicted_cluster_list,
                                                    test_data_num)
            eval_info = evaluation.printEvaluation(print_flag=True)
            HierarchyClusterEvaluationTypes(gt_hierarchy_cluster_list,
                                            predicted_cluster_list,
                                            test_infos_file,
                                            rel2id_file).printEvaluation()
    else:
        test_data, test_data_label = dataloader_test._data_()
        cluster_result, cluster_msg = Louvain_no_isolation(
            dataset=test_data,
            edge_measure=rsn.pred_X,
            weighted=louvain_weighted)

        cluster_eval_b3 = ClusterEvaluation(
            test_data_label, cluster_result).printEvaluation(print_flag=True,
                                                             extra_info=True)

        ClusterEvaluationB3Types(test_data_label, cluster_result,
                                 test_infos_file,
                                 rel2id_file).printEvaluation()
        print("100 times")

        print({k: v * 100 for k, v in cluster_eval_b3.items()})