Python get_vectorsの例、utils.get_vectors Pythonの例

コード例 #1

0

ファイルを表示

ファイル: demo.py プロジェクト: liyucheng09/sentence_embedding

    def get_embeddings(self, sents):
        tokenized_sents = self.tokenizer(sents,
                                         max_length=self.max_length,
                                         padding=True,
                                         truncation=True,
                                         return_tensors='pt')
        with torch.no_grad():
            vecs = get_vectors(self.model, tokenized_sents)[0]
        vecs = vecs.cpu().numpy()

        return vecs

コード例 #2

0

ファイルを表示

def load_features(options, net):
    global features_dict
    features_dict = {}
    time_start = time.time()
    logger.info('\t loading embedding features...')
    start_nodes = net.get_nodes(node_type=options.eval_edge_type[0])
    target_nodes = net.get_nodes(node_type=options.eval_edge_type[1])
    id_list = start_nodes + target_nodes
    id_list, features_matrix = utils.get_vectors(utils.get_KeyedVectors(
        options.vectors_path),
                                                 id_list,
                                                 missing_rule="random")
    for idx, node_id in enumerate(id_list):
        features_dict[node_id] = features_matrix[idx]
    logger.info(
        '\t loading embedding features completed in {}s'.format(time.time() -
                                                                time_start))

コード例 #3

0

ファイルを表示

ファイル: demo.py プロジェクト: liyucheng09/sentence_embedding

    def get_embeddings(self, sents, whitening=False):
        tokenized_sents = self.tokenizer(sents,
                                         max_length=self.max_length,
                                         padding=True,
                                         truncation=True,
                                         return_tensors='pt')
        with torch.no_grad():
            vecs = get_vectors(self.model, tokenized_sents, pool=self.pool)[0]
        vecs = vecs.cpu().numpy()

        if whitening:
            kernel, bias = self.kernel, self.bias
            kernel = kernel[:, :self.n_components]
            vecs = transform_and_normalize(vecs, kernel, bias)
            return vecs
        vecs = vector_l2_normlize(vecs)
        return vecs

コード例 #4

0

ファイルを表示

else:
    num_classes = 10
batch_size = 32
st = 100

import os
import time
import pickle
if os.path.isfile(pickle_file):
    x_f_train, x_s_train, x_len_train, y_train, \
        x_f_dev, x_s_dev, x_len_dev, y_dev, \
        x_dict, word_vectors, \
        max_sen_len, max_seg_len, max_seq_len = pickle.load(open(pickle_file, 'rb'))
else:
    x_dict = utils.get_flat_dict(train_file, x_index)
    word_vectors = utils.get_vectors(x_dict, vec_file)
    x_f_train, y_train, len_f_train = utils.get_flat_data(train_file, x_index, y_index, x_dict, num_classes)
    x_s_train, _, x_len_train, max_train_sen, len_s_train = utils.get_hier_data(train_file, x_index, y_index, x_dict, num_classes)
    x_f_dev, y_dev, len_f_dev = utils.get_flat_data(dev_file, x_index, y_index, x_dict, num_classes)
    x_s_dev, _, x_len_dev, max_dev_sen, len_s_dev = utils.get_hier_data(dev_file, x_index, y_index, x_dict, num_classes, max_train_sen)
    x_s_train, _, x_len_train, max_train_sen, len_s_train = utils.get_hier_data(train_file, x_index, y_index, x_dict, num_classes, max_dev_sen)
    max_sen_len = max(max_train_sen, max_dev_sen)
    max_seg_len = max(len_s_train, len_s_dev)
    max_seq_len = max(len_f_train, len_f_dev)
    pickle.dump([x_f_train, x_s_train, x_len_train, y_train, \
                 x_f_dev, x_s_dev, x_len_dev, y_dev, \
                 x_dict, word_vectors, \
                 max_sen_len, max_seg_len, max_seq_len], \
                 open(pickle_file, 'wb'), protocol=4)

vocab_size = len(x_dict)

コード例 #5

0

ファイルを表示

                most_similar_pro.extend(
                    utils.most_sim_cos(vectors_pro, query_text,
                                       args.num_responses))
                most_similar_con.extend(
                    utils.most_sim_cos(vectors_con, query_text,
                                       args.num_responses))
            infile_pro.close()
            infile_con.close()

    elif args.sim_model == 'word2vec':
        path = pre_path + "embeddings/GoogleNews-vectors-negative300.bin"
        model = gensim.models.KeyedVectors.load_word2vec_format(path,
                                                                binary=True)
        texts_all.append(query_text)
        if args.responses_per_stance == 0:
            vectors = utils.get_vectors(model, texts_all)
            most_similar = utils.most_sim_cos(vectors, query_text,
                                              args.num_responses)
        else:
            texts_pro.append(query_text)
            texts_con.append(query_text)
            vectors_pro = utils.get_vectors(model, texts_pro)
            most_similar_pro = utils.most_sim_cos(vectors_pro, query_text,
                                                  args.num_responses)
            vectors_con = utils.get_vectors(model, texts_con)
            most_similar_con = utils.most_sim_cos(vectors_con, query_text,
                                                  args.num_responses)
    elif args.sim_model == 'glove':
        path = pre_path + "embeddings/glove.840B.300dword2vec.txt"
        # path = '/Users/youmna/Documents/Phd_year2/Coherence_acl/GC_wsj/glove.6B.100d.word2vec.txt'
        model = utils.read_emb_text(path)

コード例 #6

0

ファイルを表示

        u_test, p_test, u_test_count, p_test_count, \
        x_dict, u_dict, p_dict, u_freq, p_freq, \
        x_vectors = cPickle.load(open(pickle_file, 'rb'))
else:
    x_dict = utils.get_dict(train_file, x_index)
    u_dict, p_dict, u_freq, p_freq = utils.get_up_dict(train_file, u_index, p_index)
    x_train, i_train, x_train_len, y_train, u_train, p_train, \
        u_train_count, p_train_count = utils.get_flat_data(train_file, x_index, y_index, x_dict, num_classes,
                                               u_index, p_index, u_dict, p_dict, u_freq, p_freq)
    x_dev, i_dev, x_dev_len, y_dev, u_dev, p_dev, \
        u_dev_count, p_dev_count = utils.get_flat_data(dev_file, x_index, y_index, x_dict, num_classes,
                                               u_index, p_index, u_dict, p_dict, u_freq, p_freq)
    x_test, i_test, x_test_len, y_test, u_test, p_test, \
        u_test_count, p_test_count = utils.get_flat_data(test_file, x_index, y_index, x_dict, num_classes,
                                               u_index, p_index, u_dict, p_dict, u_freq, p_freq)
    x_vectors = utils.get_vectors(x_dict, vec_file, emb_size)
    cPickle.dump([x_train, i_train, x_train_len, y_train, \
                  u_train, p_train, u_train_count, p_train_count, \
                  x_dev, i_dev, x_dev_len, y_dev, \
                  u_dev, p_dev, u_dev_count, p_dev_count, \
                  x_test, i_test, x_test_len, y_test, \
                  u_test, p_test, u_test_count, p_test_count, \
                  x_dict, u_dict, p_dict, u_freq, p_freq, \
                  x_vectors], open(pickle_file, 'wb'), protocol=4)

vocab_size = len(x_dict)
target_len = num_classes
user_vocab_size = len(u_dict)
prod_vocab_size = len(p_dict)
user_mean_freq = np.mean([u_freq[x] for x in u_freq])
prod_mean_freq = np.mean([p_freq[x] for x in p_freq])

コード例 #7

0

ファイルを表示

ファイル: PCA but incorrect.py プロジェクト: giantianye/NLP

# They are usually found in the same location of a sentence,
# have the same parts of speech, and thus when
# learning the word vectors, you end up getting similar weights. In the next week
# we will go over how you learn them, but for now let's just enjoy using them.
#
# **Instructions:** Run the cell below.

# In[143]:

words = [
    'oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country',
    'continent', 'petroleum', 'joyful'
]

# given a list of words and the embeddings, it returns a matrix with all the embeddings
X = get_vectors(word_embeddings, words)

print('You have 11 words each of 300 dimensions thus X.shape is:', X.shape)

# In[144]:

# We have done the plotting for you. Just run this cell.
result = compute_pca(X, 2)
plt.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0] - 0.05, result[i, 1] + 0.1))

plt.show()

# **What do you notice?**
#

コード例 #8

0

ファイルを表示

ファイル: eval_link_prediction.py プロジェクト: RingBDStack/RWNE

def eval_once(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    if not utils.check_rebuild(options.link_prediction_path,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    # loading features_matrix(already trained)
    logger.info('\t reading embedding vectors from file {}'.format(
        options.vectors_path))
    time_start = time.time()
    features_matrix = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list)
    logger.info(
        '\t reading embedding vectors completed in {}s'.format(time.time() -
                                                               time_start))
    logger.info('total loaded nodes: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('the embedding dimension: {}'.format(
        np.size(features_matrix, axis=1)))

    fr = open(options.link_prediction_path, 'w')
    fr.write('eval case: link-prediction ...\n')
    fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
    fr.write('\t data_format: {}\n'.format(options.data_format))
    fr.write('\t metrics: MAP and precise@K\n')
    fr.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr.write('\t similarity_metric: {}\n'.format(options.similarity_metric))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr.write('\t sample_nodes_rule: {}\n'.format(options.sample_nodes_rule))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
    fr.write('total loaded nodes: {}\n'.format(np.size(features_matrix,
                                                       axis=0)))
    fr.write('the embedding dimension: {}\n'.format(
        np.size(features_matrix, axis=1)))

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

            ret_list = []  # [[MAP, precisionK_list], ... ]
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_sample_thread_body, times_per_worker):
                    ret_list.extend(ret)
            if len(ret_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(ret_list), options.repeated_times))
        else:
            ret_list = _sample_thread_body(options.repeated_times)
    else:
        # no sampling, no repeat!
        ret_list = [_eval(net_eval, net_except)]  # [[MAP, precisionK_list]]

    if options.sample_nodes > 0:
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(ret_list)))
    else:
        fr.write(
            'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
            .format(options.sample_nodes, len(ret_list)))

    mean_MAP = np.mean([ret[0] for ret in ret_list])
    mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

    fr.write('\t\t MAP = {}\n'.format(mean_MAP))
    for k in range(options.precK_max_index):
        if k < len(mean_precisionK):
            fr.write('\t\t precisionK_{} = {}\n'.format(
                k + 1, mean_precisionK[k]))
        else:
            fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
    fr.write('details:\n')
    for repeat in range(len(ret_list)):
        fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
        MAP = ret_list[repeat][0]
        precisionK_list = ret_list[repeat][1]
        fr.write('\t\t MAP = {}\n'.format(MAP))
        for k in range(options.precK_max_index):
            if k < len(precisionK_list):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, precisionK_list[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

    fr.write(
        '\neval case: link_prediction completed in {}s.'.format(time.time() -
                                                                time_start))
    fr.close()
    logger.info(
        'eval case: link_prediction completed in {}s.'.format(time.time() -
                                                              time_start))

    return

コード例 #9

0

ファイルを表示

ファイル: eval_cluster.py プロジェクト: RingBDStack/RWNE

def eval_once(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    if not utils.check_rebuild(options.cluster_path,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    LABEL_SIZE = options.label_size
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # cluster
    fr = open(options.cluster_path, 'w')
    fr.write('eval case: cluster...\n')
    fr.write('\t save_path: {}\n'.format(options.cluster_path))
    fr.write('\t cluster: kmeans\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

        try:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)
        except:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)

        if len(nmi_list) != options.repeated_times:
            logger.warning(
                "warning: eval unmatched repeated_times: {} != {}".format(
                    len(nmi_list), options.repeated_times))
    else:
        try:
            nmi_list = _cluster_thread_body(options.repeated_times)
        except:
            nmi_list = _cluster_thread_body(options.repeated_times)

    mean_nmi = sum(nmi_list) / float(len(nmi_list))
    fr.write(
        'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
        .format(options.repeated_times, len(nmi_list)))
    fr.write('\t\t NMI = {}\n'.format(mean_nmi))
    fr.write('details:\n')
    for repeat in range(len(nmi_list)):
        fr.write('\t repeated {}/{}: NMI = {}\n'.format(
            repeat + 1, len(nmi_list), nmi_list[repeat]))
    fr.write('\neval case: cluster completed in {}s.'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: cluster completed in {}s.'.format(time.time() -
                                                              time_start))

    return

コード例 #10

0

ファイルを表示

def eval_once(options):
    # visual_dir, visual_file = os.path.split(options.visualization_path)
    if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_path: {}\n'.format(options.visualization_path))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))


    # get embedding vectors and markersize
    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                  multilabel_rule=options.multilabel_rule,
                                                  type_filepath=os.path.join(options.data_dir,
                                                                             options.data_name + ".nodes"))
    id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
    logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0)))
    logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    for i in range(options.label_size):
        logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

    fr = open(options.visualization_path, 'w')
    fr.write('eval case: visualization...\n')
    fr.write('\t data_dir = {}\n'.format(options.data_dir))
    fr.write('\t data_name = {}\n'.format(options.data_name))
    fr.write('\t isdirected = {}\n'.format(options.isdirected))
    fr.write('\t label_path = {}\n'.format(options.label_path))
    fr.write('\t label_size = {}\n'.format(options.label_size))
    fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr.write('\t save_path: {}\n\n'.format(options.visualization_path))
    fr.write('\t method: t-SNE\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t marker_size: {}\n'.format(options.marker_size))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
    fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i)))

    figure_name = "visualization_" + str(np.size(features_matrix, axis=1))
    figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name)
    CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                               features_matrix=features_matrix,
                               labels_matrix=labels_matrix,
                               label_size=options.label_size,
                               figure_path = figure_path)

    fr.write('\n figure_path: {}\n'.format(figure_path))
    fr.write(' clustering_center_distance_sim: {}\n'.format(CCD))
    fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
    fr.close()
    logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))

コード例 #11

0

ファイルを表示

def eval_online(options):
    visual_dir = os.path.split(options.visualization_path)[0]
    if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(visual_dir):
        os.makedirs(visual_dir)

    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_dir: {}\n'.format(visual_dir))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))


    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    # get embedding vectors and markersize
    time_start = time.time()
    id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                                multilabel_rule=options.multilabel_rule,
                                                                type_filepath=os.path.join(options.data_dir,
                                                                                           options.data_name + ".nodes"))
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))

    logger.info('\t total labeled data size: {}'.format(len(id_list_totoal)))
    logger.info('\t total labels size: {}'.format(options.label_size))


    fr_total = open(options.visualization_path, 'w')
    fr_total.write('eval case: visualization...\n')
    fr_total.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_total.write('\t data_name = {}\n'.format(options.data_name))
    fr_total.write('\t isdirected = {}\n'.format(options.isdirected))
    fr_total.write('\t label_path = {}\n'.format(options.label_path))
    fr_total.write('\t label_size = {}\n'.format(options.label_size))
    fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr_total.write('\t save_dir: {}\n\n'.format(visual_dir))
    fr_total.write('\t method: t-SNE\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t marker_size: {}\n'.format(options.marker_size))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal)))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write('\t results(CCD-clustering_center_distance_sim):\n'
                   '=============================================================\n')
    fr_total.write('finish_time\tckpt\tCCD\n')


    last_step = 0
    summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='CCD', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_CCD = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("model and vectors not exist, waiting...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type)
    writing = options.vectors_path + ".writing"

    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue
            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path))
        id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path),
                                                         id_list_totoal, labels_list_totoal)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))
        for i in range(options.label_size):
            logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

        # visualization
        fr = open(options.visualization_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: visualization...\n')
        fr.write('\t data_dir = {}\n'.format(options.data_dir))
        fr.write('\t data_name = {}\n'.format(options.data_name))
        fr.write('\t isdirected = {}\n'.format(options.isdirected))
        fr.write('\t label_path = {}\n'.format(options.label_path))
        fr.write('\t label_size = {}\n'.format(options.label_size))
        fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
        fr.write('\t method: t-SNE\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t marker_size: {}\n'.format(options.marker_size))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step)
        figure_path = os.path.join(visual_dir, figure_name)
        CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                                   features_matrix=features_matrix,
                                   labels_matrix=labels_matrix,
                                   label_size=options.label_size,
                                   figure_path=figure_path)

        fr.write('\n figure_path: {}\n'.format(figure_path))
        fr.write(' clustering_center_distance_sim:{}\n'.format(CCD))
        fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
        fr.close()

        fr_total.write('%.4f\n' % CCD)
        fr_total.flush()
        summary.value.add(tag='CCD', simple_value=CCD)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if CCD > best_CCD:
            best_CCD = CCD

            ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a')
                fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                              "the current best_ckpt model is loss, but the result is:\n")
            fr_best.write("best_CCD: {}\n".format(best_CCD))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
        last_step = cur_step

    fr_total.close()
    summary_writer.close()
    return

コード例 #12

0

ファイルを表示

#source_lex = "en-it.test"
source_lex = "es-na.test"
words_scr_lexicon, words_trg_lexicon = utils.get_lexicon(source_lex)
print("size of lexicon:", set(words_scr_lexicon).__len__())
#print(len(words_scr_lexicon), len(words_trg_lexicon))


source_str = "es.n2v"
target_str = "na.n2v"
#source_str = "es.norm.n2v"
#source_str = "en.fst"
source_vec = utils.open_file(source_str)
words_src, source_vec = utils.read(source_vec, is_zipped=False)
# lista de palabras en español del lexicon semilla
eval_src = list(set(words_scr_lexicon))
src_vec = utils.get_vectors(eval_src, words_src, source_vec)
print("source_vec: " + source_str)
#print(src_vec.shape)


#target_str = "it.fst"
target_vec = utils.open_file(target_str)
words_trg, target_vec = utils.read(target_vec, is_zipped=False)
print("target_vec: " + target_str)
#eval_it = list(set(it))
#trg_vec = get_vectors(eval_it, words_it, it_vec)
#print(target_vec.shape)


test_vectors = src_vec

コード例 #13

0

ファイルを表示

                                       front_vector, draw_img, img_name)


img_src = "./test_imgs"
info_src = "./test_info"

input_size = 224
test = Test(
    MobileNetV2,
    "./results/MobileNetV2_1.0_classes_66_input_224/snapshot/MobileNetV2_1.0_classes_66_input_224_epoch_50_front_vector.pkl",
    66)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

imgs = sorted(os.listdir(img_src))
infos = sorted(os.listdir(info_src))

for img, info in zip(imgs, infos):
    img_name = img
    img = cv.imread(os.path.join(img_src, img))
    draw_img = img.copy()
    img = cv.resize(img, (224, 224))
    img = transform(img)
    img = img.unsqueeze(0)
    info = get_info_from_txt(os.path.join(info_src, info))
    front_vector, _ = get_vectors(info)

    test.test_per_img(img, draw_img, front_vector, img_name)

コード例 #14

0

ファイルを表示

ファイル: moralmaze.py プロジェクト: Youmna-H/chat-app

            vectors_con = {d: vectors_all[d] for d in prop_content_texts_con}
            most_similar_con = utils.most_sim_cos(vectors_con, text,
                                                  args.num_responses)
            vectors_neutral = {
                d: vectors_all[d]
                for d in prop_content_texts_neutral
            }
            most_similar_neutral = utils.most_sim_cos(vectors_neutral, text,
                                                      args.num_responses)
    elif args.sim_model == 'word2vec':
        path = pre_path + "embeddings/GoogleNews-vectors-negative300.bin"
        model = gensim.models.KeyedVectors.load_word2vec_format(path,
                                                                binary=True)

        if args.responses_per_stance == 0:
            vectors = utils.get_vectors(model, prop_content_texts)
            most_similar = utils.most_sim_cos(vectors, text,
                                              args.num_responses)
        else:
            vectors_pro = utils.get_vectors(model, prop_content_texts_pro)
            most_similar_pro = utils.most_sim_cos(vectors_pro, text,
                                                  args.num_responses)
            vectors_con = utils.get_vectors(model, prop_content_texts_con)
            most_similar_con = utils.most_sim_cos(vectors_con, text,
                                                  args.num_responses)
            vectors_neutral = utils.get_vectors(model,
                                                prop_content_texts_neutral)
            most_similar_neutral = utils.most_sim_cos(vectors_neutral, text,
                                                      args.num_responses)
    elif args.sim_model == 'glove':
        path = pre_path + "embeddings/glove.840B.300dword2vec.txt"

コード例 #15

0

ファイルを表示

ファイル: draw_label.py プロジェクト: anArkitek/TriNet


img_src = "./test_imgs"
info_src = "./test_info"

input_size = 224
test = Test(
    MobileNetV2,
    "./results/MobileNetV2_1.5_classes_66_input_224/snapshot/MobileNetV2_1.5_classes_66_input_224_epoch_37.pkl",
    66)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

imgs = sorted(os.listdir(img_src))
infos = sorted(os.listdir(info_src))

for img, info in zip(imgs, infos):
    img_name = img
    img = cv.imread(os.path.join(img_src, img))
    draw_img = img.copy()
    img = cv.resize(img, (224, 224))
    img = transform(img)
    img = img.unsqueeze(0)
    info = get_info_from_txt(os.path.join(info_src, info))
    front_vector, right_vector, up_vector = get_vectors(info)
    print(front_vector)

    test.test_per_img(img, draw_img, img_name)

コード例 #16

0

ファイルを表示

ファイル: eval_cluster.py プロジェクト: RingBDStack/RWNE

def eval_online(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    cluster_dir = os.path.split(options.cluster_path)[0]
    if not utils.check_rebuild(cluster_dir,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t total labels size: {}'.format(options.label_size))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

    fr_total = open(options.cluster_path, 'w')
    fr_total.write('eval case: cluster...\n')
    fr_total.write('\t save_dir: {}\n'.format(cluster_dir))
    fr_total.write('\t cluster: kmeans\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(NMI):\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tNMI\n')

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='nmi', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_nmi = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_cluster"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        LABEL_SIZE = options.label_size
        logger.info(
            '\t reading labeled data completed in {}s'.format(time.time() -
                                                              time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # cluster
        fr = open(options.cluster_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: cluster...\n')
        fr.write('\t cluster: kmeans\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i,
                                                  np.sum(labels_matrix == i)))

        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            fr.write("\t using {} processes for evaling:\n".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                fr.write("\t process-{}: repeat {} times\n".format(
                    idx, rep_times))

            try:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            except:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            if len(nmi_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(nmi_list), options.repeated_times))
        else:
            try:
                nmi_list = _cluster_thread_body(options.repeated_times)
            except:
                nmi_list = _cluster_thread_body(options.repeated_times)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        mean_nmi = sum(nmi_list) / float(len(nmi_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(nmi_list)))
        fr.write('\t\t NMI = {}\n'.format(mean_nmi))
        fr.write('details:\n')
        for repeat in range(len(nmi_list)):
            fr.write('\t repeated {}/{}: NMI = {}\n'.format(
                repeat + 1, len(nmi_list), nmi_list[repeat]))
        fr.write('\neval case: cluster completed in {}s\n'.format(time.time() -
                                                                  time_start))
        fr.close()

        # fr_total.write('%.4f\n' % mean_nmi)
        fr_total.write('{}\n'.format(mean_nmi))
        fr_total.flush()
        summary.value.add(tag='nmi', simple_value=mean_nmi)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'cluster completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_nmi > best_nmi:
            best_nmi = mean_nmi

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_nmi: {}\n".format(best_nmi))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    cluster_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()

    return

コード例 #17

0

ファイルを表示

    def __getitem__(self, index, crop=False):
        base_name = self.data_list[index][:-4]
        img = Image.open(
            os.path.join(self.data_dir, 'test_imgs/' + base_name + '.jpg'))
        img = img.convert(self.image_mode)

        if crop:

            # get face bbox
            bbox_path = os.path.join(self.data_dir,
                                     'bbox/' + base_name + '.txt')

            pt2d = get_label_from_txt(bbox_path)
            x_min = pt2d[0]
            y_min = pt2d[1]
            x_max = pt2d[2]
            y_max = pt2d[3]

            # Crop the face loosely
            k = 0.1
            x_min -= k * abs(x_max - x_min)
            y_min -= k * abs(y_max - y_min)
            x_max += k * abs(x_max - x_min)
            y_max += 0.3 * k * abs(y_max - y_min)
            img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))

        # get pose angle pitch,yaw,roll(degrees) ANGLES NO NEEDED!!!
        #angle_path = os.path.join(self.data_dir, 'angles/' + base_name + '.txt')
        #angle = get_label_from_txt(angle_path)
        #angle = torch.FloatTensor(angle)

        # get pose quat
        #quat_path = os.path.join(self.data_dir, 'info/' + base_name + '.txt')
        #quat = get_label_from_txt(quat_path)
        info = get_info_from_txt(
            os.path.join(self.data_dir, 'test_info/' + base_name + '.txt'))

        # Attention vector
        #attention_vector = get_attention_vector(quat)

        front_vector, _ = get_vectors(info)
        vector_label = torch.FloatTensor(front_vector)

        # classification label
        bins = np.array(range(-99, 100, self.bin_size)) / 99
        classify_label = torch.LongTensor(np.digitize(front_vector,
                                                      bins))  # 1-num_classes
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        if self.transform is not None:
            img = self.transform(img)

        # RGB2BGR
        img = img[np.array([2, 1, 0]), :, :]

        #return img, soft_label, vector_label, angle, torch.FloatTensor(pt2d), os.path.join(self.data_dir, 'test_imgs/' + base_name + '.jpg')
        return img, soft_label, vector_label, os.path.join(
            self.data_dir, 'test_imgs/' + base_name + '.jpg')

コード例 #18

0

ファイルを表示

ファイル: eval_link_prediction.py プロジェクト: RingBDStack/RWNE

def eval_online(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    link_prediction_dir = os.path.split(options.link_prediction_path)[0]
    if not utils.check_rebuild(link_prediction_dir,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(link_prediction_dir):
        os.makedirs(link_prediction_dir)

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    metric_prec_k_list = [1]
    decimal_number = 10
    while metric_prec_k_list[-1] < options.precK_max_index:
        if decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(decimal_number)
        else:
            break
        if 2 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(2 * decimal_number)
        else:
            break
        if 5 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(5 * decimal_number)
        else:
            break
        decimal_number = decimal_number * 10

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

    fr_total = open(options.link_prediction_path, 'w')
    fr_total.write('eval case: link-prediction ...\n')
    fr_total.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr_total.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr_total.write('\t except_data_path: {}\n'.format(
        options.except_data_path))
    fr_total.write('\t data_format: {}\n'.format(options.data_format))
    fr_total.write('\t metrics: MAP and precise@K\n')
    fr_total.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr_total.write('\t similarity_metric: {}\n'.format(
        options.similarity_metric))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr_total.write('\t sample_nodes_rule: {}\n'.format(
        options.sample_nodes_rule))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr_total.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr_total.write(
        "except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr_total.write(
        "except_net_edges_size = {}\n".format(except_net_edges_size))
    fr_total.write(
        '\t results:\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tMAP\t')
    for v in metric_prec_k_list:
        fr_total.write('\tPr@{}'.format(v))
    fr_total.write("\n")

    last_step = 0
    summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='MAP', simple_value=0.)
    for v in metric_prec_k_list:
        summary.value.add(tag='Pr_{}'.format(v), simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_MAP = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_link_prediction"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        # loading features_matrix(already trained)
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        time_start = time.time()
        features_matrix = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list)
        os.remove(reading)
        logger.info("\t done for reading ...")
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('total loaded nodes: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('the embedding dimension: {}'.format(
            np.size(features_matrix, axis=1)))

        #
        fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: link-prediction ...\n')
        fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
        fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
        fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
        fr.write('\t data_format: {}\n'.format(options.data_format))
        fr.write('\t metrics: MAP and precise@K\n')
        fr.write('\t max_index for precise@K: {}\n'.format(
            options.precK_max_index))
        fr.write('\t similarity_metric: {}\n'.format(
            options.similarity_metric))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
        fr.write('\t sample_nodes_rule: {}\n'.format(
            options.sample_nodes_rule))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
        fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
        fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
        fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
        fr.write('total loaded nodes: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('the embedding dimension: {}\n'.format(
            np.size(features_matrix, axis=1)))

        if options.sample_nodes > 0:
            if options.eval_workers > 1 and options.repeated_times > 1:
                # speed up by using multi-process
                ret_list = []  # [[MAP, precisionK_list], ... ]
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_sample_thread_body,
                                            times_per_worker):
                        ret_list.extend(ret)
                if len(ret_list) != options.repeated_times:
                    logger.warning(
                        "warning: eval unmatched repeated_times: {} != {}".
                        format(len(ret_list), options.repeated_times))
            else:
                ret_list = _sample_thread_body(options.repeated_times)
        else:
            # no sampling, no repeat!
            ret_list = [_eval(net_eval,
                              net_except)]  # [[MAP, precisionK_list]]

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        if options.sample_nodes > 0:
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(options.repeated_times, len(ret_list)))
        else:
            fr.write(
                'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
                .format(options.sample_nodes, len(ret_list)))

        mean_MAP = np.mean([ret[0] for ret in ret_list])
        mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

        fr.write('\t\t MAP = {}\n'.format(mean_MAP))
        for k in range(options.precK_max_index):
            if k < len(mean_precisionK):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, mean_precisionK[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
        fr.write('details:\n')
        for repeat in range(len(ret_list)):
            fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
            MAP = ret_list[repeat][0]
            precisionK_list = ret_list[repeat][1]
            fr.write('\t\t MAP = {}\n'.format(MAP))
            for k in range(options.precK_max_index):
                if k < len(precisionK_list):
                    fr.write('\t\t precisionK_{} = {}\n'.format(
                        k + 1, precisionK_list[k]))
                else:
                    fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

        fr.write('\neval case: link_prediction completed in {}s.'.format(
            time.time() - time_start))
        fr.close()

        fr_total.write('%.4f' % mean_MAP)
        summary.value.add(tag='MAP', simple_value=mean_MAP)
        for v in metric_prec_k_list:
            fr_total.write('\t%.4f' % mean_precisionK[v - 1])
            summary.value.add(tag='Pr_{}'.format(v),
                              simple_value=mean_precisionK[v - 1])
        fr_total.write("\n")
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'eval case: ret_list completed in {}s.\n================================='
            .format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_MAP > best_MAP:
            best_MAP = mean_MAP

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_MAP: {}\n".format(best_MAP))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    link_prediction_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()

コード例 #19

0

ファイルを表示

ファイル: dataset.py プロジェクト: anArkitek/TriNet

    def __getitem__(self, index, crop=False):
        base_name, _ = self.data_list[index].split('.')
        img = Image.open(
            os.path.join(self.data_dir,
                         'dataset/bg_imgs/' + base_name + '.jpg'))
        img = img.convert(self.image_mode)

        if crop:

            # get face bbox
            bbox_path = os.path.join(self.data_dir,
                                     'bbox/' + base_name + '.txt')

            pt2d = get_label_from_txt(bbox_path)
            x_min = pt2d[0]
            y_min = pt2d[1]
            x_max = pt2d[2]
            y_max = pt2d[3]

            # Crop the face loosely
            k = 0.1
            x_min -= k * abs(x_max - x_min)
            y_min -= k * abs(y_max - y_min)
            x_max += k * abs(x_max - x_min)
            y_max += 0.3 * k * abs(y_max - y_min)
            img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))

        # get pose angle pitch,yaw,roll(degrees) ANGLES NO NEEDED!!!
        #angle_path = os.path.join(self.data_dir, 'angles/' + base_name + '.txt')
        #angle = get_label_from_txt(angle_path)
        #angle = torch.FloatTensor(angle)

        if self.transform is not None:
            img = self.transform(img)

        # RGB2BGR
        img = img[np.array([2, 1, 0]), :, :]

        # get pose quat
        #quat = get_label_from_txt(os.path.join(self.data_dir, "info/" + base_name + '.txt'))
        info = get_info_from_txt(
            os.path.join(self.data_dir, "info_all/" + base_name + '.txt'))
        # face orientation vector
        #vector_label = get_attention_vector(quat)

        #get one front vector
        #vector_label = angle2vector(os.path.join(self.data_dir, "info/" + base_name + '.txt'))
        #vector_label = torch.FloatTensor(vector_label)

        #get front vector and right vector
        front_vector, right_vector, up_vector = get_vectors(info)
        #print(np.dot(np.array(front_vector),np.array(right_vector)))
        #print(np.dot(np.array(front_vector),np.array(up_vector)))
        #print(np.dot(np.array(right_vector),np.array(up_vector)))

        vector_label_f = torch.FloatTensor(front_vector)
        vector_label_r = torch.FloatTensor(right_vector)
        vector_label_u = torch.FloatTensor(up_vector)

        bins = np.array(range(-99, 100, self.bin_size)) / 99

        #----------------front vector-------------------------
        # classification label
        classify_label = torch.LongTensor(np.digitize(
            front_vector, bins))  # return the index
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label_f = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        #-------------------right vector--------------
        classify_label = torch.LongTensor(np.digitize(
            right_vector, bins))  # return the index
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label_r = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        #------------------up vector-------------------
        classify_label = torch.LongTensor(np.digitize(
            up_vector, bins))  # return the index
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label_u = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        return img, soft_label_f, soft_label_r, soft_label_u, vector_label_f, vector_label_r, vector_label_u, os.path.join(
            self.data_dir, "dataset/bg_imgs/" + base_name + ".jpg")

コード例 #20

0

ファイルを表示

ファイル: kialo.py プロジェクト: Youmna-H/chat-app

def load(topic, model_name, sub_model_name="paraphrase-distilroberta-base-v1", is_indexed=False, **kwargs):
	pre_path = ""#"python/"
	config_path = pre_path + "kialo_config.json"
	data_path_pro, vec_path_pro, data_path_con, vec_path_con, data_path_all, vec_path_all, vec_path_pro_original, vec_path_con_original = "", "", "", "", "", "", "", ""
	lowercase_to_uppercase = {}
	texts_all, texts_pro, texts_con = [], [], []
	stances = {}
	responses = {}
	with open(config_path) as f:
		config = json.load(f)
		for c in config["topics"]:
			if topic == c["id"]:
				data_path_pro = pre_path + c["data_path_pro"]
				vec_path_pro = pre_path  + c["sbert_path_pro"]
				data_path_con = pre_path + c["data_path_con"]
				vec_path_con = pre_path  + c["sbert_path_con"]
				data_path_all = pre_path + c["data_path_all"]
				vec_path_all = pre_path  + c["sbert_path_all"]
				vec_path_pro_original = pre_path  + c["sbert_path_pro_original"]
				vec_path_con_original = pre_path  + c["sbert_path_con_original"]
	#read data files
	with open(data_path_pro) as f:
		for line in f:
			parts = line.strip().split("\t",4)
			text = parts[0]
			lowercase_to_uppercase[text] = parts[2]
			texts_pro.append(text)
			if len(parts) >= 5:
				responses[text] = parts[4].split('\t')
	with open(data_path_con) as f:
		for line in f:
			parts = line.strip().split("\t",4)
			text = parts[0]
			lowercase_to_uppercase[text] = parts[2]
			texts_con.append(text)
			if len(parts) >= 5:
				responses[text] = parts[4].split('\t')
	#load model
	pro_vecs, con_vecs = [], []
	model = None
	if model_name == 'sbert':
		model = utils.get_sbert_model(sub_model_name)
		if is_indexed == 1:
			models_pro_con = [faiss_index.load_index(vec_path_pro), faiss_index.load_index(vec_path_con)]
		else:
			infile_pro = open(vec_path_pro_original,'rb')
			infile_con = open(vec_path_con_original,'rb')
			pro_vecs = cPickle.load(infile_pro)
			con_vecs = cPickle.load(infile_con)
			model = utils.get_sbert_model(sub_model_name)
	elif model_name == 'word2vec':
		path = pre_path + "embeddings/GoogleNews-vectors-negative300.bin"
		model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
		pro_vecs, model = utils.get_vectors(model, texts_pro)
		con_vecs, model = utils.get_vectors(model, texts_con)
	elif model_name == 'glove':
		path = pre_path + "embeddings/glove.840B.300dword2vec.txt"
		model = utils.read_emb_text(path)
		pro_vecs, model = utils.get_vectors(model, texts_pro)
		con_vecs, model = utils.get_vectors(model, texts_con)
	#return values
	#model is an object
	#pro_vecs, con_vecs are arrays of vectors
	#texts_pro, texts_con are arrays of texts
	#responses and lowercase_to_uppercase is a dictionaries
	return {"model": model, "pro_vecs":pro_vecs, "con_vecs":con_vecs, "texts_pro":texts_pro, "texts_con":texts_con, "responses":responses, "lowercase_to_uppercase":lowercase_to_uppercase}

コード例 #21

0

ファイルを表示

ファイル: dataset.py プロジェクト: anArkitek/TriNet

    def __getitem__(self, index, crop=False):
        # data basename
        base_name, _ = self.data_list[index].split('.')

        # read image file
        img = Image.open(
            os.path.join(self.data_dir,
                         "dataset/bg_imgs/" + base_name + ".jpg"))
        img = img.convert(self.image_mode)

        if crop:

            # get face bounding box
            pt2d = get_label_from_txt(
                os.path.join(self.data_dir, "bbox/" + base_name + ".txt"))
            x_min, y_min, x_max, y_max = pt2d

            # crop face loosely:k=0to 0.2
            k = np.random.random_sample() * 0.1
            x_min -= 0.6 * k * abs(x_max - x_min)
            y_min -= k * abs(y_max - y_min)
            x_max += 0.6 * k * abs(x_max - x_min)
            y_max += 0.6 * k * abs(y_max - y_min)
            img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))

        # Augmentation:Blur?
        if np.random.random_sample() < 0.05:
            img = img.filter(ImageFilter.BLUR)

        # Augmentation:Gray?
        if np.random.random_sample() < 0.5 and base_name.find('ID') < 0:
            img = img.convert('L').convert("RGB")

        # transform
        if self.transform:
            img = self.transform(img)

        # RGB2BGR
        img = img[np.array([2, 1, 0]), :, :]

        # get pose quat
        #quat = get_label_from_txt(os.path.join(self.data_dir, "info/" + base_name + '.txt'))
        info = get_info_from_txt(
            os.path.join(self.data_dir, "info_all/" + base_name + '.txt'))
        # face orientation vector
        #vector_label = get_attention_vector(quat)

        #get one front vector
        #vector_label = angle2vector(os.path.join(self.data_dir, "info/" + base_name + '.txt'))
        #vector_label = torch.FloatTensor(vector_label)

        #get front vector and right vector
        front_vector, right_vector, up_vector = get_vectors(info)
        #print(np.dot(np.array(front_vector),np.array(right_vector)))
        #print(np.dot(np.array(front_vector),np.array(up_vector)))
        #print(np.dot(np.array(right_vector),np.array(up_vector)))

        vector_label_f = torch.FloatTensor(front_vector)
        vector_label_r = torch.FloatTensor(right_vector)
        vector_label_u = torch.FloatTensor(up_vector)

        #----------------front vector-------------------------
        # classification label
        classify_label = torch.LongTensor(np.digitize(
            front_vector, self.bins))  # return the index
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label_f = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        #-------------------right vector--------------
        classify_label = torch.LongTensor(np.digitize(
            right_vector, self.bins))  # return the index
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label_r = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        #------------------up vector-------------------
        classify_label = torch.LongTensor(np.digitize(
            up_vector, self.bins))  # return the index
        classify_label = np.where(classify_label > self.num_classes,
                                  self.num_classes, classify_label)
        classify_label = np.where(classify_label < 1, 1, classify_label)

        # soft label
        soft_label_x = get_soft_label(classify_label[0], self.num_classes)
        soft_label_y = get_soft_label(classify_label[1], self.num_classes)
        soft_label_z = get_soft_label(classify_label[2], self.num_classes)

        soft_label_u = torch.stack([soft_label_x, soft_label_y, soft_label_z])

        return img, soft_label_f, soft_label_r, soft_label_u, vector_label_f, vector_label_r, vector_label_u, os.path.join(
            self.data_dir, "dataset/bg_imgs/" + base_name + ".jpg")

コード例 #22

0

ファイルを表示

ファイル: eval.py プロジェクト: liyucheng09/sentence_embedding

cache_embedding = 'checkpoints/.cache/'
kernel_path = 'kernel_path/'

if __name__ == '__main__':

    ds, model_name, = sys.argv[1:]

    model, tokenizer = get_model_and_tokenizer(model_name)
    input_a, input_b, label = get_tokenized_ds(datasets_paths[ds]['scripts'],
                                               datasets_paths[ds]['data_path'],
                                               tokenizer, ds)

    if not os.path.exists(cache_embedding + ds + '.avec.cache.npy'):

        with torch.no_grad():
            a_vecs, b_vecs = get_vectors(model, input_a, input_b)
        a_vecs = a_vecs.cpu().numpy()
        b_vecs = b_vecs.cpu().numpy()
        np.save(cache_embedding + ds + '.avec.cache', a_vecs)
        np.save(cache_embedding + ds + '.bvec.cache', b_vecs)
    else:
        a_vecs = np.load(cache_embedding + ds + '.avec.cache.npy')
        b_vecs = np.load(cache_embedding + ds + '.bvec.cache.npy')

    if n_components:
        kernel, bias = compute_kernel_bias([a_vecs, b_vecs])

        save_kernel_and_bias(kernel, bias, model_name)

        kernel = kernel[:, :n_components]
        a_vecs = transform_and_normalize(a_vecs, kernel, bias)

コード例 #23

0

ファイルを表示

ファイル: eval_classify.py プロジェクト: RingBDStack/RWNE

def eval_online(options):
    global features_matrix, labels_matrix
    classify_dir = os.path.split(options.classify_path)[0]
    if not utils.check_rebuild(classify_dir,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(classify_dir):
        os.makedirs(classify_dir)
    logger.info('eval case: classify...')
    logger.info('\t save_dir: {}'.format(classify_dir))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]
    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))

    fr_total = open(options.classify_path, 'w')
    fr_total.write('eval case: classify...\n')
    fr_total.write('\t save_dir: {}\n'.format(classify_dir))
    fr_total.write('\t classifier: LogisticRegression\n')
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(Macro_F1,Micro_F1):\n=============================================================\n'
    )
    fr_total.write(
        'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n')

    time_start = time.time()
    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    id_list_totoal, labels_list_total = utils.get_labeled_data(
        options.label_path)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph())
    summary = tf.Summary()
    for train_ratio in train_ratio_list:
        summary.value.add(tag='macro_train_{}'.format(train_ratio),
                          simple_value=0.)
        summary.value.add(tag='micro_train_{}'.format(train_ratio),
                          simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_micro = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_classify"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            ## synchrolock for multi-process:
            # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and
            #                       time.time() - os.stat(options.vectors_path).st_mtime > 200)):
            #     time.sleep(options.eval_interval)
            #     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            #     cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            # os.utime(options.vectors_path, None)
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list_totoal,
            labels_list_total)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        mlb = MultiLabelBinarizer(range(options.label_size))
        labels_matrix = mlb.fit_transform(labels_list)
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # classify
        fr = open(options.classify_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: classify...\n')
        fr.write('\t classifier: LogisticRegression\n')
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
            repeated_times, train_ratio_list))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:,
                                                                          i])))

        if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
            fr.write("\t using {} processes for evaling:\n".format(
                len(train_ratios_per_worker)))
            for idx, train_ratios in enumerate(train_ratios_per_worker):
                fr.write("\t process-{}: {}\n".format(idx, train_ratios))
            ret_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_classify_thread_body,
                                        train_ratios_per_worker):
                    ret_list.extend(ret)
        else:
            ret_list = _classify_thread_body(train_ratio_fulllist)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        ret_dict = {}
        for ret in ret_list:
            if ret[0] in ret_dict:
                ret_dict[ret[0]][0].append(ret[1])
                ret_dict[ret[0]][1].append(ret[2])
            else:
                ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

        for train_ratio, macro_micro in sorted(ret_dict.items(),
                                               key=lambda item: item[0]):
            fr.write('\n' + '-' * 20 + '\n' +
                     'train_ratio = {}\n'.format(train_ratio))
            Macro_F1_list = macro_micro[0]
            Micro_F1_list = macro_micro[1]
            if len(Macro_F1_list) != repeated_times:
                logger.warning(
                    "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                    .format(train_ratio, len(Macro_F1_list), repeated_times))
            mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
            mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(repeated_times, len(Macro_F1_list)))
            fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
            fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
            fr.write('details:\n')
            for repeat in range(len(Macro_F1_list)):
                fr.write(
                    '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                        repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                        Micro_F1_list[repeat]))
            fr_total.write('%.4f, %.4f    ' % (mean_Macro_F1, mean_Micro_F1))
            summary.value.add(tag='macro_train_{}'.format(train_ratio),
                              simple_value=mean_Macro_F1)
            summary.value.add(tag='micro_train_{}'.format(train_ratio),
                              simple_value=mean_Micro_F1)

        fr.write(
            '\neval case: classify completed in {}s\n'.format(time.time() -
                                                              time_start))
        fr.close()
        fr_total.write('\n')
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'classify completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_Micro_F1 > best_micro:
            best_micro = mean_Micro_F1

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    classify_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(classify_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()

コード例 #24

0

ファイルを表示

ファイル: train.py プロジェクト: Tommy-Xu/CHIM

from model import Classifier as Model
import utils


def to_tensor(x, device=device):
    x = np.array(x)
    x = torch.from_numpy(x)
    return x.cuda()


if os.path.isfile(vector_pickle):
    word_vectors, word_dict, category_dicts, label_dict = pickle.load(
        open(vector_pickle, 'rb'))
else:
    word_vectors, word_dict, category_dicts, label_dict = utils.get_vectors(
        train_file, num_categories, vec_file, word_dim)
    pickle.dump([word_vectors, word_dict, category_dicts, label_dict],
                open(vector_pickle, 'wb'),
                protocol=4)

if os.path.isfile(train_pickle):
    train_data = pickle.load(open(train_pickle, 'rb'))
else:
    train_data = utils.get_data(train_file, word_dict, category_dicts,
                                label_dict)
    pickle.dump(train_data, open(train_pickle, 'wb'), protocol=4)

if os.path.isfile(dev_pickle):
    dev_data = pickle.load(open(dev_pickle, 'rb'))
else:
    dev_data = utils.get_data(dev_file, word_dict, category_dicts, label_dict)

コード例 #25

0

ファイルを表示

ファイル: eval_classify.py プロジェクト: RingBDStack/RWNE

def eval_once(options):
    global features_matrix, labels_matrix
    if not utils.check_rebuild(options.classify_path,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: classify...')
    logger.info('\t save_path: {}'.format(options.classify_path))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    mlb = MultiLabelBinarizer(range(options.label_size))
    labels_matrix = mlb.fit_transform(labels_list)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]

    # classify
    fr = open(options.classify_path, 'w')
    fr.write('eval case: classify...\n')
    fr.write('\t save_path: {}\n'.format(options.classify_path))
    fr.write('\t classifier: LogisticRegression\n')
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i])))

    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))
        ret_list = []  # (train_ratio, macro, micro)
        with ProcessPoolExecutor(max_workers=options.eval_workers) as executor:
            for ret in executor.map(_classify_thread_body,
                                    train_ratios_per_worker):
                ret_list.extend(ret)
    else:
        ret_list = _classify_thread_body(train_ratio_fulllist)

    ret_dict = {}
    for ret in ret_list:
        if ret[0] in ret_dict:
            ret_dict[ret[0]][0].append(ret[1])
            ret_dict[ret[0]][1].append(ret[2])
        else:
            ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

    for train_ratio, macro_micro in sorted(ret_dict.items(),
                                           key=lambda item: item[0]):
        fr.write('\n' + '-' * 20 + '\n' +
                 'train_ratio = {}\n'.format(train_ratio))
        Macro_F1_list = macro_micro[0]
        Micro_F1_list = macro_micro[1]
        if len(Macro_F1_list) != repeated_times:
            logger.warning(
                "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                .format(train_ratio, len(Macro_F1_list), repeated_times))
        mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
        mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(repeated_times, len(Macro_F1_list)))
        fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
        fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
        fr.write('details:\n')
        for repeat in range(len(Macro_F1_list)):
            fr.write(
                '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                    repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                    Micro_F1_list[repeat]))
    fr.write('\neval case: classify completed in {}s'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: classify completed in {}s'.format(time.time() -
                                                              time_start))