def get_embeddings(self, sents): tokenized_sents = self.tokenizer(sents, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): vecs = get_vectors(self.model, tokenized_sents)[0] vecs = vecs.cpu().numpy() return vecs
def load_features(options, net): global features_dict features_dict = {} time_start = time.time() logger.info('\t loading embedding features...') start_nodes = net.get_nodes(node_type=options.eval_edge_type[0]) target_nodes = net.get_nodes(node_type=options.eval_edge_type[1]) id_list = start_nodes + target_nodes id_list, features_matrix = utils.get_vectors(utils.get_KeyedVectors( options.vectors_path), id_list, missing_rule="random") for idx, node_id in enumerate(id_list): features_dict[node_id] = features_matrix[idx] logger.info( '\t loading embedding features completed in {}s'.format(time.time() - time_start))
def get_embeddings(self, sents, whitening=False): tokenized_sents = self.tokenizer(sents, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): vecs = get_vectors(self.model, tokenized_sents, pool=self.pool)[0] vecs = vecs.cpu().numpy() if whitening: kernel, bias = self.kernel, self.bias kernel = kernel[:, :self.n_components] vecs = transform_and_normalize(vecs, kernel, bias) return vecs vecs = vector_l2_normlize(vecs) return vecs
else: num_classes = 10 batch_size = 32 st = 100 import os import time import pickle if os.path.isfile(pickle_file): x_f_train, x_s_train, x_len_train, y_train, \ x_f_dev, x_s_dev, x_len_dev, y_dev, \ x_dict, word_vectors, \ max_sen_len, max_seg_len, max_seq_len = pickle.load(open(pickle_file, 'rb')) else: x_dict = utils.get_flat_dict(train_file, x_index) word_vectors = utils.get_vectors(x_dict, vec_file) x_f_train, y_train, len_f_train = utils.get_flat_data(train_file, x_index, y_index, x_dict, num_classes) x_s_train, _, x_len_train, max_train_sen, len_s_train = utils.get_hier_data(train_file, x_index, y_index, x_dict, num_classes) x_f_dev, y_dev, len_f_dev = utils.get_flat_data(dev_file, x_index, y_index, x_dict, num_classes) x_s_dev, _, x_len_dev, max_dev_sen, len_s_dev = utils.get_hier_data(dev_file, x_index, y_index, x_dict, num_classes, max_train_sen) x_s_train, _, x_len_train, max_train_sen, len_s_train = utils.get_hier_data(train_file, x_index, y_index, x_dict, num_classes, max_dev_sen) max_sen_len = max(max_train_sen, max_dev_sen) max_seg_len = max(len_s_train, len_s_dev) max_seq_len = max(len_f_train, len_f_dev) pickle.dump([x_f_train, x_s_train, x_len_train, y_train, \ x_f_dev, x_s_dev, x_len_dev, y_dev, \ x_dict, word_vectors, \ max_sen_len, max_seg_len, max_seq_len], \ open(pickle_file, 'wb'), protocol=4) vocab_size = len(x_dict)
most_similar_pro.extend( utils.most_sim_cos(vectors_pro, query_text, args.num_responses)) most_similar_con.extend( utils.most_sim_cos(vectors_con, query_text, args.num_responses)) infile_pro.close() infile_con.close() elif args.sim_model == 'word2vec': path = pre_path + "embeddings/GoogleNews-vectors-negative300.bin" model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) texts_all.append(query_text) if args.responses_per_stance == 0: vectors = utils.get_vectors(model, texts_all) most_similar = utils.most_sim_cos(vectors, query_text, args.num_responses) else: texts_pro.append(query_text) texts_con.append(query_text) vectors_pro = utils.get_vectors(model, texts_pro) most_similar_pro = utils.most_sim_cos(vectors_pro, query_text, args.num_responses) vectors_con = utils.get_vectors(model, texts_con) most_similar_con = utils.most_sim_cos(vectors_con, query_text, args.num_responses) elif args.sim_model == 'glove': path = pre_path + "embeddings/glove.840B.300dword2vec.txt" # path = '/Users/youmna/Documents/Phd_year2/Coherence_acl/GC_wsj/glove.6B.100d.word2vec.txt' model = utils.read_emb_text(path)
u_test, p_test, u_test_count, p_test_count, \ x_dict, u_dict, p_dict, u_freq, p_freq, \ x_vectors = cPickle.load(open(pickle_file, 'rb')) else: x_dict = utils.get_dict(train_file, x_index) u_dict, p_dict, u_freq, p_freq = utils.get_up_dict(train_file, u_index, p_index) x_train, i_train, x_train_len, y_train, u_train, p_train, \ u_train_count, p_train_count = utils.get_flat_data(train_file, x_index, y_index, x_dict, num_classes, u_index, p_index, u_dict, p_dict, u_freq, p_freq) x_dev, i_dev, x_dev_len, y_dev, u_dev, p_dev, \ u_dev_count, p_dev_count = utils.get_flat_data(dev_file, x_index, y_index, x_dict, num_classes, u_index, p_index, u_dict, p_dict, u_freq, p_freq) x_test, i_test, x_test_len, y_test, u_test, p_test, \ u_test_count, p_test_count = utils.get_flat_data(test_file, x_index, y_index, x_dict, num_classes, u_index, p_index, u_dict, p_dict, u_freq, p_freq) x_vectors = utils.get_vectors(x_dict, vec_file, emb_size) cPickle.dump([x_train, i_train, x_train_len, y_train, \ u_train, p_train, u_train_count, p_train_count, \ x_dev, i_dev, x_dev_len, y_dev, \ u_dev, p_dev, u_dev_count, p_dev_count, \ x_test, i_test, x_test_len, y_test, \ u_test, p_test, u_test_count, p_test_count, \ x_dict, u_dict, p_dict, u_freq, p_freq, \ x_vectors], open(pickle_file, 'wb'), protocol=4) vocab_size = len(x_dict) target_len = num_classes user_vocab_size = len(u_dict) prod_vocab_size = len(p_dict) user_mean_freq = np.mean([u_freq[x] for x in u_freq]) prod_mean_freq = np.mean([p_freq[x] for x in p_freq])
# They are usually found in the same location of a sentence, # have the same parts of speech, and thus when # learning the word vectors, you end up getting similar weights. In the next week # we will go over how you learn them, but for now let's just enjoy using them. # # **Instructions:** Run the cell below. # In[143]: words = [ 'oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country', 'continent', 'petroleum', 'joyful' ] # given a list of words and the embeddings, it returns a matrix with all the embeddings X = get_vectors(word_embeddings, words) print('You have 11 words each of 300 dimensions thus X.shape is:', X.shape) # In[144]: # We have done the plotting for you. Just run this cell. result = compute_pca(X, 2) plt.scatter(result[:, 0], result[:, 1]) for i, word in enumerate(words): plt.annotate(word, xy=(result[i, 0] - 0.05, result[i, 1] + 0.1)) plt.show() # **What do you notice?** #
def eval_once(options): global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K if not utils.check_rebuild(options.link_prediction_path, descrip='link_prediction', always_rebuild=options.always_rebuild): return logger.info('eval case: link-prediction ...') logger.info('\t save_path: {}'.format(options.link_prediction_path)) logger.info('\t eval_data_path: {}'.format(options.eval_data_path)) logger.info('\t except_data_path: {}'.format(options.except_data_path)) logger.info('\t data_format: {}'.format(options.data_format)) logger.info('\t metrics: MAP and precise@K') logger.info('\t max_index for precise@K: {}'.format( options.precK_max_index)) logger.info('\t similarity_metric: {}'.format(options.similarity_metric)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t sample_nodes: {}'.format(options.sample_nodes)) logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info("constructing eval network ...") net_eval = network.construct_network(data_path=options.eval_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) eval_net_nodes_size = net_eval.get_nodes_size() eval_net_edges_size = net_eval.get_edges_size() logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size)) logger.info("eval_net_edges_size = {}".format(eval_net_edges_size)) logger.info("constructing except(train) network ...") net_except = network.construct_network(data_path=options.except_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) except_net_nodes_size = net_except.get_nodes_size() except_net_edges_size = net_except.get_edges_size() logger.info("except_net_nodes_size = {}".format(except_net_nodes_size)) logger.info("except_net_edges_size = {}".format(except_net_edges_size)) id_list = list(range(eval_net_nodes_size)) # must be [0,1,2,3,...] SAMPLE_NODES = options.sample_nodes SAMPLE_RULE = options.sample_nodes_rule METIRC = options.similarity_metric PREC_K = options.precK_max_index # loading features_matrix(already trained) logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) time_start = time.time() features_matrix = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list) logger.info( '\t reading embedding vectors completed in {}s'.format(time.time() - time_start)) logger.info('total loaded nodes: {}'.format( np.size(features_matrix, axis=0))) logger.info('the embedding dimension: {}'.format( np.size(features_matrix, axis=1))) fr = open(options.link_prediction_path, 'w') fr.write('eval case: link-prediction ...\n') fr.write('\t save_path: {}\n'.format(options.link_prediction_path)) fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path)) fr.write('\t except_data_path: {}\n'.format(options.except_data_path)) fr.write('\t data_format: {}\n'.format(options.data_format)) fr.write('\t metrics: MAP and precise@K\n') fr.write('\t max_index for precise@K: {}\n'.format( options.precK_max_index)) fr.write('\t similarity_metric: {}\n'.format(options.similarity_metric)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes)) fr.write('\t sample_nodes_rule: {}\n'.format(options.sample_nodes_rule)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size)) fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size)) fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size)) fr.write("except_net_edges_size = {}\n".format(except_net_edges_size)) fr.write('total loaded nodes: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('the embedding dimension: {}\n'.format( np.size(features_matrix, axis=1))) if options.sample_nodes > 0: if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) ret_list = [] # [[MAP, precisionK_list], ... ] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_sample_thread_body, times_per_worker): ret_list.extend(ret) if len(ret_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(ret_list), options.repeated_times)) else: ret_list = _sample_thread_body(options.repeated_times) else: # no sampling, no repeat! ret_list = [_eval(net_eval, net_except)] # [[MAP, precisionK_list]] if options.sample_nodes > 0: fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(ret_list))) else: fr.write( 'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n' .format(options.sample_nodes, len(ret_list))) mean_MAP = np.mean([ret[0] for ret in ret_list]) mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0) fr.write('\t\t MAP = {}\n'.format(mean_MAP)) for k in range(options.precK_max_index): if k < len(mean_precisionK): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, mean_precisionK[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write('details:\n') for repeat in range(len(ret_list)): fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list))) MAP = ret_list[repeat][0] precisionK_list = ret_list[repeat][1] fr.write('\t\t MAP = {}\n'.format(MAP)) for k in range(options.precK_max_index): if k < len(precisionK_list): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, precisionK_list[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write( '\neval case: link_prediction completed in {}s.'.format(time.time() - time_start)) fr.close() logger.info( 'eval case: link_prediction completed in {}s.'.format(time.time() - time_start)) return
def eval_once(options): global features_matrix, labels_matrix, LABEL_SIZE if not utils.check_rebuild(options.cluster_path, descrip='cluster', always_rebuild=options.always_rebuild): return logger.info('eval case: cluster...') logger.info('\t save_path: {}'.format(options.cluster_path)) logger.info('\t cluster: kmeans') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data( options.label_path, multilabel_rule=options.multilabel_rule) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) labels_matrix = np.array([item[0] for item in labels_list]) LABEL_SIZE = options.label_size logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # cluster fr = open(options.cluster_path, 'w') fr.write('eval case: cluster...\n') fr.write('\t save_path: {}\n'.format(options.cluster_path)) fr.write('\t cluster: kmeans\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) try: nmi_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) except: nmi_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) if len(nmi_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(nmi_list), options.repeated_times)) else: try: nmi_list = _cluster_thread_body(options.repeated_times) except: nmi_list = _cluster_thread_body(options.repeated_times) mean_nmi = sum(nmi_list) / float(len(nmi_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(nmi_list))) fr.write('\t\t NMI = {}\n'.format(mean_nmi)) fr.write('details:\n') for repeat in range(len(nmi_list)): fr.write('\t repeated {}/{}: NMI = {}\n'.format( repeat + 1, len(nmi_list), nmi_list[repeat])) fr.write('\neval case: cluster completed in {}s.'.format(time.time() - time_start)) fr.close() logger.info('eval case: cluster completed in {}s.'.format(time.time() - time_start)) return
def eval_once(options): # visual_dir, visual_file = os.path.split(options.visualization_path) if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild): return # print logger logger.info('eval case: visualization...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t label_path = {}'.format(options.label_path)) logger.info('\t label_size = {}'.format(options.label_size)) logger.info('\t eval_node_type: {}'.format(options.eval_node_type)) logger.info('\t save_path: {}\n'.format(options.visualization_path)) logger.info('\t method: t-SNE') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t marker_size: {}'.format(options.marker_size)) logger.info('\t eval_online: {}'.format(options.eval_online)) # get embedding vectors and markersize logger.info('\t reading labeled data from file {}'.format(options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type, multilabel_rule=options.multilabel_rule, type_filepath=os.path.join(options.data_dir, options.data_name + ".nodes")) id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) labels_matrix = np.array([item[0] for item in labels_list]) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0))) logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1))) logger.info('\t total labels size: {}'.format(options.label_size)) for i in range(options.label_size): logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i))) fr = open(options.visualization_path, 'w') fr.write('eval case: visualization...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t label_path = {}\n'.format(options.label_path)) fr.write('\t label_size = {}\n'.format(options.label_size)) fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr.write('\t save_path: {}\n\n'.format(options.visualization_path)) fr.write('\t method: t-SNE\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t marker_size: {}\n'.format(options.marker_size)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i))) figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name) CCD = plot_embedding_in_2D(Markersize=options.marker_size, features_matrix=features_matrix, labels_matrix=labels_matrix, label_size=options.label_size, figure_path = figure_path) fr.write('\n figure_path: {}\n'.format(figure_path)) fr.write(' clustering_center_distance_sim: {}\n'.format(CCD)) fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start)) fr.close() logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
def eval_online(options): visual_dir = os.path.split(options.visualization_path)[0] if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild): return if not os.path.exists(visual_dir): os.makedirs(visual_dir) # print logger logger.info('eval case: visualization...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t label_path = {}'.format(options.label_path)) logger.info('\t label_size = {}'.format(options.label_size)) logger.info('\t eval_node_type: {}'.format(options.eval_node_type)) logger.info('\t save_dir: {}\n'.format(visual_dir)) logger.info('\t method: t-SNE') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t marker_size: {}'.format(options.marker_size)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t reading labeled data from file {}'.format(options.label_path)) # get embedding vectors and markersize time_start = time.time() id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type, multilabel_rule=options.multilabel_rule, type_filepath=os.path.join(options.data_dir, options.data_name + ".nodes")) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(len(id_list_totoal))) logger.info('\t total labels size: {}'.format(options.label_size)) fr_total = open(options.visualization_path, 'w') fr_total.write('eval case: visualization...\n') fr_total.write('\t data_dir = {}\n'.format(options.data_dir)) fr_total.write('\t data_name = {}\n'.format(options.data_name)) fr_total.write('\t isdirected = {}\n'.format(options.isdirected)) fr_total.write('\t label_path = {}\n'.format(options.label_path)) fr_total.write('\t label_size = {}\n'.format(options.label_size)) fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr_total.write('\t save_dir: {}\n\n'.format(visual_dir)) fr_total.write('\t method: t-SNE\n') fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr_total.write('\t marker_size: {}\n'.format(options.marker_size)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal))) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write('\t results(CCD-clustering_center_distance_sim):\n' '=============================================================\n') fr_total.write('finish_time\tckpt\tCCD\n') last_step = 0 summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='CCD', simple_value=0.) summary_writer.add_summary(summary, last_step) best_CCD = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("model and vectors not exist, waiting...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type) writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing): if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path)) id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list_totoal, labels_list_totoal) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") labels_matrix = np.array([item[0] for item in labels_list]) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) for i in range(options.label_size): logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i))) # visualization fr = open(options.visualization_path + '.{}'.format(cur_step), 'w') fr.write('eval case: visualization...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t label_path = {}\n'.format(options.label_path)) fr.write('\t label_size = {}\n'.format(options.label_size)) fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr.write('\t method: t-SNE\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t marker_size: {}\n'.format(options.marker_size)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step) figure_path = os.path.join(visual_dir, figure_name) CCD = plot_embedding_in_2D(Markersize=options.marker_size, features_matrix=features_matrix, labels_matrix=labels_matrix, label_size=options.label_size, figure_path=figure_path) fr.write('\n figure_path: {}\n'.format(figure_path)) fr.write(' clustering_center_distance_sim:{}\n'.format(CCD)) fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start)) fr.close() fr_total.write('%.4f\n' % CCD) fr_total.flush() summary.value.add(tag='CCD', simple_value=CCD) summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if CCD > best_CCD: best_CCD = CCD ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a') fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n") fr_best.write("best_CCD: {}\n".format(best_CCD)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close() return
#source_lex = "en-it.test" source_lex = "es-na.test" words_scr_lexicon, words_trg_lexicon = utils.get_lexicon(source_lex) print("size of lexicon:", set(words_scr_lexicon).__len__()) #print(len(words_scr_lexicon), len(words_trg_lexicon)) source_str = "es.n2v" target_str = "na.n2v" #source_str = "es.norm.n2v" #source_str = "en.fst" source_vec = utils.open_file(source_str) words_src, source_vec = utils.read(source_vec, is_zipped=False) # lista de palabras en español del lexicon semilla eval_src = list(set(words_scr_lexicon)) src_vec = utils.get_vectors(eval_src, words_src, source_vec) print("source_vec: " + source_str) #print(src_vec.shape) #target_str = "it.fst" target_vec = utils.open_file(target_str) words_trg, target_vec = utils.read(target_vec, is_zipped=False) print("target_vec: " + target_str) #eval_it = list(set(it)) #trg_vec = get_vectors(eval_it, words_it, it_vec) #print(target_vec.shape) test_vectors = src_vec
front_vector, draw_img, img_name) img_src = "./test_imgs" info_src = "./test_info" input_size = 224 test = Test( MobileNetV2, "./results/MobileNetV2_1.0_classes_66_input_224/snapshot/MobileNetV2_1.0_classes_66_input_224_epoch_50_front_vector.pkl", 66) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) imgs = sorted(os.listdir(img_src)) infos = sorted(os.listdir(info_src)) for img, info in zip(imgs, infos): img_name = img img = cv.imread(os.path.join(img_src, img)) draw_img = img.copy() img = cv.resize(img, (224, 224)) img = transform(img) img = img.unsqueeze(0) info = get_info_from_txt(os.path.join(info_src, info)) front_vector, _ = get_vectors(info) test.test_per_img(img, draw_img, front_vector, img_name)
vectors_con = {d: vectors_all[d] for d in prop_content_texts_con} most_similar_con = utils.most_sim_cos(vectors_con, text, args.num_responses) vectors_neutral = { d: vectors_all[d] for d in prop_content_texts_neutral } most_similar_neutral = utils.most_sim_cos(vectors_neutral, text, args.num_responses) elif args.sim_model == 'word2vec': path = pre_path + "embeddings/GoogleNews-vectors-negative300.bin" model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) if args.responses_per_stance == 0: vectors = utils.get_vectors(model, prop_content_texts) most_similar = utils.most_sim_cos(vectors, text, args.num_responses) else: vectors_pro = utils.get_vectors(model, prop_content_texts_pro) most_similar_pro = utils.most_sim_cos(vectors_pro, text, args.num_responses) vectors_con = utils.get_vectors(model, prop_content_texts_con) most_similar_con = utils.most_sim_cos(vectors_con, text, args.num_responses) vectors_neutral = utils.get_vectors(model, prop_content_texts_neutral) most_similar_neutral = utils.most_sim_cos(vectors_neutral, text, args.num_responses) elif args.sim_model == 'glove': path = pre_path + "embeddings/glove.840B.300dword2vec.txt"
img_src = "./test_imgs" info_src = "./test_info" input_size = 224 test = Test( MobileNetV2, "./results/MobileNetV2_1.5_classes_66_input_224/snapshot/MobileNetV2_1.5_classes_66_input_224_epoch_37.pkl", 66) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) imgs = sorted(os.listdir(img_src)) infos = sorted(os.listdir(info_src)) for img, info in zip(imgs, infos): img_name = img img = cv.imread(os.path.join(img_src, img)) draw_img = img.copy() img = cv.resize(img, (224, 224)) img = transform(img) img = img.unsqueeze(0) info = get_info_from_txt(os.path.join(info_src, info)) front_vector, right_vector, up_vector = get_vectors(info) print(front_vector) test.test_per_img(img, draw_img, img_name)
def eval_online(options): global features_matrix, labels_matrix, LABEL_SIZE cluster_dir = os.path.split(options.cluster_path)[0] if not utils.check_rebuild(cluster_dir, descrip='cluster', always_rebuild=options.always_rebuild): return if not os.path.exists(cluster_dir): os.makedirs(cluster_dir) logger.info('eval case: cluster...') logger.info('\t save_path: {}'.format(options.cluster_path)) logger.info('\t cluster: kmeans') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t total labels size: {}'.format(options.label_size)) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) fr_total = open(options.cluster_path, 'w') fr_total.write('eval case: cluster...\n') fr_total.write('\t save_dir: {}\n'.format(cluster_dir)) fr_total.write('\t cluster: kmeans\n') fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t repeat {} times\n'.format(options.repeated_times)) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write( '\t results(NMI):\n=============================================================\n' ) fr_total.write('finish_time\tckpt\tNMI\n') logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data( options.label_path, multilabel_rule=options.multilabel_rule) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) last_step = 0 summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='nmi', simple_value=0.) summary_writer.add_summary(summary, last_step) best_nmi = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_cluster" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") labels_matrix = np.array([item[0] for item in labels_list]) LABEL_SIZE = options.label_size logger.info( '\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # cluster fr = open(options.cluster_path + '.{}'.format(cur_step), 'w') fr.write('eval case: cluster...\n') fr.write('\t cluster: kmeans\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process fr.write("\t using {} processes for evaling:\n".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): fr.write("\t process-{}: repeat {} times\n".format( idx, rep_times)) try: nmi_list = [] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) except: nmi_list = [] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) if len(nmi_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(nmi_list), options.repeated_times)) else: try: nmi_list = _cluster_thread_body(options.repeated_times) except: nmi_list = _cluster_thread_body(options.repeated_times) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() mean_nmi = sum(nmi_list) / float(len(nmi_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(nmi_list))) fr.write('\t\t NMI = {}\n'.format(mean_nmi)) fr.write('details:\n') for repeat in range(len(nmi_list)): fr.write('\t repeated {}/{}: NMI = {}\n'.format( repeat + 1, len(nmi_list), nmi_list[repeat])) fr.write('\neval case: cluster completed in {}s\n'.format(time.time() - time_start)) fr.close() # fr_total.write('%.4f\n' % mean_nmi) fr_total.write('{}\n'.format(mean_nmi)) fr_total.flush() summary.value.add(tag='nmi', simple_value=mean_nmi) summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'cluster completed in {}s\n================================='. format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_nmi > best_nmi: best_nmi = mean_nmi ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_nmi: {}\n".format(best_nmi)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( cluster_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close() return
def __getitem__(self, index, crop=False): base_name = self.data_list[index][:-4] img = Image.open( os.path.join(self.data_dir, 'test_imgs/' + base_name + '.jpg')) img = img.convert(self.image_mode) if crop: # get face bbox bbox_path = os.path.join(self.data_dir, 'bbox/' + base_name + '.txt') pt2d = get_label_from_txt(bbox_path) x_min = pt2d[0] y_min = pt2d[1] x_max = pt2d[2] y_max = pt2d[3] # Crop the face loosely k = 0.1 x_min -= k * abs(x_max - x_min) y_min -= k * abs(y_max - y_min) x_max += k * abs(x_max - x_min) y_max += 0.3 * k * abs(y_max - y_min) img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) # get pose angle pitch,yaw,roll(degrees) ANGLES NO NEEDED!!! #angle_path = os.path.join(self.data_dir, 'angles/' + base_name + '.txt') #angle = get_label_from_txt(angle_path) #angle = torch.FloatTensor(angle) # get pose quat #quat_path = os.path.join(self.data_dir, 'info/' + base_name + '.txt') #quat = get_label_from_txt(quat_path) info = get_info_from_txt( os.path.join(self.data_dir, 'test_info/' + base_name + '.txt')) # Attention vector #attention_vector = get_attention_vector(quat) front_vector, _ = get_vectors(info) vector_label = torch.FloatTensor(front_vector) # classification label bins = np.array(range(-99, 100, self.bin_size)) / 99 classify_label = torch.LongTensor(np.digitize(front_vector, bins)) # 1-num_classes classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label = torch.stack([soft_label_x, soft_label_y, soft_label_z]) if self.transform is not None: img = self.transform(img) # RGB2BGR img = img[np.array([2, 1, 0]), :, :] #return img, soft_label, vector_label, angle, torch.FloatTensor(pt2d), os.path.join(self.data_dir, 'test_imgs/' + base_name + '.jpg') return img, soft_label, vector_label, os.path.join( self.data_dir, 'test_imgs/' + base_name + '.jpg')
def eval_online(options): global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K link_prediction_dir = os.path.split(options.link_prediction_path)[0] if not utils.check_rebuild(link_prediction_dir, descrip='link_prediction', always_rebuild=options.always_rebuild): return if not os.path.exists(link_prediction_dir): os.makedirs(link_prediction_dir) logger.info('eval case: link-prediction ...') logger.info('\t save_path: {}'.format(options.link_prediction_path)) logger.info('\t eval_data_path: {}'.format(options.eval_data_path)) logger.info('\t except_data_path: {}'.format(options.except_data_path)) logger.info('\t data_format: {}'.format(options.data_format)) logger.info('\t metrics: MAP and precise@K') logger.info('\t max_index for precise@K: {}'.format( options.precK_max_index)) logger.info('\t similarity_metric: {}'.format(options.similarity_metric)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t sample_nodes: {}'.format(options.sample_nodes)) logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info("constructing eval network ...") net_eval = network.construct_network(data_path=options.eval_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) eval_net_nodes_size = net_eval.get_nodes_size() eval_net_edges_size = net_eval.get_edges_size() logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size)) logger.info("eval_net_edges_size = {}".format(eval_net_edges_size)) logger.info("constructing except(train) network ...") net_except = network.construct_network(data_path=options.except_data_path, data_format=options.data_format, print_net_info=False, isdirected=options.isdirected) except_net_nodes_size = net_except.get_nodes_size() except_net_edges_size = net_except.get_edges_size() logger.info("except_net_nodes_size = {}".format(except_net_nodes_size)) logger.info("except_net_edges_size = {}".format(except_net_edges_size)) id_list = list(range(eval_net_nodes_size)) # must be [0,1,2,3,...] SAMPLE_NODES = options.sample_nodes SAMPLE_RULE = options.sample_nodes_rule METIRC = options.similarity_metric PREC_K = options.precK_max_index metric_prec_k_list = [1] decimal_number = 10 while metric_prec_k_list[-1] < options.precK_max_index: if decimal_number <= options.precK_max_index: metric_prec_k_list.append(decimal_number) else: break if 2 * decimal_number <= options.precK_max_index: metric_prec_k_list.append(2 * decimal_number) else: break if 5 * decimal_number <= options.precK_max_index: metric_prec_k_list.append(5 * decimal_number) else: break decimal_number = decimal_number * 10 if options.sample_nodes > 0: if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) fr_total = open(options.link_prediction_path, 'w') fr_total.write('eval case: link-prediction ...\n') fr_total.write('\t save_path: {}\n'.format(options.link_prediction_path)) fr_total.write('\t eval_data_path: {}\n'.format(options.eval_data_path)) fr_total.write('\t except_data_path: {}\n'.format( options.except_data_path)) fr_total.write('\t data_format: {}\n'.format(options.data_format)) fr_total.write('\t metrics: MAP and precise@K\n') fr_total.write('\t max_index for precise@K: {}\n'.format( options.precK_max_index)) fr_total.write('\t similarity_metric: {}\n'.format( options.similarity_metric)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t sample_nodes: {}\n'.format(options.sample_nodes)) fr_total.write('\t sample_nodes_rule: {}\n'.format( options.sample_nodes_rule)) fr_total.write('\t repeat {} times\n'.format(options.repeated_times)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size)) fr_total.write("eval_net_edges_size = {}\n".format(eval_net_edges_size)) fr_total.write( "except_net_nodes_size = {}\n".format(except_net_nodes_size)) fr_total.write( "except_net_edges_size = {}\n".format(except_net_edges_size)) fr_total.write( '\t results:\n=============================================================\n' ) fr_total.write('finish_time\tckpt\tMAP\t') for v in metric_prec_k_list: fr_total.write('\tPr@{}'.format(v)) fr_total.write("\n") last_step = 0 summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='MAP', simple_value=0.) for v in metric_prec_k_list: summary.value.add(tag='Pr_{}'.format(v), simple_value=0.) summary_writer.add_summary(summary, last_step) best_MAP = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_link_prediction" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) # loading features_matrix(already trained) logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) time_start = time.time() features_matrix = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list) os.remove(reading) logger.info("\t done for reading ...") logger.info('\t reading embedding vectors completed in {}s'.format( time.time() - time_start)) logger.info('total loaded nodes: {}'.format( np.size(features_matrix, axis=0))) logger.info('the embedding dimension: {}'.format( np.size(features_matrix, axis=1))) # fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w') fr.write('eval case: link-prediction ...\n') fr.write('\t save_path: {}\n'.format(options.link_prediction_path)) fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path)) fr.write('\t except_data_path: {}\n'.format(options.except_data_path)) fr.write('\t data_format: {}\n'.format(options.data_format)) fr.write('\t metrics: MAP and precise@K\n') fr.write('\t max_index for precise@K: {}\n'.format( options.precK_max_index)) fr.write('\t similarity_metric: {}\n'.format( options.similarity_metric)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes)) fr.write('\t sample_nodes_rule: {}\n'.format( options.sample_nodes_rule)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size)) fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size)) fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size)) fr.write("except_net_edges_size = {}\n".format(except_net_edges_size)) fr.write('total loaded nodes: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('the embedding dimension: {}\n'.format( np.size(features_matrix, axis=1))) if options.sample_nodes > 0: if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process ret_list = [] # [[MAP, precisionK_list], ... ] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_sample_thread_body, times_per_worker): ret_list.extend(ret) if len(ret_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}". format(len(ret_list), options.repeated_times)) else: ret_list = _sample_thread_body(options.repeated_times) else: # no sampling, no repeat! ret_list = [_eval(net_eval, net_except)] # [[MAP, precisionK_list]] fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() if options.sample_nodes > 0: fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(ret_list))) else: fr.write( 'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n' .format(options.sample_nodes, len(ret_list))) mean_MAP = np.mean([ret[0] for ret in ret_list]) mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0) fr.write('\t\t MAP = {}\n'.format(mean_MAP)) for k in range(options.precK_max_index): if k < len(mean_precisionK): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, mean_precisionK[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write('details:\n') for repeat in range(len(ret_list)): fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list))) MAP = ret_list[repeat][0] precisionK_list = ret_list[repeat][1] fr.write('\t\t MAP = {}\n'.format(MAP)) for k in range(options.precK_max_index): if k < len(precisionK_list): fr.write('\t\t precisionK_{} = {}\n'.format( k + 1, precisionK_list[k])) else: fr.write('\t\t precisionK_{} = None\n'.format(k + 1)) fr.write('\neval case: link_prediction completed in {}s.'.format( time.time() - time_start)) fr.close() fr_total.write('%.4f' % mean_MAP) summary.value.add(tag='MAP', simple_value=mean_MAP) for v in metric_prec_k_list: fr_total.write('\t%.4f' % mean_precisionK[v - 1]) summary.value.add(tag='Pr_{}'.format(v), simple_value=mean_precisionK[v - 1]) fr_total.write("\n") fr_total.flush() summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'eval case: ret_list completed in {}s.\n=================================' .format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_MAP > best_MAP: best_MAP = mean_MAP ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open( os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w') else: fr_best = open( os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_MAP: {}\n".format(best_MAP)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( link_prediction_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(link_prediction_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(link_prediction_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close()
def __getitem__(self, index, crop=False): base_name, _ = self.data_list[index].split('.') img = Image.open( os.path.join(self.data_dir, 'dataset/bg_imgs/' + base_name + '.jpg')) img = img.convert(self.image_mode) if crop: # get face bbox bbox_path = os.path.join(self.data_dir, 'bbox/' + base_name + '.txt') pt2d = get_label_from_txt(bbox_path) x_min = pt2d[0] y_min = pt2d[1] x_max = pt2d[2] y_max = pt2d[3] # Crop the face loosely k = 0.1 x_min -= k * abs(x_max - x_min) y_min -= k * abs(y_max - y_min) x_max += k * abs(x_max - x_min) y_max += 0.3 * k * abs(y_max - y_min) img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) # get pose angle pitch,yaw,roll(degrees) ANGLES NO NEEDED!!! #angle_path = os.path.join(self.data_dir, 'angles/' + base_name + '.txt') #angle = get_label_from_txt(angle_path) #angle = torch.FloatTensor(angle) if self.transform is not None: img = self.transform(img) # RGB2BGR img = img[np.array([2, 1, 0]), :, :] # get pose quat #quat = get_label_from_txt(os.path.join(self.data_dir, "info/" + base_name + '.txt')) info = get_info_from_txt( os.path.join(self.data_dir, "info_all/" + base_name + '.txt')) # face orientation vector #vector_label = get_attention_vector(quat) #get one front vector #vector_label = angle2vector(os.path.join(self.data_dir, "info/" + base_name + '.txt')) #vector_label = torch.FloatTensor(vector_label) #get front vector and right vector front_vector, right_vector, up_vector = get_vectors(info) #print(np.dot(np.array(front_vector),np.array(right_vector))) #print(np.dot(np.array(front_vector),np.array(up_vector))) #print(np.dot(np.array(right_vector),np.array(up_vector))) vector_label_f = torch.FloatTensor(front_vector) vector_label_r = torch.FloatTensor(right_vector) vector_label_u = torch.FloatTensor(up_vector) bins = np.array(range(-99, 100, self.bin_size)) / 99 #----------------front vector------------------------- # classification label classify_label = torch.LongTensor(np.digitize( front_vector, bins)) # return the index classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label_f = torch.stack([soft_label_x, soft_label_y, soft_label_z]) #-------------------right vector-------------- classify_label = torch.LongTensor(np.digitize( right_vector, bins)) # return the index classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label_r = torch.stack([soft_label_x, soft_label_y, soft_label_z]) #------------------up vector------------------- classify_label = torch.LongTensor(np.digitize( up_vector, bins)) # return the index classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label_u = torch.stack([soft_label_x, soft_label_y, soft_label_z]) return img, soft_label_f, soft_label_r, soft_label_u, vector_label_f, vector_label_r, vector_label_u, os.path.join( self.data_dir, "dataset/bg_imgs/" + base_name + ".jpg")
def load(topic, model_name, sub_model_name="paraphrase-distilroberta-base-v1", is_indexed=False, **kwargs): pre_path = ""#"python/" config_path = pre_path + "kialo_config.json" data_path_pro, vec_path_pro, data_path_con, vec_path_con, data_path_all, vec_path_all, vec_path_pro_original, vec_path_con_original = "", "", "", "", "", "", "", "" lowercase_to_uppercase = {} texts_all, texts_pro, texts_con = [], [], [] stances = {} responses = {} with open(config_path) as f: config = json.load(f) for c in config["topics"]: if topic == c["id"]: data_path_pro = pre_path + c["data_path_pro"] vec_path_pro = pre_path + c["sbert_path_pro"] data_path_con = pre_path + c["data_path_con"] vec_path_con = pre_path + c["sbert_path_con"] data_path_all = pre_path + c["data_path_all"] vec_path_all = pre_path + c["sbert_path_all"] vec_path_pro_original = pre_path + c["sbert_path_pro_original"] vec_path_con_original = pre_path + c["sbert_path_con_original"] #read data files with open(data_path_pro) as f: for line in f: parts = line.strip().split("\t",4) text = parts[0] lowercase_to_uppercase[text] = parts[2] texts_pro.append(text) if len(parts) >= 5: responses[text] = parts[4].split('\t') with open(data_path_con) as f: for line in f: parts = line.strip().split("\t",4) text = parts[0] lowercase_to_uppercase[text] = parts[2] texts_con.append(text) if len(parts) >= 5: responses[text] = parts[4].split('\t') #load model pro_vecs, con_vecs = [], [] model = None if model_name == 'sbert': model = utils.get_sbert_model(sub_model_name) if is_indexed == 1: models_pro_con = [faiss_index.load_index(vec_path_pro), faiss_index.load_index(vec_path_con)] else: infile_pro = open(vec_path_pro_original,'rb') infile_con = open(vec_path_con_original,'rb') pro_vecs = cPickle.load(infile_pro) con_vecs = cPickle.load(infile_con) model = utils.get_sbert_model(sub_model_name) elif model_name == 'word2vec': path = pre_path + "embeddings/GoogleNews-vectors-negative300.bin" model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) pro_vecs, model = utils.get_vectors(model, texts_pro) con_vecs, model = utils.get_vectors(model, texts_con) elif model_name == 'glove': path = pre_path + "embeddings/glove.840B.300dword2vec.txt" model = utils.read_emb_text(path) pro_vecs, model = utils.get_vectors(model, texts_pro) con_vecs, model = utils.get_vectors(model, texts_con) #return values #model is an object #pro_vecs, con_vecs are arrays of vectors #texts_pro, texts_con are arrays of texts #responses and lowercase_to_uppercase is a dictionaries return {"model": model, "pro_vecs":pro_vecs, "con_vecs":con_vecs, "texts_pro":texts_pro, "texts_con":texts_con, "responses":responses, "lowercase_to_uppercase":lowercase_to_uppercase}
def __getitem__(self, index, crop=False): # data basename base_name, _ = self.data_list[index].split('.') # read image file img = Image.open( os.path.join(self.data_dir, "dataset/bg_imgs/" + base_name + ".jpg")) img = img.convert(self.image_mode) if crop: # get face bounding box pt2d = get_label_from_txt( os.path.join(self.data_dir, "bbox/" + base_name + ".txt")) x_min, y_min, x_max, y_max = pt2d # crop face loosely:k=0to 0.2 k = np.random.random_sample() * 0.1 x_min -= 0.6 * k * abs(x_max - x_min) y_min -= k * abs(y_max - y_min) x_max += 0.6 * k * abs(x_max - x_min) y_max += 0.6 * k * abs(y_max - y_min) img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) # Augmentation:Blur? if np.random.random_sample() < 0.05: img = img.filter(ImageFilter.BLUR) # Augmentation:Gray? if np.random.random_sample() < 0.5 and base_name.find('ID') < 0: img = img.convert('L').convert("RGB") # transform if self.transform: img = self.transform(img) # RGB2BGR img = img[np.array([2, 1, 0]), :, :] # get pose quat #quat = get_label_from_txt(os.path.join(self.data_dir, "info/" + base_name + '.txt')) info = get_info_from_txt( os.path.join(self.data_dir, "info_all/" + base_name + '.txt')) # face orientation vector #vector_label = get_attention_vector(quat) #get one front vector #vector_label = angle2vector(os.path.join(self.data_dir, "info/" + base_name + '.txt')) #vector_label = torch.FloatTensor(vector_label) #get front vector and right vector front_vector, right_vector, up_vector = get_vectors(info) #print(np.dot(np.array(front_vector),np.array(right_vector))) #print(np.dot(np.array(front_vector),np.array(up_vector))) #print(np.dot(np.array(right_vector),np.array(up_vector))) vector_label_f = torch.FloatTensor(front_vector) vector_label_r = torch.FloatTensor(right_vector) vector_label_u = torch.FloatTensor(up_vector) #----------------front vector------------------------- # classification label classify_label = torch.LongTensor(np.digitize( front_vector, self.bins)) # return the index classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label_f = torch.stack([soft_label_x, soft_label_y, soft_label_z]) #-------------------right vector-------------- classify_label = torch.LongTensor(np.digitize( right_vector, self.bins)) # return the index classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label_r = torch.stack([soft_label_x, soft_label_y, soft_label_z]) #------------------up vector------------------- classify_label = torch.LongTensor(np.digitize( up_vector, self.bins)) # return the index classify_label = np.where(classify_label > self.num_classes, self.num_classes, classify_label) classify_label = np.where(classify_label < 1, 1, classify_label) # soft label soft_label_x = get_soft_label(classify_label[0], self.num_classes) soft_label_y = get_soft_label(classify_label[1], self.num_classes) soft_label_z = get_soft_label(classify_label[2], self.num_classes) soft_label_u = torch.stack([soft_label_x, soft_label_y, soft_label_z]) return img, soft_label_f, soft_label_r, soft_label_u, vector_label_f, vector_label_r, vector_label_u, os.path.join( self.data_dir, "dataset/bg_imgs/" + base_name + ".jpg")
cache_embedding = 'checkpoints/.cache/' kernel_path = 'kernel_path/' if __name__ == '__main__': ds, model_name, = sys.argv[1:] model, tokenizer = get_model_and_tokenizer(model_name) input_a, input_b, label = get_tokenized_ds(datasets_paths[ds]['scripts'], datasets_paths[ds]['data_path'], tokenizer, ds) if not os.path.exists(cache_embedding + ds + '.avec.cache.npy'): with torch.no_grad(): a_vecs, b_vecs = get_vectors(model, input_a, input_b) a_vecs = a_vecs.cpu().numpy() b_vecs = b_vecs.cpu().numpy() np.save(cache_embedding + ds + '.avec.cache', a_vecs) np.save(cache_embedding + ds + '.bvec.cache', b_vecs) else: a_vecs = np.load(cache_embedding + ds + '.avec.cache.npy') b_vecs = np.load(cache_embedding + ds + '.bvec.cache.npy') if n_components: kernel, bias = compute_kernel_bias([a_vecs, b_vecs]) save_kernel_and_bias(kernel, bias, model_name) kernel = kernel[:, :n_components] a_vecs = transform_and_normalize(a_vecs, kernel, bias)
def eval_online(options): global features_matrix, labels_matrix classify_dir = os.path.split(options.classify_path)[0] if not utils.check_rebuild(classify_dir, descrip='classify', always_rebuild=options.always_rebuild): return if not os.path.exists(classify_dir): os.makedirs(classify_dir) logger.info('eval case: classify...') logger.info('\t save_dir: {}'.format(classify_dir)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t total labels size: {}'.format(options.label_size)) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [v / 10.0 for v in range(9, 0, -1)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) train_ratio_fulllist = [ train_ratio for train_ratio in train_ratio_list for _ in range(repeated_times) ] if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: # speed up by using multi-process if len(train_ratio_fulllist) <= options.eval_workers: train_ratios_per_worker = [[train_ratio] for train_ratio in train_ratio_fulllist] else: div, mod = divmod(len(train_ratio_fulllist), options.eval_workers) train_ratios_per_worker = [ train_ratio_fulllist[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio in enumerate( train_ratio_fulllist[div * options.eval_workers:]): train_ratios_per_worker[len(train_ratios_per_worker) - 1 - idx].append(train_ratio) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) fr_total = open(options.classify_path, 'w') fr_total.write('eval case: classify...\n') fr_total.write('\t save_dir: {}\n'.format(classify_dir)) fr_total.write('\t classifier: LogisticRegression\n') fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write( '\t results(Macro_F1,Micro_F1):\n=============================================================\n' ) fr_total.write( 'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n') time_start = time.time() logger.info('\t reading labeled data from file {}'.format( options.label_path)) id_list_totoal, labels_list_total = utils.get_labeled_data( options.label_path) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) last_step = 0 summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph()) summary = tf.Summary() for train_ratio in train_ratio_list: summary.value.add(tag='macro_train_{}'.format(train_ratio), simple_value=0.) summary.value.add(tag='micro_train_{}'.format(train_ratio), simple_value=0.) summary_writer.add_summary(summary, last_step) best_micro = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_classify" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) ## synchrolock for multi-process: # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and # time.time() - os.stat(options.vectors_path).st_mtime > 200)): # time.sleep(options.eval_interval) # ckpt = tf.train.get_checkpoint_state(ckpt_dir) # cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) # os.utime(options.vectors_path, None) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list_totoal, labels_list_total) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") mlb = MultiLabelBinarizer(range(options.label_size)) labels_matrix = mlb.fit_transform(labels_list) logger.info('\t reading embedding vectors completed in {}s'.format( time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # classify fr = open(options.classify_path + '.{}'.format(cur_step), 'w') fr.write('eval case: classify...\n') fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i]))) if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: fr.write("\t using {} processes for evaling:\n".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): fr.write("\t process-{}: {}\n".format(idx, train_ratios)) ret_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(train_ratio_fulllist) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() ret_dict = {} for ret in ret_list: if ret[0] in ret_dict: ret_dict[ret[0]][0].append(ret[1]) ret_dict[ret[0]][1].append(ret[2]) else: ret_dict[ret[0]] = [[ret[1]], [ret[2]]] for train_ratio, macro_micro in sorted(ret_dict.items(), key=lambda item: item[0]): fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}\n'.format(train_ratio)) Macro_F1_list = macro_micro[0] Micro_F1_list = macro_micro[1] if len(Macro_F1_list) != repeated_times: logger.warning( "warning: train_ratio = {} eval unmatched repeated_times: {} != {}" .format(train_ratio, len(Macro_F1_list), repeated_times)) mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list)) mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(Macro_F1_list))) fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1)) fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1)) fr.write('details:\n') for repeat in range(len(Macro_F1_list)): fr.write( '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format( repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat], Micro_F1_list[repeat])) fr_total.write('%.4f, %.4f ' % (mean_Macro_F1, mean_Micro_F1)) summary.value.add(tag='macro_train_{}'.format(train_ratio), simple_value=mean_Macro_F1) summary.value.add(tag='micro_train_{}'.format(train_ratio), simple_value=mean_Micro_F1) fr.write( '\neval case: classify completed in {}s\n'.format(time.time() - time_start)) fr.close() fr_total.write('\n') fr_total.flush() summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'classify completed in {}s\n================================='. format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_Micro_F1 > best_micro: best_micro = mean_Micro_F1 ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( classify_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(classify_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close()
from model import Classifier as Model import utils def to_tensor(x, device=device): x = np.array(x) x = torch.from_numpy(x) return x.cuda() if os.path.isfile(vector_pickle): word_vectors, word_dict, category_dicts, label_dict = pickle.load( open(vector_pickle, 'rb')) else: word_vectors, word_dict, category_dicts, label_dict = utils.get_vectors( train_file, num_categories, vec_file, word_dim) pickle.dump([word_vectors, word_dict, category_dicts, label_dict], open(vector_pickle, 'wb'), protocol=4) if os.path.isfile(train_pickle): train_data = pickle.load(open(train_pickle, 'rb')) else: train_data = utils.get_data(train_file, word_dict, category_dicts, label_dict) pickle.dump(train_data, open(train_pickle, 'wb'), protocol=4) if os.path.isfile(dev_pickle): dev_data = pickle.load(open(dev_pickle, 'rb')) else: dev_data = utils.get_data(dev_file, word_dict, category_dicts, label_dict)
def eval_once(options): global features_matrix, labels_matrix if not utils.check_rebuild(options.classify_path, descrip='classify', always_rebuild=options.always_rebuild): return logger.info('eval case: classify...') logger.info('\t save_path: {}'.format(options.classify_path)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data(options.label_path) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) mlb = MultiLabelBinarizer(range(options.label_size)) labels_matrix = mlb.fit_transform(labels_list) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [v / 10.0 for v in range(9, 0, -1)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) train_ratio_fulllist = [ train_ratio for train_ratio in train_ratio_list for _ in range(repeated_times) ] # classify fr = open(options.classify_path, 'w') fr.write('eval case: classify...\n') fr.write('\t save_path: {}\n'.format(options.classify_path)) fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i]))) if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: # speed up by using multi-process if len(train_ratio_fulllist) <= options.eval_workers: train_ratios_per_worker = [[train_ratio] for train_ratio in train_ratio_fulllist] else: div, mod = divmod(len(train_ratio_fulllist), options.eval_workers) train_ratios_per_worker = [ train_ratio_fulllist[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio in enumerate( train_ratio_fulllist[div * options.eval_workers:]): train_ratios_per_worker[len(train_ratios_per_worker) - 1 - idx].append(train_ratio) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) ret_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor(max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(train_ratio_fulllist) ret_dict = {} for ret in ret_list: if ret[0] in ret_dict: ret_dict[ret[0]][0].append(ret[1]) ret_dict[ret[0]][1].append(ret[2]) else: ret_dict[ret[0]] = [[ret[1]], [ret[2]]] for train_ratio, macro_micro in sorted(ret_dict.items(), key=lambda item: item[0]): fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}\n'.format(train_ratio)) Macro_F1_list = macro_micro[0] Micro_F1_list = macro_micro[1] if len(Macro_F1_list) != repeated_times: logger.warning( "warning: train_ratio = {} eval unmatched repeated_times: {} != {}" .format(train_ratio, len(Macro_F1_list), repeated_times)) mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list)) mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(Macro_F1_list))) fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1)) fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1)) fr.write('details:\n') for repeat in range(len(Macro_F1_list)): fr.write( '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format( repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat], Micro_F1_list[repeat])) fr.write('\neval case: classify completed in {}s'.format(time.time() - time_start)) fr.close() logger.info('eval case: classify completed in {}s'.format(time.time() - time_start))