def eval_online(options): global features_matrix, labels_matrix, LABEL_SIZE cluster_dir = os.path.split(options.cluster_path)[0] if not utils.check_rebuild(cluster_dir, descrip='cluster', always_rebuild=options.always_rebuild): return if not os.path.exists(cluster_dir): os.makedirs(cluster_dir) logger.info('eval case: cluster...') logger.info('\t save_path: {}'.format(options.cluster_path)) logger.info('\t cluster: kmeans') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t total labels size: {}'.format(options.label_size)) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) fr_total = open(options.cluster_path, 'w') fr_total.write('eval case: cluster...\n') fr_total.write('\t save_dir: {}\n'.format(cluster_dir)) fr_total.write('\t cluster: kmeans\n') fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t repeat {} times\n'.format(options.repeated_times)) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write( '\t results(NMI):\n=============================================================\n' ) fr_total.write('finish_time\tckpt\tNMI\n') logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data( options.label_path, multilabel_rule=options.multilabel_rule) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) last_step = 0 summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='nmi', simple_value=0.) summary_writer.add_summary(summary, last_step) best_nmi = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_cluster" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") labels_matrix = np.array([item[0] for item in labels_list]) LABEL_SIZE = options.label_size logger.info( '\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # cluster fr = open(options.cluster_path + '.{}'.format(cur_step), 'w') fr.write('eval case: cluster...\n') fr.write('\t cluster: kmeans\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process fr.write("\t using {} processes for evaling:\n".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): fr.write("\t process-{}: repeat {} times\n".format( idx, rep_times)) try: nmi_list = [] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) except: nmi_list = [] with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) if len(nmi_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(nmi_list), options.repeated_times)) else: try: nmi_list = _cluster_thread_body(options.repeated_times) except: nmi_list = _cluster_thread_body(options.repeated_times) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() mean_nmi = sum(nmi_list) / float(len(nmi_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(nmi_list))) fr.write('\t\t NMI = {}\n'.format(mean_nmi)) fr.write('details:\n') for repeat in range(len(nmi_list)): fr.write('\t repeated {}/{}: NMI = {}\n'.format( repeat + 1, len(nmi_list), nmi_list[repeat])) fr.write('\neval case: cluster completed in {}s\n'.format(time.time() - time_start)) fr.close() # fr_total.write('%.4f\n' % mean_nmi) fr_total.write('{}\n'.format(mean_nmi)) fr_total.flush() summary.value.add(tag='nmi', simple_value=mean_nmi) summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'cluster completed in {}s\n================================='. format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_nmi > best_nmi: best_nmi = mean_nmi ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_nmi: {}\n".format(best_nmi)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( cluster_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close() return
def eval_once(options): global features_matrix, labels_matrix, LABEL_SIZE if not utils.check_rebuild(options.cluster_path, descrip='cluster', always_rebuild=options.always_rebuild): return logger.info('eval case: cluster...') logger.info('\t save_path: {}'.format(options.cluster_path)) logger.info('\t cluster: kmeans') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t repeat {} times'.format(options.repeated_times)) logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data( options.label_path, multilabel_rule=options.multilabel_rule) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) labels_matrix = np.array([item[0] for item in labels_list]) LABEL_SIZE = options.label_size logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # cluster fr = open(options.cluster_path, 'w') fr.write('eval case: cluster...\n') fr.write('\t save_path: {}\n'.format(options.cluster_path)) fr.write('\t cluster: kmeans\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times\n'.format(options.repeated_times)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) if options.eval_workers > 1 and options.repeated_times > 1: # speed up by using multi-process logger.info("\t allocating repeat_times to workers ...") if options.repeated_times <= options.eval_workers: times_per_worker = [1 for _ in range(options.repeated_times)] else: div, mod = divmod(options.repeated_times, options.eval_workers) times_per_worker = [div for _ in range(options.eval_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.repeated_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.repeated_times) logger.info("\t using {} processes for evaling:".format( len(times_per_worker))) for idx, rep_times in enumerate(times_per_worker): logger.info("\t process-{}: repeat {} times".format( idx, rep_times)) try: nmi_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) except: nmi_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_cluster_thread_body, times_per_worker): nmi_list.extend(ret) if len(nmi_list) != options.repeated_times: logger.warning( "warning: eval unmatched repeated_times: {} != {}".format( len(nmi_list), options.repeated_times)) else: try: nmi_list = _cluster_thread_body(options.repeated_times) except: nmi_list = _cluster_thread_body(options.repeated_times) mean_nmi = sum(nmi_list) / float(len(nmi_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(options.repeated_times, len(nmi_list))) fr.write('\t\t NMI = {}\n'.format(mean_nmi)) fr.write('details:\n') for repeat in range(len(nmi_list)): fr.write('\t repeated {}/{}: NMI = {}\n'.format( repeat + 1, len(nmi_list), nmi_list[repeat])) fr.write('\neval case: cluster completed in {}s.'.format(time.time() - time_start)) fr.close() logger.info('eval case: cluster completed in {}s.'.format(time.time() - time_start)) return
def eval_once(options): global features_matrix, labels_matrix if not utils.check_rebuild(options.classify_path, descrip='classify', always_rebuild=options.always_rebuild): return logger.info('eval case: classify...') logger.info('\t save_path: {}'.format(options.classify_path)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t reading labeled data from file {}'.format( options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data(options.label_path) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) mlb = MultiLabelBinarizer(range(options.label_size)) labels_matrix = mlb.fit_transform(labels_list) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [v / 10.0 for v in range(9, 0, -1)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) train_ratio_fulllist = [ train_ratio for train_ratio in train_ratio_list for _ in range(repeated_times) ] # classify fr = open(options.classify_path, 'w') fr.write('eval case: classify...\n') fr.write('\t save_path: {}\n'.format(options.classify_path)) fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i]))) if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: # speed up by using multi-process if len(train_ratio_fulllist) <= options.eval_workers: train_ratios_per_worker = [[train_ratio] for train_ratio in train_ratio_fulllist] else: div, mod = divmod(len(train_ratio_fulllist), options.eval_workers) train_ratios_per_worker = [ train_ratio_fulllist[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio in enumerate( train_ratio_fulllist[div * options.eval_workers:]): train_ratios_per_worker[len(train_ratios_per_worker) - 1 - idx].append(train_ratio) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) ret_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor(max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(train_ratio_fulllist) ret_dict = {} for ret in ret_list: if ret[0] in ret_dict: ret_dict[ret[0]][0].append(ret[1]) ret_dict[ret[0]][1].append(ret[2]) else: ret_dict[ret[0]] = [[ret[1]], [ret[2]]] for train_ratio, macro_micro in sorted(ret_dict.items(), key=lambda item: item[0]): fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}\n'.format(train_ratio)) Macro_F1_list = macro_micro[0] Micro_F1_list = macro_micro[1] if len(Macro_F1_list) != repeated_times: logger.warning( "warning: train_ratio = {} eval unmatched repeated_times: {} != {}" .format(train_ratio, len(Macro_F1_list), repeated_times)) mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list)) mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(Macro_F1_list))) fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1)) fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1)) fr.write('details:\n') for repeat in range(len(Macro_F1_list)): fr.write( '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format( repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat], Micro_F1_list[repeat])) fr.write('\neval case: classify completed in {}s'.format(time.time() - time_start)) fr.close() logger.info('eval case: classify completed in {}s'.format(time.time() - time_start))
def eval_online(options): global features_matrix, labels_matrix classify_dir = os.path.split(options.classify_path)[0] if not utils.check_rebuild(classify_dir, descrip='classify', always_rebuild=options.always_rebuild): return if not os.path.exists(classify_dir): os.makedirs(classify_dir) logger.info('eval case: classify...') logger.info('\t save_dir: {}'.format(classify_dir)) logger.info('\t classifier: LogisticRegression') logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t eval_workers: {}'.format(options.eval_workers)) logger.info('\t total labels size: {}'.format(options.label_size)) # repeated 10times repeated_times = options.repeated_times # split ratio if options.train_ratio > 0: train_ratio_list = [options.train_ratio] else: train_ratio_list = [v / 10.0 for v in range(9, 0, -1)] logger.info('\t repeat {} times for each train_ratio in {}'.format( repeated_times, train_ratio_list)) train_ratio_fulllist = [ train_ratio for train_ratio in train_ratio_list for _ in range(repeated_times) ] if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: # speed up by using multi-process if len(train_ratio_fulllist) <= options.eval_workers: train_ratios_per_worker = [[train_ratio] for train_ratio in train_ratio_fulllist] else: div, mod = divmod(len(train_ratio_fulllist), options.eval_workers) train_ratios_per_worker = [ train_ratio_fulllist[div * i:div * (i + 1)] for i in range(options.eval_workers) ] for idx, train_ratio in enumerate( train_ratio_fulllist[div * options.eval_workers:]): train_ratios_per_worker[len(train_ratios_per_worker) - 1 - idx].append(train_ratio) logger.info("\t using {} processes for evaling:".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): logger.info("\t process-{}: {}".format(idx, train_ratios)) fr_total = open(options.classify_path, 'w') fr_total.write('eval case: classify...\n') fr_total.write('\t save_dir: {}\n'.format(classify_dir)) fr_total.write('\t classifier: LogisticRegression\n') fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write( '\t results(Macro_F1,Micro_F1):\n=============================================================\n' ) fr_total.write( 'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n') time_start = time.time() logger.info('\t reading labeled data from file {}'.format( options.label_path)) id_list_totoal, labels_list_total = utils.get_labeled_data( options.label_path) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) last_step = 0 summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph()) summary = tf.Summary() for train_ratio in train_ratio_list: summary.value.add(tag='macro_train_{}'.format(train_ratio), simple_value=0.) summary.value.add(tag='micro_train_{}'.format(train_ratio), simple_value=0.) summary_writer.add_summary(summary, last_step) best_micro = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("\t model and vectors not exist, waiting ...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_classify" writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) ## synchrolock for multi-process: # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and # time.time() - os.stat(options.vectors_path).st_mtime > 200)): # time.sleep(options.eval_interval) # ckpt = tf.train.get_checkpoint_state(ckpt_dir) # cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) # os.utime(options.vectors_path, None) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): if os.path.exists( os.path.join( os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists( options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format( options.vectors_path)) features_matrix, labels_list = utils.get_vectors( utils.get_KeyedVectors(options.vectors_path), id_list_totoal, labels_list_total) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") mlb = MultiLabelBinarizer(range(options.label_size)) labels_matrix = mlb.fit_transform(labels_list) logger.info('\t reading embedding vectors completed in {}s'.format( time.time() - time_start)) logger.info('\t total labeled data size: {}'.format( np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) # classify fr = open(options.classify_path + '.{}'.format(cur_step), 'w') fr.write('eval case: classify...\n') fr.write('\t classifier: LogisticRegression\n') fr.write('\t eval_workers: {}\n'.format(options.eval_workers)) fr.write('\t repeat {} times for each train_ratio in {}\n'.format( repeated_times, train_ratio_list)) fr.write('\t total labeled data size: {}\n'.format( np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i]))) if options.eval_workers > 1 and len(train_ratio_fulllist) > 1: fr.write("\t using {} processes for evaling:\n".format( len(train_ratios_per_worker))) for idx, train_ratios in enumerate(train_ratios_per_worker): fr.write("\t process-{}: {}\n".format(idx, train_ratios)) ret_list = [] # (train_ratio, macro, micro) with ProcessPoolExecutor( max_workers=options.eval_workers) as executor: for ret in executor.map(_classify_thread_body, train_ratios_per_worker): ret_list.extend(ret) else: ret_list = _classify_thread_body(train_ratio_fulllist) fr_total.write('%s ckpt-%-9d: ' % (time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() ret_dict = {} for ret in ret_list: if ret[0] in ret_dict: ret_dict[ret[0]][0].append(ret[1]) ret_dict[ret[0]][1].append(ret[2]) else: ret_dict[ret[0]] = [[ret[1]], [ret[2]]] for train_ratio, macro_micro in sorted(ret_dict.items(), key=lambda item: item[0]): fr.write('\n' + '-' * 20 + '\n' + 'train_ratio = {}\n'.format(train_ratio)) Macro_F1_list = macro_micro[0] Micro_F1_list = macro_micro[1] if len(Macro_F1_list) != repeated_times: logger.warning( "warning: train_ratio = {} eval unmatched repeated_times: {} != {}" .format(train_ratio, len(Macro_F1_list), repeated_times)) mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list)) mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list)) fr.write( 'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n' .format(repeated_times, len(Macro_F1_list))) fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1)) fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1)) fr.write('details:\n') for repeat in range(len(Macro_F1_list)): fr.write( '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format( repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat], Micro_F1_list[repeat])) fr_total.write('%.4f, %.4f ' % (mean_Macro_F1, mean_Micro_F1)) summary.value.add(tag='macro_train_{}'.format(train_ratio), simple_value=mean_Macro_F1) summary.value.add(tag='micro_train_{}'.format(train_ratio), simple_value=mean_Micro_F1) fr.write( '\neval case: classify completed in {}s\n'.format(time.time() - time_start)) fr.close() fr_total.write('\n') fr_total.flush() summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info( 'classify completed in {}s\n================================='. format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if mean_Micro_F1 > best_micro: best_micro = mean_Micro_F1 ckptIsExists = os.path.exists( os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'), 'a') fr_best.write( "Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n" ) fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join( ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join( classify_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(classify_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close()
def __init__(self, net, using_label=False, feature_type="random", shuffled=True, label_path=None, label_size=None, feature_path=None, feature_size=None): self._nodes_size = net.get_nodes_size() self._using_label = using_label self._feature_type = feature_type self._shuffled = shuffled self._label_size = label_size self._feature_size = feature_size # generate adjacency matrix adj_matrix = np.zeros((self._nodes_size, self._nodes_size), dtype=np.int32) for x, y in net.edges: adj_matrix[x, y] = 1 # adj_matrix[y, x] = 1 # generate normalized laplacian matrix self._laplacian_matrix = self.preprocess_adj(adj_matrix) # generate labels (multi-hot encoding) if self._using_label: self._nodes_labels = np.zeros((self._nodes_size, self._label_size), dtype=np.int32) id_list, labels_list = utils.get_labeled_data(label_path) assert len( id_list ) == self._nodes_size, "error: not all nodes is labeled, %d != %d" % ( len(id_list), self._nodes_size) for idx in range(len(id_list)): self._nodes_labels[id_list[idx], labels_list[idx]] = 1 else: self._adj_matrix = adj_matrix # adj_matrix will be the target label # generate features (additional attribute features is future work) if self._feature_type == "attribute": # for future work self._nodes_features = utils.get_features(feature_path) assert self._nodes_features.shape[ 0] == self._nodes_size, "error: %d != %d" % ( self._nodes_features.shape[0], self._nodes_size) assert self._nodes_features.shape[ 1] == self._feature_size, "error: %d != %d" % ( self._nodes_features.shape[1], self._feature_size) elif self._feature_type == "random": self._nodes_features = np.random.uniform( size=[self._nodes_size, self._feature_size]) elif self._feature_type == "degree": assert self._feature_size == 1, "error: %d != 1" % self._feature_size self._nodes_features = np.zeros( (self._nodes_size, self._feature_size), dtype=np.float32) for idx in range(self._nodes_size): self._nodes_features[idx][0] = net.get_degrees(idx) elif self._feature_type == "adjacency": assert self._feature_size == self._nodes_size, "error: %d != %d" % ( self._feature_size, self._nodes_size) self._nodes_features = adj_matrix else: logger.error("error! invalid feature_type: {}".format( self._feature_type)) self._nodes_order = np.arange(self._nodes_size) if self._shuffled: np.random.shuffle(self._nodes_order) self._epochs_completed = 0 self._index_in_epoch = 0
def eval_once(options): # visual_dir, visual_file = os.path.split(options.visualization_path) if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild): return # print logger logger.info('eval case: visualization...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t label_path = {}'.format(options.label_path)) logger.info('\t label_size = {}'.format(options.label_size)) logger.info('\t eval_node_type: {}'.format(options.eval_node_type)) logger.info('\t save_path: {}\n'.format(options.visualization_path)) logger.info('\t method: t-SNE') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t marker_size: {}'.format(options.marker_size)) logger.info('\t eval_online: {}'.format(options.eval_online)) # get embedding vectors and markersize logger.info('\t reading labeled data from file {}'.format(options.label_path)) time_start = time.time() id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type, multilabel_rule=options.multilabel_rule, type_filepath=os.path.join(options.data_dir, options.data_name + ".nodes")) id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list) labels_matrix = np.array([item[0] for item in labels_list]) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0))) logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1))) logger.info('\t total labels size: {}'.format(options.label_size)) for i in range(options.label_size): logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i))) fr = open(options.visualization_path, 'w') fr.write('eval case: visualization...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t label_path = {}\n'.format(options.label_path)) fr.write('\t label_size = {}\n'.format(options.label_size)) fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr.write('\t save_path: {}\n\n'.format(options.visualization_path)) fr.write('\t method: t-SNE\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t marker_size: {}\n'.format(options.marker_size)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i))) figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name) CCD = plot_embedding_in_2D(Markersize=options.marker_size, features_matrix=features_matrix, labels_matrix=labels_matrix, label_size=options.label_size, figure_path = figure_path) fr.write('\n figure_path: {}\n'.format(figure_path)) fr.write(' clustering_center_distance_sim: {}\n'.format(CCD)) fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start)) fr.close() logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
def eval_online(options): visual_dir = os.path.split(options.visualization_path)[0] if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild): return if not os.path.exists(visual_dir): os.makedirs(visual_dir) # print logger logger.info('eval case: visualization...') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}'.format(options.isdirected)) logger.info('\t label_path = {}'.format(options.label_path)) logger.info('\t label_size = {}'.format(options.label_size)) logger.info('\t eval_node_type: {}'.format(options.eval_node_type)) logger.info('\t save_dir: {}\n'.format(visual_dir)) logger.info('\t method: t-SNE') logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule)) logger.info('\t marker_size: {}'.format(options.marker_size)) logger.info('\t eval_online: {}'.format(options.eval_online)) logger.info('\t eval_interval: {}s'.format(options.eval_interval)) logger.info('\t reading labeled data from file {}'.format(options.label_path)) # get embedding vectors and markersize time_start = time.time() id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type, multilabel_rule=options.multilabel_rule, type_filepath=os.path.join(options.data_dir, options.data_name + ".nodes")) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(len(id_list_totoal))) logger.info('\t total labels size: {}'.format(options.label_size)) fr_total = open(options.visualization_path, 'w') fr_total.write('eval case: visualization...\n') fr_total.write('\t data_dir = {}\n'.format(options.data_dir)) fr_total.write('\t data_name = {}\n'.format(options.data_name)) fr_total.write('\t isdirected = {}\n'.format(options.isdirected)) fr_total.write('\t label_path = {}\n'.format(options.label_path)) fr_total.write('\t label_size = {}\n'.format(options.label_size)) fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr_total.write('\t save_dir: {}\n\n'.format(visual_dir)) fr_total.write('\t method: t-SNE\n') fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr_total.write('\t marker_size: {}\n'.format(options.marker_size)) fr_total.write('\t eval_online: {}\n'.format(options.eval_online)) fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal))) fr_total.write('\t total labels size: {}\n'.format(options.label_size)) fr_total.write('\t results(CCD-clustering_center_distance_sim):\n' '=============================================================\n') fr_total.write('finish_time\tckpt\tCCD\n') last_step = 0 summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph()) summary = tf.Summary() summary.value.add(tag='CCD', simple_value=0.) summary_writer.add_summary(summary, last_step) best_CCD = 0 ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) while (not (ckpt and ckpt.model_checkpoint_path)): logger.info("model and vectors not exist, waiting...") time.sleep(options.eval_interval) ckpt = tf.train.get_checkpoint_state(ckpt_dir) reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type) writing = options.vectors_path + ".writing" while (options.eval_online): while True: ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing): if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")): return time.sleep(options.eval_interval) continue # ready for reading logger.info("\t declare for reading ...") open(reading, "w") # declare time.sleep(30) ckpt = tf.train.get_checkpoint_state(ckpt_dir) cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing): os.remove(reading) # undeclare logger.info("\t confliction! undeclare and waiting ...") time.sleep(options.eval_interval) continue break logger.info("\t eval ckpt-{}.......".format(cur_step)) time_start = time.time() logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path)) id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list_totoal, labels_list_totoal) os.remove(reading) # synchrolock for multi-process logger.info("\t done for reading ...") labels_matrix = np.array([item[0] for item in labels_list]) logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start)) logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0))) logger.info('\t total labels size: {}'.format(options.label_size)) for i in range(options.label_size): logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i))) # visualization fr = open(options.visualization_path + '.{}'.format(cur_step), 'w') fr.write('eval case: visualization...\n') fr.write('\t data_dir = {}\n'.format(options.data_dir)) fr.write('\t data_name = {}\n'.format(options.data_name)) fr.write('\t isdirected = {}\n'.format(options.isdirected)) fr.write('\t label_path = {}\n'.format(options.label_path)) fr.write('\t label_size = {}\n'.format(options.label_size)) fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type)) fr.write('\t method: t-SNE\n') fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule)) fr.write('\t marker_size: {}\n'.format(options.marker_size)) fr.write('\t eval_online: {}\n'.format(options.eval_online)) fr.write('\t eval_interval: {}s\n'.format(options.eval_interval)) fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0))) fr.write('\t total labels size: {}\n'.format(options.label_size)) for i in range(options.label_size): fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i))) fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step)) summary = tf.Summary() figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step) figure_path = os.path.join(visual_dir, figure_name) CCD = plot_embedding_in_2D(Markersize=options.marker_size, features_matrix=features_matrix, labels_matrix=labels_matrix, label_size=options.label_size, figure_path=figure_path) fr.write('\n figure_path: {}\n'.format(figure_path)) fr.write(' clustering_center_distance_sim:{}\n'.format(CCD)) fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start)) fr.close() fr_total.write('%.4f\n' % CCD) fr_total.flush() summary.value.add(tag='CCD', simple_value=CCD) summary_writer.add_summary(summary, cur_step) summary_writer.flush() logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start)) # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio). if CCD > best_CCD: best_CCD = CCD ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)) if ckptIsExists: fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w') else: fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a') fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n" "the current best_ckpt model is loss, but the result is:\n") fr_best.write("best_CCD: {}\n".format(best_CCD)) fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step)) fr_best.close() if ckptIsExists: sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.index') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step) targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta') if os.path.exists(targetFile): os.remove(targetFile) shutil.copy(sourceFile, targetFile) last_step = cur_step fr_total.close() summary_writer.close() return