def eval(options): if "all" not in options.metapath_path: metatree = network.construct_meta_tree( metapaths_filename=options.metapath_path) flag = False for each in metatree.nodes(): if options.eval_node_type == metatree.nodes[each]["type"]: flag = True break if not flag: return flag flag = True if options.eval_online: eval_online(options) else: eval_once(options) return flag
def eval(options): if "all" not in options.metapath_path: metatree = network.construct_meta_tree( metapaths_filename=options.metapath_path) flag0 = False flag1 = False for each in metatree.nodes(): if options.eval_edge_type[0] == metatree.nodes[each]["type"]: flag0 = True if options.eval_edge_type[1] == metatree.nodes[each]["type"]: flag1 = True if not (flag0 and flag1): return (flag0 and flag1) flag0 = True flag1 = True net = network.construct_network(options, isHIN=True, print_net_info=False) if options.eval_online: eval_online(options, net) else: eval_once(options, net) return (flag0 and flag1)
import networkx as nx from network import construct_meta_tree # construct_meta_tree(metapaths_filename='metapath/apcpa') mt = construct_meta_tree(metapaths_filename='metapath/apc_apa') # construct_meta_tree(metapaths_filename='metapath/apc_tpa')
def train_vectors(options): # check vectors and ckpt checkpoint = '0' train_vec_dir = os.path.split(options.vectors_path)[0] ckpt_dir = os.path.join(train_vec_dir, 'ckpt') ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] logger.info( "model and vectors already exists, checkpoint step = {}".format( cur_step)) checkpoint = input( "please input 0 to start a new train, or input a choosed ckpt to restore (-1 for latest ckpt)" ) if checkpoint == '0': if ckpt: tf.gfile.DeleteRecursively(ckpt_dir) logger.info('start a new embedding train using tensorflow ...') elif checkpoint == '-1': logger.info( 'restore a embedding train using tensorflow from latest ckpt ...') else: logger.info( 'restore a embedding train using tensorflow from ckpt-%s ...' % checkpoint) if not os.path.exists(train_vec_dir): os.makedirs(train_vec_dir) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) # construct network net = network.construct_network(options, isHIN=True) lr_file = os.path.join(train_vec_dir, "lr.info") np.savetxt(lr_file, np.asarray([ options.learning_rate, options.decay_epochs, options.decay_rate, options.iter_epoches ], dtype=np.float32), fmt="%.6f") random_walker = "spacey" # train info logger.info('Train info:') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}\n'.format(options.isdirected)) logger.info('\t train_model = {}'.format(options.model)) logger.info('\t random_walker = {}'.format(random_walker)) logger.info('\t walk_workers = {}'.format(options.walk_workers)) logger.info('\t train_workers = {}\n'.format(options.train_workers)) logger.info('\t walk_restart = {}'.format(options.walk_restart)) logger.info('\t walk_times = {}'.format(options.walk_times)) logger.info('\t walk_length = {}'.format(options.walk_length)) logger.info('\t batch_size = {}'.format(options.batch_size)) logger.info('\t history_position = {}\n'.format(options.history_position)) logger.info('\t using_metapath = {}\n'.format(options.using_metapath)) logger.info('\t metapath_path = {}\n'.format(options.metapath_path)) logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size())) logger.info('\t total edges = {}'.format( np.size(np.array(net.edges, dtype=np.int32), axis=0))) logger.info('\t embedding_size = {}'.format(options.embedding_size)) logger.info('\t negative = {}'.format(options.negative)) logger.info('\t distortion_power = {}'.format(options.distortion_power)) logger.info('\t iter_epoches = {}'.format(options.iter_epoches)) logger.info('\t init_learning_rate = {}'.format(options.learning_rate)) logger.info('\t decay_epochs = {}'.format(options.decay_epochs)) logger.info('\t decay_interval = {}'.format(options.decay_interval)) logger.info('\t decay_rate = {}'.format(options.decay_rate)) logger.info('\t loss_interval = {}s'.format(options.loss_interval)) logger.info('\t summary_steps = {}'.format(options.summary_steps)) logger.info('\t summary_interval = {}s'.format(options.summary_interval)) logger.info('\t ckpt_epochs = {}'.format(options.ckpt_epochs)) logger.info('\t ckpt_interval = {}s\n'.format(options.ckpt_interval)) logger.info('\t using_gpu = {}'.format(options.using_gpu)) logger.info('\t visible_device_list = {}'.format( options.visible_device_list)) logger.info('\t log_device_placement = {}'.format( options.log_device_placement)) logger.info('\t allow_soft_placement = {}'.format( options.allow_soft_placement)) logger.info('\t gpu_memory_fraction = {}'.format( options.gpu_memory_fraction)) logger.info('\t gpu_memory_allow_growth = {}'.format(options.allow_growth)) logger.info('\t ckpt_dir = {}'.format(ckpt_dir)) logger.info('\t vectors_path = {}'.format(options.vectors_path)) logger.info('\t learning_rate_path = {}'.format(lr_file)) fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w') fr_vec.write('embedding info:\n') fr_vec.write('\t data_dir = {}\n'.format(options.data_dir)) fr_vec.write('\t data_name = {}\n'.format(options.data_name)) fr_vec.write('\t isdirected = {}\n\n'.format(options.isdirected)) fr_vec.write('\t train_model = {}\n'.format(options.model)) fr_vec.write('\t random_walker = {}\n'.format(random_walker)) fr_vec.write('\t walk_workers = {}\n'.format(options.walk_workers)) fr_vec.write('\t train_workers = {}\n\n'.format(options.train_workers)) fr_vec.write('\t walk_restart = {}\n'.format(options.walk_restart)) fr_vec.write('\t walk_times = {}\n'.format(options.walk_times)) fr_vec.write('\t walk_length = {}\n'.format(options.walk_length)) fr_vec.write('\t batch_size = {}\n'.format(options.batch_size)) fr_vec.write('\t history_position = {}\n'.format(options.history_position)) fr_vec.write('\t using_metapath = {}\n'.format(options.using_metapath)) fr_vec.write('\t metapath_path = {}\n'.format(options.metapath_path)) fr_vec.write('\t total embedding nodes = {}\n'.format( net.get_nodes_size())) fr_vec.write('\t total edges = {}\n'.format( np.size(np.array(net.edges, dtype=np.int32), axis=0))) fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size)) fr_vec.write('\t negative = {}\n'.format(options.negative)) fr_vec.write('\t distortion_power = {}\n\n'.format( options.distortion_power)) fr_vec.write('\t iter_epoches = {}\n'.format(options.iter_epoches)) fr_vec.write('\t init_learning_rate = {}\n'.format(options.learning_rate)) fr_vec.write('\t decay_epochs = {}\n'.format(options.decay_epochs)) fr_vec.write('\t decay_interval = {}\n'.format(options.decay_interval)) fr_vec.write('\t decay_rate = {}\n'.format(options.decay_rate)) fr_vec.write('\t loss_interval = {}s\n'.format(options.loss_interval)) fr_vec.write('\t summary_steps = {}\n'.format(options.summary_steps)) fr_vec.write('\t summary_interval = {}s\n'.format( options.summary_interval)) fr_vec.write('\t ckpt_epochs = {}\n'.format(options.ckpt_epochs)) fr_vec.write('\t ckpt_interval = {}s\n\n'.format(options.ckpt_interval)) fr_vec.write('\t using_gpu = {}\n'.format(options.using_gpu)) fr_vec.write('\t visible_device_list = {}\n'.format( options.visible_device_list)) fr_vec.write('\t log_device_placement = {}\n'.format( options.log_device_placement)) fr_vec.write('\t allow_soft_placement = {}\n'.format( options.allow_soft_placement)) fr_vec.write('\t gpu_memory_fraction = {}\n'.format( options.gpu_memory_fraction)) fr_vec.write('\t gpu_memory_allow_growth = {}\n'.format( options.allow_growth)) fr_vec.write('\t ckpt_dir = {}\n'.format(ckpt_dir)) fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path)) fr_vec.write('\t learning_rate_path = {}\n'.format(lr_file)) fr_vec.close() # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if options.using_gpu: visible_devices = str(options.visible_device_list[0]) for dev in options.visible_device_list[1:]: visible_devices = visible_devices + ',%s' % dev os.environ['CUDA_VISIBLE_DEVICES'] = visible_devices else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # set log_level for gpu: console_log_level = options.log.upper() if console_log_level == "CRITICAL": gpu_log = '3' elif console_log_level == "ERROR": gpu_log = '2' elif console_log_level == "WARNING": gpu_log = '1' else: gpu_log = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = gpu_log if options.using_metapath == "metagraph": metagraph = network.construct_meta_graph(options.metapath_path, isdirected=options.isdirected) elif options.using_metapath == "metatree": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) else: metagraph = None walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, walk_restart=options.walk_restart, distortion_power=options.distortion_power, neg_sampled=options.negative, metagraph=metagraph, using_metapath=options.using_metapath, history_position=options.history_position) # train logger.info('training...') time_start = time.time() train(walker=walker, lr_file=lr_file, ckpt_dir=ckpt_dir, checkpoint=checkpoint, options=options) logger.info('train completed in {}s'.format(time.time() - time_start)) return
def build_walk_corpus(options): global walker # check walk info and record if not utils.check_rebuild(options.corpus_store_path, descrip='walk corpus', always_rebuild=options.always_rebuild): return if options.model == "DeepWalk": random_walker = "uniform" net = network.construct_network(options, isHIN=False) elif options.model == "SpaceyWalk": random_walker = "spacey" net = network.construct_network(options, isHIN=True) elif options.model == "MetatreeWalk": random_walker = "metatreewalk" net = network.construct_network(options, isHIN=True) else: logger.error("Unknown model or it cann't build walk corpus: '%s'." % options.model) sys.exit() logger.info('Corpus bulid: walk info:') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}\n'.format(options.isdirected)) logger.info('\t random_walker = {}'.format(random_walker)) logger.info('\t walk_times = {}'.format(options.walk_times)) logger.info('\t walk_length = {}'.format(options.walk_length)) logger.info('\t max_walk_workers = {}'.format(options.walk_workers)) logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory)) logger.info('\t seed = {}'.format(options.seed)) logger.info('\t alpha = {}'.format(options.alpha)) logger.info('\t window_size = {}'.format(options.window_size)) logger.info('\t sample_size = {}'.format(options.sample_size)) if options.walk_to_memory: logger.info('\t donot store corpus = {}'.format( str(options.not_store_corpus))) if not options.not_store_corpus: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) else: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) fr_walks = open( os.path.join( os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w') fr_walks.write('Corpus walk info:\n') fr_walks.write('\t data_dir = {}\n'.format(options.data_dir)) fr_walks.write('\t data_name = {}\n'.format(options.data_name)) fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected)) fr_walks.write('\t random_walker = {}\n'.format(random_walker)) fr_walks.write('\t walk times = {}\n'.format(options.walk_times)) fr_walks.write('\t walk length = {}\n'.format(options.walk_length)) fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers)) fr_walks.write('\t seed = {}\n'.format(options.seed)) fr_walks.write('\t alpha = {}\n'.format(options.alpha)) fr_walks.write('\t window_size = {}\n'.format(options.window_size)) fr_walks.write('\t sample_size = {}\n'.format(options.sample_size)) fr_walks.write('\t walk to memory = {}\n'.format( str(options.walk_to_memory))) if options.walk_to_memory: fr_walks.write('\t donot store corpus = {}\n'.format( str(options.not_store_corpus))) if not options.not_store_corpus: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) else: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) fr_walks.close() if options.model == "SpaceyWalk": if options.using_metapath == "metagraph": metagraph = network.construct_meta_graph( options.metapath_path, isdirected=options.isdirected) elif options.using_metapath == "metatree": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) elif options.using_metapath == "metaschema": metagraph = None else: logger.error("Unknown feature : '%s'." % options.using_metapath) sys.exit() walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, using_metapath=options.using_metapath, history_position=options.history_position, task="walk", alpha=options.alpha) elif options.model == "MetatreeWalk": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, task="walk") corpus_store_dir = os.path.split(options.corpus_store_path)[0] if not os.path.exists(corpus_store_dir): os.makedirs(corpus_store_dir) logger.info( 'Corpus bulid: walking and computing (using %d workers for multi-process)...' % options.walk_workers) time_start = time.time() if options.walk_times <= options.walk_workers: times_per_worker = [1 for _ in range(options.walk_times)] else: div, mod = divmod(options.walk_times, options.walk_workers) times_per_worker = [div for _ in range(options.walk_workers)] for idx in range(mod): times_per_worker[idx] = times_per_worker[idx] + 1 assert sum( times_per_worker ) == options.walk_times, 'workers allocating failed: %d != %d' % ( sum(times_per_worker), options.walk_times) nodes_total = list(range(walker.nodes_size)) sp_random = random.Random(options.seed) sp_random.shuffle(nodes_total) nodes_total = nodes_total[0:options.sample_size] nodes_total.insert(0, 8407) nodes_total.insert(0, 9891) nodes_total.insert(0, 8354) nodes_total.insert(0, 8798) for node in nodes_total: args_list = [] begin = 0 for cnt in times_per_worker: args_list.append((corpus_store_dir, node, begin + 1, begin + cnt, options.window_size)) begin += cnt with ProcessPoolExecutor(max_workers=options.walk_workers) as executor: executor.map(_construct_walk_corpus_and_write_singprocess, args_list) logger.info('Corpus bulid: walk completed in {}s'.format(time.time() - time_start)) del walker gc.collect() return
def build_walk_corpus(options): global walker # check walk info and record if not utils.check_rebuild(options.corpus_store_path, descrip='walk corpus', always_rebuild=options.always_rebuild): return if options.model == "DeepWalk": random_walker = "uniform" net = network.construct_network(options, isHIN=False) elif options.model == "SpaceyWalk": random_walker = "spacey" net = network.construct_network(options, isHIN=True) elif options.model == "MetatreeWalk": random_walker = "metatreewalk" net = network.construct_network(options, isHIN=True) else: logger.error("Unknown model or it cann't build walk corpus: '%s'." % options.model) sys.exit() logger.info('Corpus bulid: walk info:') logger.info('\t data_dir = {}'.format(options.data_dir)) logger.info('\t data_name = {}'.format(options.data_name)) logger.info('\t isdirected = {}\n'.format(options.isdirected)) logger.info('\t random_walker = {}'.format(random_walker)) logger.info('\t walk_times = {}'.format(options.walk_times)) logger.info('\t walk_length = {}'.format(options.walk_length)) logger.info('\t max_walk_workers = {}'.format(options.walk_workers)) logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory)) logger.info('\t alpha = {}'.format(options.alpha)) if options.walk_to_memory: logger.info('\t donot store corpus = {}'.format( str(options.not_store_corpus))) if not options.not_store_corpus: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) else: logger.info('\t corpus store path = {}'.format( options.corpus_store_path)) fr_walks = open( os.path.join( os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w') fr_walks.write('Corpus walk info:\n') fr_walks.write('\t data_dir = {}\n'.format(options.data_dir)) fr_walks.write('\t data_name = {}\n'.format(options.data_name)) fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected)) fr_walks.write('\t random_walker = {}\n'.format(random_walker)) fr_walks.write('\t walk times = {}\n'.format(options.walk_times)) fr_walks.write('\t walk length = {}\n'.format(options.walk_length)) fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers)) fr_walks.write('\t walk to memory = {}\n'.format( str(options.walk_to_memory))) if options.walk_to_memory: fr_walks.write('\t donot store corpus = {}\n'.format( str(options.not_store_corpus))) if not options.not_store_corpus: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) else: fr_walks.write('\t corpus store path = {}\n'.format( options.corpus_store_path)) fr_walks.close() if options.model == "DeepWalk": walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length) elif options.model == "SpaceyWalk": if options.using_metapath == "metagraph": metagraph = network.construct_meta_graph( options.metapath_path, isdirected=options.isdirected) elif options.using_metapath == "metatree": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) elif options.using_metapath == "metaschema": metagraph = None else: logger.error("Unknown feature : '%s'." % options.using_metapath) sys.exit() walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, using_metapath=options.using_metapath, history_position=options.history_position, task="walk", alpha=options.alpha) elif options.model == "MetatreeWalk": metagraph = network.construct_meta_tree(options.metapath_path, isdirected=True) walker = Walker(net, random_walker=random_walker, walk_length=options.walk_length, metagraph=metagraph, task="walk") walk_corpus = None if options.walk_to_memory: walk_corpus = build_walk_corpus_to_memory( options.walk_times, max_num_workers=options.walk_workers) if not options.not_store_corpus: store_walk_corpus(options.corpus_store_path, walk_corpus, always_rebuild=options.always_rebuild) else: # walk to files walk_files = build_walk_corpus_to_files( options.corpus_store_path, options.walk_times, headflag_of_index_file=options.headflag_of_index_file, max_num_workers=options.walk_workers, always_rebuild=options.always_rebuild) if "train" in options.task: if options.load_from_memory: walk_corpus = load_walks_corpus(walk_files) else: walk_corpus = WalksCorpus(walk_files) del walker gc.collect() return walk_corpus