Ejemplo n.º 1
0
def eval(options):
    if "all" not in options.metapath_path:
        metatree = network.construct_meta_tree(
            metapaths_filename=options.metapath_path)
        flag = False
        for each in metatree.nodes():
            if options.eval_node_type == metatree.nodes[each]["type"]:
                flag = True
                break
        if not flag:
            return flag
    flag = True
    if options.eval_online:
        eval_online(options)
    else:
        eval_once(options)

    return flag
Ejemplo n.º 2
0
def eval(options):
    if "all" not in options.metapath_path:
        metatree = network.construct_meta_tree(
            metapaths_filename=options.metapath_path)
        flag0 = False
        flag1 = False
        for each in metatree.nodes():
            if options.eval_edge_type[0] == metatree.nodes[each]["type"]:
                flag0 = True
            if options.eval_edge_type[1] == metatree.nodes[each]["type"]:
                flag1 = True
        if not (flag0 and flag1):
            return (flag0 and flag1)
    flag0 = True
    flag1 = True

    net = network.construct_network(options, isHIN=True, print_net_info=False)
    if options.eval_online:
        eval_online(options, net)
    else:
        eval_once(options, net)

    return (flag0 and flag1)
Ejemplo n.º 3
0
import networkx as nx
from network import construct_meta_tree

# construct_meta_tree(metapaths_filename='metapath/apcpa')
mt = construct_meta_tree(metapaths_filename='metapath/apc_apa')

# construct_meta_tree(metapaths_filename='metapath/apc_tpa')
Ejemplo n.º 4
0
def train_vectors(options):
    # check vectors and ckpt
    checkpoint = '0'
    train_vec_dir = os.path.split(options.vectors_path)[0]
    ckpt_dir = os.path.join(train_vec_dir, 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    if ckpt and ckpt.model_checkpoint_path:
        cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
        logger.info(
            "model and vectors already exists, checkpoint step = {}".format(
                cur_step))
        checkpoint = input(
            "please input 0 to start a new train, or input a choosed ckpt to restore (-1 for latest ckpt)"
        )
    if checkpoint == '0':
        if ckpt:
            tf.gfile.DeleteRecursively(ckpt_dir)
        logger.info('start a new embedding train using tensorflow ...')
    elif checkpoint == '-1':
        logger.info(
            'restore a embedding train using tensorflow from latest ckpt ...')
    else:
        logger.info(
            'restore a embedding train using tensorflow from ckpt-%s ...' %
            checkpoint)
    if not os.path.exists(train_vec_dir):
        os.makedirs(train_vec_dir)
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    # construct network
    net = network.construct_network(options, isHIN=True)

    lr_file = os.path.join(train_vec_dir, "lr.info")
    np.savetxt(lr_file,
               np.asarray([
                   options.learning_rate, options.decay_epochs,
                   options.decay_rate, options.iter_epoches
               ],
                          dtype=np.float32),
               fmt="%.6f")

    random_walker = "spacey"

    # train info
    logger.info('Train info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t train_model = {}'.format(options.model))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_workers = {}'.format(options.walk_workers))
    logger.info('\t train_workers = {}\n'.format(options.train_workers))
    logger.info('\t walk_restart = {}'.format(options.walk_restart))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t batch_size = {}'.format(options.batch_size))
    logger.info('\t history_position = {}\n'.format(options.history_position))
    logger.info('\t using_metapath = {}\n'.format(options.using_metapath))
    logger.info('\t metapath_path = {}\n'.format(options.metapath_path))
    logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size()))
    logger.info('\t total edges = {}'.format(
        np.size(np.array(net.edges, dtype=np.int32), axis=0)))
    logger.info('\t embedding_size = {}'.format(options.embedding_size))
    logger.info('\t negative = {}'.format(options.negative))
    logger.info('\t distortion_power = {}'.format(options.distortion_power))
    logger.info('\t iter_epoches = {}'.format(options.iter_epoches))
    logger.info('\t init_learning_rate = {}'.format(options.learning_rate))
    logger.info('\t decay_epochs = {}'.format(options.decay_epochs))
    logger.info('\t decay_interval = {}'.format(options.decay_interval))
    logger.info('\t decay_rate = {}'.format(options.decay_rate))
    logger.info('\t loss_interval = {}s'.format(options.loss_interval))
    logger.info('\t summary_steps = {}'.format(options.summary_steps))
    logger.info('\t summary_interval = {}s'.format(options.summary_interval))
    logger.info('\t ckpt_epochs = {}'.format(options.ckpt_epochs))
    logger.info('\t ckpt_interval = {}s\n'.format(options.ckpt_interval))
    logger.info('\t using_gpu = {}'.format(options.using_gpu))
    logger.info('\t visible_device_list = {}'.format(
        options.visible_device_list))
    logger.info('\t log_device_placement = {}'.format(
        options.log_device_placement))
    logger.info('\t allow_soft_placement = {}'.format(
        options.allow_soft_placement))
    logger.info('\t gpu_memory_fraction = {}'.format(
        options.gpu_memory_fraction))
    logger.info('\t gpu_memory_allow_growth = {}'.format(options.allow_growth))

    logger.info('\t ckpt_dir = {}'.format(ckpt_dir))
    logger.info('\t vectors_path = {}'.format(options.vectors_path))
    logger.info('\t learning_rate_path = {}'.format(lr_file))

    fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w')
    fr_vec.write('embedding info:\n')
    fr_vec.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_vec.write('\t data_name = {}\n'.format(options.data_name))
    fr_vec.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_vec.write('\t train_model = {}\n'.format(options.model))
    fr_vec.write('\t random_walker = {}\n'.format(random_walker))
    fr_vec.write('\t walk_workers = {}\n'.format(options.walk_workers))
    fr_vec.write('\t train_workers = {}\n\n'.format(options.train_workers))
    fr_vec.write('\t walk_restart = {}\n'.format(options.walk_restart))
    fr_vec.write('\t walk_times = {}\n'.format(options.walk_times))
    fr_vec.write('\t walk_length = {}\n'.format(options.walk_length))
    fr_vec.write('\t batch_size = {}\n'.format(options.batch_size))
    fr_vec.write('\t history_position = {}\n'.format(options.history_position))
    fr_vec.write('\t using_metapath = {}\n'.format(options.using_metapath))
    fr_vec.write('\t metapath_path = {}\n'.format(options.metapath_path))
    fr_vec.write('\t total embedding nodes = {}\n'.format(
        net.get_nodes_size()))
    fr_vec.write('\t total edges = {}\n'.format(
        np.size(np.array(net.edges, dtype=np.int32), axis=0)))
    fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size))
    fr_vec.write('\t negative = {}\n'.format(options.negative))
    fr_vec.write('\t distortion_power = {}\n\n'.format(
        options.distortion_power))
    fr_vec.write('\t iter_epoches = {}\n'.format(options.iter_epoches))
    fr_vec.write('\t init_learning_rate = {}\n'.format(options.learning_rate))
    fr_vec.write('\t decay_epochs = {}\n'.format(options.decay_epochs))
    fr_vec.write('\t decay_interval = {}\n'.format(options.decay_interval))
    fr_vec.write('\t decay_rate = {}\n'.format(options.decay_rate))
    fr_vec.write('\t loss_interval = {}s\n'.format(options.loss_interval))
    fr_vec.write('\t summary_steps = {}\n'.format(options.summary_steps))
    fr_vec.write('\t summary_interval = {}s\n'.format(
        options.summary_interval))
    fr_vec.write('\t ckpt_epochs = {}\n'.format(options.ckpt_epochs))
    fr_vec.write('\t ckpt_interval = {}s\n\n'.format(options.ckpt_interval))
    fr_vec.write('\t using_gpu = {}\n'.format(options.using_gpu))
    fr_vec.write('\t visible_device_list = {}\n'.format(
        options.visible_device_list))
    fr_vec.write('\t log_device_placement = {}\n'.format(
        options.log_device_placement))
    fr_vec.write('\t allow_soft_placement = {}\n'.format(
        options.allow_soft_placement))
    fr_vec.write('\t gpu_memory_fraction = {}\n'.format(
        options.gpu_memory_fraction))
    fr_vec.write('\t gpu_memory_allow_growth = {}\n'.format(
        options.allow_growth))

    fr_vec.write('\t ckpt_dir = {}\n'.format(ckpt_dir))
    fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path))
    fr_vec.write('\t learning_rate_path = {}\n'.format(lr_file))

    fr_vec.close()

    # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if options.using_gpu:
        visible_devices = str(options.visible_device_list[0])
        for dev in options.visible_device_list[1:]:
            visible_devices = visible_devices + ',%s' % dev
        os.environ['CUDA_VISIBLE_DEVICES'] = visible_devices
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    # set log_level for gpu:
    console_log_level = options.log.upper()
    if console_log_level == "CRITICAL":
        gpu_log = '3'
    elif console_log_level == "ERROR":
        gpu_log = '2'
    elif console_log_level == "WARNING":
        gpu_log = '1'
    else:
        gpu_log = '0'
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = gpu_log

    if options.using_metapath == "metagraph":
        metagraph = network.construct_meta_graph(options.metapath_path,
                                                 isdirected=options.isdirected)
    elif options.using_metapath == "metatree":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
    else:
        metagraph = None

    walker = Walker(net,
                    random_walker=random_walker,
                    walk_length=options.walk_length,
                    walk_restart=options.walk_restart,
                    distortion_power=options.distortion_power,
                    neg_sampled=options.negative,
                    metagraph=metagraph,
                    using_metapath=options.using_metapath,
                    history_position=options.history_position)

    # train
    logger.info('training...')
    time_start = time.time()
    train(walker=walker,
          lr_file=lr_file,
          ckpt_dir=ckpt_dir,
          checkpoint=checkpoint,
          options=options)
    logger.info('train completed in {}s'.format(time.time() - time_start))
    return
Ejemplo n.º 5
0
def build_walk_corpus(options):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
        net = network.construct_network(options, isHIN=False)
    elif options.model == "SpaceyWalk":
        random_walker = "spacey"
        net = network.construct_network(options, isHIN=True)
    elif options.model == "MetatreeWalk":
        random_walker = "metatreewalk"
        net = network.construct_network(options, isHIN=True)
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()

    logger.info('Corpus bulid: walk info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t max_walk_workers = {}'.format(options.walk_workers))
    logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory))
    logger.info('\t seed = {}'.format(options.seed))
    logger.info('\t alpha = {}'.format(options.alpha))
    logger.info('\t window_size = {}'.format(options.window_size))
    logger.info('\t sample_size = {}'.format(options.sample_size))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_walks.write('\t data_name = {}\n'.format(options.data_name))
    fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t seed = {}\n'.format(options.seed))
    fr_walks.write('\t alpha = {}\n'.format(options.alpha))
    fr_walks.write('\t window_size = {}\n'.format(options.window_size))
    fr_walks.write('\t sample_size = {}\n'.format(options.sample_size))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    if options.model == "SpaceyWalk":
        if options.using_metapath == "metagraph":
            metagraph = network.construct_meta_graph(
                options.metapath_path, isdirected=options.isdirected)
        elif options.using_metapath == "metatree":
            metagraph = network.construct_meta_tree(options.metapath_path,
                                                    isdirected=True)
        elif options.using_metapath == "metaschema":
            metagraph = None
        else:
            logger.error("Unknown feature : '%s'." % options.using_metapath)
            sys.exit()
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        using_metapath=options.using_metapath,
                        history_position=options.history_position,
                        task="walk",
                        alpha=options.alpha)
    elif options.model == "MetatreeWalk":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        task="walk")

    corpus_store_dir = os.path.split(options.corpus_store_path)[0]
    if not os.path.exists(corpus_store_dir):
        os.makedirs(corpus_store_dir)

    logger.info(
        'Corpus bulid: walking and computing (using %d workers for multi-process)...'
        % options.walk_workers)
    time_start = time.time()

    if options.walk_times <= options.walk_workers:
        times_per_worker = [1 for _ in range(options.walk_times)]
    else:
        div, mod = divmod(options.walk_times, options.walk_workers)
        times_per_worker = [div for _ in range(options.walk_workers)]
        for idx in range(mod):
            times_per_worker[idx] = times_per_worker[idx] + 1
    assert sum(
        times_per_worker
    ) == options.walk_times, 'workers allocating failed: %d != %d' % (
        sum(times_per_worker), options.walk_times)

    nodes_total = list(range(walker.nodes_size))
    sp_random = random.Random(options.seed)
    sp_random.shuffle(nodes_total)
    nodes_total = nodes_total[0:options.sample_size]
    nodes_total.insert(0, 8407)
    nodes_total.insert(0, 9891)
    nodes_total.insert(0, 8354)
    nodes_total.insert(0, 8798)
    for node in nodes_total:
        args_list = []
        begin = 0
        for cnt in times_per_worker:
            args_list.append((corpus_store_dir, node, begin + 1, begin + cnt,
                              options.window_size))
            begin += cnt
        with ProcessPoolExecutor(max_workers=options.walk_workers) as executor:
            executor.map(_construct_walk_corpus_and_write_singprocess,
                         args_list)
    logger.info('Corpus bulid: walk completed in {}s'.format(time.time() -
                                                             time_start))
    del walker
    gc.collect()
    return
Ejemplo n.º 6
0
def build_walk_corpus(options):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
        net = network.construct_network(options, isHIN=False)
    elif options.model == "SpaceyWalk":
        random_walker = "spacey"
        net = network.construct_network(options, isHIN=True)
    elif options.model == "MetatreeWalk":
        random_walker = "metatreewalk"
        net = network.construct_network(options, isHIN=True)
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()

    logger.info('Corpus bulid: walk info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t max_walk_workers = {}'.format(options.walk_workers))
    logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory))
    logger.info('\t alpha = {}'.format(options.alpha))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_walks.write('\t data_name = {}\n'.format(options.data_name))
    fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    if options.model == "DeepWalk":
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length)
    elif options.model == "SpaceyWalk":
        if options.using_metapath == "metagraph":
            metagraph = network.construct_meta_graph(
                options.metapath_path, isdirected=options.isdirected)
        elif options.using_metapath == "metatree":
            metagraph = network.construct_meta_tree(options.metapath_path,
                                                    isdirected=True)
        elif options.using_metapath == "metaschema":
            metagraph = None
        else:
            logger.error("Unknown feature : '%s'." % options.using_metapath)
            sys.exit()
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        using_metapath=options.using_metapath,
                        history_position=options.history_position,
                        task="walk",
                        alpha=options.alpha)
    elif options.model == "MetatreeWalk":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        task="walk")

    walk_corpus = None
    if options.walk_to_memory:
        walk_corpus = build_walk_corpus_to_memory(
            options.walk_times, max_num_workers=options.walk_workers)
        if not options.not_store_corpus:
            store_walk_corpus(options.corpus_store_path,
                              walk_corpus,
                              always_rebuild=options.always_rebuild)
    else:
        # walk to files
        walk_files = build_walk_corpus_to_files(
            options.corpus_store_path,
            options.walk_times,
            headflag_of_index_file=options.headflag_of_index_file,
            max_num_workers=options.walk_workers,
            always_rebuild=options.always_rebuild)
        if "train" in options.task:
            if options.load_from_memory:
                walk_corpus = load_walks_corpus(walk_files)
            else:
                walk_corpus = WalksCorpus(walk_files)
    del walker
    gc.collect()
    return walk_corpus