Beispiel #1
0
def flow_based_algorithm(server_types, demand_profile, use_flow=True):
    '''
    >>> from set_cover_reduction import construct_server_types, construct_demand_profile
    >>> number_of_elements = 3
    >>> sets = [[1, 2], [1, 3], [2, 3]]
    >>> server_types = construct_server_types(number_of_elements, sets)
    >>> demand_profile = construct_demand_profile(number_of_elements)
    >>> schedule = flow_based_algorithm(server_types, demand_profile)
    >>> schedule.total_energy
    167.0045222223457
    '''
    network = construct_network(server_types, demand_profile)
    tau = len(server_types)
    if use_flow:
        solver = MinimumCostTwoCommodityFlowSolver(network)
        solver.silent()
        fractional_cost = solver.solve()
        d = {
            # scale up the flow of commodity 1
            (i, k): min(int(tau * network.edges[(u(i, k), u(i, k + 1))].flow1),
                        server_types[i].m)
            for k in demand_profile for i in server_types
        }
    else:
        solver = DPMSolver(server_types, demand_profile, variable_type='C')
        solver.silent()
        fractional_cost = solver.solve()
        solution = solver.solution
        d = {(i, k): min(int(tau * solution[x(i, 0, k)]), server_types[i].m)
             for k in demand_profile for i in server_types}
    schedule = construct_schedule(server_types, demand_profile, d)
    return schedule, fractional_cost
Beispiel #2
0
def split_network(origin_net_dir, train_net_dir, eval_net_dir, data_prefixname,
                  data_format, isdirected, train_ratio):
    net_origin = network.construct_network(
        data_path=os.path.join(origin_net_dir,
                               data_prefixname + "." + data_format),
        data_format=data_format,
        net_info_path=os.path.join(origin_net_dir, "net.info"),
        isdirected=isdirected)
    net_train, net_eval = net_origin.split_by_edges(train_ratio=train_ratio)
    net_train.save_network(train_net_dir, data_prefixname, data_format)
    # net_train.print_net_info(edges_file=os.path.join(train_net_dir, datafilename), file_path=os.path.join(train_net_dir, "net.info"))
    net_eval.save_network(eval_net_dir, data_prefixname, data_format)
    # net_eval.print_net_info(edges_file=os.path.join(eval_net_dir, datafilename), file_path=os.path.join(eval_net_dir, "net.info"))
    del net_origin, net_train, net_eval
Beispiel #3
0
def train_vectors(options):
    if not utils.check_rebuild(options.vectors_path,
                               descrip='vectors',
                               always_rebuild=options.always_rebuild):
        return
    train_vec_dir = os.path.split(options.vectors_path)[0]
    if not os.path.exists(train_vec_dir):
        os.makedirs(train_vec_dir)

    # construct network
    net = network.construct_network(options)

    Kstep = 2

    # train info
    logger.info('Train info:')
    logger.info('\t train_model = {}'.format(options.model))
    logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size()))
    logger.info('\t total edges = {}'.format(net.get_edges_size()))
    logger.info('\t embedding size = {}'.format(options.embedding_size))
    logger.info('\t Kstep = {}'.format(Kstep))
    logger.info('\t vectors_path = {}'.format(options.vectors_path))

    fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w')
    fr_vec.write('embedding info:\n')
    fr_vec.write('\t train_model = {}\n'.format(options.model))
    fr_vec.write('\t total embedding nodes = {}\n'.format(
        net.get_nodes_size()))
    fr_vec.write('\t total edges = {}\n'.format(net.get_edges_size()))
    fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size))
    fr_vec.write('\t Kstep = {}\n'.format(Kstep))
    fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path))
    fr_vec.close()

    # train
    logger.info('training...')
    time_start = time.time()
    grarep = GraRep(net.get_nodes_size(), net.edges, options.embedding_size,
                    Kstep)
    vecs = grarep.train()
    save_word2vec_format(options.vectors_path, vecs, net._idx_nodes)
    logger.info('train completed in {}s'.format(time.time() - time_start))
    return
Beispiel #4
0
def process(options):
    logger.info("Data preprocessing: network serialization ...")
    time_start = time.time()
    source_data_dir, data_filename = os.path.split(options.data_path)
    data_prefixname = data_filename.split(".")[0]
    data_format = options.data_format
    isdirected = options.isdirected
    target_data_dir = options.target_data_dir

    logger.info("\t source_data_dir = {}".format(source_data_dir))
    logger.info("\t data_filename = {}".format(data_filename))
    logger.info("\t data_format = {}".format(data_format))
    logger.info("\t isdirected = {}".format(isdirected))
    logger.info("\n\t target_data_dir = {}".format(target_data_dir))

    net = network.construct_network(data_path = options.data_path,
                              data_format = data_format,
                              net_info_path = os.path.join(source_data_dir, "net.info"),
                              isdirected = isdirected,
                              print_net_info = True)
    net.make_consistent(remove_isolated = not options.keep_isolated)
    target_data_format = ["adjlist", "edgelist"]
    for save_format in target_data_format:
        net.save_network(target_data_dir, data_prefixname, save_format)

    # for label
    source_data_path = os.path.join(source_data_dir, data_prefixname + ".labels")
    target_data_path = os.path.join(target_data_dir, data_prefixname + ".labels")
    if os.path.exists(source_data_path):
        with open(target_data_path, "w") as fr:
            for line in open(source_data_path):
                line = line.strip()
                if line:
                    linelist = line.split('\t')
                    source_id = int(linelist[0].strip())
                    if source_id in net._nodes_id:
                        fr.write("{}".format(net._nodes_id[source_id]))
                        for label in linelist[1:]:
                            fr.write("\t{}".format(label.strip()))
                        fr.write("\n")
    logger.info('Data preprocessing: network serialization completed in {}s.'.format(time.time() - time_start))
Beispiel #5
0
def eval(options):
    if "all" not in options.metapath_path:
        metatree = network.construct_meta_tree(
            metapaths_filename=options.metapath_path)
        flag0 = False
        flag1 = False
        for each in metatree.nodes():
            if options.eval_edge_type[0] == metatree.nodes[each]["type"]:
                flag0 = True
            if options.eval_edge_type[1] == metatree.nodes[each]["type"]:
                flag1 = True
        if not (flag0 and flag1):
            return (flag0 and flag1)
    flag0 = True
    flag1 = True

    net = network.construct_network(options, isHIN=True, print_net_info=False)
    if options.eval_online:
        eval_online(options, net)
    else:
        eval_once(options, net)

    return (flag0 and flag1)
Beispiel #6
0
def train_vectors(options):
    # check vectors and ckpt
    checkpoint = '0'
    train_vec_dir = os.path.split(options.vectors_path)[0]
    ckpt_dir = os.path.join(train_vec_dir, 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    if ckpt and ckpt.model_checkpoint_path:
        cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
        logger.info(
            "model and vectors already exists, checkpoint step = {}".format(
                cur_step))
        checkpoint = input(
            "please input 0 to start a new train, or input a choosed ckpt to restore (-1 for latest ckpt)"
        )
    if checkpoint == '0':
        if ckpt:
            tf.gfile.DeleteRecursively(ckpt_dir)
        logger.info('start a new embedding train using tensorflow ...')
    elif checkpoint == '-1':
        logger.info(
            'restore a embedding train using tensorflow from latest ckpt...')
    else:
        logger.info(
            'restore a embedding train using tensorflow from ckpt-%s...' %
            checkpoint)
    if not os.path.exists(train_vec_dir):
        os.makedirs(train_vec_dir)
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    # construct network
    net = network.construct_network(options)

    lr_file = os.path.join(train_vec_dir, "lr.info")
    np.savetxt(lr_file,
               np.asarray([
                   options.learning_rate, options.decay_epochs,
                   options.decay_rate, options.iter_epoches
               ],
                          dtype=np.float32),
               fmt="%.6f")

    # train info
    logger.info('Train info:')
    logger.info('\t train_model = {}'.format(options.model))
    logger.info('\t order = {}'.format(options.order))
    logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size()))
    logger.info('\t total edges = {}'.format(net.get_edges_size()))
    logger.info('\t embedding size = {}'.format(options.embedding_size))
    logger.info('\t negative = {}'.format(options.negative))
    logger.info('\t distortion_power = {}\n'.format(options.distortion_power))
    logger.info('\t batch_size = {}'.format(options.batch_size))
    logger.info('\t iter_epoches = {}'.format(options.iter_epoches))
    logger.info('\t init_learning_rate = {}'.format(options.learning_rate))
    logger.info('\t decay_epochs = {}'.format(options.decay_epochs))
    logger.info('\t decay_interval = {}'.format(options.decay_interval))
    logger.info('\t decay_rate = {}'.format(options.decay_rate))
    logger.info('\t loss_interval = {}s'.format(options.loss_interval))
    logger.info('\t summary_steps = {}'.format(options.summary_steps))
    logger.info('\t summary_interval = {}s'.format(options.summary_interval))
    logger.info('\t ckpt_epochs = {}'.format(options.ckpt_epochs))
    logger.info('\t ckpt_interval = {}s\n'.format(options.ckpt_interval))
    logger.info('\t using_gpu = {}'.format(options.using_gpu))
    logger.info('\t visible_device_list = {}'.format(
        options.visible_device_list))
    logger.info('\t log_device_placement = {}'.format(
        options.log_device_placement))
    logger.info('\t allow_soft_placement = {}'.format(
        options.allow_soft_placement))
    logger.info('\t gpu_memory_fraction = {}'.format(
        options.gpu_memory_fraction))
    logger.info('\t gpu_memory_allow_growth = {}'.format(options.allow_growth))
    logger.info('\t train_workers = {}\n'.format(options.train_workers))

    logger.info('\t ckpt_dir = {}'.format(ckpt_dir))
    logger.info('\t vectors_path = {}'.format(options.vectors_path))
    logger.info('\t learning_rate_path = {}'.format(lr_file))

    fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w')
    fr_vec.write('embedding info:\n')
    fr_vec.write('\t train_model = {}\n'.format(options.model))
    fr_vec.write('\t order = {}\n'.format(options.order))
    fr_vec.write('\t total embedding nodes = {}\n'.format(
        net.get_nodes_size()))
    fr_vec.write('\t total edges = {}\n'.format(net.get_edges_size()))
    fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size))
    fr_vec.write('\t negative = {}\n'.format(options.negative))
    fr_vec.write('\t distortion_power = {}\n\n'.format(
        options.distortion_power))
    fr_vec.write('\t batch_size = {}\n'.format(options.batch_size))
    fr_vec.write('\t iter_epoches = {}\n'.format(options.iter_epoches))
    fr_vec.write('\t init_learning_rate = {}\n'.format(options.learning_rate))
    fr_vec.write('\t decay_epochs = {}\n'.format(options.decay_epochs))
    fr_vec.write('\t decay_interval = {}\n'.format(options.decay_interval))
    fr_vec.write('\t decay_rate = {}\n'.format(options.decay_rate))
    fr_vec.write('\t loss_interval = {}s\n'.format(options.loss_interval))
    fr_vec.write('\t summary_steps = {}\n'.format(options.summary_steps))
    fr_vec.write('\t summary_interval = {}s\n'.format(
        options.summary_interval))
    fr_vec.write('\t ckpt_epochs = {}\n'.format(options.ckpt_epochs))
    fr_vec.write('\t ckpt_interval = {}s\n\n'.format(options.ckpt_interval))
    fr_vec.write('\t using_gpu = {}\n'.format(options.using_gpu))
    fr_vec.write('\t visible_device_list = {}\n'.format(
        options.visible_device_list))
    fr_vec.write('\t log_device_placement = {}\n'.format(
        options.log_device_placement))
    fr_vec.write('\t allow_soft_placement = {}\n'.format(
        options.allow_soft_placement))
    fr_vec.write('\t gpu_memory_fraction = {}\n'.format(
        options.gpu_memory_fraction))
    fr_vec.write('\t gpu_memory_allow_growth = {}\n'.format(
        options.allow_growth))
    fr_vec.write('\t train_workers = {}\n\n'.format(options.train_workers))

    fr_vec.write('\t ckpt_dir = {}\n'.format(ckpt_dir))
    fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path))
    fr_vec.write('\t learning_rate_path = {}\n'.format(lr_file))

    fr_vec.close()

    # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if options.using_gpu:
        visible_devices = str(options.visible_device_list[0])
        for dev in options.visible_device_list[1:]:
            visible_devices = visible_devices + ',%s' % dev
        os.environ['CUDA_VISIBLE_DEVICES'] = visible_devices
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    # set log_level for gpu:
    console_log_level = options.log.upper()
    if console_log_level == "CRITICAL":
        gpu_log = '3'
    elif console_log_level == "ERROR":
        gpu_log = '2'
    elif console_log_level == "WARNING":
        gpu_log = '1'
    else:
        gpu_log = '0'
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = gpu_log

    # train
    logger.info('training...')
    time_start = time.time()
    train(net=net,
          vectors_path=options.vectors_path,
          lr_file=lr_file,
          ckpt_dir=ckpt_dir,
          checkpoint=checkpoint,
          order=options.order,
          embedding_size=options.embedding_size,
          neg_sampled=options.negative,
          batch_size=options.batch_size,
          distortion_power=options.distortion_power,
          initial_learning_rate=options.learning_rate,
          decay_epochs=options.decay_epochs,
          decay_rate=options.decay_rate,
          iter_epochs=options.iter_epoches,
          allow_soft_placement=options.allow_soft_placement,
          log_device_placement=options.log_device_placement,
          gpu_memory_fraction=options.gpu_memory_fraction,
          using_gpu=options.using_gpu,
          allow_growth=options.allow_growth,
          loss_interval=options.loss_interval,
          summary_steps=options.summary_steps,
          ckpt_interval=options.ckpt_interval,
          ckpt_epochs=options.ckpt_epochs,
          summary_interval=options.summary_interval,
          decay_interval=options.decay_interval,
          train_workers=options.train_workers)
    logger.info('train completed in {}s'.format(time.time() - time_start))
    return
def eval_once(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    if not utils.check_rebuild(options.link_prediction_path,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    # loading features_matrix(already trained)
    logger.info('\t reading embedding vectors from file {}'.format(
        options.vectors_path))
    time_start = time.time()
    features_matrix = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list)
    logger.info(
        '\t reading embedding vectors completed in {}s'.format(time.time() -
                                                               time_start))
    logger.info('total loaded nodes: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('the embedding dimension: {}'.format(
        np.size(features_matrix, axis=1)))

    fr = open(options.link_prediction_path, 'w')
    fr.write('eval case: link-prediction ...\n')
    fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
    fr.write('\t data_format: {}\n'.format(options.data_format))
    fr.write('\t metrics: MAP and precise@K\n')
    fr.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr.write('\t similarity_metric: {}\n'.format(options.similarity_metric))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr.write('\t sample_nodes_rule: {}\n'.format(options.sample_nodes_rule))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
    fr.write('total loaded nodes: {}\n'.format(np.size(features_matrix,
                                                       axis=0)))
    fr.write('the embedding dimension: {}\n'.format(
        np.size(features_matrix, axis=1)))

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

            ret_list = []  # [[MAP, precisionK_list], ... ]
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_sample_thread_body, times_per_worker):
                    ret_list.extend(ret)
            if len(ret_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(ret_list), options.repeated_times))
        else:
            ret_list = _sample_thread_body(options.repeated_times)
    else:
        # no sampling, no repeat!
        ret_list = [_eval(net_eval, net_except)]  # [[MAP, precisionK_list]]

    if options.sample_nodes > 0:
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(ret_list)))
    else:
        fr.write(
            'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
            .format(options.sample_nodes, len(ret_list)))

    mean_MAP = np.mean([ret[0] for ret in ret_list])
    mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

    fr.write('\t\t MAP = {}\n'.format(mean_MAP))
    for k in range(options.precK_max_index):
        if k < len(mean_precisionK):
            fr.write('\t\t precisionK_{} = {}\n'.format(
                k + 1, mean_precisionK[k]))
        else:
            fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
    fr.write('details:\n')
    for repeat in range(len(ret_list)):
        fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
        MAP = ret_list[repeat][0]
        precisionK_list = ret_list[repeat][1]
        fr.write('\t\t MAP = {}\n'.format(MAP))
        for k in range(options.precK_max_index):
            if k < len(precisionK_list):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, precisionK_list[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

    fr.write(
        '\neval case: link_prediction completed in {}s.'.format(time.time() -
                                                                time_start))
    fr.close()
    logger.info(
        'eval case: link_prediction completed in {}s.'.format(time.time() -
                                                              time_start))

    return
def eval_online(options):
    global features_matrix, net_eval, net_except, SAMPLE_NODES, SAMPLE_RULE, METIRC, PREC_K
    link_prediction_dir = os.path.split(options.link_prediction_path)[0]
    if not utils.check_rebuild(link_prediction_dir,
                               descrip='link_prediction',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(link_prediction_dir):
        os.makedirs(link_prediction_dir)

    logger.info('eval case: link-prediction ...')
    logger.info('\t save_path: {}'.format(options.link_prediction_path))
    logger.info('\t eval_data_path: {}'.format(options.eval_data_path))
    logger.info('\t except_data_path: {}'.format(options.except_data_path))
    logger.info('\t data_format: {}'.format(options.data_format))
    logger.info('\t metrics: MAP and precise@K')
    logger.info('\t max_index for precise@K: {}'.format(
        options.precK_max_index))
    logger.info('\t similarity_metric: {}'.format(options.similarity_metric))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t sample_nodes: {}'.format(options.sample_nodes))
    logger.info('\t sample_nodes_rule: {}'.format(options.sample_nodes_rule))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info("constructing eval network ...")
    net_eval = network.construct_network(data_path=options.eval_data_path,
                                         data_format=options.data_format,
                                         print_net_info=False,
                                         isdirected=options.isdirected)
    eval_net_nodes_size = net_eval.get_nodes_size()
    eval_net_edges_size = net_eval.get_edges_size()
    logger.info("eval_net_nodes_size = {}".format(eval_net_nodes_size))
    logger.info("eval_net_edges_size = {}".format(eval_net_edges_size))

    logger.info("constructing except(train) network ...")
    net_except = network.construct_network(data_path=options.except_data_path,
                                           data_format=options.data_format,
                                           print_net_info=False,
                                           isdirected=options.isdirected)
    except_net_nodes_size = net_except.get_nodes_size()
    except_net_edges_size = net_except.get_edges_size()
    logger.info("except_net_nodes_size = {}".format(except_net_nodes_size))
    logger.info("except_net_edges_size = {}".format(except_net_edges_size))

    id_list = list(range(eval_net_nodes_size))  # must be [0,1,2,3,...]
    SAMPLE_NODES = options.sample_nodes
    SAMPLE_RULE = options.sample_nodes_rule
    METIRC = options.similarity_metric
    PREC_K = options.precK_max_index

    metric_prec_k_list = [1]
    decimal_number = 10
    while metric_prec_k_list[-1] < options.precK_max_index:
        if decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(decimal_number)
        else:
            break
        if 2 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(2 * decimal_number)
        else:
            break
        if 5 * decimal_number <= options.precK_max_index:
            metric_prec_k_list.append(5 * decimal_number)
        else:
            break
        decimal_number = decimal_number * 10

    if options.sample_nodes > 0:
        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            logger.info("\t allocating repeat_times to workers ...")
            if options.repeated_times <= options.eval_workers:
                times_per_worker = [1 for _ in range(options.repeated_times)]
            else:
                div, mod = divmod(options.repeated_times, options.eval_workers)
                times_per_worker = [div for _ in range(options.eval_workers)]
                for idx in range(mod):
                    times_per_worker[idx] = times_per_worker[idx] + 1
            assert sum(
                times_per_worker
            ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
                sum(times_per_worker), options.repeated_times)

            logger.info("\t using {} processes for evaling:".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                logger.info("\t process-{}: repeat {} times".format(
                    idx, rep_times))

    fr_total = open(options.link_prediction_path, 'w')
    fr_total.write('eval case: link-prediction ...\n')
    fr_total.write('\t save_path: {}\n'.format(options.link_prediction_path))
    fr_total.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
    fr_total.write('\t except_data_path: {}\n'.format(
        options.except_data_path))
    fr_total.write('\t data_format: {}\n'.format(options.data_format))
    fr_total.write('\t metrics: MAP and precise@K\n')
    fr_total.write('\t max_index for precise@K: {}\n'.format(
        options.precK_max_index))
    fr_total.write('\t similarity_metric: {}\n'.format(
        options.similarity_metric))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
    fr_total.write('\t sample_nodes_rule: {}\n'.format(
        options.sample_nodes_rule))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
    fr_total.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
    fr_total.write(
        "except_net_nodes_size = {}\n".format(except_net_nodes_size))
    fr_total.write(
        "except_net_edges_size = {}\n".format(except_net_edges_size))
    fr_total.write(
        '\t results:\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tMAP\t')
    for v in metric_prec_k_list:
        fr_total.write('\tPr@{}'.format(v))
    fr_total.write("\n")

    last_step = 0
    summary_writer = tf.summary.FileWriter(link_prediction_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='MAP', simple_value=0.)
    for v in metric_prec_k_list:
        summary.value.add(tag='Pr_{}'.format(v), simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_MAP = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_link_prediction"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        # loading features_matrix(already trained)
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        time_start = time.time()
        features_matrix = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list)
        os.remove(reading)
        logger.info("\t done for reading ...")
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('total loaded nodes: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('the embedding dimension: {}'.format(
            np.size(features_matrix, axis=1)))

        #
        fr = open(options.link_prediction_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: link-prediction ...\n')
        fr.write('\t save_path: {}\n'.format(options.link_prediction_path))
        fr.write('\t eval_data_path: {}\n'.format(options.eval_data_path))
        fr.write('\t except_data_path: {}\n'.format(options.except_data_path))
        fr.write('\t data_format: {}\n'.format(options.data_format))
        fr.write('\t metrics: MAP and precise@K\n')
        fr.write('\t max_index for precise@K: {}\n'.format(
            options.precK_max_index))
        fr.write('\t similarity_metric: {}\n'.format(
            options.similarity_metric))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t sample_nodes: {}\n'.format(options.sample_nodes))
        fr.write('\t sample_nodes_rule: {}\n'.format(
            options.sample_nodes_rule))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write("eval_net_nodes_size = {}\n".format(eval_net_nodes_size))
        fr.write("eval_net_edges_size = {}\n".format(eval_net_edges_size))
        fr.write("except_net_nodes_size = {}\n".format(except_net_nodes_size))
        fr.write("except_net_edges_size = {}\n".format(except_net_edges_size))
        fr.write('total loaded nodes: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('the embedding dimension: {}\n'.format(
            np.size(features_matrix, axis=1)))

        if options.sample_nodes > 0:
            if options.eval_workers > 1 and options.repeated_times > 1:
                # speed up by using multi-process
                ret_list = []  # [[MAP, precisionK_list], ... ]
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_sample_thread_body,
                                            times_per_worker):
                        ret_list.extend(ret)
                if len(ret_list) != options.repeated_times:
                    logger.warning(
                        "warning: eval unmatched repeated_times: {} != {}".
                        format(len(ret_list), options.repeated_times))
            else:
                ret_list = _sample_thread_body(options.repeated_times)
        else:
            # no sampling, no repeat!
            ret_list = [_eval(net_eval,
                              net_except)]  # [[MAP, precisionK_list]]

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        if options.sample_nodes > 0:
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(options.repeated_times, len(ret_list)))
        else:
            fr.write(
                'due to the sample nodes = {}, so actual repeated_times = {}, results as follows:\n'
                .format(options.sample_nodes, len(ret_list)))

        mean_MAP = np.mean([ret[0] for ret in ret_list])
        mean_precisionK = np.mean([ret[1] for ret in ret_list], axis=0)

        fr.write('\t\t MAP = {}\n'.format(mean_MAP))
        for k in range(options.precK_max_index):
            if k < len(mean_precisionK):
                fr.write('\t\t precisionK_{} = {}\n'.format(
                    k + 1, mean_precisionK[k]))
            else:
                fr.write('\t\t precisionK_{} = None\n'.format(k + 1))
        fr.write('details:\n')
        for repeat in range(len(ret_list)):
            fr.write('\t repeated {}/{}:\n'.format(repeat + 1, len(ret_list)))
            MAP = ret_list[repeat][0]
            precisionK_list = ret_list[repeat][1]
            fr.write('\t\t MAP = {}\n'.format(MAP))
            for k in range(options.precK_max_index):
                if k < len(precisionK_list):
                    fr.write('\t\t precisionK_{} = {}\n'.format(
                        k + 1, precisionK_list[k]))
                else:
                    fr.write('\t\t precisionK_{} = None\n'.format(k + 1))

        fr.write('\neval case: link_prediction completed in {}s.'.format(
            time.time() - time_start))
        fr.close()

        fr_total.write('%.4f' % mean_MAP)
        summary.value.add(tag='MAP', simple_value=mean_MAP)
        for v in metric_prec_k_list:
            fr_total.write('\t%.4f' % mean_precisionK[v - 1])
            summary.value.add(tag='Pr_{}'.format(v),
                              simple_value=mean_precisionK[v - 1])
        fr_total.write("\n")
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'eval case: ret_list completed in {}s.\n================================='
            .format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_MAP > best_MAP:
            best_MAP = mean_MAP

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(
                    os.path.join(link_prediction_dir, 'best_ckpt.info'), 'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_MAP: {}\n".format(best_MAP))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    link_prediction_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(link_prediction_dir,
                                          'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
Beispiel #9
0
def build_walk_corpus(options, net=None):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
    elif options.model == "Node2Vec":
        random_walker = "bias"
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()
    if net == None:
        net = network.construct_network(options)

    logger.info('Corpus bulid: walk info:')
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk times = {}'.format(options.walk_times))
    logger.info('\t walk length = {}'.format(options.walk_length))
    if random_walker == "uniform":
        logger.info('\t walk restart = {}'.format(options.walk_restart))
    elif random_walker == "bias":
        logger.info('\t return_parameter (p) = {}'.format(options.p))
        logger.info('\t in-out_parameter (q) = {}'.format(options.q))
    logger.info('\t max walk workers = {}'.format(options.walk_workers))
    logger.info('\t walk to memory = {}'.format(str(options.walk_to_memory)))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    if random_walker == "uniform":
        fr_walks.write('\t walk restart = {}\n'.format(options.walk_restart))
    elif random_walker == "bias":
        fr_walks.write('\t return_parameter (p) = {}\n'.format(options.p))
        fr_walks.write('\t in-out_parameter (q) = {}\n'.format(options.q))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    walker = Walker(net,
                    random_walker=random_walker,
                    walk_length=options.walk_length,
                    p=options.p,
                    q=options.q)
    if random_walker == "bias":
        # walker.preprocess_transition_probs(options.walk_workers)
        walker.preprocess_transition_probs(net_info_path=options.net_info_path)

    walk_corpus = None
    if options.walk_to_memory:
        walk_corpus = build_walk_corpus_to_memory(
            options.walk_times, max_num_workers=options.walk_workers)
        if not options.not_store_corpus:
            store_walk_corpus(options.corpus_store_path,
                              walk_corpus,
                              always_rebuild=options.always_rebuild)
    else:
        # walk to files
        walk_files = build_walk_corpus_to_files(
            options.corpus_store_path,
            options.walk_times,
            headflag_of_index_file=options.headflag_of_index_file,
            max_num_workers=options.walk_workers,
            always_rebuild=options.always_rebuild)
        if "train" in options.task:
            if options.load_from_memory:
                walk_corpus = load_walks_corpus(walk_files)
            else:
                walk_corpus = WalksCorpus(walk_files)
    del walker
    gc.collect()
    return walk_corpus
def build_walk_corpus(options):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
        net = network.construct_network(options, isHIN=False)
    elif options.model == "SpaceyWalk":
        random_walker = "spacey"
        net = network.construct_network(options, isHIN=True)
    elif options.model == "MetatreeWalk":
        random_walker = "metatreewalk"
        net = network.construct_network(options, isHIN=True)
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()

    logger.info('Corpus bulid: walk info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t max_walk_workers = {}'.format(options.walk_workers))
    logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory))
    logger.info('\t seed = {}'.format(options.seed))
    logger.info('\t alpha = {}'.format(options.alpha))
    logger.info('\t window_size = {}'.format(options.window_size))
    logger.info('\t sample_size = {}'.format(options.sample_size))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_walks.write('\t data_name = {}\n'.format(options.data_name))
    fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t seed = {}\n'.format(options.seed))
    fr_walks.write('\t alpha = {}\n'.format(options.alpha))
    fr_walks.write('\t window_size = {}\n'.format(options.window_size))
    fr_walks.write('\t sample_size = {}\n'.format(options.sample_size))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    if options.model == "SpaceyWalk":
        if options.using_metapath == "metagraph":
            metagraph = network.construct_meta_graph(
                options.metapath_path, isdirected=options.isdirected)
        elif options.using_metapath == "metatree":
            metagraph = network.construct_meta_tree(options.metapath_path,
                                                    isdirected=True)
        elif options.using_metapath == "metaschema":
            metagraph = None
        else:
            logger.error("Unknown feature : '%s'." % options.using_metapath)
            sys.exit()
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        using_metapath=options.using_metapath,
                        history_position=options.history_position,
                        task="walk",
                        alpha=options.alpha)
    elif options.model == "MetatreeWalk":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        task="walk")

    corpus_store_dir = os.path.split(options.corpus_store_path)[0]
    if not os.path.exists(corpus_store_dir):
        os.makedirs(corpus_store_dir)

    logger.info(
        'Corpus bulid: walking and computing (using %d workers for multi-process)...'
        % options.walk_workers)
    time_start = time.time()

    if options.walk_times <= options.walk_workers:
        times_per_worker = [1 for _ in range(options.walk_times)]
    else:
        div, mod = divmod(options.walk_times, options.walk_workers)
        times_per_worker = [div for _ in range(options.walk_workers)]
        for idx in range(mod):
            times_per_worker[idx] = times_per_worker[idx] + 1
    assert sum(
        times_per_worker
    ) == options.walk_times, 'workers allocating failed: %d != %d' % (
        sum(times_per_worker), options.walk_times)

    nodes_total = list(range(walker.nodes_size))
    sp_random = random.Random(options.seed)
    sp_random.shuffle(nodes_total)
    nodes_total = nodes_total[0:options.sample_size]
    nodes_total.insert(0, 8407)
    nodes_total.insert(0, 9891)
    nodes_total.insert(0, 8354)
    nodes_total.insert(0, 8798)
    for node in nodes_total:
        args_list = []
        begin = 0
        for cnt in times_per_worker:
            args_list.append((corpus_store_dir, node, begin + 1, begin + cnt,
                              options.window_size))
            begin += cnt
        with ProcessPoolExecutor(max_workers=options.walk_workers) as executor:
            executor.map(_construct_walk_corpus_and_write_singprocess,
                         args_list)
    logger.info('Corpus bulid: walk completed in {}s'.format(time.time() -
                                                             time_start))
    del walker
    gc.collect()
    return
Beispiel #11
0
def build_walk_corpus(options):
    global walker

    # check walk info  and record
    if not utils.check_rebuild(options.corpus_store_path,
                               descrip='walk corpus',
                               always_rebuild=options.always_rebuild):
        return
    if options.model == "DeepWalk":
        random_walker = "uniform"
        net = network.construct_network(options, isHIN=False)
    elif options.model == "SpaceyWalk":
        random_walker = "spacey"
        net = network.construct_network(options, isHIN=True)
    elif options.model == "MetatreeWalk":
        random_walker = "metatreewalk"
        net = network.construct_network(options, isHIN=True)
    else:
        logger.error("Unknown model or it cann't build walk corpus: '%s'." %
                     options.model)
        sys.exit()

    logger.info('Corpus bulid: walk info:')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}\n'.format(options.isdirected))
    logger.info('\t random_walker = {}'.format(random_walker))
    logger.info('\t walk_times = {}'.format(options.walk_times))
    logger.info('\t walk_length = {}'.format(options.walk_length))
    logger.info('\t max_walk_workers = {}'.format(options.walk_workers))
    logger.info('\t walk_to_memory = {}'.format(options.walk_to_memory))
    logger.info('\t alpha = {}'.format(options.alpha))
    if options.walk_to_memory:
        logger.info('\t donot store corpus = {}'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            logger.info('\t corpus store path = {}'.format(
                options.corpus_store_path))
    else:
        logger.info('\t corpus store path = {}'.format(
            options.corpus_store_path))

    fr_walks = open(
        os.path.join(
            os.path.split(options.corpus_store_path)[0], 'walks.info'), 'w')
    fr_walks.write('Corpus walk info:\n')
    fr_walks.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_walks.write('\t data_name = {}\n'.format(options.data_name))
    fr_walks.write('\t isdirected = {}\n\n'.format(options.isdirected))
    fr_walks.write('\t random_walker = {}\n'.format(random_walker))
    fr_walks.write('\t walk times = {}\n'.format(options.walk_times))
    fr_walks.write('\t walk length = {}\n'.format(options.walk_length))
    fr_walks.write('\t max walk workers = {}\n'.format(options.walk_workers))
    fr_walks.write('\t walk to memory = {}\n'.format(
        str(options.walk_to_memory)))
    if options.walk_to_memory:
        fr_walks.write('\t donot store corpus = {}\n'.format(
            str(options.not_store_corpus)))
        if not options.not_store_corpus:
            fr_walks.write('\t corpus store path = {}\n'.format(
                options.corpus_store_path))
    else:
        fr_walks.write('\t corpus store path = {}\n'.format(
            options.corpus_store_path))
    fr_walks.close()

    if options.model == "DeepWalk":
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length)
    elif options.model == "SpaceyWalk":
        if options.using_metapath == "metagraph":
            metagraph = network.construct_meta_graph(
                options.metapath_path, isdirected=options.isdirected)
        elif options.using_metapath == "metatree":
            metagraph = network.construct_meta_tree(options.metapath_path,
                                                    isdirected=True)
        elif options.using_metapath == "metaschema":
            metagraph = None
        else:
            logger.error("Unknown feature : '%s'." % options.using_metapath)
            sys.exit()
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        using_metapath=options.using_metapath,
                        history_position=options.history_position,
                        task="walk",
                        alpha=options.alpha)
    elif options.model == "MetatreeWalk":
        metagraph = network.construct_meta_tree(options.metapath_path,
                                                isdirected=True)
        walker = Walker(net,
                        random_walker=random_walker,
                        walk_length=options.walk_length,
                        metagraph=metagraph,
                        task="walk")

    walk_corpus = None
    if options.walk_to_memory:
        walk_corpus = build_walk_corpus_to_memory(
            options.walk_times, max_num_workers=options.walk_workers)
        if not options.not_store_corpus:
            store_walk_corpus(options.corpus_store_path,
                              walk_corpus,
                              always_rebuild=options.always_rebuild)
    else:
        # walk to files
        walk_files = build_walk_corpus_to_files(
            options.corpus_store_path,
            options.walk_times,
            headflag_of_index_file=options.headflag_of_index_file,
            max_num_workers=options.walk_workers,
            always_rebuild=options.always_rebuild)
        if "train" in options.task:
            if options.load_from_memory:
                walk_corpus = load_walks_corpus(walk_files)
            else:
                walk_corpus = WalksCorpus(walk_files)
    del walker
    gc.collect()
    return walk_corpus
def process(options):
    # parser.add_argument("--train_data_dir", dest="train_data_dir", default="./splited_train_0.8_repeat_1",
    #                     help="the train network data path. like: origin_data_dir/splited_train_${train_ratio}_repeat_${repeat_th}")
    # parser.add_argument("--eval_data_dir", dest="eval_data_dir", default="./splited_eval_0.2_repeat_1",
    #                     help="the eval network data path. like: origin_data_dir/splited_eval_${eval_ratio}_repeat_${repeat_th}")
    logger.info("Data preprocessing: network split ...")
    time_start = time.time()
    origin_data_dir, data_filename = os.path.split(options.data_path)
    data_prefixname = data_filename.split(".")[0]
    data_format = options.data_format
    isdirected = options.isdirected
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info("\t origin_data_dir = {}".format(origin_data_dir))
    logger.info("\t data_filename = {}".format(data_filename))
    logger.info("\t data_format = {}".format(data_format))
    logger.info("\t isdirected = {}".format(isdirected))
    logger.info("\t train_ratio = {}".format(train_ratio_list))
    logger.info("\t repeat_from = {}".format(options.repeat_from))
    logger.info("\t repeat_to = {}".format(options.repeat_to))
    logger.info("\t log_name = {}".format(options.log_name))
    logger.info("\t re_direction_path = {}".format(options.re_direction_path))

    net_origin = network.construct_network(data_path=options.data_path,
                                           data_format=data_format,
                                           print_net_info=False,
                                           isdirected=isdirected)

    for train_ratio in train_ratio_list:
        for repeat_th in range(options.repeat_from, options.repeat_to + 1):
            logger.info(
                "\ntrain_ratio = {}, repeat_th = {}, spliting ...".format(
                    train_ratio, repeat_th))
            # train_data_dir = os.path.join(origin_data_dir, "splited_train_{}_repeat_{}".format(train_ratio, repeat_th))
            # eval_data_dir = os.path.join(origin_data_dir, "splited_eval_{}_repeat_{}".format(round(1-train_ratio, 1), repeat_th))
            train_data_dir = os.path.join(
                origin_data_dir, "splited_train_{}_repeat_{}_train".format(
                    train_ratio, repeat_th))
            eval_data_dir = os.path.join(
                origin_data_dir, "splited_train_{}_repeat_{}_eval".format(
                    train_ratio, repeat_th))
            logger.info("\t train_data_dir = {}".format(train_data_dir))
            logger.info("\t eval_data_dir = {}".format(eval_data_dir))
            if os.path.exists(
                    os.path.join(train_data_dir,
                                 data_prefixname + "." + data_format)):
                logger.info(
                    "train_ratio = {}, repeat_th = {}, already splited, skiped!!!\n"
                    .format(train_ratio, repeat_th))
                continue

            if not os.path.exists(train_data_dir):
                os.mkdir(train_data_dir)
            if not os.path.exists(eval_data_dir):
                os.mkdir(eval_data_dir)

            net_train, net_eval = net_origin.split_by_edges(
                train_ratio=train_ratio)
            net_train.save_network(train_data_dir, data_prefixname,
                                   data_format)
            # net_train.print_net_info(edges_file=os.path.join(train_net_dir, datafilename), file_path=os.path.join(train_net_dir, "net.info"))
            net_eval.save_network(eval_data_dir, data_prefixname, data_format)
            # net_eval.print_net_info(edges_file=os.path.join(eval_net_dir, datafilename), file_path=os.path.join(eval_net_dir, "net.info"))

            # eval_utils.split_network(origin_net_dir = origin_data_dir,
            #                       train_net_dir = train_data_dir,
            #                       eval_net_dir = eval_data_dir,
            #                       data_prefixname = data_prefixname,
            #                       data_format = data_format,
            #                       isdirected = isdirected,
            #                       train_ratio = train_ratio)
            logger.info(
                "train_ratio = {}, repeat_th = {}, split succssed!!!\n".format(
                    train_ratio, repeat_th))

    logger.info('Data preprocessing: network split completed in {}s.'.format(
        time.time() - time_start))
Beispiel #13
0
def train_vectors(options):
    # check vectors and ckpt
    checkpoint = '0'
    train_vec_dir = os.path.split(options.vectors_path)[0]
    ckpt_dir = os.path.join(train_vec_dir, 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    if ckpt and ckpt.model_checkpoint_path:
        cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
        logger.info(
            "model and vectors already exists, checkpoint step = {}".format(
                cur_step))
        checkpoint = input(
            "please input 0 to start a new train, or input a choosed ckpt to restore (-1 for latest ckpt)"
        )
    if checkpoint == '0':
        if ckpt:
            tf.gfile.DeleteRecursively(ckpt_dir)
        logger.info('start a new embedding train using tensorflow ...')
    elif checkpoint == '-1':
        logger.info(
            'restore a embedding train using tensorflow from latest ckpt...')
    else:
        logger.info(
            'restore a embedding train using tensorflow from ckpt-%s...' %
            checkpoint)
    if not os.path.exists(train_vec_dir):
        os.makedirs(train_vec_dir)
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    # construct network
    net = network.construct_network(options)

    lr_file = os.path.join(train_vec_dir, "lr.info")
    np.savetxt(lr_file,
               np.asarray([
                   options.learning_rate, options.decay_epochs,
                   options.decay_rate, options.iter_epoches
               ],
                          dtype=np.float32),
               fmt="%.6f")

    dataset = DataSet(nodes_size=net.get_nodes_size(),
                      edges_list=net.edges,
                      shuffled=not options.unshuffled)

    # train info
    logger.info('Train info:')
    logger.info('\t train_model = {}'.format(options.model))
    logger.info('\t total embedding nodes = {}'.format(net.get_nodes_size()))
    logger.info('\t total edges = {}'.format(net.get_edges_size()))
    logger.info('\t embedding size = {}'.format(options.embedding_size))
    logger.info('\t struct = {}'.format(options.struct))
    logger.info('\t alpha = {}'.format(options.alpha))
    logger.info('\t beta = {}'.format(options.beta))
    logger.info('\t gamma = {}'.format(options.gamma))
    logger.info('\t reg = {}\n'.format(options.reg))
    logger.info('\t shuffled in training = {}'.format(not options.unshuffled))
    logger.info('\t sparse_dot = {}'.format(options.sparse_dot))
    logger.info('\t batch_size = {}'.format(options.batch_size))
    logger.info('\t iter_epoches = {}'.format(options.iter_epoches))
    logger.info('\t init_learning_rate = {}'.format(options.learning_rate))
    logger.info('\t decay_epochs = {}'.format(options.decay_epochs))
    logger.info('\t decay_interval = {}'.format(options.decay_interval))
    logger.info('\t decay_rate = {}'.format(options.decay_rate))
    logger.info('\t loss_interval = {}s'.format(options.loss_interval))
    logger.info('\t summary_steps = {}'.format(options.summary_steps))
    logger.info('\t summary_interval = {}s'.format(options.summary_interval))
    logger.info('\t ckpt_epochs = {}'.format(options.ckpt_epochs))
    logger.info('\t ckpt_interval = {}s\n'.format(options.ckpt_interval))
    logger.info('\t dbn_initial = {}'.format(options.dbn_initial))
    logger.info('\t dbn_epochs = {}'.format(options.dbn_epochs))
    logger.info('\t dbn_batchsize = {}'.format(options.dbn_batchsize))
    logger.info('\t dbn_learning_rate = {}'.format(options.dbn_learning_rate))
    logger.info('\t active_function = {}\n'.format(options.active_function))
    logger.info('\t using_gpu = {}'.format(options.using_gpu))
    logger.info('\t visible_device_list = {}'.format(
        options.visible_device_list))
    logger.info('\t log_device_placement = {}'.format(
        options.log_device_placement))
    logger.info('\t allow_soft_placement = {}'.format(
        options.allow_soft_placement))
    logger.info('\t gpu_memory_fraction = {}'.format(
        options.gpu_memory_fraction))
    logger.info('\t gpu_memory_allow_growth = {}\n'.format(
        options.allow_growth))

    logger.info('\t ckpt_dir = {}'.format(ckpt_dir))
    logger.info('\t vectors_path = {}'.format(options.vectors_path))
    logger.info('\t learning_rate_path = {}'.format(lr_file))

    fr_vec = open(os.path.join(train_vec_dir, 'embedding.info'), 'w')
    fr_vec.write('embedding info:\n')
    fr_vec.write('\t train_model = {}\n'.format(options.model))
    fr_vec.write('\t total embedding nodes = {}\n'.format(
        net.get_nodes_size()))
    fr_vec.write('\t total edges = {}\n'.format(net.get_edges_size()))
    fr_vec.write('\t embedding size = {}\n'.format(options.embedding_size))
    fr_vec.write('\t struct = {}\n'.format(options.struct))
    fr_vec.write('\t alpha = {}\n'.format(options.alpha))
    fr_vec.write('\t beta = {}\n'.format(options.beta))
    fr_vec.write('\t gamma = {}\n'.format(options.gamma))
    fr_vec.write('\t reg = {}\n\n'.format(options.reg))
    fr_vec.write(
        '\t shuffled in training = {}\n'.format(not options.unshuffled))
    fr_vec.write('\t sparse_dot = {}\n'.format(options.sparse_dot))
    fr_vec.write('\t batch_size = {}\n'.format(options.batch_size))
    fr_vec.write('\t iter_epoches = {}\n'.format(options.iter_epoches))
    fr_vec.write('\t init_learning_rate = {}\n'.format(options.learning_rate))
    fr_vec.write('\t decay_epochs = {}\n'.format(options.decay_epochs))
    fr_vec.write('\t decay_interval = {}\n'.format(options.decay_interval))
    fr_vec.write('\t decay_rate = {}\n'.format(options.decay_rate))
    fr_vec.write('\t loss_interval = {}s\n'.format(options.loss_interval))
    fr_vec.write('\t summary_steps = {}\n'.format(options.summary_steps))
    fr_vec.write('\t summary_interval = {}s\n'.format(
        options.summary_interval))
    fr_vec.write('\t ckpt_epochs = {}\n'.format(options.ckpt_epochs))
    fr_vec.write('\t ckpt_interval = {}s\n\n'.format(options.ckpt_interval))
    fr_vec.write('\t dbn_initial = {}s\n'.format(options.dbn_initial))
    fr_vec.write('\t dbn_epochs = {}s\n'.format(options.dbn_epochs))
    fr_vec.write('\t dbn_batchsize = {}s\n'.format(options.dbn_batchsize))
    fr_vec.write('\t dbn_learning_rate = {}s\n'.format(
        options.dbn_learning_rate))
    fr_vec.write('\t active_function = {}\n'.format(options.active_function))
    fr_vec.write('\t using_gpu = {}\n'.format(options.using_gpu))
    fr_vec.write('\t visible_device_list = {}\n'.format(
        options.visible_device_list))
    fr_vec.write('\t log_device_placement = {}\n'.format(
        options.log_device_placement))
    fr_vec.write('\t allow_soft_placement = {}\n'.format(
        options.allow_soft_placement))
    fr_vec.write('\t gpu_memory_fraction = {}\n'.format(
        options.gpu_memory_fraction))
    fr_vec.write('\t gpu_memory_allow_growth = {}\n\n'.format(
        options.allow_growth))

    fr_vec.write('\t ckpt_dir = {}\n'.format(ckpt_dir))
    fr_vec.write('\t vectors_path = {}\n'.format(options.vectors_path))
    fr_vec.write('\t learning_rate_path = {}\n'.format(lr_file))

    fr_vec.close()

    # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if options.using_gpu:
        visible_devices = str(options.visible_device_list[0])
        for dev in options.visible_device_list[1:]:
            visible_devices = visible_devices + ',%s' % dev
        os.environ['CUDA_VISIBLE_DEVICES'] = visible_devices
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    # train
    logger.info('training...')
    time_start = time.time()
    train(dataset=dataset,
          vectors_path=options.vectors_path,
          lr_file=lr_file,
          ckpt_dir=ckpt_dir,
          checkpoint=checkpoint,
          embedding_size=options.embedding_size,
          struct=options.struct,
          alpha=options.alpha,
          beta=options.beta,
          gamma=options.gamma,
          reg=options.reg,
          sparse_dot=options.sparse_dot,
          batch_size=options.batch_size,
          initial_learning_rate=options.learning_rate,
          decay_epochs=options.decay_epochs,
          decay_rate=options.decay_rate,
          iter_epochs=options.iter_epoches,
          allow_soft_placement=options.allow_soft_placement,
          log_device_placement=options.log_device_placement,
          gpu_memory_fraction=options.gpu_memory_fraction,
          using_gpu=options.using_gpu,
          allow_growth=options.allow_growth,
          loss_interval=options.loss_interval,
          summary_steps=options.summary_steps,
          ckpt_interval=options.ckpt_interval,
          ckpt_epochs=options.ckpt_epochs,
          summary_interval=options.summary_interval,
          decay_interval=options.decay_interval,
          dbn_initial=options.dbn_initial,
          dbn_epochs=options.dbn_epochs,
          dbn_batchsize=options.dbn_batchsize,
          dbn_learning_rate=options.dbn_learning_rate,
          active_function=options.active_function)
    logger.info('train completed in {}s'.format(time.time() - time_start))
    return