def merge_file():
    sub_dir_path = BASE_FILE_PATH.get_sub_dir_path()
    input_graph_file_path = BASE_FILE_PATH.get_input_graph_path()
    input_nodes_file_path = BASE_FILE_PATH.get_input_nodes_file_path()
    grnt_rel_path = BASE_FILE_PATH.get_output_rel_path()

    # os.system('''
    #     if [ ! -d "{0}" ]; then
    #         mkdir {0}
    #     else
    #         rm {0}*
    #     fi
    # '''.format(sub_dir_path))

    os.system('''
            if [ ! -d "{0}" ]; then
                mkdir {0}
            fi
        '''.format(sub_dir_path))

    if not os.path.exists(input_graph_file_path):
        os.system('touch %s' % input_graph_file_path)
        os.system(
            'hdfs dfs -getmerge %s %s' %
            (BASE_SPARK.get_hdfs_graph_file_path(), input_graph_file_path))

    if not os.path.exists(input_nodes_file_path):
        os.system(
            'hdfs dfs -getmerge %s %s' %
            (BASE_SPARK.get_hdfs_nodes_file_path(), input_nodes_file_path))

    if not os.path.exists(grnt_rel_path):
        os.system('touch %s' % grnt_rel_path)

    local_graph_size = int(
        os.popen('ls -la {0} | cut -d " " -f 5'.format(
            input_graph_file_path)).readlines()[0])
    hdfs_graph_size = int(
        os.popen('hdfs dfs -du -s {0} | cut -d " " -f 1'.format(
            BASE_SPARK.get_hdfs_graph_file_path())).readlines()[0])

    local_nodes_size = int(
        os.popen('ls -la {0} | cut -d " " -f 5'.format(
            input_nodes_file_path)).readlines()[0])
    hdfs_nodes_size = int(
        os.popen('hdfs dfs -du -s {0} | cut -d " " -f 1'.format(
            BASE_SPARK.get_hdfs_nodes_file_path())).readlines()[0])

    if local_graph_size != hdfs_graph_size:
        return False

    if local_nodes_size != hdfs_nodes_size:
        return False

    return True
def main(rel_path, eid_mapping_path, cutoff=10):
    manager = mp.Manager()
    queue_sg = manager.Queue()
    queue_lg = manager.Queue()
    queue_write = manager.Queue()
    lock = manager.Lock()
    pool = mp.Pool(mp.cpu_count() + 2)

    f_rel = open(rel_path, 'wb')
    f_eid = open(eid_mapping_path, 'wb')

    pool.apply_async(listener_small_graph, (
        queue_sg,
        queue_write,
    ))
    pool.apply_async(listener_large_graph, (
        queue_lg,
        queue_write,
        cutoff,
    ))
    writer = mp.Process(target=customer,
                        args=(
                            queue_write,
                            lock,
                            f_rel,
                            f_eid,
                        ))

    jobs = []
    input_file = BASE_FILE_PATH.get_input_graph_path()
    with open(input_file, 'r') as f:
        for line in f.readlines():
            line_json = json.loads(line.strip())
            edges = line_json['links']
            job = pool.apply_async(worker, (
                queue_sg,
                queue_lg,
                edges,
            ))
            jobs.append(job)

    for job in jobs:
        job.get()

    writer.start()

    queue_sg.put('end')
    queue_lg.put('end')
    pool.close()
    pool.join()

    while True:
        if not queue_write.qsize():
            writer.terminate()
            writer.join()
            break

    f_rel.close()
    f_eid.close()
Example #3
0
def main(output_path, cutoff=10):
    manager = mp.Manager()
    queue = manager.Queue()
    lock = manager.Lock()
    pool = mp.Pool(mp.cpu_count() + 2)

    fp = open(output_path, 'wb')
    writer = mp.Process(target=customer, args=(
        queue,
        lock,
        fp,
    ))

    writer.start()

    jobs = []
    input_file = BASE_FILE_PATH.get_input_graph_path()
    idx = 0
    with open(input_file, 'r') as f:
        for line in f.readlines():
            idx = idx + 1
            line_json = json.loads(line.strip())
            edges = line_json['links']
            job = pool.apply_async(producer, (
                edges,
                queue,
                cutoff,
            ))
            jobs.append(job)

    for job in jobs:
        job.get()

    pool.close()
    pool.join()

    while True:
        if not queue.qsize():
            writer.terminate()
            writer.join()
            break

    fp.close()
    f_rel.close()
    f_eid.close()


if __name__ == '__main__':
    logging.info('=====Processing start at %s!!!=====' % get_date())

    stat = merge_file()
    if not stat:
        logging.info('Get file from HDFS error!')
        sys.exit(1)

    from utils import NODE

    level = int(sys.argv[1]) or 10
    output_rel_path = BASE_FILE_PATH.get_output_rel_path()
    output_eid_mapping_path = BASE_FILE_PATH.get_output_eid_mapping_path()

    main(output_rel_path, output_eid_mapping_path, level)

    tmp = os.popen('hdfs dfs -stat %s' %
                   BASE_SPARK.get_hdfs_rel_json_path()).readlines()
    if len(tmp):
        os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_rel_json_path())
    os.system('hdfs dfs -put %s %s' %
              (output_rel_path, BASE_SPARK.get_hdfs_rel_json_path()))

    tmp = os.popen('hdfs dfs -stat %s' %
                   BASE_SPARK.get_hdfs_eid_mapping_json_path()).readlines()
    if len(tmp):
        os.system('hdfs dfs -rm %s' %
Example #5
0
            writer.terminate()
            writer.join()
            break

    fp.close()


if __name__ == '__main__':
    logging.info('=====Processing start at %s!!!=====' % get_date())

    stat = merge_file()
    if not stat:
        logging.error('Get file from HDFS error!')
        sys.exit(1)

    from utils import NODE

    level = int(sys.argv[1]) or 10
    output_list_path = BASE_FILE_PATH.get_output_list_path()

    main(output_list_path, level)

    tmp = os.popen('hdfs dfs -stat %s' %
                   BASE_SPARK.get_hdfs_list_json_path()).readlines()
    if len(tmp):
        os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_list_json_path())
    os.system('hdfs dfs -put %s %s' %
              (output_list_path, BASE_SPARK.get_hdfs_list_json_path()))

    logging.info('=====Processing done at %s!!!=====' % get_date())
Example #6
0
 def __init__(self):
     self.hdfs_node_file_path = BASE_SPARK.get_hdfs_nodes_file_path()
     self.input_nodes_file_path = BASE_FILE_PATH.get_input_nodes_file_path()
     self.nodes = self.prepare_nodes()