def merge_file(): sub_dir_path = BASE_FILE_PATH.get_sub_dir_path() input_graph_file_path = BASE_FILE_PATH.get_input_graph_path() input_nodes_file_path = BASE_FILE_PATH.get_input_nodes_file_path() grnt_rel_path = BASE_FILE_PATH.get_output_rel_path() # os.system(''' # if [ ! -d "{0}" ]; then # mkdir {0} # else # rm {0}* # fi # '''.format(sub_dir_path)) os.system(''' if [ ! -d "{0}" ]; then mkdir {0} fi '''.format(sub_dir_path)) if not os.path.exists(input_graph_file_path): os.system('touch %s' % input_graph_file_path) os.system( 'hdfs dfs -getmerge %s %s' % (BASE_SPARK.get_hdfs_graph_file_path(), input_graph_file_path)) if not os.path.exists(input_nodes_file_path): os.system( 'hdfs dfs -getmerge %s %s' % (BASE_SPARK.get_hdfs_nodes_file_path(), input_nodes_file_path)) if not os.path.exists(grnt_rel_path): os.system('touch %s' % grnt_rel_path) local_graph_size = int( os.popen('ls -la {0} | cut -d " " -f 5'.format( input_graph_file_path)).readlines()[0]) hdfs_graph_size = int( os.popen('hdfs dfs -du -s {0} | cut -d " " -f 1'.format( BASE_SPARK.get_hdfs_graph_file_path())).readlines()[0]) local_nodes_size = int( os.popen('ls -la {0} | cut -d " " -f 5'.format( input_nodes_file_path)).readlines()[0]) hdfs_nodes_size = int( os.popen('hdfs dfs -du -s {0} | cut -d " " -f 1'.format( BASE_SPARK.get_hdfs_nodes_file_path())).readlines()[0]) if local_graph_size != hdfs_graph_size: return False if local_nodes_size != hdfs_nodes_size: return False return True
def main(rel_path, eid_mapping_path, cutoff=10): manager = mp.Manager() queue_sg = manager.Queue() queue_lg = manager.Queue() queue_write = manager.Queue() lock = manager.Lock() pool = mp.Pool(mp.cpu_count() + 2) f_rel = open(rel_path, 'wb') f_eid = open(eid_mapping_path, 'wb') pool.apply_async(listener_small_graph, ( queue_sg, queue_write, )) pool.apply_async(listener_large_graph, ( queue_lg, queue_write, cutoff, )) writer = mp.Process(target=customer, args=( queue_write, lock, f_rel, f_eid, )) jobs = [] input_file = BASE_FILE_PATH.get_input_graph_path() with open(input_file, 'r') as f: for line in f.readlines(): line_json = json.loads(line.strip()) edges = line_json['links'] job = pool.apply_async(worker, ( queue_sg, queue_lg, edges, )) jobs.append(job) for job in jobs: job.get() writer.start() queue_sg.put('end') queue_lg.put('end') pool.close() pool.join() while True: if not queue_write.qsize(): writer.terminate() writer.join() break f_rel.close() f_eid.close()
def main(output_path, cutoff=10): manager = mp.Manager() queue = manager.Queue() lock = manager.Lock() pool = mp.Pool(mp.cpu_count() + 2) fp = open(output_path, 'wb') writer = mp.Process(target=customer, args=( queue, lock, fp, )) writer.start() jobs = [] input_file = BASE_FILE_PATH.get_input_graph_path() idx = 0 with open(input_file, 'r') as f: for line in f.readlines(): idx = idx + 1 line_json = json.loads(line.strip()) edges = line_json['links'] job = pool.apply_async(producer, ( edges, queue, cutoff, )) jobs.append(job) for job in jobs: job.get() pool.close() pool.join() while True: if not queue.qsize(): writer.terminate() writer.join() break fp.close()
f_rel.close() f_eid.close() if __name__ == '__main__': logging.info('=====Processing start at %s!!!=====' % get_date()) stat = merge_file() if not stat: logging.info('Get file from HDFS error!') sys.exit(1) from utils import NODE level = int(sys.argv[1]) or 10 output_rel_path = BASE_FILE_PATH.get_output_rel_path() output_eid_mapping_path = BASE_FILE_PATH.get_output_eid_mapping_path() main(output_rel_path, output_eid_mapping_path, level) tmp = os.popen('hdfs dfs -stat %s' % BASE_SPARK.get_hdfs_rel_json_path()).readlines() if len(tmp): os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_rel_json_path()) os.system('hdfs dfs -put %s %s' % (output_rel_path, BASE_SPARK.get_hdfs_rel_json_path())) tmp = os.popen('hdfs dfs -stat %s' % BASE_SPARK.get_hdfs_eid_mapping_json_path()).readlines() if len(tmp): os.system('hdfs dfs -rm %s' %
writer.terminate() writer.join() break fp.close() if __name__ == '__main__': logging.info('=====Processing start at %s!!!=====' % get_date()) stat = merge_file() if not stat: logging.error('Get file from HDFS error!') sys.exit(1) from utils import NODE level = int(sys.argv[1]) or 10 output_list_path = BASE_FILE_PATH.get_output_list_path() main(output_list_path, level) tmp = os.popen('hdfs dfs -stat %s' % BASE_SPARK.get_hdfs_list_json_path()).readlines() if len(tmp): os.system('hdfs dfs -rm %s' % BASE_SPARK.get_hdfs_list_json_path()) os.system('hdfs dfs -put %s %s' % (output_list_path, BASE_SPARK.get_hdfs_list_json_path())) logging.info('=====Processing done at %s!!!=====' % get_date())
def __init__(self): self.hdfs_node_file_path = BASE_SPARK.get_hdfs_nodes_file_path() self.input_nodes_file_path = BASE_FILE_PATH.get_input_nodes_file_path() self.nodes = self.prepare_nodes()