def do_process(shared_job_q,shared_result_q,top_src_dir,top_dst_dir,operation): global g_process_start_time global g_process_start_readable_time file_list = [] name_list = _get_samplename_list() print 'begin collecting file information...' for root,dirs,files in os.walk(top_src_dir): for name in files: src_file = os.path.join(root,name) file_list.append(src_file) if operation == OP_PREPROCESS: tail_path = src_file[len(top_src_dir):] if top_dst_dir[-1] != '/': fet_file = top_dst_dir+'/'+tail_path else: fet_file = top_dst_dir + tail_path fet_dir = fet_file[0:fet_file.rfind('/')] if not os.path.exists(fet_dir): os.makedirs(fet_dir) elif operation == OP_MIST2VECTOR or operation == OP_MERGEFET: name_found = False for name in name_list: if src_file.find(name)>=0: name_found = True break if not name_found: print 'expect Sample name in full path: ', src_file sys.exit() elif operation == OP_MERGETABLE: pass else: print 'unexpected operation: ',operation sys.exit() print '%d files to process...'%len(file_list) if operation == OP_MERGETABLE: table_dict = {} for sub_table_file in file_list: pos = sub_table_file.rfind('.') if pos!=-1: table_name = sub_table_file[pos+1:] if table_dict.has_key(table_name): table_dict[table_name].append(sub_table_file) else: table_dict[table_name]=[sub_table_file] #chunk_size here is an average size num_files = len(file_list) num_jobs = len(table_dict) chunk_size = int(num_files/num_jobs) for (k,v) in table_dict.items(): shared_job_q.put(v) print 'chunk_size=%d, num_jobs=%d,job_q size=%d' %(chunk_size,num_jobs,shared_job_q.qsize()) if(shared_job_q.qsize() < 10): print 'job_q size is too small so unnecessary to run cluster. maybe something wrong??? check it:)' #sys.exit() chunk_processed = 0 while chunk_processed < num_jobs: chunk_processed += shared_result_q.get() if g_process_start_time == 0: print 'setting process_start_time...' g_process_start_time = time.time() g_process_start_readable_time = common.get_readable_time() print 'chunk processed: ',chunk_processed print 'all chunks processed!' return chunk_size = common.g_chunk_size num_files = len(file_list) num_jobs = int(num_files/chunk_size); for i in range(0,num_jobs): shared_job_q.put(file_list[i*chunk_size:(i+1)*chunk_size]) if num_jobs*chunk_size < num_files: shared_job_q.put(file_list[num_jobs*chunk_size:num_files]) print 'chunk_size=%d, num_jobs=%d,job_q size=%d' %(chunk_size,num_jobs,shared_job_q.qsize()) if(shared_job_q.qsize() < 10): print 'job_q size is too small so unnecessary to run cluster. maybe something wrong??? check it:)' sys.exit() files_processed = 0 while files_processed < num_files: files_processed += shared_result_q.get() if g_process_start_time == 0: print 'setting process_start_time...' g_process_start_time = time.time() g_process_start_readable_time = common.get_readable_time() print '[%s] files processed: %d'%(common.get_readable_time(),files_processed) print 'all files processed!'
sys.exit() if dir_of_target.find(common.g_sample_table) == -1: print 'invalid dir_of_target: ',dir_of_target, ' must contains ', common.g_sample_table sys.exit() if os.path.exists(dir_of_target): try: os.rmdir(dir_of_target) except: print 'remove directory failed, any files in it? make sure. dir=',dir_of_target exit(-1) os.mkdir(dir_of_target) else: print 'unexpected operation: ',operation sys.exit() manager = make_server_manager(listen_port,common.g_auth_key) shared_job_q = manager.get_job_q() shared_result_q = manager.get_result_q() do_process(shared_job_q,shared_result_q,dir_of_source,dir_of_target,operation) process_stop_time = time.time() print 'process time = %d'%(process_stop_time-g_process_start_time) print 'process started at ',g_process_start_readable_time,' stop at ',common.get_readable_time() time.sleep(1) print 'server shutting down...' manager.shutdown()