def RunSnv(args): cfg = Parse.generate_snv_cfg(args) Parse.print_snv_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) regions_df = pd.read_table(cfg['region_file'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) regions_df = regions_df[regions_df['job'] == int(cfg['job'])].reset_index(drop=True) return_values = {} models_out = {} bgzfiles = {} print '' for m in cfg['model_order']: print "initializing out file for model " + m if m != '___no_tag___' else "initializing out file" models_out[m] = cfg['out'] if m == '___no_tag___' else cfg['out'] + '.' + m try: bgzfiles[m] = bgzf.BgzfWriter(models_out[m] + '.gz', 'wb') except: print Process.Error("failed to initialize bgzip format out file " + models_out[m] + '.gz').out return 1 if len(cfg['meta_order']) > 0: for m in cfg['meta_order']: print "initializing out file for meta " + m models_out[m] = cfg['out'] + '.' + m try: bgzfiles[m] = bgzf.BgzfWriter(models_out[m] + '.gz', 'wb') except: print Process.Error("failed to initialize bgzip format out file " + models_out[m] + '.gz').out return 1 if cfg['cpus'] > 1: pool = mp.Pool(cfg['cpus']-1) for i in xrange(1,cfg['cpus']): return_values[i] = pool.apply_async(process_regions, args=(regions_df,cfg,i,True,)) print "submitting job on cpu " + str(i) + " of " + str(cfg['cpus']) pool.close() print "executing job for cpu " + str(cfg['cpus']) + " of " + str(cfg['cpus']) + " via main process" main_return = process_regions(regions_df,cfg,cfg['cpus'],True) pool.join() if 1 in [return_values[i].get() for i in return_values] or main_return == 1: print Process.Error("error detected, see log files").out return 1 else: main_return = process_regions(regions_df,cfg,1,True) if main_return == 1: print Process.Error("error detected, see log files").out return 1 for i in xrange(1,cfg['cpus']+1): try: logfile = open(cfg['out'] + '.cpu' + str(i) + '.log', 'r') except: print Process.Error("failed to initialize log file " + cfg['out'] + '.cpu' + str(i) + '.log').out return 1 print logfile.read() logfile.close() os.remove(cfg['out'] + '.cpu' + str(i) + '.log') for m in cfg['model_order']: written = False for i in xrange(1,cfg['cpus']+1): regions_cpu_df = regions_df[regions_df['cpu'] == i].reset_index(drop=True) out_model_range = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + (cfg['out'] + '.cpu' + str(i) + '.' + m).split('/')[-1] + '.pkl' pkl = open(out_model_range,"rb") results_final,metadata,results_header,tbx_start,tbx_end = pickle.load(pkl) if not written: bgzfiles[m].write(metadata) bgzfiles[m].write("\t".join(results_header) + '\n') written = True if results_final.shape[0] > 0: results_final.replace({'None': 'NA'}).to_csv(bgzfiles[m], index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True) pkl.close() os.remove(out_model_range) bgzfiles[m].close() print "indexing out file for model " + m if m != '___no_tag___' else "indexing out file" try: pysam.tabix_index(models_out[m] + '.gz',seq_col=0,start_col=tbx_start,end_col=tbx_end,force=True) except: print Process.Error('failed to generate index for file ' + models_out[m] + '.gz').out return 1 if len(cfg['meta_order']) > 0: for m in cfg['meta_order']: written = False for i in xrange(1,cfg['cpus']+1): out_model_meta = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + cfg['out'].split('/')[-1] + '.cpu' + str(i) + '.' + m + '.pkl' pkl = open(out_model_meta,"rb") results_final_meta,metadata,results_header,tbx_start,tbx_end = pickle.load(pkl) if not written: bgzfiles[m].write(metadata) bgzfiles[m].write('#' + '\t'.join(results_header) + '\n') written = True if results_final_meta.shape[0] > 0: results_final_meta.replace({'None': 'NA'}).to_csv(bgzfiles[m], index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True) pkl.close() os.remove(out_model_meta) bgzfiles[m].close() print "indexing out file for meta " + m try: pysam.tabix_index(models_out[m] + '.gz',seq_col=0,start_col=tbx_start,end_col=tbx_end,force=True) except: print Process.Error('failed to generate index for file ' + models_out[m] + '.gz').out return 1 print "process complete" return 0