def preprocess_bed_to_cgp_base(): dsname_list = ['HL60', 'K562', 'APL', 'NA19240'] # dsname_list = ['HL60', 'K562', 'APL'] # fnlist = sorted(glob.glob(os.path.join(rawReadDir, '*.strand.bed')), key=os.path.getsize) # dsname_list = ['NA19240'] for dsname in dsname_list: logger.info(f'dsname={dsname}') convert_region_to_cpg_base(dsname)
def collect_log_data(): fnlist = glob.glob(os.path.join(basedir, '*.summary*xlsx')) logger.info(fnlist) dflist = [] for fn in fnlist: df = pd.read_excel(fn) df = df.rename(columns={'type': 'tool'}) dflist.append(df) retdf = pd.concat(dflist) retdf = retdf[['dsname', 'tool', 'batchid', 'job.results']] outfn = os.path.join(pic_base_dir, 'running.logs.on.four.datasets.xlsx') retdf.to_excel(outfn) return retdf pass
def analyse_trace(): df = pd.read_csv(trace_fn, sep='\t') logger.info(df) ## Extract all meaning fields into raw format # return toolname, dsname, reads, duration, realtime, cpu, peak_rss, peak_vmem, rchar, wchar df[['tool', 'dsname', 'reads', 'duration', 'realtime', '%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar']] = df.apply(extract_tool_and_dsname_from_name, axis=1, result_type="expand") df['reads'] = df['reads'].astype(np.int) df['tool'] = df['tool'].astype(tools_cat_dtype) df = df.sort_values(by=['tool', 'reads']) outfn = os.path.join(pic_base_dir, 'benchmarking.log.formated.table.step1.all.steps.csv') df.to_csv(outfn, sep=',') ## Recalculate the time and memory for each tools for index, row in df.iterrows(): if row['tool'] in ['DeepSignal', 'Tombo', 'DeepMod', 'Nanopolish']: # Get basecalled row basecallRow = df.loc[(df.tool == 'Basecall') & (df.dsname == row['dsname']), :].iloc[0, :] df.at[index, 'duration'] = df.at[index, 'duration'] + basecallRow['duration'] df.at[index, 'realtime'] = df.at[index, 'realtime'] + basecallRow['realtime'] df.at[index, 'peak_rss'] = max(df.at[index, 'peak_rss'], basecallRow['peak_rss']) df.at[index, 'peak_vmem'] = max(df.at[index, 'peak_vmem'], basecallRow['peak_vmem']) df.at[index, 'rchar'] = max(df.at[index, 'rchar'], basecallRow['rchar']) df.at[index, 'wchar'] = max(df.at[index, 'wchar'], basecallRow['wchar']) if row['tool'] in ['DeepSignal', 'Tombo']: # Get resquiggle row resquiggleRow = df.loc[(df.tool == 'Resquiggle') & (df.dsname == row['dsname']), :].iloc[0, :] df.at[index, 'duration'] = df.at[index, 'duration'] + resquiggleRow['duration'] df.at[index, 'realtime'] = df.at[index, 'realtime'] + resquiggleRow['realtime'] df.at[index, 'peak_rss'] = max(df.at[index, 'peak_rss'], resquiggleRow['peak_rss']) # TODO why Resquiggle costs too much memory, need / 6 df.at[index, 'peak_vmem'] = max(df.at[index, 'peak_vmem'], resquiggleRow['peak_vmem']) # TODO why Resquiggle costs too much memory, need / 6 df.at[index, 'rchar'] = max(df.at[index, 'rchar'], resquiggleRow['rchar']) df.at[index, 'wchar'] = max(df.at[index, 'wchar'], resquiggleRow['wchar']) ## Filter out non-tool rows df = df[df.tool.isin(['DeepSignal', 'Tombo', 'DeepMod', 'Nanopolish', 'Megalodon'])] df = df.sort_values(by=['tool', 'reads']) outfn = os.path.join(pic_base_dir, 'benchmarking.log.formated.table.step2.all.tools.csv') df.to_csv(outfn, sep=',') pass
def recalculate(fnlist=['na19240.sumner.task.resource.summary.pkl', 'na19240.winter.task.resource.summary.pkl']): dflist = [] for fn in fnlist: dflist.append(pd.read_pickle(os.path.join(pkldir, fn))) df = pd.concat(dflist) df3 = pd.read_pickle(batch_fast5_pkl) df = df.merge(df3, on=['dsname', 'batchid'], how='left') logger.debug(df) dataset = defaultdict(list) for index, row in df.iterrows(): if row['tool'] not in tool_list_on_sumner + tool_list_on_winter: continue dsname = row['dsname'] batchid = row['batchid'] runt = row['running.time.seconds'] memg = row['mem.usage.gb'] basecall_row = df[(df['dsname'] == dsname) & (df['batchid'] == batchid) & (df['tool'] == 'basecall')].iloc[0, :] resquiggle_row = df[(df['dsname'] == dsname) & (df['batchid'] == batchid) & (df['tool'] == 'resquiggle')].iloc[0, :] if row['tool'] in ['DeepSignal', 'Tombo']: runt += basecall_row['running.time.seconds'] + resquiggle_row['running.time.seconds'] memg += basecall_row['mem.usage.gb'] + resquiggle_row['mem.usage.gb'] elif row['tool'] in ['Nanopolish', 'DeepMod']: runt += basecall_row['running.time.seconds'] memg += basecall_row['mem.usage.gb'] dataset['dsname'].append(dsname) dataset['tool'].append(row['tool']) dataset['batchid'].append(row['batchid']) dataset['fast5'].append(row['fast5']) dataset['running.time.seconds'].append(runt) dataset['mem.usage.gb'].append(memg) outdf = pd.DataFrame.from_dict(dataset) logger.info(outdf) outfn = os.path.join(pic_base_dir, 'recalculate.running.summary.na19240.csv') outdf.to_csv(outfn)
def unify_data_df(): df1 = pd.read_pickle(winter_pkl) df2 = pd.read_pickle(sunmer_pkl) df3 = pd.read_pickle(batch_fast5_pkl) df = pd.concat([df1, df2]) df = df.merge(df3, on=['dsname', 'batchid'], how='left') df[['running.time', 'mem.usage', 'running.time.seconds', 'mem.usage.gb']] = df.apply(running_resouce_extraction, axis=1) logger.info(df) logger.info(list(df.columns)) run_report_columns = ['dsname', 'batchid', 'type', 'fast5', 'running.time', 'mem.usage', 'running.time.seconds', 'mem.usage.gb', 'job.results'] outdf = df[run_report_columns] outfn = os.path.join(pic_base_dir, 'running.summary.table.xlsx') outdf.to_excel(outfn) logger.info(f'save to {outfn}') outdf = df[run_report_columns[:-1]] outfn = os.path.join(pic_base_dir, 'running.summary.table.csv') outdf.to_csv(outfn) logger.info(f'save to {outfn}')
def gen_benchmarking_data(): logger.info(f'gen_benchmarking_data for sample size={sample_sizes}') fnlist = glob.glob(os.path.join(base_input, '*.fast5')) logger.info(len(fnlist)) os.makedirs(out_dir, exist_ok=True) for t in sample_sizes: retfnlist = random.sample(fnlist, k=t) benchDir = os.path.join(out_dir, f'MB{t:02n}K') os.makedirs(benchDir, exist_ok=True) for fn in retfnlist: # copy fn into benchDir copy(fn, benchDir) logger.info(f'Copy done for t={t}, benchDir={benchDir}')
recalculate() if args.megalodon: winter_megalodon_task_summary() if args.na19240_sumner: sunmer_task_summary_na19240() if args.na19240_winter: winter_task_summary_na19240() if args.collect_data: infn = os.path.join('/projects/li-lab/yang/results/2021-04-15', 'running.logs.on.four.datasets.full.logs.xlsx') df = pd.read_excel(infn, index_col=0) fast5fn = os.path.join(basedir, 'dsname.batch.fast5.table.xlsx') fast5df = pd.read_excel(fast5fn, index_col=0) outdf = df.merge(fast5df, on=['dsname', 'batchid'], how='inner') logger.info(outdf.iloc[0, :]) outdf[['cpu.time', 'wall.clock.time', 'mem.usage', 'jobid']] = outdf.apply(running_resouce_extraction, axis=1) logger.info(df) outfn = os.path.join(pic_base_dir, 'running.logs.on.four.datasets.step1.extracted.fields.xlsx') outdf.to_excel(outfn) outdf = outdf.groupby(by=['dsname', 'tool']).agg({'cpu.time': 'sum', 'wall.clock.time': 'max', 'mem.usage': 'max'}) # Recalculate the total time and mem usage # add basecall to DeepSignal, Tombo, DeepMod, and Nanopolish # add resquiggle to DeepSignal and Tombo # remove Guppy methcall outfn = os.path.join(pic_base_dir, 'running.logs.on.four.datasets.step2.before.calculation.xlsx')
def convert_region_to_cpg_base(dsname): """ Conver bed file of regions into CpG base bed files Assume infn is 0-based file, we print 1-based output. Because 1-based site level reports are easy to stats overlap with region files. :param infn: :return: """ fnlist = glob.glob( os.path.join(rawReadDir, f'{dsname}*coverage.positivestrand.bed')) fnlist += glob.glob( os.path.join(rawReadDir, f'{dsname}*coverage.negativestrand.bed')) logger.info(f'convert_region_to_cpg_base:{fnlist}') outfn = os.path.join(pic_base_dir, f'{dsname}.rawfast5.coverage.base.bed') outfile = open(outfn, 'w') print_first = True for infn in fnlist: logger.info(f'processing file={infn}') infile = open_file_gz_or_txt(infn) for row in tqdm(infile): tmp = row.strip().split(" ") if print_first: print_first = False logger.info(f'row={row}, tmp={tmp}') chr = tmp[0] start = int(tmp[1]) end = int(tmp[2]) cov = int(tmp[3]) # TODO: get strand info, + strand link to CG's C, - strand link to CG's G strand = tmp[4] if chr not in humanChrSet: # filter out non-human chrs continue # we want get seq at least two bases, evaluate 'CG' patterns # if end == start + 1: # newend = end + 1 # else: # newend = end newend = end + 1 dnastr = get_dna_seq_from_reference(chr, start, newend, ref_fasta=refFasta) for cpg in re.finditer("CG", dnastr): cpgstart = start + cpg.start() if strand == '+': # point to CG's C, just report that position (0-based) into 1-based outstart = cpgstart + 1 elif strand == '-': # negative strand, point to CG's G, based on seq of + strand outstart = cpgstart + 2 else: raise Exception(f'strand={strand} is no suppport') out_txt = '\t'.join( [chr, str(outstart), str(outstart), str(cov), '.', strand]) outfile.write(f'{out_txt}\n') pass infile.close() outfile.close()
def report_table(): """ Generate Figure 2 C and D data :return: """ dsname_list = ['HL60', 'K562', 'APL', 'NA19240'] # dsname_list = ['HL60', 'K562', 'APL'] # dsname_list = ['NA19240'] # dsname_list = ['HL60'] cutoff_list = [1, 3] dataset = [] for dsname in dsname_list: fnlist = glob.glob(os.path.join(baseReadCovDir, f'{dsname}*.base.bed')) fn = fnlist[0] logger.info(fn) rawReadBed = BedTool(fn) rawReadBed = rawReadBed.sort() logger.info(f'len(rawReadBed)={len(rawReadBed):,}') dataDict = {'dsname': dsname, 'filename': fn, 'total': len(rawReadBed)} retList = [] with Pool(processes=8) as pool: for bedfn in narrowCoordFileList[1:]: tagname = location_filename_to_abbvname[os.path.basename( bedfn)] ret = pool.apply_async(count_sites_in_coord, ( rawReadBed, bedfn, tagname, ), kwds={'cutoff_list': cutoff_list}) retList.append(ret) concordantFileName = find_bed_filename( basedir=datasetBedDir, pattern=f'{dsname}*hg38_nonsingletons.concordant.bed') ret = pool.apply_async(count_sites_in_coord, ( rawReadBed, concordantFileName, 'Concordant', ), kwds={'cutoff_list': cutoff_list}) retList.append(ret) discordantFileName = find_bed_filename( basedir=datasetBedDir, pattern=f'{dsname}*hg38_nonsingletons.discordant.bed') ret = pool.apply_async(count_sites_in_coord, ( rawReadBed, discordantFileName, 'Discordant', ), kwds={'cutoff_list': cutoff_list}) retList.append(ret) pool.close() pool.join() retList = [ret.get() for ret in retList] for ret in retList: dataDict.update(ret) dataset.append(dataDict) logger.debug(dataDict) df = pd.DataFrame(dataset) logger.info(df) for cutoff in cutoff_list: outdf = pd.concat([ df.loc[:, ['dsname', 'filename', 'total']], df.filter(regex=f'.cutoff{cutoff}$', axis=1) ], axis=1) outfn = os.path.join( pic_base_dir, f'raw.fast5.reads.cpg.coverage.across.regions.cutoff{cutoff}.xlsx') outdf.to_excel(outfn) logger.info(f'save to {outfn}')