def preprocess_bed_to_cgp_base():
    dsname_list = ['HL60', 'K562', 'APL', 'NA19240']
    # dsname_list = ['HL60', 'K562', 'APL']
    # fnlist = sorted(glob.glob(os.path.join(rawReadDir, '*.strand.bed')), key=os.path.getsize)
    # dsname_list = ['NA19240']

    for dsname in dsname_list:
        logger.info(f'dsname={dsname}')
        convert_region_to_cpg_base(dsname)
Example #2
0
def collect_log_data():
    fnlist = glob.glob(os.path.join(basedir, '*.summary*xlsx'))
    logger.info(fnlist)

    dflist = []
    for fn in fnlist:
        df = pd.read_excel(fn)
        df = df.rename(columns={'type': 'tool'})
        dflist.append(df)
    retdf = pd.concat(dflist)
    retdf = retdf[['dsname', 'tool', 'batchid', 'job.results']]
    outfn = os.path.join(pic_base_dir, 'running.logs.on.four.datasets.xlsx')
    retdf.to_excel(outfn)
    return retdf
    pass
Example #3
0
def analyse_trace():
    df = pd.read_csv(trace_fn, sep='\t')
    logger.info(df)

    ## Extract all meaning fields into raw format
    # return toolname, dsname, reads, duration, realtime, cpu, peak_rss, peak_vmem, rchar, wchar
    df[['tool', 'dsname', 'reads', 'duration', 'realtime', '%cpu', 'peak_rss', 'peak_vmem', 'rchar', 'wchar']] = df.apply(extract_tool_and_dsname_from_name, axis=1, result_type="expand")

    df['reads'] = df['reads'].astype(np.int)

    df['tool'] = df['tool'].astype(tools_cat_dtype)
    df = df.sort_values(by=['tool', 'reads'])

    outfn = os.path.join(pic_base_dir, 'benchmarking.log.formated.table.step1.all.steps.csv')
    df.to_csv(outfn, sep=',')

    ## Recalculate the time and memory for each tools
    for index, row in df.iterrows():
        if row['tool'] in ['DeepSignal', 'Tombo', 'DeepMod', 'Nanopolish']:
            # Get basecalled row
            basecallRow = df.loc[(df.tool == 'Basecall') & (df.dsname == row['dsname']), :].iloc[0, :]
            df.at[index, 'duration'] = df.at[index, 'duration'] + basecallRow['duration']
            df.at[index, 'realtime'] = df.at[index, 'realtime'] + basecallRow['realtime']

            df.at[index, 'peak_rss'] = max(df.at[index, 'peak_rss'], basecallRow['peak_rss'])
            df.at[index, 'peak_vmem'] = max(df.at[index, 'peak_vmem'], basecallRow['peak_vmem'])
            df.at[index, 'rchar'] = max(df.at[index, 'rchar'], basecallRow['rchar'])
            df.at[index, 'wchar'] = max(df.at[index, 'wchar'], basecallRow['wchar'])
        if row['tool'] in ['DeepSignal', 'Tombo']:
            # Get resquiggle row
            resquiggleRow = df.loc[(df.tool == 'Resquiggle') & (df.dsname == row['dsname']), :].iloc[0, :]
            df.at[index, 'duration'] = df.at[index, 'duration'] + resquiggleRow['duration']
            df.at[index, 'realtime'] = df.at[index, 'realtime'] + resquiggleRow['realtime']

            df.at[index, 'peak_rss'] = max(df.at[index, 'peak_rss'], resquiggleRow['peak_rss'])  # TODO why Resquiggle costs too much memory, need / 6
            df.at[index, 'peak_vmem'] = max(df.at[index, 'peak_vmem'], resquiggleRow['peak_vmem'])  # TODO why Resquiggle costs too much memory, need / 6
            df.at[index, 'rchar'] = max(df.at[index, 'rchar'], resquiggleRow['rchar'])
            df.at[index, 'wchar'] = max(df.at[index, 'wchar'], resquiggleRow['wchar'])

    ## Filter out non-tool rows
    df = df[df.tool.isin(['DeepSignal', 'Tombo', 'DeepMod', 'Nanopolish', 'Megalodon'])]

    df = df.sort_values(by=['tool', 'reads'])
    outfn = os.path.join(pic_base_dir, 'benchmarking.log.formated.table.step2.all.tools.csv')
    df.to_csv(outfn, sep=',')
    pass
Example #4
0
def recalculate(fnlist=['na19240.sumner.task.resource.summary.pkl', 'na19240.winter.task.resource.summary.pkl']):
    dflist = []
    for fn in fnlist:
        dflist.append(pd.read_pickle(os.path.join(pkldir, fn)))
    df = pd.concat(dflist)

    df3 = pd.read_pickle(batch_fast5_pkl)
    df = df.merge(df3, on=['dsname', 'batchid'], how='left')
    logger.debug(df)

    dataset = defaultdict(list)
    for index, row in df.iterrows():
        if row['tool'] not in tool_list_on_sumner + tool_list_on_winter:
            continue
        dsname = row['dsname']
        batchid = row['batchid']
        runt = row['running.time.seconds']
        memg = row['mem.usage.gb']

        basecall_row = df[(df['dsname'] == dsname) & (df['batchid'] == batchid) & (df['tool'] == 'basecall')].iloc[0, :]
        resquiggle_row = df[(df['dsname'] == dsname) & (df['batchid'] == batchid) & (df['tool'] == 'resquiggle')].iloc[0, :]

        if row['tool'] in ['DeepSignal', 'Tombo']:
            runt += basecall_row['running.time.seconds'] + resquiggle_row['running.time.seconds']
            memg += basecall_row['mem.usage.gb'] + resquiggle_row['mem.usage.gb']
        elif row['tool'] in ['Nanopolish', 'DeepMod']:
            runt += basecall_row['running.time.seconds']
            memg += basecall_row['mem.usage.gb']
        dataset['dsname'].append(dsname)
        dataset['tool'].append(row['tool'])
        dataset['batchid'].append(row['batchid'])
        dataset['fast5'].append(row['fast5'])
        dataset['running.time.seconds'].append(runt)
        dataset['mem.usage.gb'].append(memg)
    outdf = pd.DataFrame.from_dict(dataset)
    logger.info(outdf)
    outfn = os.path.join(pic_base_dir, 'recalculate.running.summary.na19240.csv')
    outdf.to_csv(outfn)
Example #5
0
def unify_data_df():
    df1 = pd.read_pickle(winter_pkl)
    df2 = pd.read_pickle(sunmer_pkl)
    df3 = pd.read_pickle(batch_fast5_pkl)
    df = pd.concat([df1, df2])

    df = df.merge(df3, on=['dsname', 'batchid'], how='left')

    df[['running.time', 'mem.usage', 'running.time.seconds', 'mem.usage.gb']] = df.apply(running_resouce_extraction, axis=1)

    logger.info(df)

    logger.info(list(df.columns))

    run_report_columns = ['dsname', 'batchid', 'type', 'fast5', 'running.time', 'mem.usage', 'running.time.seconds', 'mem.usage.gb', 'job.results']
    outdf = df[run_report_columns]
    outfn = os.path.join(pic_base_dir, 'running.summary.table.xlsx')
    outdf.to_excel(outfn)
    logger.info(f'save to {outfn}')

    outdf = df[run_report_columns[:-1]]
    outfn = os.path.join(pic_base_dir, 'running.summary.table.csv')
    outdf.to_csv(outfn)
    logger.info(f'save to {outfn}')
Example #6
0
def gen_benchmarking_data():
    logger.info(f'gen_benchmarking_data for sample size={sample_sizes}')
    fnlist = glob.glob(os.path.join(base_input, '*.fast5'))

    logger.info(len(fnlist))
    os.makedirs(out_dir, exist_ok=True)

    for t in sample_sizes:
        retfnlist = random.sample(fnlist, k=t)

        benchDir = os.path.join(out_dir, f'MB{t:02n}K')
        os.makedirs(benchDir, exist_ok=True)

        for fn in retfnlist:
            # copy fn into benchDir
            copy(fn, benchDir)
        logger.info(f'Copy done for t={t}, benchDir={benchDir}')
Example #7
0
        recalculate()
    if args.megalodon:
        winter_megalodon_task_summary()
    if args.na19240_sumner:
        sunmer_task_summary_na19240()
    if args.na19240_winter:
        winter_task_summary_na19240()
    if args.collect_data:
        infn = os.path.join('/projects/li-lab/yang/results/2021-04-15', 'running.logs.on.four.datasets.full.logs.xlsx')
        df = pd.read_excel(infn, index_col=0)

        fast5fn = os.path.join(basedir, 'dsname.batch.fast5.table.xlsx')
        fast5df = pd.read_excel(fast5fn, index_col=0)

        outdf = df.merge(fast5df, on=['dsname', 'batchid'], how='inner')
        logger.info(outdf.iloc[0, :])

        outdf[['cpu.time', 'wall.clock.time', 'mem.usage', 'jobid']] = outdf.apply(running_resouce_extraction, axis=1)
        logger.info(df)

        outfn = os.path.join(pic_base_dir, 'running.logs.on.four.datasets.step1.extracted.fields.xlsx')
        outdf.to_excel(outfn)

        outdf = outdf.groupby(by=['dsname', 'tool']).agg({'cpu.time': 'sum', 'wall.clock.time': 'max', 'mem.usage': 'max'})

        # Recalculate the total time and mem usage
        # add basecall to DeepSignal, Tombo, DeepMod, and Nanopolish
        # add resquiggle to DeepSignal and Tombo
        # remove Guppy methcall

        outfn = os.path.join(pic_base_dir, 'running.logs.on.four.datasets.step2.before.calculation.xlsx')
def convert_region_to_cpg_base(dsname):
    """
    Conver bed file of regions into CpG base bed files

    Assume infn is 0-based file, we print 1-based output. Because 1-based site level reports are easy to stats overlap with region files.
    :param infn:
    :return:
    """

    fnlist = glob.glob(
        os.path.join(rawReadDir, f'{dsname}*coverage.positivestrand.bed'))
    fnlist += glob.glob(
        os.path.join(rawReadDir, f'{dsname}*coverage.negativestrand.bed'))

    logger.info(f'convert_region_to_cpg_base:{fnlist}')

    outfn = os.path.join(pic_base_dir, f'{dsname}.rawfast5.coverage.base.bed')
    outfile = open(outfn, 'w')

    print_first = True
    for infn in fnlist:
        logger.info(f'processing file={infn}')
        infile = open_file_gz_or_txt(infn)
        for row in tqdm(infile):
            tmp = row.strip().split(" ")
            if print_first:
                print_first = False
                logger.info(f'row={row}, tmp={tmp}')
            chr = tmp[0]
            start = int(tmp[1])
            end = int(tmp[2])
            cov = int(tmp[3])
            # TODO: get strand info, + strand link to CG's C, - strand link to CG's G
            strand = tmp[4]

            if chr not in humanChrSet:  # filter out non-human chrs
                continue

            # we want get seq at least two bases, evaluate 'CG' patterns
            # if end == start + 1:
            #     newend = end + 1
            # else:
            #     newend = end
            newend = end + 1

            dnastr = get_dna_seq_from_reference(chr,
                                                start,
                                                newend,
                                                ref_fasta=refFasta)
            for cpg in re.finditer("CG", dnastr):
                cpgstart = start + cpg.start()
                if strand == '+':  # point to CG's C, just report that position (0-based) into 1-based
                    outstart = cpgstart + 1
                elif strand == '-':  # negative strand, point to CG's G, based on seq of + strand
                    outstart = cpgstart + 2
                else:
                    raise Exception(f'strand={strand} is no suppport')

                out_txt = '\t'.join(
                    [chr,
                     str(outstart),
                     str(outstart),
                     str(cov), '.', strand])
                outfile.write(f'{out_txt}\n')
                pass
        infile.close()
    outfile.close()
def report_table():
    """
    Generate Figure 2 C and D data
    :return:
    """
    dsname_list = ['HL60', 'K562', 'APL', 'NA19240']
    # dsname_list = ['HL60', 'K562', 'APL']
    # dsname_list = ['NA19240']
    # dsname_list = ['HL60']

    cutoff_list = [1, 3]

    dataset = []
    for dsname in dsname_list:
        fnlist = glob.glob(os.path.join(baseReadCovDir, f'{dsname}*.base.bed'))
        fn = fnlist[0]
        logger.info(fn)

        rawReadBed = BedTool(fn)
        rawReadBed = rawReadBed.sort()
        logger.info(f'len(rawReadBed)={len(rawReadBed):,}')
        dataDict = {'dsname': dsname, 'filename': fn, 'total': len(rawReadBed)}
        retList = []
        with Pool(processes=8) as pool:
            for bedfn in narrowCoordFileList[1:]:
                tagname = location_filename_to_abbvname[os.path.basename(
                    bedfn)]
                ret = pool.apply_async(count_sites_in_coord, (
                    rawReadBed,
                    bedfn,
                    tagname,
                ),
                                       kwds={'cutoff_list': cutoff_list})
                retList.append(ret)

            concordantFileName = find_bed_filename(
                basedir=datasetBedDir,
                pattern=f'{dsname}*hg38_nonsingletons.concordant.bed')
            ret = pool.apply_async(count_sites_in_coord, (
                rawReadBed,
                concordantFileName,
                'Concordant',
            ),
                                   kwds={'cutoff_list': cutoff_list})
            retList.append(ret)

            discordantFileName = find_bed_filename(
                basedir=datasetBedDir,
                pattern=f'{dsname}*hg38_nonsingletons.discordant.bed')
            ret = pool.apply_async(count_sites_in_coord, (
                rawReadBed,
                discordantFileName,
                'Discordant',
            ),
                                   kwds={'cutoff_list': cutoff_list})
            retList.append(ret)

            pool.close()
            pool.join()
        retList = [ret.get() for ret in retList]
        for ret in retList:
            dataDict.update(ret)

        dataset.append(dataDict)
        logger.debug(dataDict)
    df = pd.DataFrame(dataset)
    logger.info(df)

    for cutoff in cutoff_list:
        outdf = pd.concat([
            df.loc[:, ['dsname', 'filename', 'total']],
            df.filter(regex=f'.cutoff{cutoff}$', axis=1)
        ],
                          axis=1)
        outfn = os.path.join(
            pic_base_dir,
            f'raw.fast5.reads.cpg.coverage.across.regions.cutoff{cutoff}.xlsx')
        outdf.to_excel(outfn)
        logger.info(f'save to {outfn}')