def dalignbed2dalignbedguidesseq(cfg):
    """
    Get sequences from BED file
    step#6

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    dalignbedguides = del_Unnamed(
        pd.read_csv(cfg['dalignbedguidesp'], sep='\t'))
    dalignedfasta = del_Unnamed(pd.read_csv(cfg['dalignedfastap'], sep='\t'))
    dalignbedguidesseqp = cfg['dalignbedguidesseqp']
    logging.info(basename(dalignbedguidesseqp))
    if not exists(dalignbedguidesseqp) or cfg['force']:
        dalignbedguidesseq = pd.merge(dalignbedguides,
                                      dalignedfasta,
                                      on='id',
                                      suffixes=('', '.2'))
        dalignbedguidesseq = dalignbedguidesseq.dropna(
            subset=['aligned sequence'], axis=0)

        # dalignbed.index.name='id'
        dalignbedguidesseq = dalignbedguidesseq.drop_duplicates()
        dalignbedguidesseq.to_csv(dalignbedguidesseqp, sep='\t')
    return cfg
Beispiel #2
0
def dalignbed2dalignbedguides(cfg):
    """
    Get guide seqeunces from the BED file
    step#4

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    dalignbed = del_Unnamed(
        pd.read_csv(cfg['dalignbedp'], sep='\t', keep_default_na=False))
    dguides = set_index(
        del_Unnamed(
            pd.read_csv(cfg['dguidesp'], sep='\t', keep_default_na=False)),
        'guide: id')

    #     if the error in human, use: `cut -f 1 data/alignment.bed.sorted.bed | sort| uniq -c | grep -v CHR | grep -v GL | grep -v KI`
    dalignbedguidesp = cfg['dalignbedguidesp']
    logging.info(basename(dalignbedguidesp))
    if not exists(dalignbedguidesp) or cfg['force']:
        dalignbed = pd.merge(dalignbed,
                             dguides,
                             on='guide: id',
                             suffixes=('', '.1'))
        dalignbed.to_csv(dalignbedguidesp, '\t')
    return cfg
def dalignbedannot2daggbyguide(cfg):
    """
    Aggregate annotations per alignment to annotations per guide.
    step#10

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']

    dalignbedannot = del_Unnamed(
        pd.read_csv(cfg['dalignbedannotp'], sep='\t', low_memory=False))

    daggbyguidep = '{}/10_daggbyguide.tsv'.format(datatmpd)
    logging.info(basename(daggbyguidep))
    if not exists(daggbyguidep) or cfg['force']:
        daggbyguide = dalignbedannot.loc[(dalignbedannot['NM'] == 0), [
            'guide: id', 'guide+PAM sequence', 'gene names', 'gene ids',
            'transcript ids'
        ]].drop_duplicates(subset=['guide: id'])
        if len(daggbyguide) != 0:
            daggbyguide = set_index(daggbyguide, 'guide: id')
            guideids = daggbyguide.index.tolist()
            for gi in range(len(guideids)):
                gid = guideids[gi]
                dalignbedannoti = dalignbedannot.loc[
                    dalignbedannot['guide: id'] == gid, :]
                if len(dalignbedannoti.shape) == 1:
                    dalignbedannoti = pd.DataFrame(dalignbedannoti).T
                for col in [
                        'types', 'gene names', 'gene ids', 'transcript ids',
                        'protein ids', 'exon ids'
                ]:
                    daggbyguide.loc[gid, col] = ";".join(
                        np.unique(dalignbedannoti[col].fillna('nan').tolist()))
            from beditor.lib.get_scores import get_beditorscore_per_guide
            for guideid in daggbyguide.index:
                dalignbedannotguide = dalignbedannot.loc[(
                    dalignbedannot['guide: id'] == guideid), :]
                daggbyguide.loc[
                    guideid, 'beditor score'] = get_beditorscore_per_guide(
                        guide_seq=dalignbedannotguide['guide+PAM sequence'].
                        unique()[0],
                        strategy=dalignbedannotguide['strategy'].unique()[0],
                        align_seqs_scores=dalignbedannotguide['beditor score'],
                        BEs=cfg['BEs']
                        #                                        test=cfg['test']
                    )
                daggbyguide.loc[guideid, 'CFD score'] = dalignbedannotguide[
                    'CFD score'].mean()  #FIXME if mean is not appropriate
            daggbyguide['beditor score (log10)'] = daggbyguide[
                'beditor score'].apply(np.log10)
            dalignbedannot['alternate alignments count'] = 1
            daggbyguide = daggbyguide.join(
                pd.DataFrame(
                    dalignbedannot.groupby('guide: id')
                    ['alternate alignments count'].agg('sum')))
            daggbyguide.to_csv(daggbyguidep, sep='\t')
            daggbyguide.to_csv(cfg['dofftargetsp'], sep='\t')
    return cfg
Beispiel #4
0
def dannotsagg2dannots2dalignbedannot(cfg):
    """
    Map aggregated annotations to guides
    step#9

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']

    dannotsagg = del_Unnamed(
        pd.read_csv(cfg['dannotsaggp'], sep='\t', keep_default_na=False))
    dalignbedstats = del_Unnamed(
        pd.read_csv(cfg['dalignbedstatsp'], sep='\t', keep_default_na=False))
    dalignbedannotp = cfg['dalignbedannotp']
    logging.info(basename(dalignbedannotp))
    if not exists(dalignbedannotp) or cfg['force']:
        # df2info(dalignbed)
        # df2info(dannotsagg)
        dalignbedannot = dalignbedstats.set_index('id').join(
            set_index(dannotsagg, 'id'), rsuffix=' annotation')
        dalignbedannot['NM'] = dalignbedannot['NM'].apply(int)
        from beditor.lib.get_scores import get_beditorscore_per_alignment, get_cfdscore
        dalignbedannot['beditor score'] = dalignbedannot.apply(
            lambda x: get_beditorscore_per_alignment(
                NM=x['NM'],
                genic=True if x['region'] == 'genic' else False,
                alignment=x['alignment'],
                pam_length=len(x['PAM']),
                pam_position=x['original position'],
                # test=cfg['test'],
            ),
            axis=1)
        dalignbedannot['CFD score'] = dalignbedannot.apply(
            lambda x: get_cfdscore(x['guide+PAM sequence'].upper(), x[
                'aligned sequence'].upper()),
            axis=1)
        dalignbedannot['CFD score'] = dalignbedannot['CFD score'].fillna(0)
        dalignbedannot.to_csv(dalignbedannotp, sep='\t')
    return cfg
def dalignbedguidesseq2dalignbedstats(cfg):
    """
    Gets scores for guides
    step#7

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    dalignbedguidesseq = del_Unnamed(
        pd.read_csv(cfg['dalignbedguidesseqp'], sep='\t'))

    dalignbedstatsp = cfg['dalignbedstatsp']
    logging.info(basename(dalignbedstatsp))
    if not exists(dalignbedstatsp) or cfg['force']:
        df = dalignbedguidesseq.apply(
            lambda x: align(x['guide+PAM sequence'], x['aligned sequence']),
            axis=1).apply(pd.Series)
        df.columns = ['alignment', 'alignment: score']
        dalignbedstats = dalignbedguidesseq.join(df)
        del df
        dalignbedstats.to_csv(dalignbedstatsp, sep='\t')
    return cfg
def dannots2dalignbed2dannotsagg(cfg):
    """
    Aggregate annotations per guide
    step#8

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']

    daannotp = f'{datatmpd}/08_dannot.tsv'
    cfg['daannotp'] = daannotp
    dannotsaggp = cfg['dannotsaggp']
    logging.info(basename(daannotp))
    if ((not exists(daannotp)) and (not exists(dannotsaggp))) or cfg['force']:
        dannots = pd.read_csv(
            cfg['annotationsbedp'],
            sep='\t',
            names=bed_colns + [
                c + ' annotation'
                if c in set(bed_colns).intersection(gff_colns) else c
                for c in gff_colns
            ],
            low_memory=False)
        dannots = del_Unnamed(dannots)

        dannots = dannots.set_index('id')
        dannots['annotations count'] = 1
        # separate ids from attribute columns
        dannots = lambda2cols(dannots,
                              lambdaf=gffatributes2ids,
                              in_coln='attributes',
                              to_colns=[
                                  'gene name', 'gene id', 'transcript id',
                                  'protein id', 'exon id'
                              ])

        dannots['annotation coordinate'] = dannots.apply(
            lambda x: '{}:{}-{}({})'.format(x['chromosome annotation'], x[
                'start annotation'], x['end annotation'], x['strand annotation'
                                                            ]),
            axis=1)
        logging.debug('or this step takes more time?')
        dannots.to_csv(daannotp, sep='\t')
    else:
        dannots = pd.read_csv(daannotp, sep='\t', low_memory=False)
        dannots = del_Unnamed(dannots)
    logging.info(basename(dannotsaggp))
    if not exists(dannotsaggp) or cfg['force']:
        if not 'dannots' in locals():
            dannots = pd.read_table(daannotp, low_memory=False)
        dannots = del_Unnamed(dannots)
        dannots = dannots.reset_index()

        dannotsagg = pd.DataFrame(
            dannots.groupby('id')['annotations count'].agg('sum')) - 1
        dannotsagg.loc[dannotsagg['annotations count'] == 0,
                       'region'] = 'intergenic'
        dannotsagg.loc[dannotsagg['annotations count'] != 0,
                       'region'] = 'genic'

        alignids = dannots['id'].unique()  #[:15]
        logging.debug('start of the slowest step')
        for alignidi in range(len(alignids)):
            alignid = alignids[alignidi]
            dannoti = dannots.loc[dannots['id'] == alignid, :]
            if len(dannoti.shape) == 1:
                dannoti = pd.DataFrame(dannoti).T
            dannoti = dannoti.loc[
                dannoti['type'] != 'chromosome', :].drop_duplicates(
                    subset=['start annotation', 'end annotation'])
            for col in [
                    'type', 'gene name', 'gene id', 'transcript id',
                    'protein id', 'exon id'
            ]:
                dannotsagg.loc[alignid, col + 's'] = ";".join(
                    np.unique(dannoti[col].fillna('nan').tolist()))
        logging.debug('end of the slowest step')

        del dannots
        dannotsagg = dannotsagg.reset_index()
        dannotsagg.to_csv(dannotsaggp, sep='\t')
    return cfg
Beispiel #7
0
def plot_vizbysteps(cfg):
    prjd = cfg['prjd']
    #make one output table and stepwise plots
    datad = f"{prjd}/05_output"

    # step2 # make submap
    stepi = 2
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_substitution_map"
    plotps = glob(plotp + '*')
    if len(plotps) == 0 or cfg['force']:
        plotpf = plotp + "_{mutation_type}.png"
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            if len(dstep) < 1000:
                logging.info('plot_submap_possibilities')
                plot_submap_possibilities(dmutagenesis=dstep,
                                          plotpf=plotpf,
                                          test=False)
            else:
                logging.warning(f'skipped: plot_submap_possibilities')
        else:
            logging.warning(f'not found: {dstepp}')

    # step3
    # stats by strategies
    stepi = 3
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_stats_by_strategies.png"
    if not exists(plotp) or cfg['force']:
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            logging.info('plot_bar_dguides')
            plot_bar_dguides(dstep, plotp)
        else:
            logging.warning(f'not found: {dstepp}')

    # make nt_composition plot
    stepi = 3
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_nt_compositions"
    plotps = glob(plotp + '*')
    if len(plotps) == 0 or cfg['force']:
        plotpf = plotp + "_{method}.png"
        makedirs(dirname(plotp), exist_ok=True)
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            #             dbepams=pd.read_table(f'{dirname(realpath(__file__))}/../data/dbepams.tsv')
            dbepams = pd.read_table(cfg['dbepamsp'], keep_default_na=False)
            dpam = dbepams.loc[:, cols_dpam].drop_duplicates()
            dpam = set_index(dpam, 'PAM')
            logging.info('plot_dist_dguides')
            plot_dist_dguides(dstep, dpam, plotpf)
        else:
            logging.warning(f'not found: {dstepp}')

    # make plot_dna_features_view
    stepi = 3
    plotd = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dna_features_view"
    plotps = glob(plotd + '/*')
    if len(plotps) == 0 or cfg['force']:
        dguidesp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        dsequencesp = f"{cfg[stepi-2]}/d{cfg[stepi-2].replace('/','').split('_')[-1]}.tsv"
        if exists(dguidesp):
            logging.info('plot_dna_features_view')
            plot_dna_features_view(
                cfg,
                dsequences=del_Unnamed(
                    pd.read_table(dsequencesp,
                                  keep_default_na=False)).drop_duplicates(),
                dguides=del_Unnamed(
                    pd.read_table(dguidesp,
                                  keep_default_na=False)).drop_duplicates(),
                plotd=plotd,
                more=False)
        else:
            logging.warning(f'not found: {dstepp}')

#     # step2 # make submap #FIXME get all the columns used for plotting in the dguides.
#     stepi=3
#     plotp=f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_submap_used_for_mutagenesis"
#     plotps=glob(plotp+'*')
#     if len(plotps)==0 or cfg['force']:
#         plotpf=plotp+"_{mutation_type}.png"
#         dstepp=f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
#         dstep=del_Unnamed(pd.read_table(dstepp)).drop_duplicates()
#         logging.info('plot_submap_possibilities')
#         plot_submap_possibilities(dmutagenesis=dstep,
#                                   plotpf=plotpf,test=False)

# step4 offtargets correlations
    stepi = 4
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dist_beditor_score.png"
    if not exists(plotp) or cfg['force']:
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            logging.info('plot_dist_dofftargets')
            plot_dist_dofftargets(dstep, plotp)
        else:
            logging.warning(f'not found: {dstepp}')
Beispiel #8
0
def make_outputs(cfg, plot=True):
    """
    Cobines stepwise analysis files into a pretty table.

    :param cfg: main configuration dict
    :param plot: if True creates visualizations
    """
    print(f"{get_datetime()}: generating outputs")
    from beditor.lib.global_vars import stepi2colsoutput
    prjd = cfg['prjd']
    #make one output table and stepwise plots
    datad = f"{prjd}/05_output"
    makedirs(datad, exist_ok=True)
    #table
    doutputp = f"{datad}/doutput.tsv"  #FIXME if steps are added
    if not exists(doutputp) or cfg['force']:
        from beditor.lib.io_dfs import del_Unnamed
        if 'doutput' in locals():
            del doutput
        for stepi in range(5):
            if stepi != 2 and cfg['step2ignore'] != stepi:
                dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
                if exists(dstepp):
                    logging.info(f'combining {stepi}')
                    colsoutput = stepi2colsoutput[stepi]
                    dstep = del_Unnamed(
                        pd.read_table(dstepp, keep_default_na=False))
                    if 'reverse_mutations' in cfg:
                        if cfg['reverse_mutations']:
                            if stepi == 0:
                                continue
                    colsoutput = [col for col in colsoutput if col in dstep]
                    dstep = dstep.loc[:, colsoutput]
                    if len(dstep) != 0:
                        dstep = dstep.drop_duplicates()
                    if not 'doutput' in locals():
                        doutput = dstep.copy()
                        del dstep
                    else:
                        cols_on = list(
                            set(doutput.columns.tolist()).intersection(
                                dstep.columns.tolist()))
                        if len(cols_on) != 0:
                            doutput = pd.merge(doutput,
                                               dstep,
                                               on=cols_on,
                                               how='left')
                        else:
                            logging.error(
                                f'output of step {stepi-1} or {stepi} are missing.'
                            )
                            return None
                        del dstep
        if cfg['mutation_format'] == 'nucleotide':
            doutput = doutput.drop([
                c for c in doutput
                if (('codon' in c) or ('amino' in c) or ('transcript' in c))
            ],
                                   axis=1)
        if len(doutput) != 0 and 'guide+PAM sequence' in doutput:
            from beditor.lib.io_seqs import get_polyt_length
            doutput['length of polyT stretch'] = doutput[
                'guide+PAM sequence'].apply(lambda x: get_polyt_length(x))
        makedirs(dirname(doutputp), exist_ok=True)
        doutput.to_csv(doutputp, sep='\t')
    else:
        doutput = pd.read_table(doutputp, keep_default_na=False)
    # plot
    if plot:
        plot_vizbysteps(cfg)
    logging.info(f"Outputs are located at {datad}")
    return doutput