Beispiel #1
0
def dalignbed2dalignbedguides(cfg):
    """
    Get guide seqeunces from the BED file
    step#4

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    dalignbed = del_Unnamed(
        pd.read_csv(cfg['dalignbedp'], sep='\t', keep_default_na=False))
    dguides = set_index(
        del_Unnamed(
            pd.read_csv(cfg['dguidesp'], sep='\t', keep_default_na=False)),
        'guide: id')

    #     if the error in human, use: `cut -f 1 data/alignment.bed.sorted.bed | sort| uniq -c | grep -v CHR | grep -v GL | grep -v KI`
    dalignbedguidesp = cfg['dalignbedguidesp']
    logging.info(basename(dalignbedguidesp))
    if not exists(dalignbedguidesp) or cfg['force']:
        dalignbed = pd.merge(dalignbed,
                             dguides,
                             on='guide: id',
                             suffixes=('', '.1'))
        dalignbed.to_csv(dalignbedguidesp, '\t')
    return cfg
def dalignbedannot2daggbyguide(cfg):
    """
    Aggregate annotations per alignment to annotations per guide.
    step#10

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']

    dalignbedannot = del_Unnamed(
        pd.read_csv(cfg['dalignbedannotp'], sep='\t', low_memory=False))

    daggbyguidep = '{}/10_daggbyguide.tsv'.format(datatmpd)
    logging.info(basename(daggbyguidep))
    if not exists(daggbyguidep) or cfg['force']:
        daggbyguide = dalignbedannot.loc[(dalignbedannot['NM'] == 0), [
            'guide: id', 'guide+PAM sequence', 'gene names', 'gene ids',
            'transcript ids'
        ]].drop_duplicates(subset=['guide: id'])
        if len(daggbyguide) != 0:
            daggbyguide = set_index(daggbyguide, 'guide: id')
            guideids = daggbyguide.index.tolist()
            for gi in range(len(guideids)):
                gid = guideids[gi]
                dalignbedannoti = dalignbedannot.loc[
                    dalignbedannot['guide: id'] == gid, :]
                if len(dalignbedannoti.shape) == 1:
                    dalignbedannoti = pd.DataFrame(dalignbedannoti).T
                for col in [
                        'types', 'gene names', 'gene ids', 'transcript ids',
                        'protein ids', 'exon ids'
                ]:
                    daggbyguide.loc[gid, col] = ";".join(
                        np.unique(dalignbedannoti[col].fillna('nan').tolist()))
            from beditor.lib.get_scores import get_beditorscore_per_guide
            for guideid in daggbyguide.index:
                dalignbedannotguide = dalignbedannot.loc[(
                    dalignbedannot['guide: id'] == guideid), :]
                daggbyguide.loc[
                    guideid, 'beditor score'] = get_beditorscore_per_guide(
                        guide_seq=dalignbedannotguide['guide+PAM sequence'].
                        unique()[0],
                        strategy=dalignbedannotguide['strategy'].unique()[0],
                        align_seqs_scores=dalignbedannotguide['beditor score'],
                        BEs=cfg['BEs']
                        #                                        test=cfg['test']
                    )
                daggbyguide.loc[guideid, 'CFD score'] = dalignbedannotguide[
                    'CFD score'].mean()  #FIXME if mean is not appropriate
            daggbyguide['beditor score (log10)'] = daggbyguide[
                'beditor score'].apply(np.log10)
            dalignbedannot['alternate alignments count'] = 1
            daggbyguide = daggbyguide.join(
                pd.DataFrame(
                    dalignbedannot.groupby('guide: id')
                    ['alternate alignments count'].agg('sum')))
            daggbyguide.to_csv(daggbyguidep, sep='\t')
            daggbyguide.to_csv(cfg['dofftargetsp'], sep='\t')
    return cfg
def dguides2guidessam(cfg, dguides):
    """
    Aligns guides to genome and gets SAM file
    step#1

    :param cfg: configuration dict
    :param dguides: dataframe of guides
    """
    datatmpd = cfg['datatmpd']
    dguides = set_index(dguides, 'guide: id')
    guidels = dguides.loc[:, 'guide+PAM length'].unique()
    for guidel in guidels:
        logging.debug(f"now aligning guides of length {guidel}")
        guidesfap = f'{datatmpd}/01_guides_guidel{guidel:02}.fa'
        logging.info(basename(guidesfap))
        if not exists(guidesfap) or cfg['force']:
            with open(guidesfap, 'w') as f:
                for gi in dguides.index:
                    f.write('>{}\n{}\n'.format(
                        gi.replace(' ', '_'),
                        dguides.loc[gi, 'guide+PAM sequence']))
        ## BWA alignment command is adapted from cripror
        ## https://github.com/rraadd88/crisporWebsite/blob/master/crispor.py
        # BWA: allow up to X mismatches
        # maximum number of occurences in the genome to get flagged as repeats.
        # This is used in bwa samse, when converting the sam file
        # and for warnings in the table output.
        MAXOCC = 60000

        # the BWA queue size is 2M by default. We derive the queue size from MAXOCC
        MFAC = 2000000 / MAXOCC

        genomep = cfg['genomep']
        genomed = dirname(genomep)  # make var local, see below
        genomegffp = cfg['genomegffp']

        # increase MAXOCC if there is only a single query, but only in CGI mode
        bwaM = MFAC * MAXOCC  # -m is queue size in bwa
        guidessap = f'{datatmpd}/01_guides_guidel{guidel:02}.sa'
        logging.info(basename(guidessap))
        if not exists(guidessap) or cfg['force']:
            cmd = f"{cfg['bwa']} aln -t 1 -o 0 -m {bwaM} -n {cfg['mismatches_max']} -k {cfg['mismatches_max']} -N -l {guidel} {genomep} {guidesfap} > {guidessap} 2> {guidessap}.log"
            runbashcmd(cmd)

        guidessamp = f'{datatmpd}/01_guides_guidel{guidel:02}.sam'
        logging.info(basename(guidessamp))
        if not exists(guidessamp) or cfg['force']:
            cmd = f"{cfg['bwa']} samse -n {MAXOCC} {genomep} {guidessap} {guidesfap} > {guidessamp} 2> {guidessamp}.log"
            runbashcmd(cmd)
    return cfg
Beispiel #4
0
def df2features(df):
    """
    cols= ini, end, name,sense
    """
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    from beditor.lib.io_dfs import set_index
    colini,colend,colname,colsense=df.columns
    df=set_index(df,colname)
    features=[]
    df=df.reset_index()
    for name in df.index:
        features.append(SeqFeature(FeatureLocation(start=int(df.loc[name,colini]),
                                                   end=int(df.loc[name,colend])+1,
                                                   strand=int(df.loc[name,colsense]),), 
                                   type=df.loc[name,colname],
                                  ))
    return features
Beispiel #5
0
def dannotsagg2dannots2dalignbedannot(cfg):
    """
    Map aggregated annotations to guides
    step#9

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']

    dannotsagg = del_Unnamed(
        pd.read_csv(cfg['dannotsaggp'], sep='\t', keep_default_na=False))
    dalignbedstats = del_Unnamed(
        pd.read_csv(cfg['dalignbedstatsp'], sep='\t', keep_default_na=False))
    dalignbedannotp = cfg['dalignbedannotp']
    logging.info(basename(dalignbedannotp))
    if not exists(dalignbedannotp) or cfg['force']:
        # df2info(dalignbed)
        # df2info(dannotsagg)
        dalignbedannot = dalignbedstats.set_index('id').join(
            set_index(dannotsagg, 'id'), rsuffix=' annotation')
        dalignbedannot['NM'] = dalignbedannot['NM'].apply(int)
        from beditor.lib.get_scores import get_beditorscore_per_alignment, get_cfdscore
        dalignbedannot['beditor score'] = dalignbedannot.apply(
            lambda x: get_beditorscore_per_alignment(
                NM=x['NM'],
                genic=True if x['region'] == 'genic' else False,
                alignment=x['alignment'],
                pam_length=len(x['PAM']),
                pam_position=x['original position'],
                # test=cfg['test'],
            ),
            axis=1)
        dalignbedannot['CFD score'] = dalignbedannot.apply(
            lambda x: get_cfdscore(x['guide+PAM sequence'].upper(), x[
                'aligned sequence'].upper()),
            axis=1)
        dalignbedannot['CFD score'] = dalignbedannot['CFD score'].fillna(0)
        dalignbedannot.to_csv(dalignbedannotp, sep='\t')
    return cfg
Beispiel #6
0
def get_seq_aminoacid(cfg, din):
    """
    Fetches sequences if mutation format is amino acid 

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """
    import pyensembl
    #import ensembl object that would fetch genes
    # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease'])
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])

    din.index = range(len(din))
    dbedp = '{}/dbedflank.bed'.format(cfg['datad'])
    dbed = pd.DataFrame(columns=bed_colns)
    terrpositions = []
    terrnotfound = []
    terrnoncoding = []
    bedrowi = 0
    #             for i in trange(len(din)-1,desc='get positions for bedtools'):
    for i in din.index:
        if din.loc[i, 'transcript: id'] in ensembl.transcript_ids():
            t = ensembl.transcript_by_id(din.loc[i, 'transcript: id'])
            if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon:
                coding_sequence_positions = tboundaries2positions(t)
                if len(coding_sequence_positions) == len(t.coding_sequence):
                    #TODO     need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce
                    dcoding = t2pmapper(t, coding_sequence_positions)
                    dcodingmutpos = dcoding.loc[(
                        dcoding['protein index'] == din.loc[
                            i, 'aminoacid: position']), :]
                    codon_positions = dcodingmutpos[
                        'coding sequence positions'].tolist()
                    if len(codon_positions) != 0:
                        dbed.loc[bedrowi, 'chromosome'] = t.contig
                        if cfg['test']:
                            print(din.loc[i, 'transcript: id'],
                                  codon_positions)
                        if t.strand == '+':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[0]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[2]
                        elif t.strand == '-':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[2]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[0]
                        dbed.loc[bedrowi, 'start'] = dbed.loc[
                            bedrowi,
                            'codon start'] - 22  #FIXME put flank in the yml
                        dbed.loc[bedrowi, 'end'] = dbed.loc[
                            bedrowi,
                            'codon end'] + 21  #FIXME put flank in the yml

                        dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[
                            'protein sequence'].tolist()[0]
                        dbed.loc[bedrowi, 'reference codon'] = ''.join(
                            dcodingmutpos['coding sequence'].tolist())
                        dbed.loc[bedrowi, 'strand'] = t.strand
                        dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format(
                            din.loc[i, 'transcript: id'],
                            dbed.loc[bedrowi, 'chromosome'],
                            dbed.loc[bedrowi, 'strand'],
                            int(dbed.loc[bedrowi, 'start']),
                            int(dbed.loc[bedrowi, 'end']))
                        dbed.loc[bedrowi, 'gene: id'] = t.gene_id
                        dbed.loc[bedrowi, 'gene: name'] = t.gene.name
                        dbed.loc[bedrowi, 'protein: id'] = t.protein_id
                        dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[
                            i, 'aminoacid: position']
                        #         break
                        bedrowi += 1
                    else:
                        terrpositions.append(t.id)
                else:
                    terrpositions.append(t.id)
            else:
                terrnoncoding.append(t.id)
        else:
            terrnotfound.append(din.loc[i, 'transcript: id'])
            if cfg['test']:
                logging.error('not found: {}'.format(
                    din.loc[i, 'transcript: id']))
    if len(dbed) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no valid seqeunces found; saving an empty table.')
        saveemptytable(cfg, f"{cfg['dsequencesp']}")
        return None
    dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45,
                                axis=1)), :]  #FIXME put flank in the yml

    dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int)
    dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int)

    dbed = dbed.drop_duplicates(subset=bed_colns)
    dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False)
    err2tids = {
        'terrpositions': terrpositions,
        'terrnotfound': terrnotfound,
        'terrnoncoding': terrnoncoding,
    }
    if cfg['test']:
        print(err2tids)
    with open(dbedp + '.err.json', 'w') as outfile:
        json.dump(err2tids, outfile)

    bedp = f"{cfg['datad']}/dbedflank.bed"
    fastap = f"{cfg['datad']}/dbedflank.fa"
    cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
    runbashcmd(cmd)

    dflankfa = fa2df(fastap, ids2cols=True)
    dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply(
        lambda x: x.upper())
    dflankfa.loc[:,
                 'sequence: length'] = [len(s) for s in dflankfa['sequence']]
    dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index]
    dflankfa.index.name = 'id'
    dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1')
    dseq2compatible = {
        'aminoacid: position': 'aminoacid: position',
        'gene: id': 'gene: id',
        'gene: name': 'gene: name',
        'protein: id': 'protein: id',
        'transcript: id': 'seqid',
        'transcript: sequence': 'sequence',
        'aminoacid: wild-type': 'reference residue',
        'codon: wild-type': 'reference codon',
        'contig': 'contig',
        'strand': 'strand',
        'start': 'start',
        'end': 'end',
        'codon start': 'codon start',
        'codon end': 'codon end',
    }
    if 'amino acid mutation' in dseq:
        dseq2compatible['amino acid mutation'] = 'amino acid mutation'
    dseq.to_csv(cfg['dseqtmpp'], sep='\t')

    dseq = dseq[list(dseq2compatible.values())]
    dseq.columns = list(dseq2compatible.keys())
    #             dseq.to_csv('data/dseq.csv')

    logging.info(dseq.columns.tolist())
    logging.info(din.columns.tolist())
    dseq = pd.merge(dseq.reset_index(),
                    din,
                    on=['transcript: id', 'aminoacid: position'])
    logging.info(dseq.columns.tolist())
    set_index(dseq, 'id')
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq = dfswapcols(dseq,
                              ['aminoacid: wild-type', 'amino acid mutation'])
            dseq['codon: mutation'] = dseq['codon: wild-type'].copy()

    dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t')
    del ensembl
Beispiel #7
0
def plot_vizbysteps(cfg):
    prjd = cfg['prjd']
    #make one output table and stepwise plots
    datad = f"{prjd}/05_output"

    # step2 # make submap
    stepi = 2
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_substitution_map"
    plotps = glob(plotp + '*')
    if len(plotps) == 0 or cfg['force']:
        plotpf = plotp + "_{mutation_type}.png"
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            if len(dstep) < 1000:
                logging.info('plot_submap_possibilities')
                plot_submap_possibilities(dmutagenesis=dstep,
                                          plotpf=plotpf,
                                          test=False)
            else:
                logging.warning(f'skipped: plot_submap_possibilities')
        else:
            logging.warning(f'not found: {dstepp}')

    # step3
    # stats by strategies
    stepi = 3
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_stats_by_strategies.png"
    if not exists(plotp) or cfg['force']:
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            logging.info('plot_bar_dguides')
            plot_bar_dguides(dstep, plotp)
        else:
            logging.warning(f'not found: {dstepp}')

    # make nt_composition plot
    stepi = 3
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_nt_compositions"
    plotps = glob(plotp + '*')
    if len(plotps) == 0 or cfg['force']:
        plotpf = plotp + "_{method}.png"
        makedirs(dirname(plotp), exist_ok=True)
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            #             dbepams=pd.read_table(f'{dirname(realpath(__file__))}/../data/dbepams.tsv')
            dbepams = pd.read_table(cfg['dbepamsp'], keep_default_na=False)
            dpam = dbepams.loc[:, cols_dpam].drop_duplicates()
            dpam = set_index(dpam, 'PAM')
            logging.info('plot_dist_dguides')
            plot_dist_dguides(dstep, dpam, plotpf)
        else:
            logging.warning(f'not found: {dstepp}')

    # make plot_dna_features_view
    stepi = 3
    plotd = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dna_features_view"
    plotps = glob(plotd + '/*')
    if len(plotps) == 0 or cfg['force']:
        dguidesp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        dsequencesp = f"{cfg[stepi-2]}/d{cfg[stepi-2].replace('/','').split('_')[-1]}.tsv"
        if exists(dguidesp):
            logging.info('plot_dna_features_view')
            plot_dna_features_view(
                cfg,
                dsequences=del_Unnamed(
                    pd.read_table(dsequencesp,
                                  keep_default_na=False)).drop_duplicates(),
                dguides=del_Unnamed(
                    pd.read_table(dguidesp,
                                  keep_default_na=False)).drop_duplicates(),
                plotd=plotd,
                more=False)
        else:
            logging.warning(f'not found: {dstepp}')

#     # step2 # make submap #FIXME get all the columns used for plotting in the dguides.
#     stepi=3
#     plotp=f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_submap_used_for_mutagenesis"
#     plotps=glob(plotp+'*')
#     if len(plotps)==0 or cfg['force']:
#         plotpf=plotp+"_{mutation_type}.png"
#         dstepp=f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
#         dstep=del_Unnamed(pd.read_table(dstepp)).drop_duplicates()
#         logging.info('plot_submap_possibilities')
#         plot_submap_possibilities(dmutagenesis=dstep,
#                                   plotpf=plotpf,test=False)

# step4 offtargets correlations
    stepi = 4
    plotp = f"{datad}/plot_d{cfg[stepi].replace('/','').split('_')[-1]}_dist_beditor_score.png"
    if not exists(plotp) or cfg['force']:
        dstepp = f"{cfg[stepi]}/d{cfg[stepi].replace('/','').split('_')[-1]}.tsv"
        if exists(dstepp):
            dstep = del_Unnamed(pd.read_table(
                dstepp, keep_default_na=False)).drop_duplicates()
            logging.info('plot_dist_dofftargets')
            plot_dist_dofftargets(dstep, plotp)
        else:
            logging.warning(f'not found: {dstepp}')