Beispiel #1
0
def dseq2dguides(cfg):
    """
    Wrapper around make guides function.
    
    :param cfg: configuration dict.    
    """
    cfg['datad']=cfg[cfg['step']]
    cfg['plotd']=cfg['datad']
    dguideslinp=f"{cfg['datad']}/dguides.tsv"
    dguides_nofltp=f"{cfg['datad']}/dguides_noflt.tsv"
    dmutagenesisp=f"{cfg['datad']}/dmutagenesis.tsv"
    dpam_strandsp=f"{cfg['datad']}/dpam_strands.csv"
    if not exists(dguideslinp) or cfg['force']:
        dmutagenesis=pd.read_csv(f"{cfg[cfg['step']-1]}/dmutagenesis.tsv",sep='\t')
        if cfg['mutation_format']=='nucleotide':
            dsequences=pd.read_csv(f"{cfg[cfg['step']-2]}/dsequences.tsv",sep='\t') #FIXME if numbering of steps is changed, this is gonna blow
            dsequences,dmutagenesis=dinnucleotide2dsequencesproper(dsequences,dmutagenesis)
        elif cfg['mutation_format']=='aminoacid':
            dsequences=pd.read_csv(f"{cfg[cfg['step']-2]}/dsequences.tsv",sep='\t') #FIXME if numbering of steps is changed, this is gonna blow
            if 'reverse_mutations' in cfg:
                if cfg['reverse_mutations']: 
#                     from beditor.lib.global_vars import stepi2cols
                    cols_dsequences=dsequences.columns.tolist()
                    dsequences=pd.merge(dsequences,
                        dmutagenesis,
                        how='inner',
                        left_on=['aminoacid: wild-type','codon: mutation','amino acid mutation'],
                        right_on=['amino acid','codon mutation','amino acid mutation'],
                                       suffixes=['', ': dmutagenesis']) 
                    dsequences['codon: wild-type']=dsequences['codon']
                    dsequences=dsequences.loc[:,cols_dsequences]
                    
        dsequences.to_csv(f"{cfg[cfg['step']]}/dsequences.tsv",sep='\t') #FIXME if numbering of steps is changed, this is gonna blow
        # make pam table
        dpam=pd.read_table(f'{dirname(realpath(__file__))}/../data/dpam.tsv')
        if sum(dpam['PAM'].isin(cfg['pams']))!=len(cfg['pams']):
            logging.error(f"PAM/s {cfg['pams']} are not supported {dpam['PAM'].tolist()}")
            sys.exit(1)
        dpam_strands=dpam2dpam_strands(dpam,pams=cfg['pams'])
        dpam_strands.to_csv(dpam_strandsp,sep='\t')

        if not (len(dsequences)==0 or len(dmutagenesis)==0):        
            dmutagenesis['strand']=dmutagenesis.apply(lambda x : x['mutation on strand'].replace(' strand',''),axis=1)        
            dmutagenesis.to_csv(dmutagenesisp,sep='\t')

            dguideslin,dguides_noflt,err2idxs=make_guides(cfg,dsequences,
                        dmutagenesis,
                        dpam=dpam_strands,
                           test=cfg['test'],
                           # dbug=True,
                         )
            if not dguides_noflt is None:
                dguides_noflt.to_csv(dguides_nofltp,sep='\t')
            if not ((dguideslin is None) and (err2idxs is None)):
                dguideslin.to_csv(dguideslinp,sep='\t')
                if cfg['test']:
                    logging.info(err2idxs)
                with open(dguideslinp+'.err.json', 'w') as f:
                    json.dump(err2idxs, f)
            else:
                from beditor.lib.global_vars import saveemptytable
                logging.warning('no guides designed; saving an empty table.')
                saveemptytable(cfg,dguideslinp)
        else:
            from beditor.lib.global_vars import saveemptytable
            logging.warning('no guides designed; saving an empty table.')
            saveemptytable(cfg,dguideslinp)
            
        import gc
        gc.collect()
Beispiel #2
0
def get_seq_aminoacid(cfg, din):
    """
    Fetches sequences if mutation format is amino acid 

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """
    import pyensembl
    #import ensembl object that would fetch genes
    # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease'])
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])

    din.index = range(len(din))
    dbedp = '{}/dbedflank.bed'.format(cfg['datad'])
    dbed = pd.DataFrame(columns=bed_colns)
    terrpositions = []
    terrnotfound = []
    terrnoncoding = []
    bedrowi = 0
    #             for i in trange(len(din)-1,desc='get positions for bedtools'):
    for i in din.index:
        if din.loc[i, 'transcript: id'] in ensembl.transcript_ids():
            t = ensembl.transcript_by_id(din.loc[i, 'transcript: id'])
            if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon:
                coding_sequence_positions = tboundaries2positions(t)
                if len(coding_sequence_positions) == len(t.coding_sequence):
                    #TODO     need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce
                    dcoding = t2pmapper(t, coding_sequence_positions)
                    dcodingmutpos = dcoding.loc[(
                        dcoding['protein index'] == din.loc[
                            i, 'aminoacid: position']), :]
                    codon_positions = dcodingmutpos[
                        'coding sequence positions'].tolist()
                    if len(codon_positions) != 0:
                        dbed.loc[bedrowi, 'chromosome'] = t.contig
                        if cfg['test']:
                            print(din.loc[i, 'transcript: id'],
                                  codon_positions)
                        if t.strand == '+':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[0]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[2]
                        elif t.strand == '-':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[2]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[0]
                        dbed.loc[bedrowi, 'start'] = dbed.loc[
                            bedrowi,
                            'codon start'] - 22  #FIXME put flank in the yml
                        dbed.loc[bedrowi, 'end'] = dbed.loc[
                            bedrowi,
                            'codon end'] + 21  #FIXME put flank in the yml

                        dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[
                            'protein sequence'].tolist()[0]
                        dbed.loc[bedrowi, 'reference codon'] = ''.join(
                            dcodingmutpos['coding sequence'].tolist())
                        dbed.loc[bedrowi, 'strand'] = t.strand
                        dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format(
                            din.loc[i, 'transcript: id'],
                            dbed.loc[bedrowi, 'chromosome'],
                            dbed.loc[bedrowi, 'strand'],
                            int(dbed.loc[bedrowi, 'start']),
                            int(dbed.loc[bedrowi, 'end']))
                        dbed.loc[bedrowi, 'gene: id'] = t.gene_id
                        dbed.loc[bedrowi, 'gene: name'] = t.gene.name
                        dbed.loc[bedrowi, 'protein: id'] = t.protein_id
                        dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[
                            i, 'aminoacid: position']
                        #         break
                        bedrowi += 1
                    else:
                        terrpositions.append(t.id)
                else:
                    terrpositions.append(t.id)
            else:
                terrnoncoding.append(t.id)
        else:
            terrnotfound.append(din.loc[i, 'transcript: id'])
            if cfg['test']:
                logging.error('not found: {}'.format(
                    din.loc[i, 'transcript: id']))
    if len(dbed) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no valid seqeunces found; saving an empty table.')
        saveemptytable(cfg, f"{cfg['dsequencesp']}")
        return None
    dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45,
                                axis=1)), :]  #FIXME put flank in the yml

    dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int)
    dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int)

    dbed = dbed.drop_duplicates(subset=bed_colns)
    dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False)
    err2tids = {
        'terrpositions': terrpositions,
        'terrnotfound': terrnotfound,
        'terrnoncoding': terrnoncoding,
    }
    if cfg['test']:
        print(err2tids)
    with open(dbedp + '.err.json', 'w') as outfile:
        json.dump(err2tids, outfile)

    bedp = f"{cfg['datad']}/dbedflank.bed"
    fastap = f"{cfg['datad']}/dbedflank.fa"
    cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
    runbashcmd(cmd)

    dflankfa = fa2df(fastap, ids2cols=True)
    dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply(
        lambda x: x.upper())
    dflankfa.loc[:,
                 'sequence: length'] = [len(s) for s in dflankfa['sequence']]
    dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index]
    dflankfa.index.name = 'id'
    dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1')
    dseq2compatible = {
        'aminoacid: position': 'aminoacid: position',
        'gene: id': 'gene: id',
        'gene: name': 'gene: name',
        'protein: id': 'protein: id',
        'transcript: id': 'seqid',
        'transcript: sequence': 'sequence',
        'aminoacid: wild-type': 'reference residue',
        'codon: wild-type': 'reference codon',
        'contig': 'contig',
        'strand': 'strand',
        'start': 'start',
        'end': 'end',
        'codon start': 'codon start',
        'codon end': 'codon end',
    }
    if 'amino acid mutation' in dseq:
        dseq2compatible['amino acid mutation'] = 'amino acid mutation'
    dseq.to_csv(cfg['dseqtmpp'], sep='\t')

    dseq = dseq[list(dseq2compatible.values())]
    dseq.columns = list(dseq2compatible.keys())
    #             dseq.to_csv('data/dseq.csv')

    logging.info(dseq.columns.tolist())
    logging.info(din.columns.tolist())
    dseq = pd.merge(dseq.reset_index(),
                    din,
                    on=['transcript: id', 'aminoacid: position'])
    logging.info(dseq.columns.tolist())
    set_index(dseq, 'id')
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq = dfswapcols(dseq,
                              ['aminoacid: wild-type', 'amino acid mutation'])
            dseq['codon: mutation'] = dseq['codon: wild-type'].copy()

    dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t')
    del ensembl
def dguides2offtargets(cfg):
    """
    All the processes in offtarget detection are here.
    
    :param cfg: Configuration settings provided in .yml file
    """
    from beditor.lib.global_vars import saveemptytable

    cfg['datad'] = cfg[cfg['step']]
    cfg['plotd'] = cfg['datad']
    dofftargetsp = '{}/dofftargets.tsv'.format(cfg['datad'])

    stepn = '04_offtargets'
    logging.info(stepn)
    dguidesp = f"{cfg[cfg['step']-1]}/dguides.tsv"
    if not exists(dguidesp):
        logging.warning(f"not found {dguidesp}")
        return saveemptytable(cfg, dofftargetsp)
    dguides = pd.read_csv(dguidesp, sep='\t')
    if len(dguides) == 0:
        logging.warning(f"dguides is empty.")
        return saveemptytable(cfg, dofftargetsp)

    cfg['datatmpd'] = f"{cfg['datad']}/tmp"
    for dp in [cfg['datatmpd']]:
        if not exists(dp):
            makedirs(dp)

    step2doutp = {
        1: '01_guides_guidel*.fa',
        2: '02_dalignbed.tsv',
        3: '03_annotations.bed',
        4: '04_dalignbedguides.tsv',
        5: '05_dalignedfasta.tsv',
        6: '06_dalignbedguidesseq.tsv',
        7: '07_dalignbedstats.tsv',
        8: '08_dannotsagg.tsv',
        9: '09_dalignbedannot.tsv',
        10: '10_daggbyguide.tsv',
    }
    cfg['dguidesp'] = dguidesp
    cfg['alignmentbedp'] = f"{cfg['datatmpd']}/02_alignment.bed"
    cfg['dalignbedp'] = f"{cfg['datatmpd']}/02_dalignbed.tsv"
    cfg['dalignbedguidesp'] = f"{cfg['datatmpd']}/04_dalignbedguides.tsv"
    cfg['dalignedfastap'] = f"{cfg['datatmpd']}/05_dalignedfasta.tsv"
    cfg['dalignbedguidesseqp'] = f"{cfg['datatmpd']}/06_dalignbedguidesseq.tsv"
    cfg['dalignbedstatsp'] = f"{cfg['datatmpd']}/07_dalignbedstats.tsv"
    cfg['dannotsaggp'] = f"{cfg['datatmpd']}/08_dannotsagg.tsv"
    cfg['dalignbedannotp'] = f"{cfg['datatmpd']}/09_dalignbedannot.tsv"
    cfg['daggbyguidep'] = f"{cfg['datatmpd']}/10_daggbyguide.tsv"

    #check which step to process
    for step in range(2, 10 + 1, 1):
        if not exists(f"{cfg['datatmpd']}/{step2doutp[step]}"):
            if step == 2:
                step = 'all'
            break
    logging.info(f'process from step:{step}')
    cfg['dofftargetsp'] = '{}/dofftargets.tsv'.format(cfg['datad'])
    if not exists(cfg['dofftargetsp']) or cfg['force']:
        if step == 1 or step == 'all':
            cfg = dguides2guidessam(cfg, dguides)
        if step == 2 or step == 'all' or (not cfg is None):
            cfg = guidessam2dalignbed(cfg)
        if step == 3 or step == 'all' or (not cfg is None):
            cfg = dalignbed2annotationsbed(cfg)
        if step == 4 or step == 'all' or (not cfg is None):
            cfg = dalignbed2dalignbedguides(cfg)
        if step == 5 or step == 'all' or (not cfg is None):
            cfg = alignmentbed2dalignedfasta(cfg)
        if step == 6 or step == 'all' or (not cfg is None):
            cfg = dalignbed2dalignbedguidesseq(cfg)
        if step == 7 or step == 'all' or (not cfg is None):
            cfg = dalignbedguidesseq2dalignbedstats(cfg)
        if step == 8 or step == 'all' or (not cfg is None):
            cfg = dannots2dalignbed2dannotsagg(cfg)
        if step == 9 or step == 'all' or (not cfg is None):
            cfg = dannotsagg2dannots2dalignbedannot(cfg)
        if step == 10 or step == 'all' or (not cfg is None):
            cfg = dalignbedannot2daggbyguide(cfg)

        if cfg is None:
            logging.warning(f"no alignment found")
            cfg['step'] = 4
            return saveemptytable(cfg, cfg['dofftargetsp'])
        import gc
        gc.collect()
Beispiel #4
0
def get_possible_mutagenesis(
    cfg,
    dcodontable,
    dcodonusage,
    BEs,
    pos_muts,
    host,
):
    """
    Assesses possible mutagenesis strategies, given the set of Base editors and positions of mutations.

    :param dcodontable: Codon table
    :param dcodonusage: Codon usage table
    :param BEs: Base editors (dict), see global_vars.py
    :param pos_muts: positions of mutations
    :param host: host organism
    :returns: possible mutagenesis strategies as a pandas dataframe
    """
    def write_dmutagenesis(cdni, posi, codon, codonmut, ntwt, ntmut, aa, aamut,
                           method):
        """
        Write dmutagenesis table for each iteraction in get_possible_mutagenesis.
        """
        dmutagenesis.loc[cdni, 'codon'] = codon
        dmutagenesis.loc[cdni, 'position of mutation in codon'] = int(posi)
        dmutagenesis.loc[cdni, 'codon mutation'] = codonmut
        dmutagenesis.loc[cdni, 'nucleotide'] = ntwt
        dmutagenesis.loc[cdni, 'nucleotide mutation'] = ntmut
        dmutagenesis.loc[cdni, 'amino acid'] = aa
        dmutagenesis.loc[cdni, 'amino acid mutation'] = aamut
        dmutagenesis.loc[cdni, 'mutation on strand'] = method.split(' on ')[1]
        dmutagenesis.loc[cdni,
                         'strand: mutation'] = method.split(' on ')[1].replace(
                             ' strand', '')
        dmutagenesis.loc[cdni, 'method'] = method.split(' on ')[0]
        dmutagenesis.loc[cdni,
                         'codon mutation usage Fraction'] = dcodonusage.loc[
                             codonmut, 'Fraction']
        dmutagenesis.loc[cdni,
                         'codon mutation usage Frequency'] = dcodonusage.loc[
                             codonmut, 'Frequency']
        return dmutagenesis

    def get_sm(dmutagenesis, BEs, positions, codon, muti, cdni):
        """
        Fetches single nucleotide mutagenesis strategies.
        """
        for method in BEs:
            for posi in positions:
                if BEs[method][0] == codon[posi]:
                    for ntmut in BEs[method][1]:
                        if posi == 0:
                            codonmut = '{}{}{}'.format(ntmut, codon[1],
                                                       codon[2])
                        elif posi == 1:
                            codonmut = '{}{}{}'.format(codon[0], ntmut,
                                                       codon[2])
                        elif posi == 2:
                            codonmut = '{}{}{}'.format(codon[0], codon[1],
                                                       ntmut)
                        aamut = str(
                            Seq.Seq(codonmut,
                                    Alphabet.generic_dna).translate(table=1))
                        # if (aamut!='*') and (aamut!=aa): #  nonsence and synonymous
                        if muti == 0:
                            cdni = cdni
                        else:
                            cdni = len(dmutagenesis) + 1
                        muti += 1
                        ntwt = BEs[method][0]
                        if '-' in method.split(' on ')[1]:
                            ntwt = str(
                                Seq.Seq(
                                    ntwt,
                                    Alphabet.generic_dna).reverse_complement())
                            ntmut = str(
                                Seq.Seq(
                                    ntmut,
                                    Alphabet.generic_dna).reverse_complement())
                        dmutagenesis_row = {
                            'cdni': cdni,
                            'posi': posi + 1,
                            'codon': codon,
                            'codonmut': codonmut,
                            'ntwt': ntwt,
                            'ntmut': ntmut,
                            'aa': aa,
                            'aamut': aamut,
                            'method': method
                        }
                        #                         print(dmutagenesis_row)
                        dmutagenesis = write_dmutagenesis(**dmutagenesis_row)
#                 else:
#                     logging.warning(f"BEs[{method}][0]!=codon[{posi}]")
        return dmutagenesis, muti

    #double nucleotide mutations
    positions = {0: '@1st position', 1: '@2nd position', 2: '@3rd position'}
    #double nucleotide mutations
    positions_dm = [(i, j) for i in positions.keys() for j in positions.keys()
                    if i < j]
    #double nucleotide mutations
    positions_tm = [[0, 1, 2]]

    dmutagenesis = dcodontable.copy()
    # test=True
    test = False
    for cdni in dmutagenesis.index:
        codon = dmutagenesis.loc[cdni, 'codon']
        aa = dmutagenesis.loc[cdni, 'amino acid']
        muti = 0
        if test:
            print(codon)
        #single nucleuotide mutations
        dmutagenesis, muti = get_sm(dmutagenesis, BEs, positions, codon, muti,
                                    cdni)
    if len(dmutagenesis) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no guides designed; saving an empty table.')
        dmutagenesis = saveemptytable(cfg)
    else:
        #         to_table(dmutagenesis,'test.tsv') #FIXME
        #         print(dmutagenesis.shape)
        dmutagenesis = dmutagenesis.dropna()  #FIXME
        #         print(dmutagenesis.shape)
        dmutagenesis['nucleotide mutation: count'] = [
            len(s) for s in dmutagenesis['nucleotide mutation']
        ]
        dmutagenesis = dmutagenesis.sort_values('codon')
        # Adding information of Allowed activity window
        dmutagenesis = dmutagenesis.set_index('method').join(pos_muts)
        dmutagenesis = dmutagenesis.reset_index()

        from beditor.lib.io_seqs import reverse_complement_multintseq
        from beditor.lib.global_vars import nt2complement
        dmutagenesis['nucleotide: wild-type'] = dmutagenesis.apply(
            lambda x: x['nucleotide']
            if x['strand: mutation'] == '+' else reverse_complement_multintseq(
                x['nucleotide'], nt2complement),
            axis=1)
        dmutagenesis['nucleotide: mutation'] = dmutagenesis.apply(
            lambda x: x['nucleotide mutation']
            if x['strand: mutation'] == '+' else reverse_complement_multintseq(
                x['nucleotide mutation'], nt2complement),
            axis=1)

    return dmutagenesis
Beispiel #5
0
def get_possible_mutagenesis(
    cfg,
    dcodontable,
    dcodonusage,
    BEs,
    pos_muts,
    host,
):
    """
    Assesses possible mutagenesis strategies, given the set of Base editors and positions of mutations.

    :param dcodontable: Codon table
    :param dcodonusage: Codon usage table
    :param BEs: Base editors (dict), see global_vars.py
    :param pos_muts: positions of mutations
    :param host: host organism
    :returns: possible mutagenesis strategies as a pandas dataframe
    """
    def write_dmutagenesis(cdni, posi, codon, codonmut, ntwt, ntmut, aa, aamut,
                           method):
        """
        Write dmutagenesis table for each iteraction in get_possible_mutagenesis.
        """
        dmutagenesis.loc[cdni, 'codon'] = codon
        dmutagenesis.loc[cdni, 'position of mutation in codon'] = int(posi)
        dmutagenesis.loc[cdni, 'codon mutation'] = codonmut
        dmutagenesis.loc[cdni, 'nucleotide'] = ntwt
        dmutagenesis.loc[cdni, 'nucleotide mutation'] = ntmut
        dmutagenesis.loc[cdni, 'amino acid'] = aa
        dmutagenesis.loc[cdni, 'amino acid mutation'] = aamut
        dmutagenesis.loc[cdni, 'mutation on strand'] = method.split(' on ')[1]
        dmutagenesis.loc[cdni,
                         'strand: mutation'] = method.split(' on ')[1].replace(
                             ' strand', '')
        dmutagenesis.loc[cdni, 'method'] = method.split(' on ')[0]
        dmutagenesis.loc[cdni,
                         'codon mutation usage Fraction'] = dcodonusage.loc[
                             codonmut, 'Fraction']
        dmutagenesis.loc[cdni,
                         'codon mutation usage Frequency'] = dcodonusage.loc[
                             codonmut, 'Frequency']
        return dmutagenesis

    def get_sm(dmutagenesis, BEs, positions, codon, muti, cdni):
        """
        Fetches single nucleotide mutagenesis strategies.
        """
        for method in BEs:
            for posi in positions:
                if BEs[method][0] == codon[posi]:
                    for ntmut in BEs[method][1]:
                        if posi == 0:
                            codonmut = '{}{}{}'.format(ntmut, codon[1],
                                                       codon[2])
                        elif posi == 1:
                            codonmut = '{}{}{}'.format(codon[0], ntmut,
                                                       codon[2])
                        elif posi == 2:
                            codonmut = '{}{}{}'.format(codon[0], codon[1],
                                                       ntmut)
                        aamut = str(
                            Seq.Seq(codonmut,
                                    Alphabet.generic_dna).translate(table=1))
                        # if (aamut!='*') and (aamut!=aa): #  nonsence and synonymous
                        if muti == 0:
                            cdni = cdni
                        else:
                            cdni = len(dmutagenesis) + 1
                        muti += 1
                        ntwt = BEs[method][0]
                        if '-' in method.split(' on ')[1]:
                            ntwt = str(
                                Seq.Seq(
                                    ntwt,
                                    Alphabet.generic_dna).reverse_complement())
                            ntmut = str(
                                Seq.Seq(
                                    ntmut,
                                    Alphabet.generic_dna).reverse_complement())
                        dmutagenesis = write_dmutagenesis(
                            **{
                                'cdni': cdni,
                                'posi': posi + 1,
                                'codon': codon,
                                'codonmut': codonmut,
                                'ntwt': ntwt,
                                'ntmut': ntmut,
                                'aa': aa,
                                'aamut': aamut,
                                'method': method
                            })
        return dmutagenesis, muti

    def get_dm(dmutagenesis, BEs, positions_dm, codon, muti, cdni):
        """
        Fetches double nucleotide mutagenesis strategies.
        """
        for method in BEs:
            for posi1, posi2 in positions_dm:
                if (BEs[method][0] == codon[posi1]) and (BEs[method][0]
                                                         == codon[posi2]):
                    for ntmut1, ntmut2 in itertools.product(''.join(
                            BEs[method][1]),
                                                            repeat=2):
                        if (posi1 == 0) and (posi2 == 1):
                            codonmut = '{}{}{}'.format(ntmut1, ntmut2,
                                                       codon[2])
                        elif (posi1 == 1) and (posi2 == 2):
                            codonmut = '{}{}{}'.format(codon[0], ntmut1,
                                                       ntmut2)
                        elif (posi1 == 0) and (posi2 == 2):
                            codonmut = '{}{}{}'.format(ntmut1, codon[1],
                                                       ntmut2)
                        aamut = str(
                            Seq.Seq(codonmut,
                                    Alphabet.generic_dna).translate(table=1))
                        # if (aamut!='*') and (aamut!=aa): #  nonsence and synonymous
                        if muti == 0:
                            cdni = cdni
                        else:
                            cdni = len(dmutagenesis) + 1
                        muti += 1
                        ntwt = '{}{}'.format(BEs[method][0], BEs[method][0])
                        ntmut = '{}{}'.format(ntmut1, ntmut2)
                        if '-' in method.split(' on ')[1]:
                            ntwt = str(
                                Seq.Seq(
                                    ntwt,
                                    Alphabet.generic_dna).reverse_complement())
                            ntmut = str(
                                Seq.Seq(
                                    ntmut,
                                    Alphabet.generic_dna).reverse_complement())
                        dmutagenesis = write_dmutagenesis(
                            **{
                                'cdni': cdni,
                                'posi': '{}{}'.format(posi1, posi2),
                                'codon': codon,
                                'codonmut': codonmut,
                                'ntwt': ntwt,
                                'ntmut': ntmut,
                                'aa': aa,
                                'aamut': aamut,
                                'method': method
                            })
        return dmutagenesis, muti

    def get_tm(dmutagenesis, BEs, positions_tm, codon, muti, cdni):
        """
        Fetches triple nucleotide mutagenesis strategies.
        """
        for method in BEs:
            for posi1, posi2, posi3 in positions_tm:
                if (BEs[method][0] == codon[posi1]) and (
                        BEs[method][0] == codon[posi2]) and (BEs[method][0]
                                                             == codon[posi3]):
                    for ntmut1, ntmut2, ntmut3 in itertools.product(''.join(
                            BEs[method][1]),
                                                                    repeat=3):
                        codonmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3)
                        aamut = str(
                            Seq.Seq(codonmut,
                                    Alphabet.generic_dna).translate(table=1))
                        # if (aamut!='*') and (aamut!=aa): #  nonsence and synonymous
                        if muti == 0:
                            cdni = cdni
                        else:
                            cdni = len(dmutagenesis) + 1
                        muti += 1
                        ntwt = '{}{}{}'.format(BEs[method][0], BEs[method][0],
                                               BEs[method][0])
                        ntmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3)
                        if '-' in method.split(' on ')[1]:
                            ntwt = str(
                                Seq.Seq(
                                    ntwt,
                                    Alphabet.generic_dna).reverse_complement())
                            ntmut = str(
                                Seq.Seq(
                                    ntmut,
                                    Alphabet.generic_dna).reverse_complement())
                        dmutagenesis = write_dmutagenesis(
                            **{
                                'cdni': cdni,
                                'posi': '123',
                                'codon': codon,
                                'codonmut': codonmut,
                                'ntwt': ntwt,
                                'ntmut': ntmut,
                                'aa': aa,
                                'aamut': aamut,
                                'method': method
                            })
        return dmutagenesis, muti

    def get_dm_combo(dmutagenesis, BEs, positions_dm, codon, muti, cdni,
                     method):
        """
        Fetches double nucleotide mutagenesis strategies utilising 2 different base editors simultaneously.
        """
        methods = [
            m for m in itertools.product(BEs.keys(), repeat=2) if ((
                m[0].split('on')[1] == m[1].split('on')[1])) and (m[0] != m[1])
        ]
        for method1, method2 in methods:
            for posi1, posi2 in positions_dm:
                if (BEs[method1][0] == codon[posi1]) and (BEs[method2][0]
                                                          == codon[posi2]):
                    ntmuts = [(n1, n2) for n1 in ''.join(BEs[method1][1])
                              for n2 in ''.join(BEs[method2][1])]
                    for ntmut1, ntmut2 in ntmuts:
                        if (posi1 == 0) and (posi2 == 1):
                            codonmut = '{}{}{}'.format(ntmut1, ntmut2,
                                                       codon[2])
                        elif (posi1 == 1) and (posi2 == 2):
                            codonmut = '{}{}{}'.format(codon[0], ntmut1,
                                                       ntmut2)
                        elif (posi1 == 0) and (posi2 == 2):
                            codonmut = '{}{}{}'.format(ntmut1, codon[1],
                                                       ntmut2)
                        aamut = str(
                            Seq.Seq(codonmut,
                                    Alphabet.generic_dna).translate(table=1))
                        # if (aamut!='*') and (aamut!=aa): #  nonsence and synonymous
                        if muti == 0:
                            cdni = cdni
                        else:
                            cdni = len(dmutagenesis) + 1
                        muti += 1
                        ntwt = '{}{}'.format(BEs[method1][0], BEs[method2][0])
                        ntmut = '{}{}'.format(ntmut1, ntmut2)
                        if '-' in method1.split(' on ')[1]:
                            ntwt = str(
                                Seq.Seq(
                                    ntwt,
                                    Alphabet.generic_dna).reverse_complement())
                            ntmut = str(
                                Seq.Seq(
                                    ntmut,
                                    Alphabet.generic_dna).reverse_complement())
                        dmutagenesis = write_dmutagenesis(
                            **{
                                'cdni': cdni,
                                'posi': '{}{}'.format(posi1, posi2),
                                'codon': codon,
                                'codonmut': codonmut,
                                'ntwt': ntwt,
                                'ntmut': ntmut,
                                'aa': aa,
                                'aamut': aamut,
                                'method': method + ' on ' +
                                method1.split('on')[1]
                            })
        return dmutagenesis, muti

    def get_tm_combo(dmutagenesis, BEs, positions_tm, codon, muti, cdni,
                     method):
        """
        Fetches triple nucleotide mutagenesis strategies utilising 2 different base editors simultaneously.
        """
        methods = [
            m for m in itertools.product(BEs.keys(), repeat=3)
            if ((m[0].split('on')[1] == m[1].split('on')[1] == m[2].split('on')
                 [1])) and not (m[0] == m[1] == m[2])
        ]
        for method1, method2, method3 in methods:
            for posi1, posi2, posi3 in positions_tm:
                if (BEs[method1][0] == codon[posi1]) and (
                        BEs[method2][0] == codon[posi2]) and (BEs[method3][0]
                                                              == codon[posi3]):
                    ntmuts = [(n1, n2, n3) for n1 in ''.join(BEs[method1][1])
                              for n2 in ''.join(BEs[method2][1])
                              for n3 in ''.join(BEs[method3][1])]
                    for ntmut1, ntmut2, ntmut3 in ntmuts:
                        codonmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3)
                        aamut = str(
                            Seq.Seq(codonmut,
                                    Alphabet.generic_dna).translate(table=1))
                        # if (aamut!='*') and (aamut!=aa): #  nonsence and synonymous
                        if muti == 0:
                            cdni = cdni
                        else:
                            cdni = len(dmutagenesis) + 1
                        muti += 1
                        ntwt = '{}{}{}'.format(BEs[method1][0],
                                               BEs[method2][0],
                                               BEs[method3][0])
                        ntmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3)
                        if '-' in method1.split(' on ')[1]:
                            ntwt = str(
                                Seq.Seq(
                                    ntwt,
                                    Alphabet.generic_dna).reverse_complement())
                            ntmut = str(
                                Seq.Seq(
                                    ntmut,
                                    Alphabet.generic_dna).reverse_complement())
                        dmutagenesis = write_dmutagenesis(
                            **{
                                'cdni': cdni,
                                'posi': '123',
                                'codon': codon,
                                'codonmut': codonmut,
                                'ntwt': ntwt,
                                'ntmut': ntmut,
                                'aa': aa,
                                'aamut': aamut,
                                'method': method + ' on ' +
                                method1.split('on')[1]
                            })
        return dmutagenesis, muti

    #double nucleotide mutations
    positions = {0: '@1st position', 1: '@2nd position', 2: '@3rd position'}
    #double nucleotide mutations
    positions_dm = [(i, j) for i in positions.keys() for j in positions.keys()
                    if i < j]
    #double nucleotide mutations
    positions_tm = [[0, 1, 2]]

    dmutagenesis = dcodontable.copy()
    # test=True
    test = False
    for cdni in dmutagenesis.index:
        codon = dmutagenesis.loc[cdni, 'codon']
        aa = dmutagenesis.loc[cdni, 'amino acid']
        muti = 0
        if test:
            print(codon)
        #single nucleuotide mutations
        dmutagenesis, muti = get_sm(dmutagenesis, BEs, positions, codon, muti,
                                    cdni)
        #double nucleotide mutations
        dmutagenesis, muti = get_dm(dmutagenesis, BEs, positions_dm, codon,
                                    muti, cdni)
        #triple nucleotide mutations
        dmutagenesis, muti = get_tm(dmutagenesis, BEs, positions_tm, codon,
                                    muti, cdni)
        # #double nucleotide mutations combinations
        # dmutagenesis,muti=get_dm_combo(dmutagenesis,BEs,positions_dm,codon,muti,cdni,method='undefined')
        # #triple nucleotide mutations combinations
        # dmutagenesis,muti=get_tm_combo(dmutagenesis,BEs,positions_tm,codon,muti,cdni,method='undefined')
    if len(dmutagenesis) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no guides designed; saving an empty table.')
        dmutagenesis = saveemptytable(cfg)
    else:
        dmutagenesis['nucleotide mutation: count'] = [
            len(s) for s in dmutagenesis['nucleotide mutation']
        ]
        dmutagenesis = dmutagenesis.sort_values('codon')
        # Adding information of Allowed activity window
        dmutagenesis = dmutagenesis.set_index('method').join(pos_muts)
        dmutagenesis = dmutagenesis.reset_index()

        from beditor.lib.io_seqs import reverse_complement_multintseq
        from beditor.lib.global_vars import nt2complement
        dmutagenesis['nucleotide: wild-type'] = dmutagenesis.apply(
            lambda x: x['nucleotide']
            if x['strand: mutation'] == '+' else reverse_complement_multintseq(
                x['nucleotide'], nt2complement),
            axis=1)
        dmutagenesis['nucleotide: mutation'] = dmutagenesis.apply(
            lambda x: x['nucleotide mutation']
            if x['strand: mutation'] == '+' else reverse_complement_multintseq(
                x['nucleotide mutation'], nt2complement),
            axis=1)

    return dmutagenesis
Beispiel #6
0
def dseq2dguides(cfg):
    """
    Wrapper around make guides function.
    
    :param cfg: configuration dict.    
    """
    cfg['datad'] = cfg[cfg['step']]
    cfg['plotd'] = cfg['datad']
    dguideslinp = f"{cfg['datad']}/dguides.tsv"
    dguides_nofltp = f"{cfg['datad']}/dguides_noflt.tsv"
    dmutagenesisp = f"{cfg['datad']}/dmutagenesis.tsv"
    if not exists(dguideslinp) or cfg['force']:
        dmutagenesis = pd.read_csv(f"{cfg[cfg['step']-1]}/dmutagenesis.tsv",
                                   sep='\t',
                                   keep_default_na=False)
        if cfg['mutation_format'] == 'nucleotide':
            dsequences = pd.read_csv(
                f"{cfg[cfg['step']-2]}/dsequences.tsv",
                sep='\t',
                keep_default_na=False
            )  #FIXME if numbering of steps is changed, this is gonna blow
            dsequences, dmutagenesis = dinnucleotide2dsequencesproper(
                dsequences, dmutagenesis)
        elif cfg['mutation_format'] == 'aminoacid':
            dsequences = pd.read_csv(
                f"{cfg[cfg['step']-2]}/dsequences.tsv",
                sep='\t',
                keep_default_na=False
            )  #FIXME if numbering of steps is changed, this is gonna blow
            if 'reverse_mutations' in cfg:
                if cfg['reverse_mutations']:
                    #                     from beditor.lib.global_vars import stepi2cols
                    cols_dsequences = dsequences.columns.tolist()
                    dsequences = pd.merge(dsequences,
                                          dmutagenesis,
                                          how='inner',
                                          left_on=[
                                              'aminoacid: wild-type',
                                              'codon: mutation',
                                              'amino acid mutation'
                                          ],
                                          right_on=[
                                              'amino acid', 'codon mutation',
                                              'amino acid mutation'
                                          ],
                                          suffixes=['', ': dmutagenesis'])
                    dsequences['codon: wild-type'] = dsequences['codon']
                    dsequences = dsequences.loc[:, cols_dsequences]

        dsequences.to_csv(f"{cfg[cfg['step']]}/dsequences.tsv", sep='\t')

        if not (len(dsequences) == 0 or len(dmutagenesis) == 0):
            dmutagenesis['strand'] = dmutagenesis.apply(
                lambda x: x['mutation on strand'].replace(' strand', ''),
                axis=1)
            dmutagenesis.to_csv(dmutagenesisp, sep='\t')

            dguideslin, dguides_noflt, err2idxs, dguides_neg_control, dguides_pos_control = make_guides(
                cfg,
                dsequences,
                dmutagenesis,
                test=cfg['test'],
                # dbug=True,
            )
            if not dguides_noflt is None:
                dguides_noflt.to_csv(dguides_nofltp, sep='\t')
            if not ((dguideslin is None) and (err2idxs is None)):
                dguideslin.to_csv(dguideslinp, sep='\t')
                if cfg['test']:
                    logging.info(err2idxs)
                with open(dguideslinp + '.err.json', 'w') as f:
                    json.dump(err2idxs, f)
                if cfg['make_control_pos'] and not dguides_pos_control is None:
                    to_table(dguides_pos_control,
                             f"{dguideslinp}.pos_control.tsv")
                if cfg['make_control_neg'] and not dguides_neg_control is None:
                    to_table(dguides_neg_control,
                             f"{dguideslinp}.neg_control.tsv")
            else:
                from beditor.lib.global_vars import saveemptytable
                logging.warning('no guides designed; saving an empty table.')
                saveemptytable(cfg, dguideslinp)
        else:
            from beditor.lib.global_vars import saveemptytable
            logging.warning('no guides designed; saving an empty table.')
            saveemptytable(cfg, dguideslinp)

        import gc
        gc.collect()