def dseq2dguides(cfg): """ Wrapper around make guides function. :param cfg: configuration dict. """ cfg['datad']=cfg[cfg['step']] cfg['plotd']=cfg['datad'] dguideslinp=f"{cfg['datad']}/dguides.tsv" dguides_nofltp=f"{cfg['datad']}/dguides_noflt.tsv" dmutagenesisp=f"{cfg['datad']}/dmutagenesis.tsv" dpam_strandsp=f"{cfg['datad']}/dpam_strands.csv" if not exists(dguideslinp) or cfg['force']: dmutagenesis=pd.read_csv(f"{cfg[cfg['step']-1]}/dmutagenesis.tsv",sep='\t') if cfg['mutation_format']=='nucleotide': dsequences=pd.read_csv(f"{cfg[cfg['step']-2]}/dsequences.tsv",sep='\t') #FIXME if numbering of steps is changed, this is gonna blow dsequences,dmutagenesis=dinnucleotide2dsequencesproper(dsequences,dmutagenesis) elif cfg['mutation_format']=='aminoacid': dsequences=pd.read_csv(f"{cfg[cfg['step']-2]}/dsequences.tsv",sep='\t') #FIXME if numbering of steps is changed, this is gonna blow if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: # from beditor.lib.global_vars import stepi2cols cols_dsequences=dsequences.columns.tolist() dsequences=pd.merge(dsequences, dmutagenesis, how='inner', left_on=['aminoacid: wild-type','codon: mutation','amino acid mutation'], right_on=['amino acid','codon mutation','amino acid mutation'], suffixes=['', ': dmutagenesis']) dsequences['codon: wild-type']=dsequences['codon'] dsequences=dsequences.loc[:,cols_dsequences] dsequences.to_csv(f"{cfg[cfg['step']]}/dsequences.tsv",sep='\t') #FIXME if numbering of steps is changed, this is gonna blow # make pam table dpam=pd.read_table(f'{dirname(realpath(__file__))}/../data/dpam.tsv') if sum(dpam['PAM'].isin(cfg['pams']))!=len(cfg['pams']): logging.error(f"PAM/s {cfg['pams']} are not supported {dpam['PAM'].tolist()}") sys.exit(1) dpam_strands=dpam2dpam_strands(dpam,pams=cfg['pams']) dpam_strands.to_csv(dpam_strandsp,sep='\t') if not (len(dsequences)==0 or len(dmutagenesis)==0): dmutagenesis['strand']=dmutagenesis.apply(lambda x : x['mutation on strand'].replace(' strand',''),axis=1) dmutagenesis.to_csv(dmutagenesisp,sep='\t') dguideslin,dguides_noflt,err2idxs=make_guides(cfg,dsequences, dmutagenesis, dpam=dpam_strands, test=cfg['test'], # dbug=True, ) if not dguides_noflt is None: dguides_noflt.to_csv(dguides_nofltp,sep='\t') if not ((dguideslin is None) and (err2idxs is None)): dguideslin.to_csv(dguideslinp,sep='\t') if cfg['test']: logging.info(err2idxs) with open(dguideslinp+'.err.json', 'w') as f: json.dump(err2idxs, f) else: from beditor.lib.global_vars import saveemptytable logging.warning('no guides designed; saving an empty table.') saveemptytable(cfg,dguideslinp) else: from beditor.lib.global_vars import saveemptytable logging.warning('no guides designed; saving an empty table.') saveemptytable(cfg,dguideslinp) import gc gc.collect()
def get_seq_aminoacid(cfg, din): """ Fetches sequences if mutation format is amino acid :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ import pyensembl #import ensembl object that would fetch genes # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease']) ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) din.index = range(len(din)) dbedp = '{}/dbedflank.bed'.format(cfg['datad']) dbed = pd.DataFrame(columns=bed_colns) terrpositions = [] terrnotfound = [] terrnoncoding = [] bedrowi = 0 # for i in trange(len(din)-1,desc='get positions for bedtools'): for i in din.index: if din.loc[i, 'transcript: id'] in ensembl.transcript_ids(): t = ensembl.transcript_by_id(din.loc[i, 'transcript: id']) if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon: coding_sequence_positions = tboundaries2positions(t) if len(coding_sequence_positions) == len(t.coding_sequence): #TODO need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce dcoding = t2pmapper(t, coding_sequence_positions) dcodingmutpos = dcoding.loc[( dcoding['protein index'] == din.loc[ i, 'aminoacid: position']), :] codon_positions = dcodingmutpos[ 'coding sequence positions'].tolist() if len(codon_positions) != 0: dbed.loc[bedrowi, 'chromosome'] = t.contig if cfg['test']: print(din.loc[i, 'transcript: id'], codon_positions) if t.strand == '+': dbed.loc[bedrowi, 'codon start'] = codon_positions[0] dbed.loc[bedrowi, 'codon end'] = codon_positions[2] elif t.strand == '-': dbed.loc[bedrowi, 'codon start'] = codon_positions[2] dbed.loc[bedrowi, 'codon end'] = codon_positions[0] dbed.loc[bedrowi, 'start'] = dbed.loc[ bedrowi, 'codon start'] - 22 #FIXME put flank in the yml dbed.loc[bedrowi, 'end'] = dbed.loc[ bedrowi, 'codon end'] + 21 #FIXME put flank in the yml dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[ 'protein sequence'].tolist()[0] dbed.loc[bedrowi, 'reference codon'] = ''.join( dcodingmutpos['coding sequence'].tolist()) dbed.loc[bedrowi, 'strand'] = t.strand dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format( din.loc[i, 'transcript: id'], dbed.loc[bedrowi, 'chromosome'], dbed.loc[bedrowi, 'strand'], int(dbed.loc[bedrowi, 'start']), int(dbed.loc[bedrowi, 'end'])) dbed.loc[bedrowi, 'gene: id'] = t.gene_id dbed.loc[bedrowi, 'gene: name'] = t.gene.name dbed.loc[bedrowi, 'protein: id'] = t.protein_id dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[ i, 'aminoacid: position'] # break bedrowi += 1 else: terrpositions.append(t.id) else: terrpositions.append(t.id) else: terrnoncoding.append(t.id) else: terrnotfound.append(din.loc[i, 'transcript: id']) if cfg['test']: logging.error('not found: {}'.format( din.loc[i, 'transcript: id'])) if len(dbed) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no valid seqeunces found; saving an empty table.') saveemptytable(cfg, f"{cfg['dsequencesp']}") return None dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45, axis=1)), :] #FIXME put flank in the yml dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int) dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int) dbed = dbed.drop_duplicates(subset=bed_colns) dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False) err2tids = { 'terrpositions': terrpositions, 'terrnotfound': terrnotfound, 'terrnoncoding': terrnoncoding, } if cfg['test']: print(err2tids) with open(dbedp + '.err.json', 'w') as outfile: json.dump(err2tids, outfile) bedp = f"{cfg['datad']}/dbedflank.bed" fastap = f"{cfg['datad']}/dbedflank.fa" cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) dflankfa = fa2df(fastap, ids2cols=True) dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply( lambda x: x.upper()) dflankfa.loc[:, 'sequence: length'] = [len(s) for s in dflankfa['sequence']] dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index] dflankfa.index.name = 'id' dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1') dseq2compatible = { 'aminoacid: position': 'aminoacid: position', 'gene: id': 'gene: id', 'gene: name': 'gene: name', 'protein: id': 'protein: id', 'transcript: id': 'seqid', 'transcript: sequence': 'sequence', 'aminoacid: wild-type': 'reference residue', 'codon: wild-type': 'reference codon', 'contig': 'contig', 'strand': 'strand', 'start': 'start', 'end': 'end', 'codon start': 'codon start', 'codon end': 'codon end', } if 'amino acid mutation' in dseq: dseq2compatible['amino acid mutation'] = 'amino acid mutation' dseq.to_csv(cfg['dseqtmpp'], sep='\t') dseq = dseq[list(dseq2compatible.values())] dseq.columns = list(dseq2compatible.keys()) # dseq.to_csv('data/dseq.csv') logging.info(dseq.columns.tolist()) logging.info(din.columns.tolist()) dseq = pd.merge(dseq.reset_index(), din, on=['transcript: id', 'aminoacid: position']) logging.info(dseq.columns.tolist()) set_index(dseq, 'id') if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq = dfswapcols(dseq, ['aminoacid: wild-type', 'amino acid mutation']) dseq['codon: mutation'] = dseq['codon: wild-type'].copy() dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t') del ensembl
def dguides2offtargets(cfg): """ All the processes in offtarget detection are here. :param cfg: Configuration settings provided in .yml file """ from beditor.lib.global_vars import saveemptytable cfg['datad'] = cfg[cfg['step']] cfg['plotd'] = cfg['datad'] dofftargetsp = '{}/dofftargets.tsv'.format(cfg['datad']) stepn = '04_offtargets' logging.info(stepn) dguidesp = f"{cfg[cfg['step']-1]}/dguides.tsv" if not exists(dguidesp): logging.warning(f"not found {dguidesp}") return saveemptytable(cfg, dofftargetsp) dguides = pd.read_csv(dguidesp, sep='\t') if len(dguides) == 0: logging.warning(f"dguides is empty.") return saveemptytable(cfg, dofftargetsp) cfg['datatmpd'] = f"{cfg['datad']}/tmp" for dp in [cfg['datatmpd']]: if not exists(dp): makedirs(dp) step2doutp = { 1: '01_guides_guidel*.fa', 2: '02_dalignbed.tsv', 3: '03_annotations.bed', 4: '04_dalignbedguides.tsv', 5: '05_dalignedfasta.tsv', 6: '06_dalignbedguidesseq.tsv', 7: '07_dalignbedstats.tsv', 8: '08_dannotsagg.tsv', 9: '09_dalignbedannot.tsv', 10: '10_daggbyguide.tsv', } cfg['dguidesp'] = dguidesp cfg['alignmentbedp'] = f"{cfg['datatmpd']}/02_alignment.bed" cfg['dalignbedp'] = f"{cfg['datatmpd']}/02_dalignbed.tsv" cfg['dalignbedguidesp'] = f"{cfg['datatmpd']}/04_dalignbedguides.tsv" cfg['dalignedfastap'] = f"{cfg['datatmpd']}/05_dalignedfasta.tsv" cfg['dalignbedguidesseqp'] = f"{cfg['datatmpd']}/06_dalignbedguidesseq.tsv" cfg['dalignbedstatsp'] = f"{cfg['datatmpd']}/07_dalignbedstats.tsv" cfg['dannotsaggp'] = f"{cfg['datatmpd']}/08_dannotsagg.tsv" cfg['dalignbedannotp'] = f"{cfg['datatmpd']}/09_dalignbedannot.tsv" cfg['daggbyguidep'] = f"{cfg['datatmpd']}/10_daggbyguide.tsv" #check which step to process for step in range(2, 10 + 1, 1): if not exists(f"{cfg['datatmpd']}/{step2doutp[step]}"): if step == 2: step = 'all' break logging.info(f'process from step:{step}') cfg['dofftargetsp'] = '{}/dofftargets.tsv'.format(cfg['datad']) if not exists(cfg['dofftargetsp']) or cfg['force']: if step == 1 or step == 'all': cfg = dguides2guidessam(cfg, dguides) if step == 2 or step == 'all' or (not cfg is None): cfg = guidessam2dalignbed(cfg) if step == 3 or step == 'all' or (not cfg is None): cfg = dalignbed2annotationsbed(cfg) if step == 4 or step == 'all' or (not cfg is None): cfg = dalignbed2dalignbedguides(cfg) if step == 5 or step == 'all' or (not cfg is None): cfg = alignmentbed2dalignedfasta(cfg) if step == 6 or step == 'all' or (not cfg is None): cfg = dalignbed2dalignbedguidesseq(cfg) if step == 7 or step == 'all' or (not cfg is None): cfg = dalignbedguidesseq2dalignbedstats(cfg) if step == 8 or step == 'all' or (not cfg is None): cfg = dannots2dalignbed2dannotsagg(cfg) if step == 9 or step == 'all' or (not cfg is None): cfg = dannotsagg2dannots2dalignbedannot(cfg) if step == 10 or step == 'all' or (not cfg is None): cfg = dalignbedannot2daggbyguide(cfg) if cfg is None: logging.warning(f"no alignment found") cfg['step'] = 4 return saveemptytable(cfg, cfg['dofftargetsp']) import gc gc.collect()
def get_possible_mutagenesis( cfg, dcodontable, dcodonusage, BEs, pos_muts, host, ): """ Assesses possible mutagenesis strategies, given the set of Base editors and positions of mutations. :param dcodontable: Codon table :param dcodonusage: Codon usage table :param BEs: Base editors (dict), see global_vars.py :param pos_muts: positions of mutations :param host: host organism :returns: possible mutagenesis strategies as a pandas dataframe """ def write_dmutagenesis(cdni, posi, codon, codonmut, ntwt, ntmut, aa, aamut, method): """ Write dmutagenesis table for each iteraction in get_possible_mutagenesis. """ dmutagenesis.loc[cdni, 'codon'] = codon dmutagenesis.loc[cdni, 'position of mutation in codon'] = int(posi) dmutagenesis.loc[cdni, 'codon mutation'] = codonmut dmutagenesis.loc[cdni, 'nucleotide'] = ntwt dmutagenesis.loc[cdni, 'nucleotide mutation'] = ntmut dmutagenesis.loc[cdni, 'amino acid'] = aa dmutagenesis.loc[cdni, 'amino acid mutation'] = aamut dmutagenesis.loc[cdni, 'mutation on strand'] = method.split(' on ')[1] dmutagenesis.loc[cdni, 'strand: mutation'] = method.split(' on ')[1].replace( ' strand', '') dmutagenesis.loc[cdni, 'method'] = method.split(' on ')[0] dmutagenesis.loc[cdni, 'codon mutation usage Fraction'] = dcodonusage.loc[ codonmut, 'Fraction'] dmutagenesis.loc[cdni, 'codon mutation usage Frequency'] = dcodonusage.loc[ codonmut, 'Frequency'] return dmutagenesis def get_sm(dmutagenesis, BEs, positions, codon, muti, cdni): """ Fetches single nucleotide mutagenesis strategies. """ for method in BEs: for posi in positions: if BEs[method][0] == codon[posi]: for ntmut in BEs[method][1]: if posi == 0: codonmut = '{}{}{}'.format(ntmut, codon[1], codon[2]) elif posi == 1: codonmut = '{}{}{}'.format(codon[0], ntmut, codon[2]) elif posi == 2: codonmut = '{}{}{}'.format(codon[0], codon[1], ntmut) aamut = str( Seq.Seq(codonmut, Alphabet.generic_dna).translate(table=1)) # if (aamut!='*') and (aamut!=aa): # nonsence and synonymous if muti == 0: cdni = cdni else: cdni = len(dmutagenesis) + 1 muti += 1 ntwt = BEs[method][0] if '-' in method.split(' on ')[1]: ntwt = str( Seq.Seq( ntwt, Alphabet.generic_dna).reverse_complement()) ntmut = str( Seq.Seq( ntmut, Alphabet.generic_dna).reverse_complement()) dmutagenesis_row = { 'cdni': cdni, 'posi': posi + 1, 'codon': codon, 'codonmut': codonmut, 'ntwt': ntwt, 'ntmut': ntmut, 'aa': aa, 'aamut': aamut, 'method': method } # print(dmutagenesis_row) dmutagenesis = write_dmutagenesis(**dmutagenesis_row) # else: # logging.warning(f"BEs[{method}][0]!=codon[{posi}]") return dmutagenesis, muti #double nucleotide mutations positions = {0: '@1st position', 1: '@2nd position', 2: '@3rd position'} #double nucleotide mutations positions_dm = [(i, j) for i in positions.keys() for j in positions.keys() if i < j] #double nucleotide mutations positions_tm = [[0, 1, 2]] dmutagenesis = dcodontable.copy() # test=True test = False for cdni in dmutagenesis.index: codon = dmutagenesis.loc[cdni, 'codon'] aa = dmutagenesis.loc[cdni, 'amino acid'] muti = 0 if test: print(codon) #single nucleuotide mutations dmutagenesis, muti = get_sm(dmutagenesis, BEs, positions, codon, muti, cdni) if len(dmutagenesis) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no guides designed; saving an empty table.') dmutagenesis = saveemptytable(cfg) else: # to_table(dmutagenesis,'test.tsv') #FIXME # print(dmutagenesis.shape) dmutagenesis = dmutagenesis.dropna() #FIXME # print(dmutagenesis.shape) dmutagenesis['nucleotide mutation: count'] = [ len(s) for s in dmutagenesis['nucleotide mutation'] ] dmutagenesis = dmutagenesis.sort_values('codon') # Adding information of Allowed activity window dmutagenesis = dmutagenesis.set_index('method').join(pos_muts) dmutagenesis = dmutagenesis.reset_index() from beditor.lib.io_seqs import reverse_complement_multintseq from beditor.lib.global_vars import nt2complement dmutagenesis['nucleotide: wild-type'] = dmutagenesis.apply( lambda x: x['nucleotide'] if x['strand: mutation'] == '+' else reverse_complement_multintseq( x['nucleotide'], nt2complement), axis=1) dmutagenesis['nucleotide: mutation'] = dmutagenesis.apply( lambda x: x['nucleotide mutation'] if x['strand: mutation'] == '+' else reverse_complement_multintseq( x['nucleotide mutation'], nt2complement), axis=1) return dmutagenesis
def get_possible_mutagenesis( cfg, dcodontable, dcodonusage, BEs, pos_muts, host, ): """ Assesses possible mutagenesis strategies, given the set of Base editors and positions of mutations. :param dcodontable: Codon table :param dcodonusage: Codon usage table :param BEs: Base editors (dict), see global_vars.py :param pos_muts: positions of mutations :param host: host organism :returns: possible mutagenesis strategies as a pandas dataframe """ def write_dmutagenesis(cdni, posi, codon, codonmut, ntwt, ntmut, aa, aamut, method): """ Write dmutagenesis table for each iteraction in get_possible_mutagenesis. """ dmutagenesis.loc[cdni, 'codon'] = codon dmutagenesis.loc[cdni, 'position of mutation in codon'] = int(posi) dmutagenesis.loc[cdni, 'codon mutation'] = codonmut dmutagenesis.loc[cdni, 'nucleotide'] = ntwt dmutagenesis.loc[cdni, 'nucleotide mutation'] = ntmut dmutagenesis.loc[cdni, 'amino acid'] = aa dmutagenesis.loc[cdni, 'amino acid mutation'] = aamut dmutagenesis.loc[cdni, 'mutation on strand'] = method.split(' on ')[1] dmutagenesis.loc[cdni, 'strand: mutation'] = method.split(' on ')[1].replace( ' strand', '') dmutagenesis.loc[cdni, 'method'] = method.split(' on ')[0] dmutagenesis.loc[cdni, 'codon mutation usage Fraction'] = dcodonusage.loc[ codonmut, 'Fraction'] dmutagenesis.loc[cdni, 'codon mutation usage Frequency'] = dcodonusage.loc[ codonmut, 'Frequency'] return dmutagenesis def get_sm(dmutagenesis, BEs, positions, codon, muti, cdni): """ Fetches single nucleotide mutagenesis strategies. """ for method in BEs: for posi in positions: if BEs[method][0] == codon[posi]: for ntmut in BEs[method][1]: if posi == 0: codonmut = '{}{}{}'.format(ntmut, codon[1], codon[2]) elif posi == 1: codonmut = '{}{}{}'.format(codon[0], ntmut, codon[2]) elif posi == 2: codonmut = '{}{}{}'.format(codon[0], codon[1], ntmut) aamut = str( Seq.Seq(codonmut, Alphabet.generic_dna).translate(table=1)) # if (aamut!='*') and (aamut!=aa): # nonsence and synonymous if muti == 0: cdni = cdni else: cdni = len(dmutagenesis) + 1 muti += 1 ntwt = BEs[method][0] if '-' in method.split(' on ')[1]: ntwt = str( Seq.Seq( ntwt, Alphabet.generic_dna).reverse_complement()) ntmut = str( Seq.Seq( ntmut, Alphabet.generic_dna).reverse_complement()) dmutagenesis = write_dmutagenesis( **{ 'cdni': cdni, 'posi': posi + 1, 'codon': codon, 'codonmut': codonmut, 'ntwt': ntwt, 'ntmut': ntmut, 'aa': aa, 'aamut': aamut, 'method': method }) return dmutagenesis, muti def get_dm(dmutagenesis, BEs, positions_dm, codon, muti, cdni): """ Fetches double nucleotide mutagenesis strategies. """ for method in BEs: for posi1, posi2 in positions_dm: if (BEs[method][0] == codon[posi1]) and (BEs[method][0] == codon[posi2]): for ntmut1, ntmut2 in itertools.product(''.join( BEs[method][1]), repeat=2): if (posi1 == 0) and (posi2 == 1): codonmut = '{}{}{}'.format(ntmut1, ntmut2, codon[2]) elif (posi1 == 1) and (posi2 == 2): codonmut = '{}{}{}'.format(codon[0], ntmut1, ntmut2) elif (posi1 == 0) and (posi2 == 2): codonmut = '{}{}{}'.format(ntmut1, codon[1], ntmut2) aamut = str( Seq.Seq(codonmut, Alphabet.generic_dna).translate(table=1)) # if (aamut!='*') and (aamut!=aa): # nonsence and synonymous if muti == 0: cdni = cdni else: cdni = len(dmutagenesis) + 1 muti += 1 ntwt = '{}{}'.format(BEs[method][0], BEs[method][0]) ntmut = '{}{}'.format(ntmut1, ntmut2) if '-' in method.split(' on ')[1]: ntwt = str( Seq.Seq( ntwt, Alphabet.generic_dna).reverse_complement()) ntmut = str( Seq.Seq( ntmut, Alphabet.generic_dna).reverse_complement()) dmutagenesis = write_dmutagenesis( **{ 'cdni': cdni, 'posi': '{}{}'.format(posi1, posi2), 'codon': codon, 'codonmut': codonmut, 'ntwt': ntwt, 'ntmut': ntmut, 'aa': aa, 'aamut': aamut, 'method': method }) return dmutagenesis, muti def get_tm(dmutagenesis, BEs, positions_tm, codon, muti, cdni): """ Fetches triple nucleotide mutagenesis strategies. """ for method in BEs: for posi1, posi2, posi3 in positions_tm: if (BEs[method][0] == codon[posi1]) and ( BEs[method][0] == codon[posi2]) and (BEs[method][0] == codon[posi3]): for ntmut1, ntmut2, ntmut3 in itertools.product(''.join( BEs[method][1]), repeat=3): codonmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3) aamut = str( Seq.Seq(codonmut, Alphabet.generic_dna).translate(table=1)) # if (aamut!='*') and (aamut!=aa): # nonsence and synonymous if muti == 0: cdni = cdni else: cdni = len(dmutagenesis) + 1 muti += 1 ntwt = '{}{}{}'.format(BEs[method][0], BEs[method][0], BEs[method][0]) ntmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3) if '-' in method.split(' on ')[1]: ntwt = str( Seq.Seq( ntwt, Alphabet.generic_dna).reverse_complement()) ntmut = str( Seq.Seq( ntmut, Alphabet.generic_dna).reverse_complement()) dmutagenesis = write_dmutagenesis( **{ 'cdni': cdni, 'posi': '123', 'codon': codon, 'codonmut': codonmut, 'ntwt': ntwt, 'ntmut': ntmut, 'aa': aa, 'aamut': aamut, 'method': method }) return dmutagenesis, muti def get_dm_combo(dmutagenesis, BEs, positions_dm, codon, muti, cdni, method): """ Fetches double nucleotide mutagenesis strategies utilising 2 different base editors simultaneously. """ methods = [ m for m in itertools.product(BEs.keys(), repeat=2) if (( m[0].split('on')[1] == m[1].split('on')[1])) and (m[0] != m[1]) ] for method1, method2 in methods: for posi1, posi2 in positions_dm: if (BEs[method1][0] == codon[posi1]) and (BEs[method2][0] == codon[posi2]): ntmuts = [(n1, n2) for n1 in ''.join(BEs[method1][1]) for n2 in ''.join(BEs[method2][1])] for ntmut1, ntmut2 in ntmuts: if (posi1 == 0) and (posi2 == 1): codonmut = '{}{}{}'.format(ntmut1, ntmut2, codon[2]) elif (posi1 == 1) and (posi2 == 2): codonmut = '{}{}{}'.format(codon[0], ntmut1, ntmut2) elif (posi1 == 0) and (posi2 == 2): codonmut = '{}{}{}'.format(ntmut1, codon[1], ntmut2) aamut = str( Seq.Seq(codonmut, Alphabet.generic_dna).translate(table=1)) # if (aamut!='*') and (aamut!=aa): # nonsence and synonymous if muti == 0: cdni = cdni else: cdni = len(dmutagenesis) + 1 muti += 1 ntwt = '{}{}'.format(BEs[method1][0], BEs[method2][0]) ntmut = '{}{}'.format(ntmut1, ntmut2) if '-' in method1.split(' on ')[1]: ntwt = str( Seq.Seq( ntwt, Alphabet.generic_dna).reverse_complement()) ntmut = str( Seq.Seq( ntmut, Alphabet.generic_dna).reverse_complement()) dmutagenesis = write_dmutagenesis( **{ 'cdni': cdni, 'posi': '{}{}'.format(posi1, posi2), 'codon': codon, 'codonmut': codonmut, 'ntwt': ntwt, 'ntmut': ntmut, 'aa': aa, 'aamut': aamut, 'method': method + ' on ' + method1.split('on')[1] }) return dmutagenesis, muti def get_tm_combo(dmutagenesis, BEs, positions_tm, codon, muti, cdni, method): """ Fetches triple nucleotide mutagenesis strategies utilising 2 different base editors simultaneously. """ methods = [ m for m in itertools.product(BEs.keys(), repeat=3) if ((m[0].split('on')[1] == m[1].split('on')[1] == m[2].split('on') [1])) and not (m[0] == m[1] == m[2]) ] for method1, method2, method3 in methods: for posi1, posi2, posi3 in positions_tm: if (BEs[method1][0] == codon[posi1]) and ( BEs[method2][0] == codon[posi2]) and (BEs[method3][0] == codon[posi3]): ntmuts = [(n1, n2, n3) for n1 in ''.join(BEs[method1][1]) for n2 in ''.join(BEs[method2][1]) for n3 in ''.join(BEs[method3][1])] for ntmut1, ntmut2, ntmut3 in ntmuts: codonmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3) aamut = str( Seq.Seq(codonmut, Alphabet.generic_dna).translate(table=1)) # if (aamut!='*') and (aamut!=aa): # nonsence and synonymous if muti == 0: cdni = cdni else: cdni = len(dmutagenesis) + 1 muti += 1 ntwt = '{}{}{}'.format(BEs[method1][0], BEs[method2][0], BEs[method3][0]) ntmut = '{}{}{}'.format(ntmut1, ntmut2, ntmut3) if '-' in method1.split(' on ')[1]: ntwt = str( Seq.Seq( ntwt, Alphabet.generic_dna).reverse_complement()) ntmut = str( Seq.Seq( ntmut, Alphabet.generic_dna).reverse_complement()) dmutagenesis = write_dmutagenesis( **{ 'cdni': cdni, 'posi': '123', 'codon': codon, 'codonmut': codonmut, 'ntwt': ntwt, 'ntmut': ntmut, 'aa': aa, 'aamut': aamut, 'method': method + ' on ' + method1.split('on')[1] }) return dmutagenesis, muti #double nucleotide mutations positions = {0: '@1st position', 1: '@2nd position', 2: '@3rd position'} #double nucleotide mutations positions_dm = [(i, j) for i in positions.keys() for j in positions.keys() if i < j] #double nucleotide mutations positions_tm = [[0, 1, 2]] dmutagenesis = dcodontable.copy() # test=True test = False for cdni in dmutagenesis.index: codon = dmutagenesis.loc[cdni, 'codon'] aa = dmutagenesis.loc[cdni, 'amino acid'] muti = 0 if test: print(codon) #single nucleuotide mutations dmutagenesis, muti = get_sm(dmutagenesis, BEs, positions, codon, muti, cdni) #double nucleotide mutations dmutagenesis, muti = get_dm(dmutagenesis, BEs, positions_dm, codon, muti, cdni) #triple nucleotide mutations dmutagenesis, muti = get_tm(dmutagenesis, BEs, positions_tm, codon, muti, cdni) # #double nucleotide mutations combinations # dmutagenesis,muti=get_dm_combo(dmutagenesis,BEs,positions_dm,codon,muti,cdni,method='undefined') # #triple nucleotide mutations combinations # dmutagenesis,muti=get_tm_combo(dmutagenesis,BEs,positions_tm,codon,muti,cdni,method='undefined') if len(dmutagenesis) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no guides designed; saving an empty table.') dmutagenesis = saveemptytable(cfg) else: dmutagenesis['nucleotide mutation: count'] = [ len(s) for s in dmutagenesis['nucleotide mutation'] ] dmutagenesis = dmutagenesis.sort_values('codon') # Adding information of Allowed activity window dmutagenesis = dmutagenesis.set_index('method').join(pos_muts) dmutagenesis = dmutagenesis.reset_index() from beditor.lib.io_seqs import reverse_complement_multintseq from beditor.lib.global_vars import nt2complement dmutagenesis['nucleotide: wild-type'] = dmutagenesis.apply( lambda x: x['nucleotide'] if x['strand: mutation'] == '+' else reverse_complement_multintseq( x['nucleotide'], nt2complement), axis=1) dmutagenesis['nucleotide: mutation'] = dmutagenesis.apply( lambda x: x['nucleotide mutation'] if x['strand: mutation'] == '+' else reverse_complement_multintseq( x['nucleotide mutation'], nt2complement), axis=1) return dmutagenesis
def dseq2dguides(cfg): """ Wrapper around make guides function. :param cfg: configuration dict. """ cfg['datad'] = cfg[cfg['step']] cfg['plotd'] = cfg['datad'] dguideslinp = f"{cfg['datad']}/dguides.tsv" dguides_nofltp = f"{cfg['datad']}/dguides_noflt.tsv" dmutagenesisp = f"{cfg['datad']}/dmutagenesis.tsv" if not exists(dguideslinp) or cfg['force']: dmutagenesis = pd.read_csv(f"{cfg[cfg['step']-1]}/dmutagenesis.tsv", sep='\t', keep_default_na=False) if cfg['mutation_format'] == 'nucleotide': dsequences = pd.read_csv( f"{cfg[cfg['step']-2]}/dsequences.tsv", sep='\t', keep_default_na=False ) #FIXME if numbering of steps is changed, this is gonna blow dsequences, dmutagenesis = dinnucleotide2dsequencesproper( dsequences, dmutagenesis) elif cfg['mutation_format'] == 'aminoacid': dsequences = pd.read_csv( f"{cfg[cfg['step']-2]}/dsequences.tsv", sep='\t', keep_default_na=False ) #FIXME if numbering of steps is changed, this is gonna blow if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: # from beditor.lib.global_vars import stepi2cols cols_dsequences = dsequences.columns.tolist() dsequences = pd.merge(dsequences, dmutagenesis, how='inner', left_on=[ 'aminoacid: wild-type', 'codon: mutation', 'amino acid mutation' ], right_on=[ 'amino acid', 'codon mutation', 'amino acid mutation' ], suffixes=['', ': dmutagenesis']) dsequences['codon: wild-type'] = dsequences['codon'] dsequences = dsequences.loc[:, cols_dsequences] dsequences.to_csv(f"{cfg[cfg['step']]}/dsequences.tsv", sep='\t') if not (len(dsequences) == 0 or len(dmutagenesis) == 0): dmutagenesis['strand'] = dmutagenesis.apply( lambda x: x['mutation on strand'].replace(' strand', ''), axis=1) dmutagenesis.to_csv(dmutagenesisp, sep='\t') dguideslin, dguides_noflt, err2idxs, dguides_neg_control, dguides_pos_control = make_guides( cfg, dsequences, dmutagenesis, test=cfg['test'], # dbug=True, ) if not dguides_noflt is None: dguides_noflt.to_csv(dguides_nofltp, sep='\t') if not ((dguideslin is None) and (err2idxs is None)): dguideslin.to_csv(dguideslinp, sep='\t') if cfg['test']: logging.info(err2idxs) with open(dguideslinp + '.err.json', 'w') as f: json.dump(err2idxs, f) if cfg['make_control_pos'] and not dguides_pos_control is None: to_table(dguides_pos_control, f"{dguideslinp}.pos_control.tsv") if cfg['make_control_neg'] and not dguides_neg_control is None: to_table(dguides_neg_control, f"{dguideslinp}.neg_control.tsv") else: from beditor.lib.global_vars import saveemptytable logging.warning('no guides designed; saving an empty table.') saveemptytable(cfg, dguideslinp) else: from beditor.lib.global_vars import saveemptytable logging.warning('no guides designed; saving an empty table.') saveemptytable(cfg, dguideslinp) import gc gc.collect()