def alignmentbed2dalignedfasta(cfg): """ Get sequences in FASTA format from BED file step#5 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] alignmentbedp = cfg['alignmentbedp'] dalignedfastap = cfg['dalignedfastap'] logging.info(basename(dalignedfastap)) if not exists(dalignedfastap) or cfg['force']: alignedfastap = '{}/05_alignment.fa'.format(datatmpd) if not exists(alignedfastap) or cfg['force']: cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {alignmentbedp} -fo {alignedfastap}" runbashcmd(cmd) dalignedfasta = fa2df(alignedfastap) dalignedfasta.columns = ['aligned sequence'] dalignedfasta = dalignedfasta.loc[(dalignedfasta.apply( lambda x: not 'N' in x['aligned sequence'], axis=1)), :] #FIXME bwa aligns to NNNNNs dalignedfasta.index = [ i.split('(')[0] for i in dalignedfasta.index ] # for bedtools 2.27, the fasta header now has hanging (+) or (-) dalignedfasta.index.name = 'id' dalignedfasta.to_csv(dalignedfastap, sep='\t') return cfg
def get_seq_nucleotide(cfg,din): """ Fetches sequences if mutation format is nucleotide :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ bedp=f"{cfg['datad']}/dbedntmuts.bed" fastap=f"{cfg['datad']}/dbedntmuts.fa" dbedntmutsp=f"{cfg['datad']}/dbedntmuts.tsv" if not exists(cfg['dsequencesp']) or cfg['force']: if not exists(bedp) or cfg['force']: dbed=genomeocoords2bed(din,col_genomeocoord='genome coordinate') dbed['start']=dbed['start'].astype(int)-flankntc-1 dbed['end']=dbed['end'].astype(int)+flankntc dbed.to_csv(bedp,sep='\t',header=False, index=False) if not exists(fastap) or cfg['force']: cmd=f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) if not exists(dbedntmutsp) or cfg['force']: dbedntmuts=fa2df(fastap) dbedntmuts.columns=['transcript: sequence'] dbedntmuts['transcript: sequence']=dbedntmuts.apply(lambda x: x['transcript: sequence'].upper(),axis=1) dbedntmuts=dbedntmuts.reset_index() dbedntmuts['genome coordinate']=dbedntmuts.apply(lambda x : x['id'].split('(')[0] ,axis=1) dbedntmuts.to_csv(dbedntmutsp,sep='\t') else: dbedntmuts=pd.read_table(dbedntmutsp,keep_default_na=False) dsequences=pd.merge(din,dbedntmuts, on=['genome coordinate'],suffixes=('', ': dbedntmuts')) dsequences=del_Unnamed(dsequences) # print(dsequences[['codon: wild-type']].head()) col_nt_wt='nucleotide wild-type' if not 'nucleotide wild-type' in dsequences else 'nucleotide wild-type: from flanking sequence' col_nt_mt='nucleotide mutation' if not 'nucleotide mutation' in dsequences else 'nucleotide mutation: from flanking sequence' col_cd_wt='codon: wild-type' if not 'codon: wild-type' in dsequences else 'codon: wild-type: from flanking sequence' col_cd_mt='codon: mutation' if not 'codon: mutation' in dsequences else 'codon: mutation: from flanking sequence' dsequences[col_nt_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc],axis=1) # print(dsequences[['codon: wild-type']].head()) dsequences[col_cd_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc-1:flankntc+2],axis=1) # print(dsequences[['codon: wild-type']].head()) dsequences[col_cd_mt]=dsequences.apply(lambda x: f"{x['codon: wild-type'][0]}{x['nucleotide mutation']}{x['codon: wild-type'][2]}",axis=1) dsequences['transcript: id']=dsequences['genome coordinate'] dsequences_bedcols=genomeocoords2bed(dsequences, col_genomeocoord='genome coordinate') for col in dsequences_bedcols: dsequences[col]=dsequences_bedcols[col] if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq=dfswapcols(dsequences,['nucleotide wild-type', 'nucleotide mutation']) dseq=dfswapcols(dsequences,['codon: wild-type', 'codon: mutation']) dsequences.to_csv(f"{cfg['dsequencesp']}",sep='\t')
def get_seq_aminoacid(cfg, din): """ Fetches sequences if mutation format is amino acid :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ import pyensembl #import ensembl object that would fetch genes # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease']) ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) din.index = range(len(din)) dbedp = '{}/dbedflank.bed'.format(cfg['datad']) dbed = pd.DataFrame(columns=bed_colns) terrpositions = [] terrnotfound = [] terrnoncoding = [] bedrowi = 0 # for i in trange(len(din)-1,desc='get positions for bedtools'): for i in din.index: if din.loc[i, 'transcript: id'] in ensembl.transcript_ids(): t = ensembl.transcript_by_id(din.loc[i, 'transcript: id']) if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon: coding_sequence_positions = tboundaries2positions(t) if len(coding_sequence_positions) == len(t.coding_sequence): #TODO need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce dcoding = t2pmapper(t, coding_sequence_positions) dcodingmutpos = dcoding.loc[( dcoding['protein index'] == din.loc[ i, 'aminoacid: position']), :] codon_positions = dcodingmutpos[ 'coding sequence positions'].tolist() if len(codon_positions) != 0: dbed.loc[bedrowi, 'chromosome'] = t.contig if cfg['test']: print(din.loc[i, 'transcript: id'], codon_positions) if t.strand == '+': dbed.loc[bedrowi, 'codon start'] = codon_positions[0] dbed.loc[bedrowi, 'codon end'] = codon_positions[2] elif t.strand == '-': dbed.loc[bedrowi, 'codon start'] = codon_positions[2] dbed.loc[bedrowi, 'codon end'] = codon_positions[0] dbed.loc[bedrowi, 'start'] = dbed.loc[ bedrowi, 'codon start'] - 22 #FIXME put flank in the yml dbed.loc[bedrowi, 'end'] = dbed.loc[ bedrowi, 'codon end'] + 21 #FIXME put flank in the yml dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[ 'protein sequence'].tolist()[0] dbed.loc[bedrowi, 'reference codon'] = ''.join( dcodingmutpos['coding sequence'].tolist()) dbed.loc[bedrowi, 'strand'] = t.strand dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format( din.loc[i, 'transcript: id'], dbed.loc[bedrowi, 'chromosome'], dbed.loc[bedrowi, 'strand'], int(dbed.loc[bedrowi, 'start']), int(dbed.loc[bedrowi, 'end'])) dbed.loc[bedrowi, 'gene: id'] = t.gene_id dbed.loc[bedrowi, 'gene: name'] = t.gene.name dbed.loc[bedrowi, 'protein: id'] = t.protein_id dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[ i, 'aminoacid: position'] # break bedrowi += 1 else: terrpositions.append(t.id) else: terrpositions.append(t.id) else: terrnoncoding.append(t.id) else: terrnotfound.append(din.loc[i, 'transcript: id']) if cfg['test']: logging.error('not found: {}'.format( din.loc[i, 'transcript: id'])) if len(dbed) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no valid seqeunces found; saving an empty table.') saveemptytable(cfg, f"{cfg['dsequencesp']}") return None dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45, axis=1)), :] #FIXME put flank in the yml dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int) dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int) dbed = dbed.drop_duplicates(subset=bed_colns) dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False) err2tids = { 'terrpositions': terrpositions, 'terrnotfound': terrnotfound, 'terrnoncoding': terrnoncoding, } if cfg['test']: print(err2tids) with open(dbedp + '.err.json', 'w') as outfile: json.dump(err2tids, outfile) bedp = f"{cfg['datad']}/dbedflank.bed" fastap = f"{cfg['datad']}/dbedflank.fa" cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) dflankfa = fa2df(fastap, ids2cols=True) dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply( lambda x: x.upper()) dflankfa.loc[:, 'sequence: length'] = [len(s) for s in dflankfa['sequence']] dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index] dflankfa.index.name = 'id' dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1') dseq2compatible = { 'aminoacid: position': 'aminoacid: position', 'gene: id': 'gene: id', 'gene: name': 'gene: name', 'protein: id': 'protein: id', 'transcript: id': 'seqid', 'transcript: sequence': 'sequence', 'aminoacid: wild-type': 'reference residue', 'codon: wild-type': 'reference codon', 'contig': 'contig', 'strand': 'strand', 'start': 'start', 'end': 'end', 'codon start': 'codon start', 'codon end': 'codon end', } if 'amino acid mutation' in dseq: dseq2compatible['amino acid mutation'] = 'amino acid mutation' dseq.to_csv(cfg['dseqtmpp'], sep='\t') dseq = dseq[list(dseq2compatible.values())] dseq.columns = list(dseq2compatible.keys()) # dseq.to_csv('data/dseq.csv') logging.info(dseq.columns.tolist()) logging.info(din.columns.tolist()) dseq = pd.merge(dseq.reset_index(), din, on=['transcript: id', 'aminoacid: position']) logging.info(dseq.columns.tolist()) set_index(dseq, 'id') if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq = dfswapcols(dseq, ['aminoacid: wild-type', 'amino acid mutation']) dseq['codon: mutation'] = dseq['codon: wild-type'].copy() dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t') del ensembl