Ejemplo n.º 1
0
def alignmentbed2dalignedfasta(cfg):
    """
    Get sequences in FASTA format from BED file
    step#5

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    alignmentbedp = cfg['alignmentbedp']
    dalignedfastap = cfg['dalignedfastap']
    logging.info(basename(dalignedfastap))
    if not exists(dalignedfastap) or cfg['force']:
        alignedfastap = '{}/05_alignment.fa'.format(datatmpd)
        if not exists(alignedfastap) or cfg['force']:
            cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {alignmentbedp} -fo {alignedfastap}"
            runbashcmd(cmd)

        dalignedfasta = fa2df(alignedfastap)
        dalignedfasta.columns = ['aligned sequence']
        dalignedfasta = dalignedfasta.loc[(dalignedfasta.apply(
            lambda x: not 'N' in x['aligned sequence'],
            axis=1)), :]  #FIXME bwa aligns to NNNNNs
        dalignedfasta.index = [
            i.split('(')[0] for i in dalignedfasta.index
        ]  # for bedtools 2.27, the fasta header now has hanging (+) or (-)
        dalignedfasta.index.name = 'id'
        dalignedfasta.to_csv(dalignedfastap, sep='\t')
    return cfg
Ejemplo n.º 2
0
def get_seq_nucleotide(cfg,din):
    """
    Fetches sequences if mutation format is nucleotide

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """    
    bedp=f"{cfg['datad']}/dbedntmuts.bed"
    fastap=f"{cfg['datad']}/dbedntmuts.fa"
    dbedntmutsp=f"{cfg['datad']}/dbedntmuts.tsv"
    if not exists(cfg['dsequencesp']) or cfg['force']:
        if not exists(bedp) or cfg['force']:            
            dbed=genomeocoords2bed(din,col_genomeocoord='genome coordinate')
            dbed['start']=dbed['start'].astype(int)-flankntc-1
            dbed['end']=dbed['end'].astype(int)+flankntc
            dbed.to_csv(bedp,sep='\t',header=False, index=False)
        if not exists(fastap) or cfg['force']:
            cmd=f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
            runbashcmd(cmd)
        if not exists(dbedntmutsp) or cfg['force']:
            dbedntmuts=fa2df(fastap)
            dbedntmuts.columns=['transcript: sequence']
            dbedntmuts['transcript: sequence']=dbedntmuts.apply(lambda x: x['transcript: sequence'].upper(),axis=1)
            dbedntmuts=dbedntmuts.reset_index()
            dbedntmuts['genome coordinate']=dbedntmuts.apply(lambda x : x['id'].split('(')[0] ,axis=1)
            dbedntmuts.to_csv(dbedntmutsp,sep='\t')
        else:
            dbedntmuts=pd.read_table(dbedntmutsp,keep_default_na=False)
    dsequences=pd.merge(din,dbedntmuts,
            on=['genome coordinate'],suffixes=('', ': dbedntmuts'))
    dsequences=del_Unnamed(dsequences)
#     print(dsequences[['codon: wild-type']].head())
    col_nt_wt='nucleotide wild-type' if not 'nucleotide wild-type' in dsequences else 'nucleotide wild-type: from flanking sequence'    
    col_nt_mt='nucleotide mutation' if not 'nucleotide mutation' in dsequences else 'nucleotide mutation: from flanking sequence'    
    col_cd_wt='codon: wild-type' if not 'codon: wild-type' in dsequences else 'codon: wild-type: from flanking sequence'
    col_cd_mt='codon: mutation' if not 'codon: mutation' in dsequences else 'codon: mutation: from flanking sequence'        
    dsequences[col_nt_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc],axis=1)        
    
#     print(dsequences[['codon: wild-type']].head())
    dsequences[col_cd_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc-1:flankntc+2],axis=1)
#     print(dsequences[['codon: wild-type']].head())
            
    dsequences[col_cd_mt]=dsequences.apply(lambda x: f"{x['codon: wild-type'][0]}{x['nucleotide mutation']}{x['codon: wild-type'][2]}",axis=1)
    dsequences['transcript: id']=dsequences['genome coordinate']
    dsequences_bedcols=genomeocoords2bed(dsequences, col_genomeocoord='genome coordinate')
    for col in dsequences_bedcols:
        dsequences[col]=dsequences_bedcols[col]
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq=dfswapcols(dsequences,['nucleotide wild-type', 'nucleotide mutation'])
            dseq=dfswapcols(dsequences,['codon: wild-type', 'codon: mutation'])
    dsequences.to_csv(f"{cfg['dsequencesp']}",sep='\t')
Ejemplo n.º 3
0
def get_seq_aminoacid(cfg, din):
    """
    Fetches sequences if mutation format is amino acid 

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """
    import pyensembl
    #import ensembl object that would fetch genes
    # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease'])
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])

    din.index = range(len(din))
    dbedp = '{}/dbedflank.bed'.format(cfg['datad'])
    dbed = pd.DataFrame(columns=bed_colns)
    terrpositions = []
    terrnotfound = []
    terrnoncoding = []
    bedrowi = 0
    #             for i in trange(len(din)-1,desc='get positions for bedtools'):
    for i in din.index:
        if din.loc[i, 'transcript: id'] in ensembl.transcript_ids():
            t = ensembl.transcript_by_id(din.loc[i, 'transcript: id'])
            if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon:
                coding_sequence_positions = tboundaries2positions(t)
                if len(coding_sequence_positions) == len(t.coding_sequence):
                    #TODO     need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce
                    dcoding = t2pmapper(t, coding_sequence_positions)
                    dcodingmutpos = dcoding.loc[(
                        dcoding['protein index'] == din.loc[
                            i, 'aminoacid: position']), :]
                    codon_positions = dcodingmutpos[
                        'coding sequence positions'].tolist()
                    if len(codon_positions) != 0:
                        dbed.loc[bedrowi, 'chromosome'] = t.contig
                        if cfg['test']:
                            print(din.loc[i, 'transcript: id'],
                                  codon_positions)
                        if t.strand == '+':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[0]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[2]
                        elif t.strand == '-':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[2]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[0]
                        dbed.loc[bedrowi, 'start'] = dbed.loc[
                            bedrowi,
                            'codon start'] - 22  #FIXME put flank in the yml
                        dbed.loc[bedrowi, 'end'] = dbed.loc[
                            bedrowi,
                            'codon end'] + 21  #FIXME put flank in the yml

                        dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[
                            'protein sequence'].tolist()[0]
                        dbed.loc[bedrowi, 'reference codon'] = ''.join(
                            dcodingmutpos['coding sequence'].tolist())
                        dbed.loc[bedrowi, 'strand'] = t.strand
                        dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format(
                            din.loc[i, 'transcript: id'],
                            dbed.loc[bedrowi, 'chromosome'],
                            dbed.loc[bedrowi, 'strand'],
                            int(dbed.loc[bedrowi, 'start']),
                            int(dbed.loc[bedrowi, 'end']))
                        dbed.loc[bedrowi, 'gene: id'] = t.gene_id
                        dbed.loc[bedrowi, 'gene: name'] = t.gene.name
                        dbed.loc[bedrowi, 'protein: id'] = t.protein_id
                        dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[
                            i, 'aminoacid: position']
                        #         break
                        bedrowi += 1
                    else:
                        terrpositions.append(t.id)
                else:
                    terrpositions.append(t.id)
            else:
                terrnoncoding.append(t.id)
        else:
            terrnotfound.append(din.loc[i, 'transcript: id'])
            if cfg['test']:
                logging.error('not found: {}'.format(
                    din.loc[i, 'transcript: id']))
    if len(dbed) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no valid seqeunces found; saving an empty table.')
        saveemptytable(cfg, f"{cfg['dsequencesp']}")
        return None
    dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45,
                                axis=1)), :]  #FIXME put flank in the yml

    dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int)
    dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int)

    dbed = dbed.drop_duplicates(subset=bed_colns)
    dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False)
    err2tids = {
        'terrpositions': terrpositions,
        'terrnotfound': terrnotfound,
        'terrnoncoding': terrnoncoding,
    }
    if cfg['test']:
        print(err2tids)
    with open(dbedp + '.err.json', 'w') as outfile:
        json.dump(err2tids, outfile)

    bedp = f"{cfg['datad']}/dbedflank.bed"
    fastap = f"{cfg['datad']}/dbedflank.fa"
    cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
    runbashcmd(cmd)

    dflankfa = fa2df(fastap, ids2cols=True)
    dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply(
        lambda x: x.upper())
    dflankfa.loc[:,
                 'sequence: length'] = [len(s) for s in dflankfa['sequence']]
    dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index]
    dflankfa.index.name = 'id'
    dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1')
    dseq2compatible = {
        'aminoacid: position': 'aminoacid: position',
        'gene: id': 'gene: id',
        'gene: name': 'gene: name',
        'protein: id': 'protein: id',
        'transcript: id': 'seqid',
        'transcript: sequence': 'sequence',
        'aminoacid: wild-type': 'reference residue',
        'codon: wild-type': 'reference codon',
        'contig': 'contig',
        'strand': 'strand',
        'start': 'start',
        'end': 'end',
        'codon start': 'codon start',
        'codon end': 'codon end',
    }
    if 'amino acid mutation' in dseq:
        dseq2compatible['amino acid mutation'] = 'amino acid mutation'
    dseq.to_csv(cfg['dseqtmpp'], sep='\t')

    dseq = dseq[list(dseq2compatible.values())]
    dseq.columns = list(dseq2compatible.keys())
    #             dseq.to_csv('data/dseq.csv')

    logging.info(dseq.columns.tolist())
    logging.info(din.columns.tolist())
    dseq = pd.merge(dseq.reset_index(),
                    din,
                    on=['transcript: id', 'aminoacid: position'])
    logging.info(dseq.columns.tolist())
    set_index(dseq, 'id')
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq = dfswapcols(dseq,
                              ['aminoacid: wild-type', 'amino acid mutation'])
            dseq['codon: mutation'] = dseq['codon: wild-type'].copy()

    dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t')
    del ensembl