Exemple #1
0
def test_species(host='saccharomyces_cerevisiae'):
    # clone test_beditor
    if not exists('test_beditor'):
        runbashcmd('git clone https://github.com/rraadd88/test_beditor.git',
                   test=True)
    else:
        runbashcmd('cd test_beditor;git pull', test=True)
def alignmentbed2dalignedfasta(cfg):
    """
    Get sequences in FASTA format from BED file
    step#5

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    alignmentbedp = cfg['alignmentbedp']
    dalignedfastap = cfg['dalignedfastap']
    logging.info(basename(dalignedfastap))
    if not exists(dalignedfastap) or cfg['force']:
        alignedfastap = '{}/05_alignment.fa'.format(datatmpd)
        if not exists(alignedfastap) or cfg['force']:
            cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {alignmentbedp} -fo {alignedfastap}"
            runbashcmd(cmd)

        dalignedfasta = fa2df(alignedfastap)
        dalignedfasta.columns = ['aligned sequence']
        dalignedfasta = dalignedfasta.loc[(dalignedfasta.apply(
            lambda x: not 'N' in x['aligned sequence'],
            axis=1)), :]  #FIXME bwa aligns to NNNNNs
        dalignedfasta.index = [
            i.split('(')[0] for i in dalignedfasta.index
        ]  # for bedtools 2.27, the fasta header now has hanging (+) or (-)
        dalignedfasta.index.name = 'id'
        dalignedfasta.to_csv(dalignedfastap, sep='\t')
    return cfg
Exemple #3
0
def get_seq_nucleotide(cfg,din):
    """
    Fetches sequences if mutation format is nucleotide

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """    
    bedp=f"{cfg['datad']}/dbedntmuts.bed"
    fastap=f"{cfg['datad']}/dbedntmuts.fa"
    dbedntmutsp=f"{cfg['datad']}/dbedntmuts.tsv"
    if not exists(cfg['dsequencesp']) or cfg['force']:
        if not exists(bedp) or cfg['force']:            
            dbed=genomeocoords2bed(din,col_genomeocoord='genome coordinate')
            dbed['start']=dbed['start'].astype(int)-flankntc-1
            dbed['end']=dbed['end'].astype(int)+flankntc
            dbed.to_csv(bedp,sep='\t',header=False, index=False)
        if not exists(fastap) or cfg['force']:
            cmd=f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
            runbashcmd(cmd)
        if not exists(dbedntmutsp) or cfg['force']:
            dbedntmuts=fa2df(fastap)
            dbedntmuts.columns=['transcript: sequence']
            dbedntmuts['transcript: sequence']=dbedntmuts.apply(lambda x: x['transcript: sequence'].upper(),axis=1)
            dbedntmuts=dbedntmuts.reset_index()
            dbedntmuts['genome coordinate']=dbedntmuts.apply(lambda x : x['id'].split('(')[0] ,axis=1)
            dbedntmuts.to_csv(dbedntmutsp,sep='\t')
        else:
            dbedntmuts=pd.read_table(dbedntmutsp,keep_default_na=False)
    dsequences=pd.merge(din,dbedntmuts,
            on=['genome coordinate'],suffixes=('', ': dbedntmuts'))
    dsequences=del_Unnamed(dsequences)
#     print(dsequences[['codon: wild-type']].head())
    col_nt_wt='nucleotide wild-type' if not 'nucleotide wild-type' in dsequences else 'nucleotide wild-type: from flanking sequence'    
    col_nt_mt='nucleotide mutation' if not 'nucleotide mutation' in dsequences else 'nucleotide mutation: from flanking sequence'    
    col_cd_wt='codon: wild-type' if not 'codon: wild-type' in dsequences else 'codon: wild-type: from flanking sequence'
    col_cd_mt='codon: mutation' if not 'codon: mutation' in dsequences else 'codon: mutation: from flanking sequence'        
    dsequences[col_nt_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc],axis=1)        
    
#     print(dsequences[['codon: wild-type']].head())
    dsequences[col_cd_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc-1:flankntc+2],axis=1)
#     print(dsequences[['codon: wild-type']].head())
            
    dsequences[col_cd_mt]=dsequences.apply(lambda x: f"{x['codon: wild-type'][0]}{x['nucleotide mutation']}{x['codon: wild-type'][2]}",axis=1)
    dsequences['transcript: id']=dsequences['genome coordinate']
    dsequences_bedcols=genomeocoords2bed(dsequences, col_genomeocoord='genome coordinate')
    for col in dsequences_bedcols:
        dsequences[col]=dsequences_bedcols[col]
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq=dfswapcols(dsequences,['nucleotide wild-type', 'nucleotide mutation'])
            dseq=dfswapcols(dsequences,['codon: wild-type', 'codon: mutation'])
    dsequences.to_csv(f"{cfg['dsequencesp']}",sep='\t')
def dguides2guidessam(cfg, dguides):
    """
    Aligns guides to genome and gets SAM file
    step#1

    :param cfg: configuration dict
    :param dguides: dataframe of guides
    """
    datatmpd = cfg['datatmpd']
    dguides = set_index(dguides, 'guide: id')
    guidels = dguides.loc[:, 'guide+PAM length'].unique()
    for guidel in guidels:
        logging.debug(f"now aligning guides of length {guidel}")
        guidesfap = f'{datatmpd}/01_guides_guidel{guidel:02}.fa'
        logging.info(basename(guidesfap))
        if not exists(guidesfap) or cfg['force']:
            with open(guidesfap, 'w') as f:
                for gi in dguides.index:
                    f.write('>{}\n{}\n'.format(
                        gi.replace(' ', '_'),
                        dguides.loc[gi, 'guide+PAM sequence']))
        ## BWA alignment command is adapted from cripror
        ## https://github.com/rraadd88/crisporWebsite/blob/master/crispor.py
        # BWA: allow up to X mismatches
        # maximum number of occurences in the genome to get flagged as repeats.
        # This is used in bwa samse, when converting the sam file
        # and for warnings in the table output.
        MAXOCC = 60000

        # the BWA queue size is 2M by default. We derive the queue size from MAXOCC
        MFAC = 2000000 / MAXOCC

        genomep = cfg['genomep']
        genomed = dirname(genomep)  # make var local, see below
        genomegffp = cfg['genomegffp']

        # increase MAXOCC if there is only a single query, but only in CGI mode
        bwaM = MFAC * MAXOCC  # -m is queue size in bwa
        guidessap = f'{datatmpd}/01_guides_guidel{guidel:02}.sa'
        logging.info(basename(guidessap))
        if not exists(guidessap) or cfg['force']:
            cmd = f"{cfg['bwa']} aln -t 1 -o 0 -m {bwaM} -n {cfg['mismatches_max']} -k {cfg['mismatches_max']} -N -l {guidel} {genomep} {guidesfap} > {guidessap} 2> {guidessap}.log"
            runbashcmd(cmd)

        guidessamp = f'{datatmpd}/01_guides_guidel{guidel:02}.sam'
        logging.info(basename(guidessamp))
        if not exists(guidessamp) or cfg['force']:
            cmd = f"{cfg['bwa']} samse -n {MAXOCC} {genomep} {guidessap} {guidesfap} > {guidessamp} 2> {guidessamp}.log"
            runbashcmd(cmd)
    return cfg
def dalignbed2annotationsbed(cfg):
    """
    Get annotations from the aligned BED file
    step#3

    :param cfg: configuration dict
    """
    datatmpd = cfg['datatmpd']
    alignmentbedp = cfg['alignmentbedp']
    alignmentbedsortedp = alignmentbedp + '.sorted.bed'
    logging.info(basename(alignmentbedsortedp))
    if not exists(alignmentbedsortedp) or cfg['force']:
        cmd = '{} sort -i {} > {}'.format(cfg['bedtools'], alignmentbedp,
                                          alignmentbedsortedp)
        runbashcmd(cmd)

    genomegffsortedp = cfg['genomegffp'] + '.sorted.gff3.gz'
    logging.info(basename(genomegffsortedp))
    if not exists(genomegffsortedp):
        cmd = f"{cfg['bedtools']} sort -i {cfg['genomegffp']} > {genomegffsortedp}"
        runbashcmd(cmd)

    annotationsbedp = '{}/03_annotations.bed'.format(datatmpd)
    cfg['annotationsbedp'] = annotationsbedp
    logging.info(basename(annotationsbedp))
    if not exists(annotationsbedp) or cfg['force']:
        cmd = f"{cfg['bedtools']} intersect -wa -wb -loj -a {alignmentbedsortedp} -b {genomegffsortedp} > {annotationsbedp}"
        runbashcmd(cmd)
    return cfg
Exemple #6
0
def test_species(host='saccharomyces_cerevisiae'):
    # clone test_beditor
    if not exists('test_beditor'):
        runbashcmd('git clone https://github.com/rraadd88/test_beditor.git',
                   test=True)
    else:
        runbashcmd('cd test_beditor;git pull', test=True)
    com = f'source activate beditor;cd test_beditor;python test_datasets.py'
    runbashcmd(com, test=True)
Exemple #7
0
def get_genomes(cfg):
    """
    Installs genomes
    
    :param cfg: configuration dict
    """
    #     print('checking if genome is installed/ downloading if necessary.')
    logging.info(
        'pyensembl: checking if genome is installed/ downloading if necessary.'
    )
    runbashcmd(
        f"pyensembl install --reference-name {cfg['genomeassembly']} --release {cfg['genomerelease']} --species {cfg['host']}"
    )

    # if 'step2ignore' in cfg:
    #     if cfg['step2ignore']==4:z`
    #         return cfg

    #download genome for step 5 specificity
    host_ = "_".join(s for s in cfg['host'].split('_')).capitalize()
    ensembl_fastad = 'pub/release-{}/fasta/{}/dna/'.format(
        cfg['genomerelease'], cfg['host'])
    genome_fastad = '{}/{}'.format(dirname(realpath(__file__)), ensembl_fastad)
    cfg['genomep'] = '{}/genome.fa'.format(genome_fastad)
    if not exists(cfg['genomep']):
        logging.error(f"not found: {cfg['genomep']}")
        logging.info(f"downloading file: {cfg['genomep']}")
        #back compatible
        if not 'gui' in cfg:
            cfg['gui'] = False
        if (not '/test_beditor/' in cfg['cfgp']) or (not cfg['gui']):
            ifdlref = input(
                "Download genome at {}?[Y/n]: ".format(genome_fastad))
        else:
            ifdlref = 'Y'
        if ifdlref == 'Y':
            # #FIXME download contigs and cat and get index, sizes
            try:
                contigurls = get_genomeurls(cfg['host'],
                                            cfg['genomerelease'],
                                            test=False)['dna']
            except:
                contigurls = get_genomeurls(cfg['host'],
                                            cfg['genomerelease'],
                                            test=False)['dna']
            logging.info(
                f"{len(contigurls)} contigs/chromosomes in the genome")
            logging.info(contigurls)
            for contigurl in contigurls:
                fn = basename(contigurl)
                fp = f'{ensembl_fastad}/{fn}'
                logging.info(f"downloading: {contigurl}")
                if not exists(fp):
                    cmd = f"wget -q -x -nH {contigurl} -P {dirname(realpath(__file__))}"
                    runbashcmd(cmd, test=cfg['test'])
            # make the fa ready
            if not exists(cfg['genomep']):
                cmd = 'gunzip {}*.fa.gz;cat {}/*.fa > {}/genome.fa;'.format(
                    genome_fastad, genome_fastad, genome_fastad)
                runbashcmd(cmd, test=cfg['test'])
        else:
            logging.error('abort')
            sys.exit(1)
    if not exists(cfg['genomep'] + '.bwt'):
        cmd = '{} index {}'.format(cfg['bwa'], cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('bwa index is present')
    if not exists(cfg['genomep'] + '.fai'):
        cmd = '{} faidx {}'.format(cfg['samtools'], cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('samtools index is present')
    if not exists(cfg['genomep'] + '.sizes'):
        cmd = 'cut -f1,2 {}.fai > {}.sizes'.format(cfg['genomep'],
                                                   cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('sizes of contigs are present')

    #download gff3
    ensembl_gff3d = 'pub/release-{}/gff3/{}/'.format(cfg['genomerelease'],
                                                     cfg['host'])
    genome_gff3d = f'{dirname(realpath(__file__))}/{ensembl_gff3d}'
    cfg['genomegffp'] = '{}/genome.gff3'.format(genome_gff3d)
    if not exists(cfg['genomegffp']):
        logging.error(f"not found: {cfg['genomegffp']}")
        logging.info(f"downloading file: {cfg['genomegffp']}")

        if (not '/test_beditor/' in cfg['cfgp']) or (not cfg['gui']):
            ifdlref = input(
                f"Download genome annotations at {genome_gff3d}?[Y/n]: ")
        else:
            ifdlref = 'Y'
        if ifdlref == 'Y':
            # #FIXME download contigs and cat and get index, sizes
            fn = '{}.{}.{}.gff3.gz'.format(cfg['host'].capitalize(),
                                           cfg['genomeassembly'],
                                           cfg['genomerelease'])
            fp = '{}/{}'.format(ensembl_gff3d, fn)
            try:
                ensembl_gff3p = get_genomeurls(cfg['host'],
                                               cfg['genomerelease'],
                                               test=False)['gff3']
            except:
                ensembl_gff3p = get_genomeurls(cfg['host'],
                                               cfg['genomerelease'],
                                               test=False)['gff3']
            if not exists(fp):
                cmd = f'wget -x -nH {ensembl_gff3p} -P {dirname(realpath(__file__))}'
                runbashcmd(cmd, test=cfg['test'])
                # move to genome.gff3
                cmd = 'cp {}/{} {}'.format(genome_gff3d, fn, cfg['genomegffp'])
                runbashcmd(cmd, test=cfg['test'])
        else:
            logging.error('abort')
            sys.exit(1)
    logging.info('genomes are installed!')
    return cfg
Exemple #8
0
def get_seq_aminoacid(cfg, din):
    """
    Fetches sequences if mutation format is amino acid 

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """
    import pyensembl
    #import ensembl object that would fetch genes
    # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease'])
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])

    din.index = range(len(din))
    dbedp = '{}/dbedflank.bed'.format(cfg['datad'])
    dbed = pd.DataFrame(columns=bed_colns)
    terrpositions = []
    terrnotfound = []
    terrnoncoding = []
    bedrowi = 0
    #             for i in trange(len(din)-1,desc='get positions for bedtools'):
    for i in din.index:
        if din.loc[i, 'transcript: id'] in ensembl.transcript_ids():
            t = ensembl.transcript_by_id(din.loc[i, 'transcript: id'])
            if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon:
                coding_sequence_positions = tboundaries2positions(t)
                if len(coding_sequence_positions) == len(t.coding_sequence):
                    #TODO     need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce
                    dcoding = t2pmapper(t, coding_sequence_positions)
                    dcodingmutpos = dcoding.loc[(
                        dcoding['protein index'] == din.loc[
                            i, 'aminoacid: position']), :]
                    codon_positions = dcodingmutpos[
                        'coding sequence positions'].tolist()
                    if len(codon_positions) != 0:
                        dbed.loc[bedrowi, 'chromosome'] = t.contig
                        if cfg['test']:
                            print(din.loc[i, 'transcript: id'],
                                  codon_positions)
                        if t.strand == '+':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[0]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[2]
                        elif t.strand == '-':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[2]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[0]
                        dbed.loc[bedrowi, 'start'] = dbed.loc[
                            bedrowi,
                            'codon start'] - 22  #FIXME put flank in the yml
                        dbed.loc[bedrowi, 'end'] = dbed.loc[
                            bedrowi,
                            'codon end'] + 21  #FIXME put flank in the yml

                        dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[
                            'protein sequence'].tolist()[0]
                        dbed.loc[bedrowi, 'reference codon'] = ''.join(
                            dcodingmutpos['coding sequence'].tolist())
                        dbed.loc[bedrowi, 'strand'] = t.strand
                        dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format(
                            din.loc[i, 'transcript: id'],
                            dbed.loc[bedrowi, 'chromosome'],
                            dbed.loc[bedrowi, 'strand'],
                            int(dbed.loc[bedrowi, 'start']),
                            int(dbed.loc[bedrowi, 'end']))
                        dbed.loc[bedrowi, 'gene: id'] = t.gene_id
                        dbed.loc[bedrowi, 'gene: name'] = t.gene.name
                        dbed.loc[bedrowi, 'protein: id'] = t.protein_id
                        dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[
                            i, 'aminoacid: position']
                        #         break
                        bedrowi += 1
                    else:
                        terrpositions.append(t.id)
                else:
                    terrpositions.append(t.id)
            else:
                terrnoncoding.append(t.id)
        else:
            terrnotfound.append(din.loc[i, 'transcript: id'])
            if cfg['test']:
                logging.error('not found: {}'.format(
                    din.loc[i, 'transcript: id']))
    if len(dbed) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no valid seqeunces found; saving an empty table.')
        saveemptytable(cfg, f"{cfg['dsequencesp']}")
        return None
    dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45,
                                axis=1)), :]  #FIXME put flank in the yml

    dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int)
    dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int)

    dbed = dbed.drop_duplicates(subset=bed_colns)
    dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False)
    err2tids = {
        'terrpositions': terrpositions,
        'terrnotfound': terrnotfound,
        'terrnoncoding': terrnoncoding,
    }
    if cfg['test']:
        print(err2tids)
    with open(dbedp + '.err.json', 'w') as outfile:
        json.dump(err2tids, outfile)

    bedp = f"{cfg['datad']}/dbedflank.bed"
    fastap = f"{cfg['datad']}/dbedflank.fa"
    cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
    runbashcmd(cmd)

    dflankfa = fa2df(fastap, ids2cols=True)
    dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply(
        lambda x: x.upper())
    dflankfa.loc[:,
                 'sequence: length'] = [len(s) for s in dflankfa['sequence']]
    dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index]
    dflankfa.index.name = 'id'
    dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1')
    dseq2compatible = {
        'aminoacid: position': 'aminoacid: position',
        'gene: id': 'gene: id',
        'gene: name': 'gene: name',
        'protein: id': 'protein: id',
        'transcript: id': 'seqid',
        'transcript: sequence': 'sequence',
        'aminoacid: wild-type': 'reference residue',
        'codon: wild-type': 'reference codon',
        'contig': 'contig',
        'strand': 'strand',
        'start': 'start',
        'end': 'end',
        'codon start': 'codon start',
        'codon end': 'codon end',
    }
    if 'amino acid mutation' in dseq:
        dseq2compatible['amino acid mutation'] = 'amino acid mutation'
    dseq.to_csv(cfg['dseqtmpp'], sep='\t')

    dseq = dseq[list(dseq2compatible.values())]
    dseq.columns = list(dseq2compatible.keys())
    #             dseq.to_csv('data/dseq.csv')

    logging.info(dseq.columns.tolist())
    logging.info(din.columns.tolist())
    dseq = pd.merge(dseq.reset_index(),
                    din,
                    on=['transcript: id', 'aminoacid: position'])
    logging.info(dseq.columns.tolist())
    set_index(dseq, 'id')
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq = dfswapcols(dseq,
                              ['aminoacid: wild-type', 'amino acid mutation'])
            dseq['codon: mutation'] = dseq['codon: wild-type'].copy()

    dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t')
    del ensembl
Exemple #9
0
def gui(test=False):
    layout = get_layout(test=test)
    win = sg.Window('beditor').Layout(layout)

    win_addbepam_active = False
    bulconfigure_advanced = False
    init = True
    while True:
        # gottu be in while loop to capture first event
        ev1, vals1 = win.Read()
        if init:
            _ev1, _vals1 = ev1, vals1
            init = False
        if test:
            print(ev1)
            print(vals1)
        if vals1['mutation table'] != '':
            win.FindElement('load din').Update(disabled=False)
        if vals1['cfgp'] != '':
            win.FindElement('save cfgp').Update(disabled=False)

        if ev1 is None:
            break
        if ev1 == 'BE type and PAM':
            # print('event: BE type and PAM')
            dbepams = get_dbepams()
            dbepams_ = dbepams.groupby('BE type and PAM').agg({
                'BE name and editing window':
                unique_dropna,
            }).reset_index()
            l = tuple(dbepams_.loc[
                dbepams_['BE type and PAM'] == vals1['BE type and PAM'],
                'BE name and editing window'].sort_values().tolist()[0])
            win.FindElement('BE name and editing window').Update(
                disabled=False)
            win.FindElement('BE name and editing window').Update(values=l)
            win.FindElement('add_bepam').Update(disabled=True)
            win.FindElement('BE and PAM clear').Update(disabled=False)
            win.FindElement('BE name and editing window').Update(
                disabled=False)
            win = resetwinvals(win, vals1)
        elif ev1 == 'add_bepam' and not win_addbepam_active:
            win_add_bepam_active = True
            win.Hide()
            win_add_bepam = sg.Window('add new BE and PAM').Layout(
                layout_addbepam)
            while True:
                ev2, vals2 = win_add_bepam.Read()
                if test:
                    print(ev2)
                if ev2 is None:
                    win_add_bepam.Close()
                    win_add_bepam_active = False
                    win.UnHide()
                    break
                elif ev2 == 'add':
                    if any([
                            vals2['fromA'] and vals2['toA'], vals2['fromT']
                            and vals2['toT'], vals2['fromG'] and vals2['toG'],
                            vals2['fromC'] and vals2['toC']
                    ]):
                        if test:
                            print(ev2, vals2)
                        win_add_bepam = resetwinvals(win_add_bepam, vals2)
                        win_add_bepam.FindElement('error mutation').Update(
                            "* invalid")
                    elif not vals2['editing window min'] < vals2[
                            'editing window max']:
                        win_add_bepam = resetwinvals(win_add_bepam, vals2)
                        win_add_bepam.FindElement(
                            'editing window error').Update("* invalid")
                    elif not isstrallowed(s=vals2['PAM'], form=f"^[{nts}]*$"):
                        win_add_bepam = resetwinvals(win_add_bepam, vals2)
                        win_add_bepam.FindElement('error').Update(
                            '* invalid nucleotide')
                    else:
                        # get the keys and print on the gui
                        if test:
                            print(vals2)
                        win_add_bepam.Close()
                        win_add_bepam_active = False
                        win.UnHide()
                        win.FindElement('add_bepam print').Update(
                            f"{get_mutation(vals2)} {vals2['BE name']} {vals2['editing window min']}-{vals2['editing window max']}bp"
                        )
                        vals1[
                            'add_bepam print'] = f"{get_mutation(vals2)} {vals2['BE name']} {vals2['editing window min']}-{vals2['editing window max']}bp"
                        win.FindElement('BE type and PAM').Update(values=[
                            f"{get_mutation(vals2)} PAM:{vals2['PAM']}",
                        ])
                        win.FindElement(
                            'BE name and editing window'
                        ).Update(values=[
                            f"method:{vals2['BE name']} editing window:{int(vals2['editing window min'])}-{int(vals2['editing window max'])}bp",
                        ])
                        win.FindElement('BE type and PAM').Update(
                            disabled=True)
                        win.FindElement('BE name and editing window').Update(
                            disabled=True)
                        win.FindElement('BE and PAM clear').Update(
                            disabled=False)
                        break
                elif ev2 == 'Cancel':
                    win_add_bepam.Close()
                    win_add_bepam_active = False
                    win.UnHide()
                    break
            win = resetwinvals(win, vals1)
        elif ev1 == 'BE and PAM clear':
            win.FindElement('BE and PAM clear').Update(disabled=False)
            dbepams = get_dbepams()
            win.FindElement('BE type and PAM').Update(values=list(
                np.sort(dbepams['BE type and PAM'].unique())),
                                                      disabled=False)
            win.FindElement('BE name and editing window').Update(
                disabled=False)
            win.FindElement('add_bepam').Update(disabled=False)
            win.FindElement('add_bepam print').Update("")
            win = resetwinvals(win, vals1)
        elif ev1 == 'load cfg':
            if vals1['cfginp'] != '' and vals1[
                    'cfginp'] != 'path to configuration file (.yaml)':
                cfg = yaml.load(open(vals1['cfginp'], 'r'))
                listed_keys = ['BEs', 'pams']
                del_keys = ['max_subs_per_codon', 'mutations', 'chunksize']
                for k in listed_keys:
                    if not isinstance(cfg[k], list):
                        cfg[k] = [cfg[k]]
                for k in del_keys:
                    del cfg[k]
                win = loadcfginwinvals(win, cfg)
            win = resetwinvals(win, vals1)
        elif ev1 == 'configure_advanced_':
            win = resetwinvals(win, vals1)
            win.FindElement('configure_advanced').Update(
                visible=False if bulconfigure_advanced else True)
            bulconfigure_advanced = False if bulconfigure_advanced else True
            win = resetwinvals(win, vals1)
        elif ev1 == 'optional: load configuration file_':
            win.FindElement('optional: load configuration file').Update(
                visible=True)
            win = resetwinvals(win, vals1)
        elif ev1 == 'options for amino acid mutations_':
            win.FindElement('options for amino acid mutations').Update(
                visible=True)
            win = resetwinvals(win, vals1)
        elif ev1 == 'optional: for infering amino acid mutations from wt_':
            win.FindElement(
                'optional: for infering amino acid mutations from wt').Update(
                    visible=True)
            win = resetwinvals(win, vals1)
        elif ev1 == 'optional: dependencies paths_':
            win.FindElement('optional: dependencies paths').Update(
                visible=True)
            win = resetwinvals(win, vals1)
        elif ev1 == 'optional: or just save configuration file_':
            win.FindElement('optional: or just save configuration file'
                            ).Update(visible=True)
            win = resetwinvals(win, vals1)
        elif ev1 == 'optional: design control gRNAs_':
            win.FindElement('optional: design control gRNAs').Update(
                visible=True)
            win = resetwinvals(win, vals1)
        elif ev1 == 'configuretorun':
            #check vals
            if vals1['mutation table'] == 'path to the tsv file':
                vals1['mutation table'] = ''
            if win.FindElement('add_bepam print').DisplayText == '':
                keys = np.array([
                    'mutation table', 'Species name (Ensembl assembly)',
                    'BE type and PAM', 'BE name and editing window'
                ])
            else:
                keys = np.array(
                    ['mutation table', 'Species name (Ensembl assembly)'])
            buls = np.array([vals1[k] == '' for k in keys])

            if len(keys[buls]) == 0:
                din = del_Unnamed(
                    pd.read_table(vals1['mutation table'],
                                  keep_default_na=False))
                if (('genome coordinate' in din) and
                    (vals1['mutation_format nucleotide'])) or (
                        ('transcript: id' in din) and
                        (vals1['mutation_format aminoacid'])):
                    win.FindElement('configure error').Update(
                        'run beditor', text_color='green')
                    win.FindElement('run').Update(disabled=False)
                else:
                    win.FindElement('configure error').Update(
                        "invalid mutation format", text_color='red')
            else:
                win.FindElement('configure error').Update(
                    f"invalid {' and '.join(list(keys[buls]))}",
                    text_color='red')
            win = resetwinvals(win, vals1)
            ##TODO check columns and rename
        elif ev1 == 'load din':
            #         try:
            din = del_Unnamed(
                pd.read_table(vals1['mutation table'], keep_default_na=False))
            cols_din = [
                'genome coordinate', 'nucleotide mutation', 'transcript: id',
                'aminoacid: position', 'amino acid mutation'
            ]
            normalised2cols = dict(
                zip([normalisestr(c) for c in cols_din], cols_din))
            din = din.rename(
                columns={c: normalised2cols[normalisestr(c)]
                         for c in din})
            if ('genome coordinate' in din) and (not 'transcript: id' in din):
                win.FindElement('mutation_format aminoacid').Update(
                    value=False)
                win.FindElement('mutation_format nucleotide').Update(
                    value=True)
                vals1['mutation_format aminoacid'] = False
                vals1['mutation_format nucleotide'] = True
            elif (not 'genome coordinate' in din) and ('transcript: id'
                                                       in din):
                win.FindElement('mutation_format nucleotide').Update(
                    value=False)
                win.FindElement('mutation_format aminoacid').Update(value=True)
                vals1['mutation_format aminoacid'] = True
                vals1['mutation_format nucleotide'] = False
            else:
                vals1['mutation_format aminoacid'] = False
                vals1['mutation_format nucleotide'] = False
            vals1['reverse_mutations create'] = True
            vals1['reverse_mutations remove'] = False
            win.FindElement('error din').Update("loaded", text_color='green')
            win.FindElement('mutation table').Update(vals1['mutation table'])
            win = resetwinvals(win, vals1)
        elif ev1 == 'mutation_format nucleotide':
            vals1['mutation_format nucleotide'] = True
            vals1['mutation_format aminoacid'] = False
            win = resetwinvals(win, vals1)
        elif ev1 == 'mutation_format aminoacid':
            vals1['mutation_format nucleotide'] = False
            vals1['mutation_format aminoacid'] = True
            win = resetwinvals(win, vals1)
        elif ev1 == 'reverse_mutations create':
            vals1['reverse_mutations create'] = True
            vals1['reverse_mutations remove'] = False
            win = resetwinvals(win, vals1)
        elif ev1 == 'reverse_mutations remove':
            vals1['reverse_mutations create'] = False
            vals1['reverse_mutations remove'] = True
            win = resetwinvals(win, vals1)
        elif ev1 == 'clear all':
            win = resetwinvals(win, _vals1)

        elif ev1 == 'save cfgp':
            if vals1['cfgp'] != '' or vals1[
                    'cfgp'] != 'path to save configuration file (.yml)':
                vals1['cfgp'] = f"{vals1['cfgp']}.yml" if not vals1[
                    'cfgp'].endswith('.yml') else vals1['cfgp']
                if test:
                    yaml.dump(vals1,
                              open(vals1['cfgp'] + '_test.yml', 'w'),
                              default_flow_style=False)
                if not 'vals2' in locals():
                    vals2 = None
                cfg = guival2cfg(vals1, vals2)
                from beditor.pipeline import validcfg
                if validcfg(cfg):
                    yaml.dump(cfg,
                              open(vals1['cfgp'], 'w'),
                              default_flow_style=False)
                    win.FindElement('save cfgp error').Update(
                        f"saved", text_color='green')
                else:
                    win.FindElement('save cfgp error').Update(
                        f"error/s in configuration", text_color='red')
                win.FindElement('run beditor').Update(disabled=False)
            win.FindElement('cfgp').Update(vals1['cfgp'])
            win = resetwinvals(win, vals1)
        elif ev1 == 'run beditor':
            win.FindElement('guiload').UpdateAnimation(
                source=f'{dirname(abspath(__file__))}/data/gui/guiload.gif',
                time_between_frames=0)
            win.FindElement('run beditor error').Update(f"running!",
                                                        text_color='green')
            try:
                runbashcmd(
                    f"source activate beditor; beditor --cfg {vals1['cfgp']}")
                win.FindElement('run beditor error').Update(
                    f"finished processing!", text_color='green')
            except:
                win.FindElement('run beditor error').Update(
                    f"errored! see command line", text_color='red')
            #TODO create cfg and validate
            win = resetwinvals(win, vals1)
Exemple #10
0
def get_genomes(cfg):
    """
    Installs genomes
    
    :param cfg: configuration dict
    """

    runbashcmd(
        f"pyensembl install --reference-name {cfg['genomeassembly']} --release {cfg['genomerelease']} --species {cfg['host']}"
    )

    import pyensembl
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])
    contig_mito = ['MTDNA', 'MITO', 'MT']
    contigs = [
        c for c in ensembl.contigs()
        if ((not '.' in c) and (len(c) < 5) and (c not in contig_mito))
    ]
    if len(contigs) == 0:
        logging.error('no contigs identified by pyensembl; aborting')
        sys.exit(0)
    logging.info(f"{len(contigs)} contigs/chromosomes in the genome")
    logging.info(contigs)
    # raw genome next
    if 'human' in cfg['host'].lower():
        cfg['host'] = 'homo_sapiens'
    if 'yeast' in cfg['host'].lower():
        cfg['host'] = 'saccharomyces_cerevisiae'
    host_ = "_".join(s for s in cfg['host'].split('_')).capitalize()
    ensembl_fastad = 'pub/release-{}/fasta/{}/dna/'.format(
        cfg['genomerelease'], cfg['host'])
    genome_fastad = '{}/{}'.format(dirname(realpath(__file__)), ensembl_fastad)
    cfg['genomep'] = '{}/genome.fa'.format(genome_fastad)
    if not exists(cfg['genomep']):
        logging.error('not found: {}'.format(cfg['genomep']))
        if not '/test_beditor/' in cfg['cfgp']:
            ifdlref = input(
                "Download genome at {}?[Y/n]: ".format(genome_fastad))
        else:
            ifdlref = 'Y'
        if ifdlref == 'Y':
            # #FIXME download contigs and cat and get index, sizes
            for contig in contigs:
                if 'GRCh37' in cfg['genomeassembly']:
                    #Homo_sapiens.GRCh37.75.dna_sm.chromosome.1.fa.gz
                    fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.{cfg['genomerelease']}.dna_sm.chromosome.{contig}.fa.gz"
                else:
                    fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.dna_sm.chromosome.{contig}.fa.gz"
                fp = '{}/{}'.format(ensembl_fastad, fn)
                if not exists(fp):
                    cmd = 'wget -q -x -nH ftp://ftp.ensembl.org/{} -P {}'.format(
                        fp, dirname(realpath(__file__)))
                    runbashcmd(cmd, test=cfg['test'])
#                 break
# make the fa ready
            if not exists(cfg['genomep']):
                cmd = 'gunzip {}*.fa.gz;cat {}/*.fa > {}/genome.fa;'.format(
                    genome_fastad, genome_fastad, genome_fastad)
                runbashcmd(cmd, test=cfg['test'])
        else:
            logging.error('abort')
            sys.exit(1)
    if not exists(cfg['genomep'] + '.bwt'):
        cmd = '{} index {}'.format(cfg['bwa'], cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('bwa index is present')
    if not exists(cfg['genomep'] + '.fai'):
        cmd = '{} faidx {}'.format(cfg['samtools'], cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('samtools index is present')
    if not exists(cfg['genomep'] + '.sizes'):
        cmd = 'cut -f1,2 {}.fai > {}.sizes'.format(cfg['genomep'],
                                                   cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('sizes of contigs are present')

    ensembl_gff3d = 'pub/release-{}/gff3/{}/'.format(cfg['genomerelease'],
                                                     cfg['host'])
    genome_gff3d = '{}/{}'.format(dirname(realpath(__file__)), ensembl_gff3d)
    cfg['genomegffp'] = '{}/genome.gff3'.format(genome_gff3d)
    if not exists(cfg['genomegffp']):
        logging.error('not found: {}'.format(cfg['genomegffp']))
        if not '/test_beditor/' in cfg['cfgp']:
            ifdlref = input("Download genome annotations at {}?[Y/n]: ".format(
                genome_gff3d))
        else:
            ifdlref = 'Y'
        if ifdlref == 'Y':
            # #FIXME download contigs and cat and get index, sizes
            fn = '{}.{}.{}.gff3.gz'.format(cfg['host'].capitalize(),
                                           cfg['genomeassembly'],
                                           cfg['genomerelease'])
            fp = '{}/{}'.format(ensembl_gff3d, fn)
            if not exists(fp):
                cmd = 'wget -x -nH ftp://ftp.ensembl.org/{} -P {}'.format(
                    fp, dirname(realpath(__file__)))
                runbashcmd(cmd, test=cfg['test'])
                # move to genome.gff3
                cmd = 'cp {}/{} {}'.format(genome_gff3d, fn, cfg['genomegffp'])
                runbashcmd(cmd, test=cfg['test'])

        else:
            logging.error('abort')
            sys.exit(1)
    logging.info('genomes are installed!')
    return cfg
Exemple #11
0
def get_deps(cfg):
    """
    Installs dependencies of `beditor`

    :param cfg: configuration dict
    """
    depsd = "%s/deps" % abspath(dirname(__file__))
    if not exists(depsd):
        makedirs(depsd)
    deps = ['samtools', 'bedtools', 'bwa']
    ddeps = pd.DataFrame(columns=['local path', 'download link'], index=deps)
    #     ddeps=ddeps.set_index('name')
    ddeps.index.name = 'dep'
    ddeps.loc[:, 'local path'] = [
        '{}/{}'.format(depsd, dep) for dep in ddeps.index
    ]

    dep = 'samtools'
    ddeps.loc[
        dep,
        'download link'] = 'https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2'
    ddeps.loc[dep, 'executable'] = '{}/samtools-1.7/samtools'.format(
        ddeps.loc[dep, 'local path'])
    ddeps.loc[dep,
              'install'] = 'cd {};./configure --disable-lzma;make;'.format(
                  dirname(ddeps.loc[dep, 'executable']))

    dep = 'bedtools'
    ddeps.loc[
        dep,
        'download link'] = 'https://github.com/arq5x/bedtools2/releases/download/v2.27.1/bedtools-2.27.1.tar.gz'
    ddeps.loc[dep, 'executable'] = '{}/bedtools2/bin/bedtools'.format(
        ddeps.loc[dep, 'local path'])
    ddeps.loc[dep, 'install'] = 'cd {}/../;make;'.format(
        dirname(ddeps.loc[dep, 'executable']))

    dep = 'bwa'
    ddeps.loc[
        dep,
        'download link'] = 'https://github.com/lh3/bwa/releases/download/v0.7.17/bwa-0.7.17.tar.bz2'
    ddeps.loc[dep, 'executable'] = '{}/bwa-0.7.17/bwa'.format(
        ddeps.loc[dep, 'local path'])
    ddeps.loc[dep, 'install'] = 'cd {};make;'.format(
        dirname(ddeps.loc[dep, 'executable']))

    ddeps.loc[:, 'ext'] = [
        '.tar.{}'.format(ddeps.loc[n, 'download link'].split('.tar.')[-1])
        for n in ddeps.index
    ]
    ddeps.to_csv("{}/deps.tsv".format(depsd), sep='\t')

    logp = "%s/deps.log" % (depsd)
    with open(logp, 'a') as logf:
        for dep in ddeps.index:
            if not exists(ddeps.loc[dep, 'executable']):
                logging.info("configuring: {} to {}".format(
                    dep, ddeps.loc[dep, 'executable']))
                link = ddeps.loc[dep, 'download link']
                path = ddeps.loc[dep, 'local path']
                tarp = '{}/{}'.format(path, basename(link))
                if not exists(tarp):
                    runbashcmd("wget -q %s --directory-prefix=%s" %
                               (link, path),
                               logf=logf)
                if not exists(dirname(ddeps.loc[dep, 'executable'])):
                    if ddeps.loc[dep, 'ext'] == '.tar.bz2':
                        tarcom = 'xvjf'
                    elif ddeps.loc[dep, 'ext'] == '.tar.gz':
                        tarcom = 'zxvf'
                    runbashcmd("tar {} {} -C {}".format(tarcom, tarp, path),
                               logf=logf)
                runbashcmd(ddeps.loc[dep, 'install'], logf=logf)
    #                 ddeps.loc[dep,'executable']='{}/.{}'.format(srcd,dep)
    #             break

    logging.info("dependencies are installed!")
    for dep in ddeps.index:
        cfg[dep] = ddeps.loc[dep, 'executable']
    return cfg