def test_species(host='saccharomyces_cerevisiae'): # clone test_beditor if not exists('test_beditor'): runbashcmd('git clone https://github.com/rraadd88/test_beditor.git', test=True) else: runbashcmd('cd test_beditor;git pull', test=True)
def alignmentbed2dalignedfasta(cfg): """ Get sequences in FASTA format from BED file step#5 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] alignmentbedp = cfg['alignmentbedp'] dalignedfastap = cfg['dalignedfastap'] logging.info(basename(dalignedfastap)) if not exists(dalignedfastap) or cfg['force']: alignedfastap = '{}/05_alignment.fa'.format(datatmpd) if not exists(alignedfastap) or cfg['force']: cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {alignmentbedp} -fo {alignedfastap}" runbashcmd(cmd) dalignedfasta = fa2df(alignedfastap) dalignedfasta.columns = ['aligned sequence'] dalignedfasta = dalignedfasta.loc[(dalignedfasta.apply( lambda x: not 'N' in x['aligned sequence'], axis=1)), :] #FIXME bwa aligns to NNNNNs dalignedfasta.index = [ i.split('(')[0] for i in dalignedfasta.index ] # for bedtools 2.27, the fasta header now has hanging (+) or (-) dalignedfasta.index.name = 'id' dalignedfasta.to_csv(dalignedfastap, sep='\t') return cfg
def get_seq_nucleotide(cfg,din): """ Fetches sequences if mutation format is nucleotide :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ bedp=f"{cfg['datad']}/dbedntmuts.bed" fastap=f"{cfg['datad']}/dbedntmuts.fa" dbedntmutsp=f"{cfg['datad']}/dbedntmuts.tsv" if not exists(cfg['dsequencesp']) or cfg['force']: if not exists(bedp) or cfg['force']: dbed=genomeocoords2bed(din,col_genomeocoord='genome coordinate') dbed['start']=dbed['start'].astype(int)-flankntc-1 dbed['end']=dbed['end'].astype(int)+flankntc dbed.to_csv(bedp,sep='\t',header=False, index=False) if not exists(fastap) or cfg['force']: cmd=f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) if not exists(dbedntmutsp) or cfg['force']: dbedntmuts=fa2df(fastap) dbedntmuts.columns=['transcript: sequence'] dbedntmuts['transcript: sequence']=dbedntmuts.apply(lambda x: x['transcript: sequence'].upper(),axis=1) dbedntmuts=dbedntmuts.reset_index() dbedntmuts['genome coordinate']=dbedntmuts.apply(lambda x : x['id'].split('(')[0] ,axis=1) dbedntmuts.to_csv(dbedntmutsp,sep='\t') else: dbedntmuts=pd.read_table(dbedntmutsp,keep_default_na=False) dsequences=pd.merge(din,dbedntmuts, on=['genome coordinate'],suffixes=('', ': dbedntmuts')) dsequences=del_Unnamed(dsequences) # print(dsequences[['codon: wild-type']].head()) col_nt_wt='nucleotide wild-type' if not 'nucleotide wild-type' in dsequences else 'nucleotide wild-type: from flanking sequence' col_nt_mt='nucleotide mutation' if not 'nucleotide mutation' in dsequences else 'nucleotide mutation: from flanking sequence' col_cd_wt='codon: wild-type' if not 'codon: wild-type' in dsequences else 'codon: wild-type: from flanking sequence' col_cd_mt='codon: mutation' if not 'codon: mutation' in dsequences else 'codon: mutation: from flanking sequence' dsequences[col_nt_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc],axis=1) # print(dsequences[['codon: wild-type']].head()) dsequences[col_cd_wt]=dsequences.apply(lambda x: x['transcript: sequence'][flankntc-1:flankntc+2],axis=1) # print(dsequences[['codon: wild-type']].head()) dsequences[col_cd_mt]=dsequences.apply(lambda x: f"{x['codon: wild-type'][0]}{x['nucleotide mutation']}{x['codon: wild-type'][2]}",axis=1) dsequences['transcript: id']=dsequences['genome coordinate'] dsequences_bedcols=genomeocoords2bed(dsequences, col_genomeocoord='genome coordinate') for col in dsequences_bedcols: dsequences[col]=dsequences_bedcols[col] if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq=dfswapcols(dsequences,['nucleotide wild-type', 'nucleotide mutation']) dseq=dfswapcols(dsequences,['codon: wild-type', 'codon: mutation']) dsequences.to_csv(f"{cfg['dsequencesp']}",sep='\t')
def dguides2guidessam(cfg, dguides): """ Aligns guides to genome and gets SAM file step#1 :param cfg: configuration dict :param dguides: dataframe of guides """ datatmpd = cfg['datatmpd'] dguides = set_index(dguides, 'guide: id') guidels = dguides.loc[:, 'guide+PAM length'].unique() for guidel in guidels: logging.debug(f"now aligning guides of length {guidel}") guidesfap = f'{datatmpd}/01_guides_guidel{guidel:02}.fa' logging.info(basename(guidesfap)) if not exists(guidesfap) or cfg['force']: with open(guidesfap, 'w') as f: for gi in dguides.index: f.write('>{}\n{}\n'.format( gi.replace(' ', '_'), dguides.loc[gi, 'guide+PAM sequence'])) ## BWA alignment command is adapted from cripror ## https://github.com/rraadd88/crisporWebsite/blob/master/crispor.py # BWA: allow up to X mismatches # maximum number of occurences in the genome to get flagged as repeats. # This is used in bwa samse, when converting the sam file # and for warnings in the table output. MAXOCC = 60000 # the BWA queue size is 2M by default. We derive the queue size from MAXOCC MFAC = 2000000 / MAXOCC genomep = cfg['genomep'] genomed = dirname(genomep) # make var local, see below genomegffp = cfg['genomegffp'] # increase MAXOCC if there is only a single query, but only in CGI mode bwaM = MFAC * MAXOCC # -m is queue size in bwa guidessap = f'{datatmpd}/01_guides_guidel{guidel:02}.sa' logging.info(basename(guidessap)) if not exists(guidessap) or cfg['force']: cmd = f"{cfg['bwa']} aln -t 1 -o 0 -m {bwaM} -n {cfg['mismatches_max']} -k {cfg['mismatches_max']} -N -l {guidel} {genomep} {guidesfap} > {guidessap} 2> {guidessap}.log" runbashcmd(cmd) guidessamp = f'{datatmpd}/01_guides_guidel{guidel:02}.sam' logging.info(basename(guidessamp)) if not exists(guidessamp) or cfg['force']: cmd = f"{cfg['bwa']} samse -n {MAXOCC} {genomep} {guidessap} {guidesfap} > {guidessamp} 2> {guidessamp}.log" runbashcmd(cmd) return cfg
def dalignbed2annotationsbed(cfg): """ Get annotations from the aligned BED file step#3 :param cfg: configuration dict """ datatmpd = cfg['datatmpd'] alignmentbedp = cfg['alignmentbedp'] alignmentbedsortedp = alignmentbedp + '.sorted.bed' logging.info(basename(alignmentbedsortedp)) if not exists(alignmentbedsortedp) or cfg['force']: cmd = '{} sort -i {} > {}'.format(cfg['bedtools'], alignmentbedp, alignmentbedsortedp) runbashcmd(cmd) genomegffsortedp = cfg['genomegffp'] + '.sorted.gff3.gz' logging.info(basename(genomegffsortedp)) if not exists(genomegffsortedp): cmd = f"{cfg['bedtools']} sort -i {cfg['genomegffp']} > {genomegffsortedp}" runbashcmd(cmd) annotationsbedp = '{}/03_annotations.bed'.format(datatmpd) cfg['annotationsbedp'] = annotationsbedp logging.info(basename(annotationsbedp)) if not exists(annotationsbedp) or cfg['force']: cmd = f"{cfg['bedtools']} intersect -wa -wb -loj -a {alignmentbedsortedp} -b {genomegffsortedp} > {annotationsbedp}" runbashcmd(cmd) return cfg
def test_species(host='saccharomyces_cerevisiae'): # clone test_beditor if not exists('test_beditor'): runbashcmd('git clone https://github.com/rraadd88/test_beditor.git', test=True) else: runbashcmd('cd test_beditor;git pull', test=True) com = f'source activate beditor;cd test_beditor;python test_datasets.py' runbashcmd(com, test=True)
def get_genomes(cfg): """ Installs genomes :param cfg: configuration dict """ # print('checking if genome is installed/ downloading if necessary.') logging.info( 'pyensembl: checking if genome is installed/ downloading if necessary.' ) runbashcmd( f"pyensembl install --reference-name {cfg['genomeassembly']} --release {cfg['genomerelease']} --species {cfg['host']}" ) # if 'step2ignore' in cfg: # if cfg['step2ignore']==4:z` # return cfg #download genome for step 5 specificity host_ = "_".join(s for s in cfg['host'].split('_')).capitalize() ensembl_fastad = 'pub/release-{}/fasta/{}/dna/'.format( cfg['genomerelease'], cfg['host']) genome_fastad = '{}/{}'.format(dirname(realpath(__file__)), ensembl_fastad) cfg['genomep'] = '{}/genome.fa'.format(genome_fastad) if not exists(cfg['genomep']): logging.error(f"not found: {cfg['genomep']}") logging.info(f"downloading file: {cfg['genomep']}") #back compatible if not 'gui' in cfg: cfg['gui'] = False if (not '/test_beditor/' in cfg['cfgp']) or (not cfg['gui']): ifdlref = input( "Download genome at {}?[Y/n]: ".format(genome_fastad)) else: ifdlref = 'Y' if ifdlref == 'Y': # #FIXME download contigs and cat and get index, sizes try: contigurls = get_genomeurls(cfg['host'], cfg['genomerelease'], test=False)['dna'] except: contigurls = get_genomeurls(cfg['host'], cfg['genomerelease'], test=False)['dna'] logging.info( f"{len(contigurls)} contigs/chromosomes in the genome") logging.info(contigurls) for contigurl in contigurls: fn = basename(contigurl) fp = f'{ensembl_fastad}/{fn}' logging.info(f"downloading: {contigurl}") if not exists(fp): cmd = f"wget -q -x -nH {contigurl} -P {dirname(realpath(__file__))}" runbashcmd(cmd, test=cfg['test']) # make the fa ready if not exists(cfg['genomep']): cmd = 'gunzip {}*.fa.gz;cat {}/*.fa > {}/genome.fa;'.format( genome_fastad, genome_fastad, genome_fastad) runbashcmd(cmd, test=cfg['test']) else: logging.error('abort') sys.exit(1) if not exists(cfg['genomep'] + '.bwt'): cmd = '{} index {}'.format(cfg['bwa'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('bwa index is present') if not exists(cfg['genomep'] + '.fai'): cmd = '{} faidx {}'.format(cfg['samtools'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('samtools index is present') if not exists(cfg['genomep'] + '.sizes'): cmd = 'cut -f1,2 {}.fai > {}.sizes'.format(cfg['genomep'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('sizes of contigs are present') #download gff3 ensembl_gff3d = 'pub/release-{}/gff3/{}/'.format(cfg['genomerelease'], cfg['host']) genome_gff3d = f'{dirname(realpath(__file__))}/{ensembl_gff3d}' cfg['genomegffp'] = '{}/genome.gff3'.format(genome_gff3d) if not exists(cfg['genomegffp']): logging.error(f"not found: {cfg['genomegffp']}") logging.info(f"downloading file: {cfg['genomegffp']}") if (not '/test_beditor/' in cfg['cfgp']) or (not cfg['gui']): ifdlref = input( f"Download genome annotations at {genome_gff3d}?[Y/n]: ") else: ifdlref = 'Y' if ifdlref == 'Y': # #FIXME download contigs and cat and get index, sizes fn = '{}.{}.{}.gff3.gz'.format(cfg['host'].capitalize(), cfg['genomeassembly'], cfg['genomerelease']) fp = '{}/{}'.format(ensembl_gff3d, fn) try: ensembl_gff3p = get_genomeurls(cfg['host'], cfg['genomerelease'], test=False)['gff3'] except: ensembl_gff3p = get_genomeurls(cfg['host'], cfg['genomerelease'], test=False)['gff3'] if not exists(fp): cmd = f'wget -x -nH {ensembl_gff3p} -P {dirname(realpath(__file__))}' runbashcmd(cmd, test=cfg['test']) # move to genome.gff3 cmd = 'cp {}/{} {}'.format(genome_gff3d, fn, cfg['genomegffp']) runbashcmd(cmd, test=cfg['test']) else: logging.error('abort') sys.exit(1) logging.info('genomes are installed!') return cfg
def get_seq_aminoacid(cfg, din): """ Fetches sequences if mutation format is amino acid :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ import pyensembl #import ensembl object that would fetch genes # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease']) ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) din.index = range(len(din)) dbedp = '{}/dbedflank.bed'.format(cfg['datad']) dbed = pd.DataFrame(columns=bed_colns) terrpositions = [] terrnotfound = [] terrnoncoding = [] bedrowi = 0 # for i in trange(len(din)-1,desc='get positions for bedtools'): for i in din.index: if din.loc[i, 'transcript: id'] in ensembl.transcript_ids(): t = ensembl.transcript_by_id(din.loc[i, 'transcript: id']) if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon: coding_sequence_positions = tboundaries2positions(t) if len(coding_sequence_positions) == len(t.coding_sequence): #TODO need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce dcoding = t2pmapper(t, coding_sequence_positions) dcodingmutpos = dcoding.loc[( dcoding['protein index'] == din.loc[ i, 'aminoacid: position']), :] codon_positions = dcodingmutpos[ 'coding sequence positions'].tolist() if len(codon_positions) != 0: dbed.loc[bedrowi, 'chromosome'] = t.contig if cfg['test']: print(din.loc[i, 'transcript: id'], codon_positions) if t.strand == '+': dbed.loc[bedrowi, 'codon start'] = codon_positions[0] dbed.loc[bedrowi, 'codon end'] = codon_positions[2] elif t.strand == '-': dbed.loc[bedrowi, 'codon start'] = codon_positions[2] dbed.loc[bedrowi, 'codon end'] = codon_positions[0] dbed.loc[bedrowi, 'start'] = dbed.loc[ bedrowi, 'codon start'] - 22 #FIXME put flank in the yml dbed.loc[bedrowi, 'end'] = dbed.loc[ bedrowi, 'codon end'] + 21 #FIXME put flank in the yml dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[ 'protein sequence'].tolist()[0] dbed.loc[bedrowi, 'reference codon'] = ''.join( dcodingmutpos['coding sequence'].tolist()) dbed.loc[bedrowi, 'strand'] = t.strand dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format( din.loc[i, 'transcript: id'], dbed.loc[bedrowi, 'chromosome'], dbed.loc[bedrowi, 'strand'], int(dbed.loc[bedrowi, 'start']), int(dbed.loc[bedrowi, 'end'])) dbed.loc[bedrowi, 'gene: id'] = t.gene_id dbed.loc[bedrowi, 'gene: name'] = t.gene.name dbed.loc[bedrowi, 'protein: id'] = t.protein_id dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[ i, 'aminoacid: position'] # break bedrowi += 1 else: terrpositions.append(t.id) else: terrpositions.append(t.id) else: terrnoncoding.append(t.id) else: terrnotfound.append(din.loc[i, 'transcript: id']) if cfg['test']: logging.error('not found: {}'.format( din.loc[i, 'transcript: id'])) if len(dbed) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no valid seqeunces found; saving an empty table.') saveemptytable(cfg, f"{cfg['dsequencesp']}") return None dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45, axis=1)), :] #FIXME put flank in the yml dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int) dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int) dbed = dbed.drop_duplicates(subset=bed_colns) dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False) err2tids = { 'terrpositions': terrpositions, 'terrnotfound': terrnotfound, 'terrnoncoding': terrnoncoding, } if cfg['test']: print(err2tids) with open(dbedp + '.err.json', 'w') as outfile: json.dump(err2tids, outfile) bedp = f"{cfg['datad']}/dbedflank.bed" fastap = f"{cfg['datad']}/dbedflank.fa" cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) dflankfa = fa2df(fastap, ids2cols=True) dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply( lambda x: x.upper()) dflankfa.loc[:, 'sequence: length'] = [len(s) for s in dflankfa['sequence']] dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index] dflankfa.index.name = 'id' dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1') dseq2compatible = { 'aminoacid: position': 'aminoacid: position', 'gene: id': 'gene: id', 'gene: name': 'gene: name', 'protein: id': 'protein: id', 'transcript: id': 'seqid', 'transcript: sequence': 'sequence', 'aminoacid: wild-type': 'reference residue', 'codon: wild-type': 'reference codon', 'contig': 'contig', 'strand': 'strand', 'start': 'start', 'end': 'end', 'codon start': 'codon start', 'codon end': 'codon end', } if 'amino acid mutation' in dseq: dseq2compatible['amino acid mutation'] = 'amino acid mutation' dseq.to_csv(cfg['dseqtmpp'], sep='\t') dseq = dseq[list(dseq2compatible.values())] dseq.columns = list(dseq2compatible.keys()) # dseq.to_csv('data/dseq.csv') logging.info(dseq.columns.tolist()) logging.info(din.columns.tolist()) dseq = pd.merge(dseq.reset_index(), din, on=['transcript: id', 'aminoacid: position']) logging.info(dseq.columns.tolist()) set_index(dseq, 'id') if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq = dfswapcols(dseq, ['aminoacid: wild-type', 'amino acid mutation']) dseq['codon: mutation'] = dseq['codon: wild-type'].copy() dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t') del ensembl
def gui(test=False): layout = get_layout(test=test) win = sg.Window('beditor').Layout(layout) win_addbepam_active = False bulconfigure_advanced = False init = True while True: # gottu be in while loop to capture first event ev1, vals1 = win.Read() if init: _ev1, _vals1 = ev1, vals1 init = False if test: print(ev1) print(vals1) if vals1['mutation table'] != '': win.FindElement('load din').Update(disabled=False) if vals1['cfgp'] != '': win.FindElement('save cfgp').Update(disabled=False) if ev1 is None: break if ev1 == 'BE type and PAM': # print('event: BE type and PAM') dbepams = get_dbepams() dbepams_ = dbepams.groupby('BE type and PAM').agg({ 'BE name and editing window': unique_dropna, }).reset_index() l = tuple(dbepams_.loc[ dbepams_['BE type and PAM'] == vals1['BE type and PAM'], 'BE name and editing window'].sort_values().tolist()[0]) win.FindElement('BE name and editing window').Update( disabled=False) win.FindElement('BE name and editing window').Update(values=l) win.FindElement('add_bepam').Update(disabled=True) win.FindElement('BE and PAM clear').Update(disabled=False) win.FindElement('BE name and editing window').Update( disabled=False) win = resetwinvals(win, vals1) elif ev1 == 'add_bepam' and not win_addbepam_active: win_add_bepam_active = True win.Hide() win_add_bepam = sg.Window('add new BE and PAM').Layout( layout_addbepam) while True: ev2, vals2 = win_add_bepam.Read() if test: print(ev2) if ev2 is None: win_add_bepam.Close() win_add_bepam_active = False win.UnHide() break elif ev2 == 'add': if any([ vals2['fromA'] and vals2['toA'], vals2['fromT'] and vals2['toT'], vals2['fromG'] and vals2['toG'], vals2['fromC'] and vals2['toC'] ]): if test: print(ev2, vals2) win_add_bepam = resetwinvals(win_add_bepam, vals2) win_add_bepam.FindElement('error mutation').Update( "* invalid") elif not vals2['editing window min'] < vals2[ 'editing window max']: win_add_bepam = resetwinvals(win_add_bepam, vals2) win_add_bepam.FindElement( 'editing window error').Update("* invalid") elif not isstrallowed(s=vals2['PAM'], form=f"^[{nts}]*$"): win_add_bepam = resetwinvals(win_add_bepam, vals2) win_add_bepam.FindElement('error').Update( '* invalid nucleotide') else: # get the keys and print on the gui if test: print(vals2) win_add_bepam.Close() win_add_bepam_active = False win.UnHide() win.FindElement('add_bepam print').Update( f"{get_mutation(vals2)} {vals2['BE name']} {vals2['editing window min']}-{vals2['editing window max']}bp" ) vals1[ 'add_bepam print'] = f"{get_mutation(vals2)} {vals2['BE name']} {vals2['editing window min']}-{vals2['editing window max']}bp" win.FindElement('BE type and PAM').Update(values=[ f"{get_mutation(vals2)} PAM:{vals2['PAM']}", ]) win.FindElement( 'BE name and editing window' ).Update(values=[ f"method:{vals2['BE name']} editing window:{int(vals2['editing window min'])}-{int(vals2['editing window max'])}bp", ]) win.FindElement('BE type and PAM').Update( disabled=True) win.FindElement('BE name and editing window').Update( disabled=True) win.FindElement('BE and PAM clear').Update( disabled=False) break elif ev2 == 'Cancel': win_add_bepam.Close() win_add_bepam_active = False win.UnHide() break win = resetwinvals(win, vals1) elif ev1 == 'BE and PAM clear': win.FindElement('BE and PAM clear').Update(disabled=False) dbepams = get_dbepams() win.FindElement('BE type and PAM').Update(values=list( np.sort(dbepams['BE type and PAM'].unique())), disabled=False) win.FindElement('BE name and editing window').Update( disabled=False) win.FindElement('add_bepam').Update(disabled=False) win.FindElement('add_bepam print').Update("") win = resetwinvals(win, vals1) elif ev1 == 'load cfg': if vals1['cfginp'] != '' and vals1[ 'cfginp'] != 'path to configuration file (.yaml)': cfg = yaml.load(open(vals1['cfginp'], 'r')) listed_keys = ['BEs', 'pams'] del_keys = ['max_subs_per_codon', 'mutations', 'chunksize'] for k in listed_keys: if not isinstance(cfg[k], list): cfg[k] = [cfg[k]] for k in del_keys: del cfg[k] win = loadcfginwinvals(win, cfg) win = resetwinvals(win, vals1) elif ev1 == 'configure_advanced_': win = resetwinvals(win, vals1) win.FindElement('configure_advanced').Update( visible=False if bulconfigure_advanced else True) bulconfigure_advanced = False if bulconfigure_advanced else True win = resetwinvals(win, vals1) elif ev1 == 'optional: load configuration file_': win.FindElement('optional: load configuration file').Update( visible=True) win = resetwinvals(win, vals1) elif ev1 == 'options for amino acid mutations_': win.FindElement('options for amino acid mutations').Update( visible=True) win = resetwinvals(win, vals1) elif ev1 == 'optional: for infering amino acid mutations from wt_': win.FindElement( 'optional: for infering amino acid mutations from wt').Update( visible=True) win = resetwinvals(win, vals1) elif ev1 == 'optional: dependencies paths_': win.FindElement('optional: dependencies paths').Update( visible=True) win = resetwinvals(win, vals1) elif ev1 == 'optional: or just save configuration file_': win.FindElement('optional: or just save configuration file' ).Update(visible=True) win = resetwinvals(win, vals1) elif ev1 == 'optional: design control gRNAs_': win.FindElement('optional: design control gRNAs').Update( visible=True) win = resetwinvals(win, vals1) elif ev1 == 'configuretorun': #check vals if vals1['mutation table'] == 'path to the tsv file': vals1['mutation table'] = '' if win.FindElement('add_bepam print').DisplayText == '': keys = np.array([ 'mutation table', 'Species name (Ensembl assembly)', 'BE type and PAM', 'BE name and editing window' ]) else: keys = np.array( ['mutation table', 'Species name (Ensembl assembly)']) buls = np.array([vals1[k] == '' for k in keys]) if len(keys[buls]) == 0: din = del_Unnamed( pd.read_table(vals1['mutation table'], keep_default_na=False)) if (('genome coordinate' in din) and (vals1['mutation_format nucleotide'])) or ( ('transcript: id' in din) and (vals1['mutation_format aminoacid'])): win.FindElement('configure error').Update( 'run beditor', text_color='green') win.FindElement('run').Update(disabled=False) else: win.FindElement('configure error').Update( "invalid mutation format", text_color='red') else: win.FindElement('configure error').Update( f"invalid {' and '.join(list(keys[buls]))}", text_color='red') win = resetwinvals(win, vals1) ##TODO check columns and rename elif ev1 == 'load din': # try: din = del_Unnamed( pd.read_table(vals1['mutation table'], keep_default_na=False)) cols_din = [ 'genome coordinate', 'nucleotide mutation', 'transcript: id', 'aminoacid: position', 'amino acid mutation' ] normalised2cols = dict( zip([normalisestr(c) for c in cols_din], cols_din)) din = din.rename( columns={c: normalised2cols[normalisestr(c)] for c in din}) if ('genome coordinate' in din) and (not 'transcript: id' in din): win.FindElement('mutation_format aminoacid').Update( value=False) win.FindElement('mutation_format nucleotide').Update( value=True) vals1['mutation_format aminoacid'] = False vals1['mutation_format nucleotide'] = True elif (not 'genome coordinate' in din) and ('transcript: id' in din): win.FindElement('mutation_format nucleotide').Update( value=False) win.FindElement('mutation_format aminoacid').Update(value=True) vals1['mutation_format aminoacid'] = True vals1['mutation_format nucleotide'] = False else: vals1['mutation_format aminoacid'] = False vals1['mutation_format nucleotide'] = False vals1['reverse_mutations create'] = True vals1['reverse_mutations remove'] = False win.FindElement('error din').Update("loaded", text_color='green') win.FindElement('mutation table').Update(vals1['mutation table']) win = resetwinvals(win, vals1) elif ev1 == 'mutation_format nucleotide': vals1['mutation_format nucleotide'] = True vals1['mutation_format aminoacid'] = False win = resetwinvals(win, vals1) elif ev1 == 'mutation_format aminoacid': vals1['mutation_format nucleotide'] = False vals1['mutation_format aminoacid'] = True win = resetwinvals(win, vals1) elif ev1 == 'reverse_mutations create': vals1['reverse_mutations create'] = True vals1['reverse_mutations remove'] = False win = resetwinvals(win, vals1) elif ev1 == 'reverse_mutations remove': vals1['reverse_mutations create'] = False vals1['reverse_mutations remove'] = True win = resetwinvals(win, vals1) elif ev1 == 'clear all': win = resetwinvals(win, _vals1) elif ev1 == 'save cfgp': if vals1['cfgp'] != '' or vals1[ 'cfgp'] != 'path to save configuration file (.yml)': vals1['cfgp'] = f"{vals1['cfgp']}.yml" if not vals1[ 'cfgp'].endswith('.yml') else vals1['cfgp'] if test: yaml.dump(vals1, open(vals1['cfgp'] + '_test.yml', 'w'), default_flow_style=False) if not 'vals2' in locals(): vals2 = None cfg = guival2cfg(vals1, vals2) from beditor.pipeline import validcfg if validcfg(cfg): yaml.dump(cfg, open(vals1['cfgp'], 'w'), default_flow_style=False) win.FindElement('save cfgp error').Update( f"saved", text_color='green') else: win.FindElement('save cfgp error').Update( f"error/s in configuration", text_color='red') win.FindElement('run beditor').Update(disabled=False) win.FindElement('cfgp').Update(vals1['cfgp']) win = resetwinvals(win, vals1) elif ev1 == 'run beditor': win.FindElement('guiload').UpdateAnimation( source=f'{dirname(abspath(__file__))}/data/gui/guiload.gif', time_between_frames=0) win.FindElement('run beditor error').Update(f"running!", text_color='green') try: runbashcmd( f"source activate beditor; beditor --cfg {vals1['cfgp']}") win.FindElement('run beditor error').Update( f"finished processing!", text_color='green') except: win.FindElement('run beditor error').Update( f"errored! see command line", text_color='red') #TODO create cfg and validate win = resetwinvals(win, vals1)
def get_genomes(cfg): """ Installs genomes :param cfg: configuration dict """ runbashcmd( f"pyensembl install --reference-name {cfg['genomeassembly']} --release {cfg['genomerelease']} --species {cfg['host']}" ) import pyensembl ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) contig_mito = ['MTDNA', 'MITO', 'MT'] contigs = [ c for c in ensembl.contigs() if ((not '.' in c) and (len(c) < 5) and (c not in contig_mito)) ] if len(contigs) == 0: logging.error('no contigs identified by pyensembl; aborting') sys.exit(0) logging.info(f"{len(contigs)} contigs/chromosomes in the genome") logging.info(contigs) # raw genome next if 'human' in cfg['host'].lower(): cfg['host'] = 'homo_sapiens' if 'yeast' in cfg['host'].lower(): cfg['host'] = 'saccharomyces_cerevisiae' host_ = "_".join(s for s in cfg['host'].split('_')).capitalize() ensembl_fastad = 'pub/release-{}/fasta/{}/dna/'.format( cfg['genomerelease'], cfg['host']) genome_fastad = '{}/{}'.format(dirname(realpath(__file__)), ensembl_fastad) cfg['genomep'] = '{}/genome.fa'.format(genome_fastad) if not exists(cfg['genomep']): logging.error('not found: {}'.format(cfg['genomep'])) if not '/test_beditor/' in cfg['cfgp']: ifdlref = input( "Download genome at {}?[Y/n]: ".format(genome_fastad)) else: ifdlref = 'Y' if ifdlref == 'Y': # #FIXME download contigs and cat and get index, sizes for contig in contigs: if 'GRCh37' in cfg['genomeassembly']: #Homo_sapiens.GRCh37.75.dna_sm.chromosome.1.fa.gz fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.{cfg['genomerelease']}.dna_sm.chromosome.{contig}.fa.gz" else: fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.dna_sm.chromosome.{contig}.fa.gz" fp = '{}/{}'.format(ensembl_fastad, fn) if not exists(fp): cmd = 'wget -q -x -nH ftp://ftp.ensembl.org/{} -P {}'.format( fp, dirname(realpath(__file__))) runbashcmd(cmd, test=cfg['test']) # break # make the fa ready if not exists(cfg['genomep']): cmd = 'gunzip {}*.fa.gz;cat {}/*.fa > {}/genome.fa;'.format( genome_fastad, genome_fastad, genome_fastad) runbashcmd(cmd, test=cfg['test']) else: logging.error('abort') sys.exit(1) if not exists(cfg['genomep'] + '.bwt'): cmd = '{} index {}'.format(cfg['bwa'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('bwa index is present') if not exists(cfg['genomep'] + '.fai'): cmd = '{} faidx {}'.format(cfg['samtools'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('samtools index is present') if not exists(cfg['genomep'] + '.sizes'): cmd = 'cut -f1,2 {}.fai > {}.sizes'.format(cfg['genomep'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('sizes of contigs are present') ensembl_gff3d = 'pub/release-{}/gff3/{}/'.format(cfg['genomerelease'], cfg['host']) genome_gff3d = '{}/{}'.format(dirname(realpath(__file__)), ensembl_gff3d) cfg['genomegffp'] = '{}/genome.gff3'.format(genome_gff3d) if not exists(cfg['genomegffp']): logging.error('not found: {}'.format(cfg['genomegffp'])) if not '/test_beditor/' in cfg['cfgp']: ifdlref = input("Download genome annotations at {}?[Y/n]: ".format( genome_gff3d)) else: ifdlref = 'Y' if ifdlref == 'Y': # #FIXME download contigs and cat and get index, sizes fn = '{}.{}.{}.gff3.gz'.format(cfg['host'].capitalize(), cfg['genomeassembly'], cfg['genomerelease']) fp = '{}/{}'.format(ensembl_gff3d, fn) if not exists(fp): cmd = 'wget -x -nH ftp://ftp.ensembl.org/{} -P {}'.format( fp, dirname(realpath(__file__))) runbashcmd(cmd, test=cfg['test']) # move to genome.gff3 cmd = 'cp {}/{} {}'.format(genome_gff3d, fn, cfg['genomegffp']) runbashcmd(cmd, test=cfg['test']) else: logging.error('abort') sys.exit(1) logging.info('genomes are installed!') return cfg
def get_deps(cfg): """ Installs dependencies of `beditor` :param cfg: configuration dict """ depsd = "%s/deps" % abspath(dirname(__file__)) if not exists(depsd): makedirs(depsd) deps = ['samtools', 'bedtools', 'bwa'] ddeps = pd.DataFrame(columns=['local path', 'download link'], index=deps) # ddeps=ddeps.set_index('name') ddeps.index.name = 'dep' ddeps.loc[:, 'local path'] = [ '{}/{}'.format(depsd, dep) for dep in ddeps.index ] dep = 'samtools' ddeps.loc[ dep, 'download link'] = 'https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2' ddeps.loc[dep, 'executable'] = '{}/samtools-1.7/samtools'.format( ddeps.loc[dep, 'local path']) ddeps.loc[dep, 'install'] = 'cd {};./configure --disable-lzma;make;'.format( dirname(ddeps.loc[dep, 'executable'])) dep = 'bedtools' ddeps.loc[ dep, 'download link'] = 'https://github.com/arq5x/bedtools2/releases/download/v2.27.1/bedtools-2.27.1.tar.gz' ddeps.loc[dep, 'executable'] = '{}/bedtools2/bin/bedtools'.format( ddeps.loc[dep, 'local path']) ddeps.loc[dep, 'install'] = 'cd {}/../;make;'.format( dirname(ddeps.loc[dep, 'executable'])) dep = 'bwa' ddeps.loc[ dep, 'download link'] = 'https://github.com/lh3/bwa/releases/download/v0.7.17/bwa-0.7.17.tar.bz2' ddeps.loc[dep, 'executable'] = '{}/bwa-0.7.17/bwa'.format( ddeps.loc[dep, 'local path']) ddeps.loc[dep, 'install'] = 'cd {};make;'.format( dirname(ddeps.loc[dep, 'executable'])) ddeps.loc[:, 'ext'] = [ '.tar.{}'.format(ddeps.loc[n, 'download link'].split('.tar.')[-1]) for n in ddeps.index ] ddeps.to_csv("{}/deps.tsv".format(depsd), sep='\t') logp = "%s/deps.log" % (depsd) with open(logp, 'a') as logf: for dep in ddeps.index: if not exists(ddeps.loc[dep, 'executable']): logging.info("configuring: {} to {}".format( dep, ddeps.loc[dep, 'executable'])) link = ddeps.loc[dep, 'download link'] path = ddeps.loc[dep, 'local path'] tarp = '{}/{}'.format(path, basename(link)) if not exists(tarp): runbashcmd("wget -q %s --directory-prefix=%s" % (link, path), logf=logf) if not exists(dirname(ddeps.loc[dep, 'executable'])): if ddeps.loc[dep, 'ext'] == '.tar.bz2': tarcom = 'xvjf' elif ddeps.loc[dep, 'ext'] == '.tar.gz': tarcom = 'zxvf' runbashcmd("tar {} {} -C {}".format(tarcom, tarp, path), logf=logf) runbashcmd(ddeps.loc[dep, 'install'], logf=logf) # ddeps.loc[dep,'executable']='{}/.{}'.format(srcd,dep) # break logging.info("dependencies are installed!") for dep in ddeps.index: cfg[dep] = ddeps.loc[dep, 'executable'] return cfg