def __call__(self, seqs, all_seqs): """call the runner like a function""" with tempfile.NamedTemporaryFile(prefix='weeder.fasta', delete=False) as outfile: filename = outfile.name logging.info("Run Weeder on FASTA file: '%s'", filename) st.write_sequences_to_fasta_file(outfile, seqs.items()) pssms = weeder.run_weeder(filename) meme_outfile = '%s.meme' % filename dbfile = self.meme_suite.make_sequence_file( [(feature_id, locseq[1]) for feature_id, locseq in all_seqs.items()]) logging.info("# PSSMS created: %d", len(pssms)) logging.info("run MAST on '%s'", meme_outfile) motif_infos = [] for i in xrange(len(pssms)): pssm = pssms[i] motif_infos.append(meme.MemeMotifInfo(pssm.values(), i + 1, pssm.sequence_length(), len(pssm.sites()), None, pssm.evalue(), pssm.sites())) try: mast_out = self.meme_suite.mast( meme_outfile, dbfile, self.meme_suite.global_background_file()) pe_values, annotations = meme.read_mast_output(mast_out, seqs.keys()) return meme.MemeRunResult(pe_values, annotations, motif_infos) except: return meme.MemeRunResult([], {}, [])
def test_write_sequences_to_fasta_file(self): """Tests writing to a FASTA file""" seqs = st.read_sequences_from_fasta_file('testdata/fasta_test.fa') with open('/tmp/fasta_tmp.fa', 'w') as outputfile: st.write_sequences_to_fasta_file(outputfile, seqs) seqs2 = st.read_sequences_from_fasta_file('/tmp/fasta_tmp.fa') self.assertEquals(seqs, seqs2)
def make_sequence_file(self, seqs): """Creates a FASTA file from a list of(feature_id, sequence) pairs""" filename = None with tempfile.NamedTemporaryFile(prefix='memeseqs', delete=False) as outfile: filename = outfile.name st.write_sequences_to_fasta_file(outfile, seqs) return filename
def __call__(self, params): """call the runner like a function""" with tempfile.NamedTemporaryFile(prefix='weeder.fasta', delete=False) as outfile: filename = outfile.name logging.debug("Run Weeder on FASTA file: '%s'", filename) st.write_sequences_to_fasta_file(outfile, params.seqs.iteritems()) try: dbfile = None meme_outfile, pssms = weeder.run_weeder(filename, params, self.config_params, self.meme_suite.bgmodel) if len(pssms) == 0: logging.debug('no PSSMS generated, skipping cluster') return meme.MemeRunResult([], {}, []) dbfile = self.meme_suite.make_sequence_file( [(feature_id, locseq[1]) for feature_id, locseq in params.used_seqs.iteritems()]) logging.debug("# PSSMS created: %d %s", len(pssms), str([i.consensus_motif() for i in pssms])) logging.debug("run MAST on '%s', dbfile: '%s'", meme_outfile, dbfile) motif_infos = [] for i in xrange(len(pssms)): pssm = pssms[i] motif_infos.append(meme.MemeMotifInfo(pssm.values, i + 1, pssm.sequence_length(), len(pssm.sites), None, pssm.e_value, pssm.sites)) mast_out = self.meme_suite.mast(meme_outfile, dbfile, self.meme_suite.global_background_file()) if 'keep_mastout' in self.config_params['debug']: with open('%s.mast' % meme_outfile, 'w') as outfile: outfile.write(mast_out) pe_values, annotations = self.meme_suite.read_mast_output(mast_out, params.seqs.keys()) return meme.MemeRunResult(pe_values, annotations, motif_infos) except: e = sys.exc_info()[0] print e raise return meme.MemeRunResult([], {}, []) finally: if self.__remove_tempfiles: for fileExtension in ['', '.wee', '.mix', '.html', '.meme', '.1.f1', '.1.f2', '.2.f1', '.2.f2']: tmpName = filename + fileExtension if os.path.exists(tmpName): try: os.remove(tmpName) except: logging.warn("could not remove tmp file:'%s'", tmpName) try: if dbfile: os.remove(dbfile) except: logging.warn("could not remove tmp file:'%s'", dbfile)
def test_write_sequences_to_fasta_file_empty_seqs(self): """Tests ensures that only non-empty sequences will be written to FASTA""" seqs = [['seq1', 'TATATA'], ['seq2', '']] with open('/tmp/fasta_tmp.fa', 'w') as outputfile: st.write_sequences_to_fasta_file(outputfile, seqs) seqs2 = st.read_sequences_from_fasta_file('/tmp/fasta_tmp.fa') self.assertEquals(1, len(seqs2)) self.assertEquals(seqs[0][0], seqs2[0][0]) self.assertEquals(seqs[0][1], seqs2[0][1])
def __call__(self, params): """call the runner like a function""" with tempfile.NamedTemporaryFile(prefix='weeder.fasta', delete=False) as outfile: filename = outfile.name logging.info("Run Weeder on FASTA file: '%s'", filename) st.write_sequences_to_fasta_file(outfile, params.seqs.items()) pssms = weeder.run_weeder(filename) meme_outfile = '%s.meme' % filename dbfile = self.meme_suite.make_sequence_file([ (feature_id, locseq[1]) for feature_id, locseq in params.used_seqs.items() ]) logging.info("# PSSMS created: %d %s", len(pssms), str([i.consensus_motif() for i in pssms])) logging.info("run MAST on '%s'", meme_outfile) motif_infos = [] for i in xrange(len(pssms)): pssm = pssms[i] motif_infos.append( meme.MemeMotifInfo(pssm.values, i + 1, pssm.sequence_length(), len(pssm.sites), None, pssm.e_value, pssm.sites)) try: mast_out = self.meme_suite.mast( meme_outfile, dbfile, self.meme_suite.global_background_file()) pe_values, annotations = meme.read_mast_output( mast_out, params.seqs.keys()) return meme.MemeRunResult(pe_values, annotations, motif_infos) except: return meme.MemeRunResult([], {}, []) finally: if self.__remove_tempfiles: for fileExtension in [ '', '.wee', '.mix', '.html', '.meme', '.1.f1', '.1.f2', '.2.f1', '.2.f2' ]: tmpName = filename + fileExtension if os.path.exists(tmpName): try: os.remove(tmpName) except: logging.warn("could not remove tmp file:'%s'", tmpName) try: os.remove(dbfile) except: logging.warn("could not remove tmp file:'%s'", dbfile)
def __call__(self, params): """call the runner like a function""" with tempfile.NamedTemporaryFile(prefix='weeder.fasta', delete=False) as outfile: filename = outfile.name logging.info("Run Weeder on FASTA file: '%s'", filename) st.write_sequences_to_fasta_file(outfile, params.seqs.items()) if seqtype=='upstream': freqfile = 'HS' elif seqtype=='p3utr': freqfile = 'HS3P' pssms = weeder.run_weeder(filename, freqfile) meme_outfile = '%s.meme' % filename dbfile = self.meme_suite.make_sequence_file( [(feature_id, locseq[1]) for feature_id, locseq in params.used_seqs.items()]) logging.info("# PSSMS created: %d %s", len(pssms),str([i.consensus_motif() for i in pssms])) logging.info("run MAST on '%s'", meme_outfile) motif_infos = [] for i in xrange(len(pssms)): pssm = pssms[i] motif_infos.append(meme.MemeMotifInfo(pssm.values, i + 1, pssm.sequence_length(), len(pssm.sites), None, pssm.e_value, pssm.sites)) try: mast_out = self.meme_suite.mast( meme_outfile, dbfile, self.meme_suite.global_background_file()) pe_values, annotations = meme.read_mast_output(mast_out, params.seqs.keys()) return meme.MemeRunResult(pe_values, annotations, motif_infos) except: return meme.MemeRunResult([], {}, []) finally: if self.__remove_tempfiles: for fileExtension in ['','.wee','.mix','.html','.meme','.1.f1','.1.f2','.2.f1','.2.f2']: tmpName = filename+fileExtension if os.path.exists(tmpName): try: os.remove(tmpName) except: logging.warn("could not remove tmp file:'%s'", tmpName) try: os.remove(dbfile) except: logging.warn("could not remove tmp file:'%s'", dbfile)
def make_sequences( genome_fasta_file, gene_features_file, outfile='sequences.csv', distance={'upstream':300,'downstream':100}, from_end=False, fasta=False ): if from_end: distance = ( distance['upstream'], distance['downstream'] ) else: '''WARNING: as of 2012-03-22, the st.extract functions used flipped distances! e.g. distance[1] is the UPSTREAM distance and distance[0] is the DOWNSTREAM CHECK YOUR SEQUENCES after running this! Also, a negative number is expected for DOWNSTREAM. So, (-100,300) must be passed to st.extract_upstream in order to get a sequence from 300 upstream to 100 downstream. WEIRD!''' distance = (-1*distance['downstream'],distance['upstream']) contig_sequences = st.read_sequences_from_fasta_file( genome_fasta_file ) # convert contig_sequences to dictionary (this func returns a list of tuples) contig_dict = {} for name, seq in contig_sequences: contig_dict[name] = seq print 'loaded %i contigs' %len(contig_dict) print string.join( [ '%s: %ibp' %(a,len(b)) for a,b in contig_dict.items()] , ',' ) features = st.read_features_from_file( gene_features_file ) print 'loaded %i features' %len(features) # print str(features.values()[1]) sequences = [] for feature in features.values(): location = feature.location() # print location, location.contig, distance, feature.id() if from_end: sequences.append( ( feature.id(), st.extract_downstream(contig_dict[location.contig], location, distance)[1] ) ) else: sequences.append( ( feature.id(), st.extract_upstream(contig_dict[location.contig], location, distance)[1] ) ) # print sequences[feature.id()] outf = open(outfile,'w') if fasta: st.write_sequences_to_fasta_file(outf,sequences) else: sep = ',' for id, seq in sequences: outf.write( '%s%s%s\n' %(id,sep,seq) ) outf.close()
def make_sequences( genome_fasta_file, gene_features_file, outfile='sequences.csv', distance={'upstream':300,'downstream':100}, from_end=False, fasta=False ): if from_end: distance = ( distance['upstream'], distance['downstream'] ) else: '''WARNING: as of 2012-03-22, the st.extract functions used flipped distances! e.g. distance[1] is the UPSTREAM distance and distance[0] is the DOWNSTREAM CHECK YOUR SEQUENCES after running this! Also, a negative number is expected for DOWNSTREAM. So, (-100,300) must be passed to st.extract_upstream in order to get a sequence from 300 upstream to 100 downstream. WEIRD!''' distance = (-1*distance['downstream'],distance['upstream']) contig_sequences = st.read_sequences_from_fasta_file( genome_fasta_file ) # convert contig_sequences to dictionary (this func returns a list of tuples) contig_dict = {} for name, seq in contig_sequences: contig_dict[name] = seq print 'loaded %i contigs' %len(contig_dict) print string.join( [ '%s: %ibp' %(a,len(b)) for a,b in contig_dict.items()] , ',' ) features = st.read_features_from_file( gene_features_file ) print 'loaded %i features' %len(features) # print str(features.values()[1]) sequences = [] for feature in features.values(): location = feature.location # print location, location.contig, distance, feature.id if from_end: sequences.append( ( feature.id, st.extract_downstream(contig_dict[location.contig], location, distance)[1] ) ) else: sequences.append( ( feature.id, st.extract_upstream(contig_dict[location.contig], location, distance)[1] ) ) # print sequences[feature.id] outf = open(outfile,'w') if fasta: st.write_sequences_to_fasta_file(outf,sequences) else: sep = ',' for id, seq in sequences: outf.write( '%s%s%s\n' %(id,sep,seq) ) outf.close()