def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') diff_file = os.path.join('NeuralNetwork', 'repeat.fasta') self.test_records = [] self.diff_records = [] # load the records for file, records in ((test_file, self.test_records), (diff_file, self.diff_records)): handle = open(file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break records.append(seq_record) handle.close() self.motif_finder = Motif.MotifFinder()
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') diff_file = os.path.join('NeuralNetwork', 'repeat.fasta') self.test_records = [] self.diff_records = [] # load the records for file, records in ((test_file, self.test_records), (diff_file, self.diff_records)): handle = open(file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break records.append(seq_record) handle.close() self.num_schemas = 2 schema_ga = Schema.GeneticAlgorithmFinder() schema_ga.min_generations = 1 self.finder = Schema.SchemaFinder(num_schemas=self.num_schemas, schema_finder=schema_ga)
def parse_file(file_name, type = 'DNA'): """Parse the given file into a FastaAlignment object. Arguments: o file_name - The location of the file to parse. o type - The type of information contained in the file. """ if type.upper() == 'DNA': alphabet = IUPAC.ambiguous_dna elif type.upper() == 'RNA': alphabet = IUPAC.ambiguous_rna elif type.upper() == 'PROTEIN': alphabet = IUPAC.protein else: raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN" % type) # create a new alignment object fasta_align = FastaAlignment(Alphabet.Gapped(alphabet)) # now parse the file and fill up the alignment object align_file = open(file_name, 'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(align_file, parser) cur_align = iterator.next() while cur_align: fasta_align.add_sequence(cur_align.title, cur_align.sequence) cur_align = iterator.next() return fasta_align
def test_record_iterator(self): """Test the iterator with a Record Parser. """ parser = Fasta.RecordParser() iterator = Fasta.Iterator(self.test_handle, parser) for rec in iter(iterator): assert isinstance(rec, Fasta.Record)
def test_sequence_iterator(self): """Test the iterator with a Sequence Parser. """ parser = Fasta.SequenceParser() iterator = Fasta.Iterator(self.test_handle, parser) for rec in iter(iterator): assert isinstance(rec, SeqRecord.SeqRecord)
def test_schema_representation(self): """Convert sequences into schema representations. """ # get a set of schemas we want to code the sequence in schema_bank = self._load_schema_repository() top_schemas = schema_bank.get_top(25) schema_coder = Schema.SchemaCoder(top_schemas, self.schema) # get the sequences one at a time, and encode them fasta_handle = open(self.test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(fasta_handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break schema_values = schema_coder.representation(seq_record.seq) if VERBOSE: print "Schema values:", schema_values fasta_handle.close()
def ReadFile(self): self.parser = Fasta.RecordParser() self.iter = Fasta.Iterator(handle=open(self.file), parser=self.parser) while 1: rec = self.iter.next() if not rec: break self.header = rec.title.split()[0].split(',')[0] self.HandleRecord(rec)
def test_new_iterator(self): """Ensure the Fasta iterator works like a Python 2.2 iterator. """ n = 0 iterator = Fasta.Iterator(self.test_handle) for rec in iter(iterator): n += 1 assert n == 3
def test_parsing_comments(self): """Parse FASTA files with # style comment lines in them. """ handle = open(os.path.join("Fasta", "f003")) iterator = Fasta.Iterator(handle, Fasta.RecordParser()) num_recs = 0 for rec in iter(iterator): num_recs += 1 assert num_recs == 2
def read_fasta_file(self, file): genes = [] iter = Fasta.Iterator(handle = open(file), parser = Fasta.RecordParser()) while 1: rec = iter.next() if not rec: break genes.append((rec.sequence, rec.title)) return genes
def runDisEMBLpipeline(): try: smooth_frame = 8 peak_frame = 8 join_frame = 4 fold_coils = 1.2 fold_hotloops = 1.4 fold_rem465 = 1.2 mode = 'scores' try: file = open(sys.argv[1], 'r') except: mode = 'default' except: print '\nDisEMBL.py sequence_file \n' print 'A default run would be: ./DisEMBL.py fasta_file' raise SystemExit #db = sys.stdin parser = Fasta.RecordParser() iterator = Fasta.Iterator(file, parser) while 1: try: cur_record = iterator.next() sequence = upper(cur_record.sequence) # Run NN COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence) # Run Savitzky-Golay REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw) COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw) HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw) sys.stdout.write('> ' + cur_record.title + '\n') sys.stdout.write('# COILS ') reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence) sys.stdout.write('# REM465 ') reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence) sys.stdout.write('# HOTLOOPS ') reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence) sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n') for i in range(len(REM465_smooth)): sys.stdout.write(sequence[i] + '\t' + fpformat.fix(COILS_smooth[i], 5) + '\t' + fpformat.fix(REM465_smooth[i], 5) + '\t' + fpformat.fix(HOTLOOPS_smooth[i], 5) + '\n') except AttributeError: break file.close() return
def test_basic_iterator(self): """Ensure the Fasta iterator works returning text. """ i = Fasta.Iterator(self.test_handle) rec_info = {0 : ">gi|1348912|gb|G26680|G26680", 1 : ">gi|1348917|gb|G26685|G26685", 2 : ">gi|1592936|gb|G29385|G29385"} for rec_num in range(3): rec = i.next() lines = rec.split("\n") title_part = lines[0].split() assert title_part[0] == rec_info[rec_num] # make sure we keep getting None when the iterator is done assert i.next() is None assert i.next() is None
def runDisEMBLpipeline(): try: smooth_frame = int(sys.argv[1]) peak_frame = int(sys.argv[2]) join_frame = int(sys.argv[3]) fold_coils = float(sys.argv[4]) fold_hotloops = float(sys.argv[5]) fold_rem465 = float(sys.argv[6]) file = str(sys.argv[7]) except: print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file\n' print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2 fasta_file' raise SystemExit db = open(file, 'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(db, parser) while 1: try: cur_record = iterator.next() sequence = upper(cur_record.sequence) # Run NN COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence) # Run Savitzky-Golay REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw) COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw) HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw) sys.stdout.write('> ' + cur_record.title + '_COILS ') reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence) sys.stdout.write('> ' + cur_record.title + '_REM465 ') reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence) sys.stdout.write('> ' + cur_record.title + '_HOTLOOPS ') reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence) sys.stdout.write('\n') except AttributeError: break return
def _load_schema_repository(self): """Helper function to load a schema repository from a file. This also caches a schema bank, to prevent having to do this time consuming operation multiple times. """ # if we already have a cached repository, return it if self.schema_bank is not None: return self.schema_bank # otherwise, we'll read in a new schema bank # read in the all of the motif records motif_handle = open(self.test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(motif_handle, seq_parser) seq_records = [] while 1: seq_record = iterator.next() if seq_record is None: break seq_records.append(seq_record) motif_handle.close() # find motifs from the file motif_finder = Motif.MotifFinder() motif_size = 9 motif_bank = motif_finder.find(seq_records, motif_size) schema_bank = self.factory.from_motifs(motif_bank, .1, 2) # cache the repository self.schema_bank = schema_bank return schema_bank
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') self.test_records = [] # load the records handle = open(test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break self.test_records.append(seq_record) handle.close() self.sig_finder = Signature.SignatureFinder()
def extract_organisms(file_to_parse): # set up the parser and iterator parser = Fasta.RecordParser() file = open(file_to_parse, 'r') iterator = Fasta.Iterator(file, parser) all_species = [] while 1: cur_record = iterator.next() if cur_record is None: break # extract the info from the title new_species = cur_record.title.split()[1] # append the new species to the list if it isn't there if new_species not in all_species: all_species.append(new_species) return all_species
def get_seqs(blastRootDirectory): if len(sys.argv) >= 2: numSeqs = int(sys.argv[1]) if numSeqs < 0 or numSeqs > 100000: print 'requested number of sequences is outside allowable range (1-100000). Using default (1000)' numSeqs = 10 else: numSeqs = 10 print 'requesting', numSeqs, 'query sequences from the server' seqs = phamServer.request_seqs(server, numSeqs, client) '''Builds the file to be blasted from the sequences given''' f = open(os.path.join(blastRootDirectory, 'filetoblast.txt'), 'w') print seqs '''takes the new set of sequences and checks if they exist in the local database and, if so, writes the sequence id and translation to a separate FASTA formated input file to be passed to the BLASTALL executable''' for GeneID in seqs: parser = Fasta.RecordParser() infile = open(os.path.join(blastRootDirectory, 'blastDB.fasta')) iterator = Fasta.Iterator(infile, parser) while 1: record = iterator.next() if not record: break record_id = record.title if GeneID == record_id: f.write('>' + record.title + '\n' + record.sequence + '\n') f.close() return (len(seqs))
def runGlobPlot(): try: smoothFrame = 10 DOM_joinFrame = 15 DOM_peakFrame = 74 DIS_joinFrame = 4 DIS_peakFrame = 5 file = str(sys.argv[1]) db = open(file, 'r') except: print 'Usage:' print ' ./GlobPipe.py FASTAfile' raise SystemExit parser = Fasta.RecordParser() iterator = Fasta.Iterator(db, parser) while 1: try: cur_record = iterator.next() #uppercase is searchspace seq = upper(cur_record.sequence) # sum function sum_vector = Sum(seq, RL) # Run Savitzky-Golay smooth = SavitzkyGolay( ` smoothFrame `, 0, sum_vector) dydx_vector = SavitzkyGolay( ` smoothFrame `, 1, sum_vector) #test sumHEAD = sum_vector[:smoothFrame] sumTAIL = sum_vector[len(sum_vector) - smoothFrame:] newHEAD = [] newTAIL = [] for i in range(len(sumHEAD)): try: dHEAD = (sumHEAD[i + 1] - sumHEAD[i]) / 2 except: dHEAD = (sumHEAD[i] - sumHEAD[i - 1]) / 2 try: dTAIL = (sumTAIL[i + 1] - sumTAIL[i]) / 2 except: dTAIL = (sumTAIL[i] - sumTAIL[i - 1]) / 2 newHEAD.append(dHEAD) newTAIL.append(dTAIL) dydx_vector[:smoothFrame] = newHEAD dydx_vector[len(dydx_vector) - smoothFrame:] = newTAIL globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame, DOM_peakFrame, DIS_joinFrame, DIS_peakFrame) s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM') s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS') sys.stdout.write('>' + cur_record.title + '\n') sys.stdout.write('# ' + coordstrDOM + '\n') sys.stdout.write('# ' + coordstrDIS + '\n') # UNCOMMENT THIS IF NEED TO PRODUCE PER RESEDUE VALUES sys.stdout.write('# RESIDUE' + '\t' + 'DYDX' + '\t' + 'RAW' + '\t' + 'SMOOTHED\n') for i in range(len(dydx_vector)): # dydx (positive values seems to indicate disorder in rows more than ~6 chars) raw smoothed sys.stdout.write(seq[i] + '\t' + fpformat.fix(dydx_vector[i], 4) + '\t' + fpformat.fix(smooth[i], 4) + '\t' + fpformat.fix(sum_vector[i], 4) + '\n') # print s_final print '\n' except AttributeError: break return
filename = "interpro_" + title + ".xml" result = server.poll(jobId, resultType.type) parseResult(result, wh_domain, wh_evidence, title) else: counter += 1 return jobIds jobIds = [] packageFull = False RH = open(infile, "r") WH = open(options.outfile, "w") for prot_record in Fasta.Iterator(RH, Fasta.RecordParser()): seqData = ">" + prot_record.title + "\n" + prot_record.sequence content = [{'type': 'sequence', 'content': seqData}] packageFull = True while (packageFull): jobIds = checkJobs(jobIds, wh_domain, wh_evidence) if (len(jobIds) < 25): packageFull = False if (packageFull): time.sleep(15) sys.stderr.write("Submitted protein " + prot_record.title + " to InterProScan...\n")
if __name__ == "__main__": import getopt opts, args = getopt.getopt(sys.argv[1:], 'hs:t:') if not opts or len(args) != 1: usage() sys.exit('Error usage') fasta_file = open(args[0]) parser = Fasta.RecordParser() for o, a in opts: if o == '-h': usage() sys.exit(0) elif o == '-s': sieve = get_sieve(get_input_handle(a)) iterator = FastaSelectiveIterator(sieve, fasta_file, parser) for record in iterator: print record elif o == '-t': translator = FastaTranslator(get_input_handle(a), reverse=True) iterator = Fasta.Iterator(fasta_file, parser) for record in iterator: print translator(record) else: usage() sys.exit('Error usage')
def initialize(): """ Parse command line options, and read input Fasta file. Construct a dictionary contains the following fields: sequences a list of dictionary objects having 'title', 'sequence', and 'motif_position' attributes (see also the docstring of gibbs.Gibbs.__init__) width width of motif to find weight weight to use for pseudocounts iterations number of non-improving iterations before stopping shifts maximum phase shifts to detect ps_freq frequency of detecting phase shifts init_occurrences number of base occurrences to use for initial motif positions heuristic init_width width of patterns to use for initial motif positions heuristic Return the constructed dictionary. """ parser = OptionParser(usage = "usage: %prog -i FILE -w WIDTH [-h] " "[options]", version = "PyMotif %s (%s)" % (VERSION, DATE), description = "PyMotif is an implementation of the " "Gibbs sampling algorithm for finding local " "alignments of DNA sequences. " "See the accompanied README file for usage " "instructions and the documentation directory for " "implementation details.") parser.add_option("-i", "--input", dest="input", metavar="FILE", help="read FILE in Fasta format") parser.add_option("-w", "--width", dest="width", metavar="WIDTH", type="int", help="find motif of width WIDTH") parser.add_option("-t", "--iterations", dest="iterations", metavar="ITERATIONS", default=ITERATIONS_DEFAULT, type="int", help="number of non-improving iterations " "(default " + str(ITERATIONS_DEFAULT) + ")") parser.add_option("-p", "--pseudo", dest="pseudo", metavar="WEIGHT", default=PSEUDOCOUNTS_WEIGHT_DEFAULT, type="float", help="use WEIGHT for weight of pseudocounts (default " + str(PSEUDOCOUNTS_WEIGHT_DEFAULT) + ")") parser.add_option("-s", "--phase-shifts", dest="shifts", metavar="SHIFTS", default=PHASE_SHIFTS_DEFAULT, type="int", help="detect phase shifts of width SHIFTS (default " + str(PHASE_SHIFTS_DEFAULT) + ")") parser.add_option("-f", "--ps-frequency", dest="frequency", metavar="FREQ", default=PS_FREQUENCY_DEFAULT, type="int", help="if SHIFTS>0, detect phase shifts " "every FREQ iterations (default " + str(PS_FREQUENCY_DEFAULT) + ")") parser.add_option("-n", "--init-num-occurrences", dest="initoccurrences", metavar="OCCURRENCES", default=INIT_NUM_OCCURRENCES_DEFAULT, type="int", help="number of base occurrences to use for initial " "positions heuristic (default " + str(INIT_NUM_OCCURRENCES_DEFAULT) + ")") parser.add_option("-v", "--init-pattern-width", dest="initwidth", metavar="WIDTH", default=INIT_PATTERN_WIDTH_DEFAULT, type="int", help="if OCCURRENCES>0, width of pattern " "to use for initial positions heuristic (defaults to " "value of --width)") parser.add_option("-c", "--cow", action="store_true", dest="cow", default=False, help="display cow (not recommended)") (options, args) = parser.parse_args() if options.cow: s = "" for _ in range(10): s += choice("ATCG") # Created with the cowsay program print """ ____________ < %s > ------------ \ ^__^ \ (oo)\_______ (__)\ )\/\\ ||----w | || ||""" % s sys.exit(0) if not options.input: parser.error("input file required") if not options.width: parser.error("width argument required") if options.width < 2: parser.error("please use a sane motif width") # Read contents of Fasta file try: file = open(options.input) except IOError: parser.error("could not read file %s" % options.input) fasta_parser = Fasta.RecordParser() # Iterator for sample data fasta_iterator = Fasta.Iterator(file, fasta_parser) # A list containing a dictionary object for each sequence sequences = [{'title': record.title, 'sequence': record.sequence, 'motif_position': 0} for record in fasta_iterator] # We could do some more error checking on the input file here, like # checking there's only ATCG and at least a few of them, but for now # this is enough if len(sequences) < 2: parser.error("found %i sequences in input file %s" % (len(sequences), options.input)) return {'sequences': sequences, 'width': options.width, 'weight': options.pseudo, 'iterations': options.iterations, 'shifts': options.shifts, 'ps_freq': options.frequency, 'init_occurrences': options.initoccurrences, 'init_width': options.initwidth}
#!/usr/bin/env python """Example showing how to deal with internet BLAST from Biopython. This code is described in great detail in the BLAST section of the Biopython documentation. """ # standard library import cStringIO # biopython from Bio.Blast import NCBIWWW from Bio import Fasta # first get the sequence we want to parse from a FASTA file file_for_blast = open('m_cold.fasta', 'r') f_iterator = Fasta.Iterator(file_for_blast) f_record = f_iterator.next() print 'Doing the BLAST and retrieving the results...' result_handle = NCBIWWW.qblast('blastn', 'nr', f_record) # save the results for later, in case we want to look at it save_file = open('m_cold_blast.out', 'w') blast_results = result_handle.read() save_file.write(blast_results) save_file.close() print 'Parsing the results and extracting info...' b_parser = NCBIWWW.BlastParser()