def test_fasta_reader_cleanup(): i_path = _data_path("fasta_reader_1.fasta") fh = open(i_path) with _new_argv([fh]): reader = fastaReader(fh) for _ in reader: pass assert (fh.closed)
def main(): # Read command line arguments fasta_filename = sys.argv[1] fasta_type = sys.argv[ 2] or 'fasta' # should always be fasta or csfasta? what if txt? qual_filename = sys.argv[3] qual_type = sys.argv[4] or 'qualsanger' # qual454 qualsolid output_filename = sys.argv[5] force_quality_encoding = sys.argv[6] if force_quality_encoding == 'None': force_quality_encoding = None format = 'sanger' if fasta_type == 'csfasta' or qual_type == 'qualsolid': format = 'cssanger' elif qual_type == 'qualsolexa': format = 'solexa' elif qual_type == 'qualillumina': format = 'illumina' out = fastqWriter(path=output_filename, format=format, force_quality_encoding=force_quality_encoding) if qual_filename == 'None': qual_input = fastqFakeFastaScoreReader( format, quality_encoding=force_quality_encoding) else: qual_input = fastaNamedReader(open(qual_filename, 'rt')) fastq_combiner = fastqCombiner(format) i = None skip_count = 0 for i, sequence in enumerate(fastaReader(open(fasta_filename, 'rt'))): quality = qual_input.get(sequence) if quality: fastq_read = fastq_combiner.combine(sequence, quality) out.write(fastq_read) else: skip_count += 1 out.close() if i is None: print("Your file contains no valid FASTA sequences.") else: print(qual_input.has_data()) print('Combined %s of %s sequences with quality scores (%.2f%%).' % (i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0))
def load_primers_as_re(primer_fasta, mm, rc=False): #Read primer file and record all specified sequences primers = set() in_handle = open(primer_fasta, "rU") reader = fastaReader(in_handle) count = 0 for record in reader: if rc: seq = reverse_complement(record.sequence) else: seq = record.sequence #primers.add(re.compile(make_reg_ex(seq))) count += 1 for pattern in make_reg_ex_mm(seq, mm): primers.add(pattern) in_handle.close() #Use set to avoid duplicates, sort to have longest first #(so more specific primers found before less specific ones) primers = sorted(set(primers), key=lambda p: -len(p)) return count, re.compile("|".join(primers)) #make one monster re!
def main(): #Read command line arguments fasta_filename = sys.argv[1] fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt? qual_filename = sys.argv[3] qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid output_filename = sys.argv[5] force_quality_encoding = sys.argv[6] if force_quality_encoding == 'None': force_quality_encoding = None format = 'sanger' if fasta_type == 'csfasta' or qual_type == 'qualsolid': format = 'cssanger' elif qual_type == 'qualsolexa': format = 'solexa' elif qual_type == 'qualillumina': format = 'illumina' out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding ) if qual_filename == 'None': qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding ) else: qual_input = fastaNamedReader( open( qual_filename, 'rb' ) ) fastq_combiner = fastqCombiner( format ) i = None skip_count = 0 for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ): quality = qual_input.get( sequence ) if quality: fastq_read = fastq_combiner.combine( sequence, quality ) out.write( fastq_read ) else: skip_count += 1 out.close() if i is None: print "Your file contains no valid FASTA sequences." else: print qual_input.has_data() print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
if len(record.sequence) >= min_len: record.quality = record.quality[:cut] clipped += 1 writer.write(record) else: short_clipped += 1 elif keep_negatives: if len(record) >= min_len: negs += 1 writer.write(record) else: short_negs += 1 elif seq_format.lower() == "fasta": in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastaReader(in_handle) writer = fastaWriter(out_handle) #Following code is identical to that for FASTQ but without editing qualities if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: #Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: clipped += 1 writer.write(record) else: short_clipped += 1
try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: #Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] \ or seq_format.lower().startswith("qual"): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) marker = "@" else: sys.exit("Unsupported file type %r" % seq_format) #Now do the renaming count = 0 renamed = 0 for record in reader: #The [1:] is because the fastaReader leaves the > on the identifier, #likewise the fastqReader leaves the @ on the identifier
if len(record.sequence) >= min_len: record.quality = record.quality[:cut] clipped += 1 writer.write(record) else: short_clipped += 1 elif keep_negatives: if len(record) >= min_len: negs += 1 writer.write(record) else: short_neg += 1 elif seq_format.lower()=="fasta": in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastaReader(in_handle) writer = fastaWriter(out_handle) #Following code is identical to that for FASTQ but without editing qualities if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: #Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: clipped += 1 writer.write(record) else: short_clipped += 1
except ValueError: manifest = None out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: # Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith( "qual" ): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) marker = "@" else: sys.exit("Unsupported file type %r" % seq_format) # Now do the renaming count = 0 renamed = 0 for record in reader: # The [1:] is because the fastaReader leaves the > on the identifier,