def test_yield_record_and_tokenized_headers(): # We could delete this test, since we have whitelist tests also. # Or we could expand this. fp_in = io.StringIO(fasta_tests[1]) records = FastaReader.yield_fasta_record(fp_in, log=lambda x: None) w = set() assert [] == list(mod.yield_record_and_tokenized_headers(w, records))
def test_fasta_simple(): sin = StringIO(FASTA) result = list(M.yield_fasta_record(sin, None)) assert [rec.sequence for rec in result] == ['ACGT'] rec = result[0] assert rec.metadata == 'FOO=BAR' assert rec.id == 'foo/bar/0_42' assert rec.name == 'foo/bar/0_42 FOO=BAR'
def test_gzip(tmp_path): fp = tmp_path / "foo.fasta" with fp.open('w') as sout: sout.write(FASTA) syscall('gzip {}'.format(fp)) fr_gz = str(fp) + '.gz' fw_gz = str(fp) + 'w.gz' #print("fn:", fn_gz) def noop(*args): pass with M.open_fasta_reader(fr_gz, log=noop) as reader: with M.open_fasta_writer(fw_gz, log=noop) as writer: for record in reader: assert record.sequence == 'ACGT' writer.write(str(record)) writer.write('\n') assert gzip.open(fr_gz, 'rb').read() == gzip.open(fw_gz, 'rb').read() os.remove(fr_gz) os.remove(fw_gz)
def run_median_filter(fp_in, fp_out, fn, zmw_filter_func=median_zmw_subread): # Expect an actual file, not a stream. assert(os.path.exists(fn)) # Needed to jump back for the second pass. fp_in_start = fp_in.tell() # Stores all subreads for a ZMW. zmw_dict = collections.defaultdict(list) # First pass, collect all ZMW info. for record in FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info): movie_name, zmw_id, subread_start, subread_end = tokenize_header(record.name) # Store None instead of the actual record to free the memory after yield. zrec = ZMWTuple(movie_name=movie_name, zmw_id=zmw_id, subread_start=subread_start, subread_end=subread_end, seq_len=len(record.sequence), subread_record=None, subread_header=record.name, subread_id=len(zmw_dict[zmw_id])) zmw_dict[zmw_id].append(zrec) # For each ZMW, keep only one particular subread, specified by it's order of # appearance in the input FASTA file (stored in median_zrec.subread_id). whitelist = collections.defaultdict(int) for zmw_id, zmw_subreads in zmw_dict.iteritems(): median_zrec = zmw_filter_func(list(zmw_subreads)) whitelist[zmw_id] = median_zrec.subread_id # Second pass, yield selected sequences. # Rewind. fp_in.seek(fp_in_start, os.SEEK_SET) # Fly-through. for record in FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info): movie_name, zmw_id, subread_start, subread_end = tokenize_header(record.name) # Write-out only one particular subread from the ZMW. if whitelist[zmw_id] == 0: fp_out.write(str(record)) fp_out.write('\n') whitelist[zmw_id] -= 1
def write_record(fp, record): fp.write('>{}\n'.format(str_name(record.name, record.length))) fp.write(FastaReader.wrap(record.sequence, FastaReader.FastaRecord.COLUMNS)) fp.write('\n')
def yield_zmwtuple_func(store_record=True): # Rewind. fp_in.seek(fp_in_start, os.SEEK_SET) fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info) return yield_zmwtuple(fasta_records, whitelist_set, store_record)
def run_pass_filter(fp_in, fp_out, whitelist_set): for record in yield_record( whitelist_set, FastaReader.yield_fasta_record(fp_in, log=LOG.info)): write_record(fp_out, record)
def yield_zmwtuple_func(store_record=True): fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info) return yield_zmwtuple(fasta_records, whitelist_set, store_record)
def run_pass_filter(fp_in, fp_out, fn): for record in FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info): fp_out.write(str(record)) fp_out.write('\n')
def run_streamed_median(fp_in, fp_out, fn='-', zmw_filter_func=median_zmw_subread): fasta_records = FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info) for zmw_id, zmw_subreads in itertools.groupby(yield_zmwtuples(fasta_records), lambda x: x.zmw_id): median_zrec = median_zmw_subread(list(zmw_subreads)) fp_out.write(str(median_zrec.subread_record)) fp_out.write('\n')
def test_fasta_empty(): fasta = '' sin = StringIO(fasta) result = list(M.yield_fasta_record(sin, None)) assert not result
def yield_record(input_files): for input_fn in input_files: with open(input_fn, 'r') as fp_in: fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info) for record in fasta_records: yield record