Ejemplo n.º 1
0
def test_yield_record_and_tokenized_headers():
    # We could delete this test, since we have whitelist tests also.
    # Or we could expand this.
    fp_in = io.StringIO(fasta_tests[1])
    records = FastaReader.yield_fasta_record(fp_in, log=lambda x: None)
    w = set()
    assert [] == list(mod.yield_record_and_tokenized_headers(w, records))
Ejemplo n.º 2
0
def test_fasta_simple():
    sin = StringIO(FASTA)
    result = list(M.yield_fasta_record(sin, None))
    assert [rec.sequence for rec in result] == ['ACGT']
    rec = result[0]
    assert rec.metadata == 'FOO=BAR'
    assert rec.id == 'foo/bar/0_42'
    assert rec.name == 'foo/bar/0_42 FOO=BAR'
Ejemplo n.º 3
0
def test_gzip(tmp_path):
    fp = tmp_path / "foo.fasta"
    with fp.open('w') as sout:
        sout.write(FASTA)
    syscall('gzip {}'.format(fp))
    fr_gz = str(fp) + '.gz'
    fw_gz = str(fp) + 'w.gz'

    #print("fn:", fn_gz)
    def noop(*args):
        pass

    with M.open_fasta_reader(fr_gz, log=noop) as reader:
        with M.open_fasta_writer(fw_gz, log=noop) as writer:
            for record in reader:
                assert record.sequence == 'ACGT'
                writer.write(str(record))
                writer.write('\n')
    assert gzip.open(fr_gz, 'rb').read() == gzip.open(fw_gz, 'rb').read()
    os.remove(fr_gz)
    os.remove(fw_gz)
Ejemplo n.º 4
0
def run_median_filter(fp_in, fp_out, fn, zmw_filter_func=median_zmw_subread):
    # Expect an actual file, not a stream.
    assert(os.path.exists(fn))

    # Needed to jump back for the second pass.
    fp_in_start = fp_in.tell()

    # Stores all subreads for a ZMW.
    zmw_dict = collections.defaultdict(list)

    # First pass, collect all ZMW info.
    for record in FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info):
        movie_name, zmw_id, subread_start, subread_end = tokenize_header(record.name)
        # Store None instead of the actual record to free the memory after yield.
        zrec = ZMWTuple(movie_name=movie_name, zmw_id=zmw_id,
                        subread_start=subread_start, subread_end=subread_end,
                        seq_len=len(record.sequence), subread_record=None,
                        subread_header=record.name, subread_id=len(zmw_dict[zmw_id]))
        zmw_dict[zmw_id].append(zrec)

    # For each ZMW, keep only one particular subread, specified by it's order of
    # appearance in the input FASTA file (stored in median_zrec.subread_id).
    whitelist = collections.defaultdict(int)
    for zmw_id, zmw_subreads in zmw_dict.iteritems():
        median_zrec = zmw_filter_func(list(zmw_subreads))
        whitelist[zmw_id] = median_zrec.subread_id

    # Second pass, yield selected sequences.
    # Rewind.
    fp_in.seek(fp_in_start, os.SEEK_SET)
    # Fly-through.
    for record in FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info):
        movie_name, zmw_id, subread_start, subread_end = tokenize_header(record.name)
        # Write-out only one particular subread from the ZMW.
        if whitelist[zmw_id] == 0:
            fp_out.write(str(record))
            fp_out.write('\n')
        whitelist[zmw_id] -= 1
Ejemplo n.º 5
0
def write_record(fp, record):
    fp.write('>{}\n'.format(str_name(record.name, record.length)))
    fp.write(FastaReader.wrap(record.sequence,
                              FastaReader.FastaRecord.COLUMNS))
    fp.write('\n')
Ejemplo n.º 6
0
    def yield_zmwtuple_func(store_record=True):
        # Rewind.
        fp_in.seek(fp_in_start, os.SEEK_SET)

        fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
        return yield_zmwtuple(fasta_records, whitelist_set, store_record)
Ejemplo n.º 7
0
def run_pass_filter(fp_in, fp_out, whitelist_set):
    for record in yield_record(
            whitelist_set, FastaReader.yield_fasta_record(fp_in,
                                                          log=LOG.info)):
        write_record(fp_out, record)
Ejemplo n.º 8
0
 def yield_zmwtuple_func(store_record=True):
     fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
     return yield_zmwtuple(fasta_records, whitelist_set, store_record)
Ejemplo n.º 9
0
def run_pass_filter(fp_in, fp_out, fn):
    for record in FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info):
        fp_out.write(str(record))
        fp_out.write('\n')
Ejemplo n.º 10
0
def run_streamed_median(fp_in, fp_out, fn='-', zmw_filter_func=median_zmw_subread):
    fasta_records = FastaReader.yield_fasta_records(fp_in, fn, log=LOG.info)
    for zmw_id, zmw_subreads in itertools.groupby(yield_zmwtuples(fasta_records), lambda x: x.zmw_id):
        median_zrec = median_zmw_subread(list(zmw_subreads))
        fp_out.write(str(median_zrec.subread_record))
        fp_out.write('\n')
Ejemplo n.º 11
0
def test_fasta_empty():
    fasta = ''
    sin = StringIO(fasta)
    result = list(M.yield_fasta_record(sin, None))
    assert not result
Ejemplo n.º 12
0
def yield_record(input_files):
    for input_fn in input_files:
        with open(input_fn, 'r') as fp_in:
            fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
            for record in fasta_records:
                yield record