def test_sorted_items(self): items = [1, 2, 3, 4, 4, 3, 2, 1] unique_items = sorted_items(iter(items)) assert list(unique_items) == [1, 1, 2, 2, 3, 3, 4, 4] unique_items = sorted_items(iter(items), tempdir=tempfile.tempdir) assert list(unique_items) == [1, 1, 2, 2, 3, 3, 4, 4] unique_items = sorted_items(iter(items), max_items_in_memory=3) assert list(unique_items) == [1, 1, 2, 2, 3, 3, 4, 4] items = iter([]) assert not list(unique_items)
def test_key(self): items = [(1, 'a'), (1, 'b'), (2, 'a')] _sorted_items = sorted_items(iter(items), key=lambda x: x[0]) assert list(_sorted_items) == [(1, 'a'), (1, 'b'), (2, 'a')] unique_items = unique(_sorted_items, key=lambda x: x[0]) assert list(unique_items) == [(1, 'a'), (2, 'a')] _sorted_items = sorted_items(iter(items), key=lambda x: x[1]) assert list(_sorted_items) == [(1, 'a'), (2, 'a'), (1, 'b')] unique_items = unique(_sorted_items, key=lambda x: x[1]) assert list(unique_items) == [(1, 'a'), (1, 'b')]
def test_key(self): items = [(1, 'a'), (1, 'b'), (2, 'a')] _sorted_items = list(sorted_items(iter(items), key=lambda x: x[0], max_items_in_memory=1)) assert _sorted_items == [(1, 'a'), (1, 'b'), (2, 'a')] unique_items = unique(_sorted_items, key=lambda x: x[0]) assert list(unique_items) == [(1, 'a'), (2, 'a')] _sorted_items = sorted_items(iter(items), key=lambda x: x[1]) assert list(_sorted_items) == [(1, 'a'), (2, 'a'), (1, 'b')] unique_items = unique(_sorted_items, key=lambda x: x[1]) assert list(unique_items) == [(1, 'a'), (1, 'b')]
def sort_fastx_files(in_fhands, key, index_fpath=None, directory=None, max_items_in_memory=None, tempdir=None): if key == 'seq': reads = read_seqs(in_fhands) return sorted_items(reads, key=get_str_seq, tempdir=tempdir, max_items_in_memory=max_items_in_memory) elif key == 'coordinate': return sort_by_position_in_ref(in_fhands, index_fpath=index_fpath, directory=directory, tempdir=tempdir) elif key == 'name': reads = read_seqs(in_fhands) return sorted_items(reads, key=get_name, tempdir=tempdir, max_items_in_memory=max_items_in_memory) else: raise ValueError('Non-supported sorting key')
def _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if ordered: sorted_reads = reads else: def _key(seq): return get_title(seq) sorted_reads = sorted_items(reads, _key, max_reads_memory, temp_dir) return group_pairs_by_name(sorted_reads)
def filter_duplicates(in_fhands, out_fhand, paired_reads, n_seqs_packet=None, tempdir=None): if not in_fhands: raise ValueError('At least one input fhand is required') pairs = _read_pairs(in_fhands, paired_reads) sorted_pairs = sorted_items(pairs, key=_get_pair_key, tempdir=tempdir, max_items_in_memory=n_seqs_packet) for pair in unique(sorted_pairs, key=_get_pair_key): write_seqs(pair, out_fhand)
def filter_duplicates(in_fhands, out_fhand, paired_reads, use_length=None, n_seqs_packet=None, tempdir=None): if not in_fhands: raise ValueError('At least one input fhand is required') pairs = _read_pairs(in_fhands, paired_reads) get_pair_key = _PairKeyGetter(use_length=use_length) if n_seqs_packet is None: unique_pairs = unique_unordered(pairs, key=get_pair_key) else: sorted_pairs = sorted_items(pairs, key=get_pair_key, tempdir=tempdir, max_items_in_memory=n_seqs_packet) unique_pairs = unique(sorted_pairs, key=get_pair_key) for pair in unique_pairs: write_seqs(pair, out_fhand)