def index_repeatmasker_alignment_by_id(fh, out_fh, vebrose=False): """Build an index for a repeat-masker alignment file by repeat-masker ID.""" def extract_UID(rm_alignment): return rm_alignment.meta[multipleAlignment.RM_ID_KEY] index = IndexedFile(fh, repeat_masker_alignment_iterator, extract_UID) index.write_index(out_fh)
def index_genome_alignment_by_locus(fh, out_fh, verbose=False): """Build an index for a genome alig. using coords in ref genome as keys.""" bound_iter = functools.partial(genome_alignment_iterator, reference_species="hg19", index_friendly=True) hash_func = JustInTimeGenomeAlignmentBlock.build_hash idx = IndexedFile(fh, bound_iter, hash_func) idx.write_index(out_fh, verbose=verbose)
def test_index_lookup(self, mock_open): """Test lookup of specific key using full UI.""" in_strm = StringIO.StringIO(self.ga_maf1) idx_strm = StringIO.StringIO() out_strm = StringIO.StringIO() # replace open with mock def open_side_effect(*args, **kwargs): if not isinstance(args[0], basestring): raise TypeError() if args[0] == "one.maf": return in_strm elif args[0] == "one.idx": return idx_strm elif args[0] == "out.txt": return out_strm raise IOError("No such file") mock_open.side_effect = open_side_effect # build and index in idx_strm bound_iter = functools.partial(genome_alignment_iterator, reference_species="hg19") hash_func = JustInTimeGenomeAlignmentBlock.build_hash idx = IndexedFile(StringIO.StringIO(self.ga_maf1), bound_iter, hash_func) idx.write_index(idx_strm) idx_strm.seek(0) key = "chr22" + "\t" + "1772" + "\t" + "1825" main(["lookup", "-o", "out.txt", "-k", key, "one.maf", "one.idx"]) self.assertEqual( str(idx[key]).strip(), str(out_strm.getvalue()).strip())
def test_index_lookup(self, mock_open): """Test lookup of specific key using full UI.""" in_strm = StringIO.StringIO(self.ga_maf1) idx_strm = StringIO.StringIO() out_strm = StringIO.StringIO() # replace open with mock def open_side_effect(*args, **kwargs): if not isinstance(args[0], basestring): raise TypeError() if args[0] == "one.maf": return in_strm elif args[0] == "one.idx": return idx_strm elif args[0] == "out.txt": return out_strm raise IOError("No such file") mock_open.side_effect = open_side_effect # build and index in idx_strm bound_iter = functools.partial(genome_alignment_iterator, reference_species="hg19") hash_func = JustInTimeGenomeAlignmentBlock.build_hash idx = IndexedFile(StringIO.StringIO(self.ga_maf1), bound_iter, hash_func) idx.write_index(idx_strm) idx_strm.seek(0) key = "chr22" + "\t" + "1772" + "\t" + "1825" main(["lookup", "-o", "out.txt", "-k", key, "one.maf", "one.idx"]) self.assertEqual(str(idx[key]).strip(), str(out_strm.getvalue()).strip())
def lookup_genome_alignment_index(index_fh, indexed_fh, out_fh=sys.stdout, key=None, verbose=False): """Load a GA index and its indexed file and extract one or more blocks. :param index_fh: the index file to load. Can be a filename or a stream-like object. :param indexed_fh: the file that the index was built for, :param key: A single key, iterable of keys, or None. This key will be used for lookup. If None, user is prompted to enter keys interactively. """ # load the genome alignment as a JIT object bound_iter = functools.partial(genome_alignment_iterator, reference_species="hg19", index_friendly=True) hash_func = JustInTimeGenomeAlignmentBlock.build_hash idx = IndexedFile(record_iterator=bound_iter, record_hash_function=hash_func) idx.read_index(index_fh, indexed_fh) if key is None: while key is None or key.strip() != "": sys.stderr.write("[WAITING FOR KEY ENTRY ON STDIN; " + "END WITH EMPTY LINE]\n") key = raw_input() # we know keys for genome alignments have tabs as delims, so.. key = '\t'.join(key.split()).strip() if key != "": out_fh.write(str(idx[key]) + "\n") sys.stderr.write("\n") else: # we know keys for genome alignments have tabs as delims, so.. key = '\t'.join(key.split()) out_fh.write(str(idx[key]) + "\n")
def build_genome_alignment_from_file(ga_path, ref_spec, idx_path=None, verbose=False): """ build a genome alignment by loading from a single MAF file. :param ga_path: the path to the file to load. :param ref_spec: which species in the MAF file is the reference? :param idx_path: if provided, use this index to generate a just-in-time genome alignment, instead of loading the file immediately. """ blocks = [] if (idx_path is not None): bound_iter = functools.partial(genome_alignment_iterator, reference_species=ref_spec) hash_func = JustInTimeGenomeAlignmentBlock.build_hash factory = IndexedFile(None, bound_iter, hash_func) factory.read_index(idx_path, ga_path, verbose=verbose) pind = None for k in factory: if verbose: if pind is None: total = len(factory) pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="building alignment blocks ") pind.done += 1 pind.showProgress() blocks.append(JustInTimeGenomeAlignmentBlock(factory, k)) else: for b in genome_alignment_iterator(ga_path, ref_spec, verbose=verbose): blocks.append(b) return GenomeAlignment(blocks, verbose)
def _build_index(in_strng, ref_spec): idx_strm = StringIO.StringIO() bound_iter = functools.partial(genome_alignment_iterator, reference_species=ref_spec) hash_func = JustInTimeGenomeAlignmentBlock.build_hash idx = IndexedFile(StringIO.StringIO(in_strng), bound_iter, hash_func) idx.write_index(idx_strm) idx_strm.seek(0) # seek to the start return idx_strm
def _build_index(maf_strm, ref_spec): """Build an index for a MAF genome alig file and return StringIO of it.""" idx_strm = StringIO.StringIO() bound_iter = functools.partial(genome_alignment_iterator, reference_species=ref_spec) hash_func = JustInTimeGenomeAlignmentBlock.build_hash idx = IndexedFile(maf_strm, bound_iter, hash_func) idx.write_index(idx_strm) idx_strm.seek(0) # seek to the start return idx_strm
def build_genome_alignment_from_file(ga_path, ref_spec, idx_path=None, verbose=False): """ build a genome alignment by loading from a single MAF file. :param ga_path: the path to the file to load. :param ref_spec: which species in the MAF file is the reference? :param idx_path: if provided, use this index to generate a just-in-time genome alignment, instead of loading the file immediately. """ blocks = [] if (idx_path is not None): bound_iter = functools.partial(genome_alignment_iterator, reference_species=ref_spec) hash_func = JustInTimeGenomeAlignmentBlock.build_hash factory = IndexedFile(None, bound_iter, hash_func) factory.read_index(idx_path, ga_path, verbose=verbose) pind = None for k in factory: if verbose: if pind is None: total = len(factory) pind = ProgressIndicator( totalToDo=total, messagePrefix="completed", messageSuffix="building alignment blocks ") pind.done += 1 pind.showProgress() blocks.append(JustInTimeGenomeAlignmentBlock(factory, k)) else: for b in genome_alignment_iterator(ga_path, ref_spec, verbose=verbose): blocks.append(b) return GenomeAlignment(blocks, verbose)
def test_repeat_masker_on_demand_load(self): """ Tests wrapping the alignment iterator in an index and using this index to build RM alignment objects that are loaded on-demand from the indexed stream. """ from pyokit.io.indexedFile import IndexedFile def extract_UID(rm_alignment): return rm_alignment.meta[RM_ID_KEY] s_io = StringIO.StringIO(self.rm_rc_1_input) index = IndexedFile(s_io, repeat_masker_alignment_iterator, extract_UID) for i, trail_meta_size, c_width, m_width, rm_id in [ (0, 4, 29, None, 5), (1, 4, 30, None, 10), (2, 3, 31, 13, 15), (3, 0, 22, 13, 231) ]: on_d_alig = JustInTimePairwiseAlignment(index, rm_id) on_d_str = _to_repeatmasker_string(on_d_alig, column_width=c_width, m_name_width=m_width) # strip out the last few lines; these should all be their, but order # isn't important. alig_actual = [ x for x in map( str.rstrip, self.rm_tc1_records[i].split("\n") [:-trail_meta_size]) if x.strip() != "" ] meta_actual = map( str.rstrip, self.rm_tc1_records[i].split("\n")[-trail_meta_size:]) alig_result = [ x for x in map(str.rstrip, on_d_str.split("\n")[:-trail_meta_size]) if x.strip() != "" ] meta_result = map(str.rstrip, on_d_str.split("\n")[-trail_meta_size:]) self.failUnlessEqual(alig_actual, alig_result) self.failUnlessEqual(set(meta_actual), set(meta_result))
def test_iterator_with_alignment_index(self): def extract_UID(rm_alignment): return rm_alignment.meta[repeatmaskerAlignments.RM_ID_KEY] s_io = StringIO.StringIO(self.rm_rc_1_input) index = IndexedFile(s_io, repeat_masker_alignment_iterator, extract_UID) elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann), alignment_index=index)] self.assertEqual(len(elems), 6) for i in range(0, len(elems)): an = retrotransposon.from_repeat_masker_string(self.indv_an[i]) self.assertEqual(elems[i], an) # alignments were provided, liftover should be using them; test one # to make sure they were matched up properly r = elems[0].liftover(GenomicInterval("chr1", 10, 100)) self.assertEqual(r, [(132, 142), (120, 131), (88, 118), (85, 87)]) # also test one of the ones that had no alignment; here we expect failure self.assertRaises(IndexError, elems[4].liftover, GenomicInterval("chr1", 15200, 15400))