def processBED(infh, outhandle, scheme, verbose=False): for read in BEDIterator(infh, verbose=verbose): # split the chrom field to get the genomic indices.. y = collections.deque(read.chrom.split("_")) while len(y) > 5: a = y.popleft() a += ("_" + y.popleft()) y.appendleft(a) chrom = y[0] chrom1SeqStart = int(y[1]) chrom1SeqEnd = int(y[2]) chrom2SeqStart = int(y[3]) # arbitrarily decide the first exon contains the largest portion of # the read if both are the same firstExon = None secondExon = None if scheme != SECOND_EXON: firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1, chrom1SeqEnd, read.name, read.score, read.strand) if scheme != FIRST_EXON: end = chrom2SeqStart + (read.end - (chrom1SeqEnd - chrom1SeqStart)) - 1 secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name, read.score, read.strand) # we add %1 or %2 to the end of the read names so they can # be distinguished later if firstExon is not None: firstExon.name = firstExon.name + "%1" if secondExon is not None: secondExon.name = secondExon.name + "%2" if (scheme == FIRST_EXON) or \ (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "+"): out = str(firstExon) elif (scheme == SECOND_EXON) or \ (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "-"): out = str(secondExon) elif scheme == BOTH_EXONS: out = str(firstExon) + "\n" + str(secondExon) # sanity check -- make sure we create a valid output string for l in out.split("\n"): e = parseBEDString(l) if e.chrom.strip() == "": raise ValueError(" got an emtpy chrom -> " + str(read)) # write output outhandle.write(out + "\n")
def test_basic_iterator(self): elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann))] self.assertEqual(len(elems), 6) for i in range(0, len(elems)): an = retrotransposon.from_repeat_masker_string(self.indv_an[i]) self.assertEqual(elems[i], an) # alignments are not avaialble, so liftover should work only on coords; # just check one to make sure its working # elem[0]: 15, 67 -> 85, 141 - (53 to 57; gap_length = 14) self.assertEqual(elems[0].liftover(GenomicInterval("chr1", 10, 100)), [GenomicInterval("A#B", 128, 142, strand='+'), GenomicInterval("A#B", 113, 127, strand='+'), GenomicInterval("A#B", 98, 112, strand='+'), GenomicInterval("A#B", 86, 97, strand='+')])
def __init__(self, chrom, genomic_start, genomic_end, consensus_match_strand, consensus_start, consensus_end, consensus_len, retrotransposon, pairwise_alignment=None, uniq_id=None): """ Constructor for RetrotransposonOccurrence objects; see class docstring for parameter defs. """ GenomicInterval.__init__(self, chrom, genomic_start, genomic_end, retrotransposon.name, 0, "+") self.consensus_start = consensus_start self.consensus_end = consensus_end # TODO check that consensus length is at least as long as end index self.consensus_len = consensus_len self.consensus_match_strand = consensus_match_strand self.retrotransposon = retrotransposon self.uniq_id = uniq_id # TODO would be good to provide some error-checking here to make sure the # alignment matches up properly with this occurrence... self.pairwise_alignment = pairwise_alignment
def __str__(self): """ :return: a string representation of this occurrence in the format <genomic_interval_details> --> match_start match_end \ consensus_len repeat_name \ match_strand """ return GenomicInterval.__str__(self) + " --> " +\ str(self.consensus_start) + "\t" + str(self.consensus_end) + "\t" +\ str(self.consensus_len) + "\t" + self.repeat_name() + "\t" +\ str(self.consensus_match_strand)
def test_iterator_with_alignment_index(self): def extract_UID(rm_alignment): return rm_alignment.meta[repeatmaskerAlignments.RM_ID_KEY] s_io = StringIO.StringIO(self.rm_rc_1_input) index = IndexedFile(s_io, repeat_masker_alignment_iterator, extract_UID) elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann), alignment_index=index)] self.assertEqual(len(elems), 6) for i in range(0, len(elems)): an = retrotransposon.from_repeat_masker_string(self.indv_an[i]) self.assertEqual(elems[i], an) # alignments were provided, liftover should be using them; test one # to make sure they were matched up properly r = elems[0].liftover(GenomicInterval("chr1", 10, 100)) self.assertEqual(r, [(132, 142), (120, 131), (88, 118), (85, 87)]) # also test one of the ones that had no alignment; here we expect failure self.assertRaises(IndexError, elems[4].liftover, GenomicInterval("chr1", 15200, 15400))
def __init__(self, whole_chrom_files, partial_chrom_files, factory): """Constructor; see class docsstring for param details.""" self.current = None self.current_key = None self.factory = factory self.whole_chrom_files = whole_chrom_files self.partial_trees = {} by_chrom = {} for chrom, start, end in partial_chrom_files: k = (chrom, start, end) v = partial_chrom_files[k] if chrom in whole_chrom_files: raise GenomeAlignmentError("Oops") if chrom not in by_chrom: by_chrom[chrom] = [] interval = GenomicInterval(chrom, start, end) by_chrom[chrom].append(JITGenomeAlignmentKeyInterval(interval, v)) for chrom in by_chrom: self.partial_trees[chrom] = IntervalTree(by_chrom[chrom]) for chrom, start, end in partial_chrom_files: hits = self.partial_trees[chrom].intersectingInterval(start, end) if len(hits) != 1: raise GenomeAlignmentError("Oops")
def pairedBEDIterator(inputStreams, mirror=False, mirrorScore=None, ignoreStrand=False, ignoreScore=True, ignoreName=True, sortedby=ITERATOR_SORTED_END, scoreType=float, verbose=False): """ Iterate over multiple BED format files simultaneously and yield lists of genomic intervals for each matching set of intervals found. By default, regions which are not found in all files will be skipped (mirror = false). Optionally (by setting mirror to true) if a file is missing an interval, it can be added on-the-fly, and will have the same chrom, start and end and name as in other files. The score will be taken from the first file in inputStreams if mirrorScore is not set, otherwise that value will be used. :param inputStreams: a list of input streams in BED format :param mirror: if true, add missing elements so all streams contain the same elements. Inserted elements will have the same :param ignoreStrand: ignore strand when comparing elements for equality? :param ignoreScore: ignore score when comparing elements for equality? :param ignoreScore: ignore name when comparing elements for equality? :param sortedby: must be set to one of the sorting orders for BED streams; we require the streams to be sorted in some fashion. :param scoreType: interpret scores as what type? Defaults to float, which is generally the most flexible. """ # let's build our sorting order... sortOrder = ["chrom"] if sortedby == ITERATOR_SORTED_START: sortOrder.append("start") sortOrder.append("end") elif sortedby == ITERATOR_SORTED_END: sortOrder.append("end") sortOrder.append("start") if not ignoreStrand: sortOrder.append("strand") if not ignoreName: sortOrder.append("name") if not ignoreScore: sortOrder.append("score") keyFunc = attrgetter(*sortOrder) def next_item(iterator): """ little internal function to return the next item, or None """ try: return iterator.next() except StopIteration: return None bIterators = [ BEDIterator(bfh, verbose=verbose, sortedby=sortedby, scoreType=scoreType) for bfh in inputStreams ] elements = [next_item(it) for it in bIterators] while True: assert (len(elements) >= 2) if None not in elements and len(set([keyFunc(x) for x in elements])) == 1: # All equal -- yield and move on for all streams yield [e for e in elements] elements = [next_item(it) for it in bIterators] else: # something wasn't equal.. find the smallest thing, it's about to drop # out of range and will never have the chance to match anything again minElement = min([x for x in elements if x is not None], key=keyFunc) minIndices = [ i for i in range(0, len(elements)) if elements[i] is not None and keyFunc(elements[i]) == keyFunc(minElement) ] if mirror: # mirror the min item for any streams in which it doesn't match score = minElement.score if mirrorScore is None else mirrorScore yield [ elements[i] if i in minIndices else GenomicInterval( minElement.chrom, minElement.start, minElement.end, minElement.name, score, minElement.strand, scoreType=scoreType) for i in range(0, len(elements)) ] # move the smallest element onwards now, we're done with it for index in minIndices: elements[index] = next_item(bIterators[index]) # stop once all streams are exhausted if reduce(lambda x, y: x and y, [e is None for e in elements]): break
def pairedWigIterator(inputStreams, mirror=False, mirrorScore=None, ignoreScore=True, sortedby=ITERATOR_SORTED_END, scoreType=int, verbose=False, debug=False): """ @summary: iterate over multiple wig streams, and yield a list of wig elements that match for each location (locations with 0 matching items are skipped) @param inputStrams: TODO @param mirror: TODO @param mirrorScore: TODO @param ignoreScore: Don't consider score when determining if two elements are equal @param sortedby: TODO @param scoreType: TODO @param verbose: TODO @param debug: TODO @note: streams must be sorted -- sortedby parameter determines what sorting order will be acceptable """ # let's build our sorting order... sortOrder = ["chrom"] if sortedby == ITERATOR_SORTED_START: sortOrder.append("start") sortOrder.append("end") elif sortedby == ITERATOR_SORTED_END: sortOrder.append("end") sortOrder.append("start") if not ignoreScore: sortOrder.append("score") keyFunc = attrgetter(*sortOrder) def next_item(iterator): """ little internal function to return the next item, or None """ try: return iterator.next() except StopIteration: return None wIterators = [ wigIterator(fh, verbose=verbose, sortedby=sortedby, scoreType=scoreType) for fh in inputStreams ] elements = [next_item(it) for it in wIterators] while True: assert (len(elements) >= 2) if None not in elements and len(set([keyFunc(x) for x in elements])) == 1: # All equal -- yield and move on for all streams yield [e for e in elements] elements = [next_item(it) for it in wIterators] else: # something wasn't equal.... find the smallest thing, it's about # to drop out of range... minElement = min([x for x in elements if x is not None], key=keyFunc) minIndices = [ i for i in range(0, len(elements)) if elements[i] is not None and keyFunc(elements[i]) == keyFunc(minElement) ] if mirror: # mirror the min item for any streams in which it doesn't match score = minElement.score if mirrorScore is None else mirrorScore yield [ elements[i] if i in minIndices else GenomicInterval( minElement.chrom, minElement.start, minElement.end, None, score) for i in range(0, len(elements)) ] # move the smallest element onwards now, we're done with it for index in minIndices: elements[index] = next_item(wIterators[index]) # stop once all strams are exhuasted if reduce(lambda x, y: x and y, [e is None for e in elements]): break
def fixedWigIterator(fd, verbose=False, sortedby=None, scoreType=int): """ @summary: """ fh = openFD(fd) if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(fh.name), messagePrefix="completed", messageSuffix="of processing " + fh.name) except AttributeError: sys.stderr.write("WigIterator -- warning: " + "unable to show progress for stream") verbose = False chromsSeen = set() prev = None # NUMBERS = set(['1','2','3','4','5','6','7','8','9','0','.']) currentChrom, at, step = None, None, None for line in fh: line = line.strip() if line == "": continue if line[0] == 't' or line[0] == 'f': parts = line.split() if parts[0] == "track": continue elif parts[0] == "fixedStep": currentChrom = parts[1].split("=")[1] at = int(parts[2].split("=")[1]) step = int(parts[3].split("=")[1]) else: val = float(line) e = GenomicInterval(currentChrom, at, at + step, None, val, scoreType=scoreType) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise WigIteratorError( "Wig file " + fd.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): raise WigIteratorError("Wig file " + fd.name + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e at += step if verbose: pind.done = fh.tell() pind.showProgress()