def count_coverage(readers, comments=True): primary = readers[0] secondary = readers[1] secondary_copy = readers[2] rightTree = quicksect.IntervalTree() for item in secondary: if type(item) is GenomicInterval: rightTree.insert(item, secondary.linenum, item.fields) bitsets = secondary_copy.binned_bitsets() global full, partial for interval in primary: if type(interval) is Header: yield interval if type(interval) is Comment and comments: yield interval elif type(interval) == GenomicInterval: chrom = interval.chrom start = int(interval.start) end = int(interval.end) full = 0 partial = 0 if chrom not in bitsets: bases_covered = 0 percent = 0.0 full = 0 partial = 0 else: bases_covered = bitsets[chrom].count_range(start, end - start) if (end - start) == 0: percent = 0 else: percent = float(bases_covered) / float(end - start) if bases_covered: root = rightTree.chroms[ chrom] #root node for the chrom tree counter(root, start, end) interval.fields.append(str(bases_covered)) interval.fields.append(str(percent)) interval.fields.append(str(full)) interval.fields.append(str(partial)) yield interval
def proximal_region_finder(readers, region, comments=True): """ Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. Intervals are GenomicInterval objects. """ primary = readers[0] features = readers[1] either = False if region == 'Upstream': up, down = True, False elif region == 'Downstream': up, down = False, True else: up, down = True, True if region == 'Either': either = True # Read features into memory: rightTree = quicksect.IntervalTree() for item in features: if type(item) is GenomicInterval: rightTree.insert(item, features.linenum, item) for interval in primary: if type(interval) is Header: yield interval if type(interval) is Comment and comments: yield interval elif type(interval) == GenomicInterval: chrom = interval.chrom start = int(interval.start) end = int(interval.end) strand = interval.strand if chrom not in rightTree.chroms: continue else: root = rightTree.chroms[chrom] #root node for the chrom tree result_up = [] result_down = [] if (strand == '+' and up) or (strand == '-' and down): #upstream +ve strand and downstream -ve strand cases get_closest_feature(root, 1, start, None, lambda node: result_up.append(node), None) if (strand == '+' and down) or (strand == '-' and up): #downstream +ve strand and upstream -ve strand case get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append(node)) if result_up: if len( result_up ) > 1: #The results_up list has a list of intervals upstream to the given interval. ends = [] for n in result_up: ends.append(n.end) res_ind = ends.index( max(ends) ) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list else: res_ind = 0 if not (either): yield [interval, result_up[res_ind].other] if result_down: if not (either): #The last element of result_down will be the closest element to the given interval yield [interval, result_down[-1].other] if either and (result_up or result_down): iter_val = [] if result_up and result_down: if abs(start - int(result_up[res_ind].end)) <= abs( end - int(result_down[-1].start)): iter_val = [interval, result_up[res_ind].other] else: #The last element of result_down will be the closest element to the given interval iter_val = [interval, result_down[-1].other] elif result_up: iter_val = [interval, result_up[res_ind].other] elif result_down: #The last element of result_down will be the closest element to the given interval iter_val = [interval, result_down[-1].other] yield iter_val
sp_file.seek(0) win = NiceReaderWrapper( fileinput.FileInput( int_file ), chrom_col=chr_col_i, start_col=start_col_i, end_col=end_col_i, strand_col=strand_col_i, fix_strand=True) indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), chrom_col=1, start_col=sort_col, end_col=sort_col+1, strand_col=-1, fix_strand=True) indelTree = quicksect.IntervalTree() for item in indel: if type( item ) is GenomicInterval: indelTree.insert( item, indel.linenum, item.fields ) result=[] global full, blk_len, blk_list for interval in win: if type( interval ) is Header: pass if type( interval ) is Comment: pass elif type( interval ) == GenomicInterval: chrom = interval.chrom start = int(interval.start) end = int(interval.end)
def main(): infile = sys.argv[1] for i, line in enumerate(file(infile)): line = line.rstrip('\r\n') if len(line) > 0 and not line.startswith('#'): elems = line.split('\t') break if i == 30: break # Hopefully we'll never get here... if len(elems) != 15: stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." ) global winspecies, speciesind if region == 'win': if dbkey_i in elems[1]: winspecies = 1 speciesind = 1 elif dbkey_i in elems[8]: winspecies = 2 speciesind = 8 else: stop_err( "The species build corresponding to your interval file is not present in the Microsatellite file." ) fin = open(infile, 'r') skipped = 0 blk = 0 win = 0 linestr = "" if region == 'win': msats = NiceReaderWrapper(fileinput.FileInput(infile), chrom_col=speciesind, start_col=speciesind + 1, end_col=speciesind + 2, strand_col=-1, fix_strand=True) msatTree = quicksect.IntervalTree() for item in msats: if type(item) is GenomicInterval: msatTree.insert(item, msats.linenum, item.fields) for iline in fint: try: iline = iline.rstrip('\r\n') if not (iline) or iline == "": continue ielems = iline.strip("\r\n").split('\t') ichr = ielems[chr_col_i] istart = int(ielems[start_col_i]) iend = int(ielems[end_col_i]) isrc = "%s.%s" % (dbkey_i, ichr) if isrc not in msatTree.chroms: continue result = [] root = msatTree.chroms[isrc] #root node for the chrom counter(root, istart, iend, lambda node: result.append(node)) if not (result): continue tmpfile1 = tempfile.NamedTemporaryFile('wb+') for node in result: tmpfile1.write("%s\n" % "\t".join(node.other)) tmpfile1.seek(0) output_writer(iline, tmpfile1.readlines()) except: skipped += 1 if skipped: print "Skipped %d intervals as invalid." % (skipped) elif region == 'align': if s_group_cols[0] != -1: print >> fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount" else: print >> fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount" prev_bnum = -1 try: for line in fin: line = line.strip("\r\n") if not (line) or line == "": continue elems = line.split('\t') try: assert int(elems[0]) assert len(elems) == 15 except: continue new_bnum = int(elems[0]) if new_bnum != prev_bnum: if prev_bnum != -1: output_writer( prev_bnum, linestr.strip().replace('\r', '\n').split('\n')) linestr = line + "\n" else: linestr += line linestr += "\n" prev_bnum = new_bnum output_writer(prev_bnum, linestr.strip().replace('\r', '\n').split('\n')) except Exception, ea: print >> sys.stderr, ea skipped += 1 if skipped: print "Skipped %d lines as invalid." % (skipped)