예제 #1
0
def count_coverage(readers, comments=True):
    primary = readers[0]
    secondary = readers[1]
    secondary_copy = readers[2]

    rightTree = quicksect.IntervalTree()
    for item in secondary:
        if type(item) is GenomicInterval:
            rightTree.insert(item, secondary.linenum, item.fields)

    bitsets = secondary_copy.binned_bitsets()

    global full, partial

    for interval in primary:
        if type(interval) is Header:
            yield interval
        if type(interval) is Comment and comments:
            yield interval
        elif type(interval) == GenomicInterval:
            chrom = interval.chrom
            start = int(interval.start)
            end = int(interval.end)
            full = 0
            partial = 0
            if chrom not in bitsets:
                bases_covered = 0
                percent = 0.0
                full = 0
                partial = 0
            else:
                bases_covered = bitsets[chrom].count_range(start, end - start)
                if (end - start) == 0:
                    percent = 0
                else:
                    percent = float(bases_covered) / float(end - start)
                if bases_covered:
                    root = rightTree.chroms[
                        chrom]  #root node for the chrom tree
                    counter(root, start, end)
            interval.fields.append(str(bases_covered))
            interval.fields.append(str(percent))
            interval.fields.append(str(full))
            interval.fields.append(str(partial))
            yield interval
예제 #2
0
def proximal_region_finder(readers, region, comments=True):
    """
    Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. 
    Intervals are GenomicInterval objects. 
    """
    primary = readers[0]
    features = readers[1]
    either = False
    if region == 'Upstream':
        up, down = True, False
    elif region == 'Downstream':
        up, down = False, True
    else:
        up, down = True, True
        if region == 'Either':
            either = True

    # Read features into memory:
    rightTree = quicksect.IntervalTree()
    for item in features:
        if type(item) is GenomicInterval:
            rightTree.insert(item, features.linenum, item)

    for interval in primary:
        if type(interval) is Header:
            yield interval
        if type(interval) is Comment and comments:
            yield interval
        elif type(interval) == GenomicInterval:
            chrom = interval.chrom
            start = int(interval.start)
            end = int(interval.end)
            strand = interval.strand
            if chrom not in rightTree.chroms:
                continue
            else:
                root = rightTree.chroms[chrom]  #root node for the chrom tree
                result_up = []
                result_down = []
                if (strand == '+' and up) or (strand == '-' and down):
                    #upstream +ve strand and downstream -ve strand cases
                    get_closest_feature(root, 1, start, None,
                                        lambda node: result_up.append(node),
                                        None)

                if (strand == '+' and down) or (strand == '-' and up):
                    #downstream +ve strand and upstream -ve strand case
                    get_closest_feature(root, 0, None, end - 1, None,
                                        lambda node: result_down.append(node))

                if result_up:
                    if len(
                            result_up
                    ) > 1:  #The results_up list has a list of intervals upstream to the given interval.
                        ends = []
                        for n in result_up:
                            ends.append(n.end)
                        res_ind = ends.index(
                            max(ends)
                        )  #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
                    else:
                        res_ind = 0
                    if not (either):
                        yield [interval, result_up[res_ind].other]

                if result_down:
                    if not (either):
                        #The last element of result_down will be the closest element to the given interval
                        yield [interval, result_down[-1].other]

                if either and (result_up or result_down):
                    iter_val = []
                    if result_up and result_down:
                        if abs(start - int(result_up[res_ind].end)) <= abs(
                                end - int(result_down[-1].start)):
                            iter_val = [interval, result_up[res_ind].other]
                        else:
                            #The last element of result_down will be the closest element to the given interval
                            iter_val = [interval, result_down[-1].other]
                    elif result_up:
                        iter_val = [interval, result_up[res_ind].other]
                    elif result_down:
                        #The last element of result_down will be the closest element to the given interval
                        iter_val = [interval, result_down[-1].other]
                    yield iter_val
예제 #3
0
 sp_file.seek(0)
 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
                             chrom_col=chr_col_i,
                             start_col=start_col_i,
                             end_col=end_col_i,
                             strand_col=strand_col_i,
                             fix_strand=True)
 
 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
                             chrom_col=1,
                             start_col=sort_col,
                             end_col=sort_col+1,
                             strand_col=-1,
                             fix_strand=True)
 
 indelTree = quicksect.IntervalTree()
 for item in indel:
     if type( item ) is GenomicInterval:
         indelTree.insert( item, indel.linenum, item.fields )
 result=[]
 
 global full, blk_len, blk_list
 for interval in win:
     if type( interval ) is Header:
         pass
     if type( interval ) is Comment:
         pass
     elif type( interval ) == GenomicInterval:
         chrom = interval.chrom
         start = int(interval.start)
         end = int(interval.end)
예제 #4
0
def main():
    infile = sys.argv[1]

    for i, line in enumerate(file(infile)):
        line = line.rstrip('\r\n')
        if len(line) > 0 and not line.startswith('#'):
            elems = line.split('\t')
            break
        if i == 30:
            break  # Hopefully we'll never get here...

    if len(elems) != 15:
        stop_err(
            "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly."
        )
    global winspecies, speciesind
    if region == 'win':
        if dbkey_i in elems[1]:
            winspecies = 1
            speciesind = 1
        elif dbkey_i in elems[8]:
            winspecies = 2
            speciesind = 8
        else:
            stop_err(
                "The species build corresponding to your interval file is not present in the Microsatellite file."
            )

    fin = open(infile, 'r')
    skipped = 0
    blk = 0
    win = 0
    linestr = ""

    if region == 'win':

        msats = NiceReaderWrapper(fileinput.FileInput(infile),
                                  chrom_col=speciesind,
                                  start_col=speciesind + 1,
                                  end_col=speciesind + 2,
                                  strand_col=-1,
                                  fix_strand=True)
        msatTree = quicksect.IntervalTree()
        for item in msats:
            if type(item) is GenomicInterval:
                msatTree.insert(item, msats.linenum, item.fields)

        for iline in fint:
            try:
                iline = iline.rstrip('\r\n')
                if not (iline) or iline == "":
                    continue
                ielems = iline.strip("\r\n").split('\t')
                ichr = ielems[chr_col_i]
                istart = int(ielems[start_col_i])
                iend = int(ielems[end_col_i])
                isrc = "%s.%s" % (dbkey_i, ichr)
                if isrc not in msatTree.chroms:
                    continue
                result = []
                root = msatTree.chroms[isrc]  #root node for the chrom
                counter(root, istart, iend, lambda node: result.append(node))
                if not (result):
                    continue
                tmpfile1 = tempfile.NamedTemporaryFile('wb+')
                for node in result:
                    tmpfile1.write("%s\n" % "\t".join(node.other))

                tmpfile1.seek(0)
                output_writer(iline, tmpfile1.readlines())
            except:
                skipped += 1
        if skipped:
            print "Skipped %d intervals as invalid." % (skipped)
    elif region == 'align':
        if s_group_cols[0] != -1:
            print >> fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
        else:
            print >> fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
        prev_bnum = -1
        try:
            for line in fin:
                line = line.strip("\r\n")
                if not (line) or line == "":
                    continue
                elems = line.split('\t')
                try:
                    assert int(elems[0])
                    assert len(elems) == 15
                except:
                    continue
                new_bnum = int(elems[0])
                if new_bnum != prev_bnum:
                    if prev_bnum != -1:
                        output_writer(
                            prev_bnum,
                            linestr.strip().replace('\r', '\n').split('\n'))
                    linestr = line + "\n"
                else:
                    linestr += line
                    linestr += "\n"
                prev_bnum = new_bnum
            output_writer(prev_bnum,
                          linestr.strip().replace('\r', '\n').split('\n'))
        except Exception, ea:
            print >> sys.stderr, ea
            skipped += 1
        if skipped:
            print "Skipped %d lines as invalid." % (skipped)