Ejemplo n.º 1
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr( item, "visited", True )
                yield outfields
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen): outfields.append(".")
                yield outfields

    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            outfields = list()
            for x in range(leftlen): outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 2
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start, interval.end +
                                       1) and item.end not in range(
                                           interval.start, interval.end + 1):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end +
                                       1) and item.start not in range(
                                           interval.start, interval.end + 1):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end +
                                         1) and item.end in range(
                                             interval.start, interval.end + 1):
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr(item, "visited", True)
                yield outfields
            if (len(result) == 0
                    or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen):
                    outfields.append(".")
                yield outfields

    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            outfields = list()
            for x in range(leftlen):
                outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 3
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()
    
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields
            if rightStrandCol == -1: rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if (asfraction==True):
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr( item, "visited", True )
                yield(getSelectedColumns( interval.fields, item.other, outColumns ))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield(getSelectedColumns( interval.fields, rightlen, outColumns ))
    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            yield(getSelectedColumns( leftlen, item.other, outColumns))
Ejemplo n.º 4
0
def join(
    leftSet,
    rightSet,
    mincols=1,
    leftfill=True,
    rightfill=True,
    asfraction=False,
    matchStrand=STRAND_NEUTRAL,
    outColumns=[-1, -1],
):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()

    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0:
                rightlen = item.nfields
            if rightStrandCol == -1:
                rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if asfraction == True:
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start, interval.end + 1) and item.end not in range(
                    interval.start, interval.end + 1
                ):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end + 1) and item.start not in range(
                    interval.start, interval.end + 1
                ):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end + 1) and item.end in range(
                    interval.start, interval.end + 1
                ):
                    overlap = item.end - item.start
                else:  # the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    # check strand
                    strandMatched = (
                        STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    )
                    if strandMatched == -1 and matchStrand > 0:
                        # needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if strandMatched == 1 and matchStrand < 0:
                        # needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if strandMatched == 0 and (matchStrand < -1 or matchStrand > 1):
                        # strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                # strand criteria met
                setattr(item, "visited", True)
                yield (getSelectedColumns(interval.fields, item.other, outColumns))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield (getSelectedColumns(interval.fields, rightlen, outColumns))
    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            yield (getSelectedColumns(leftlen, item.other, outColumns))