Ejemplo n.º 1
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr( item, "visited", True )
                yield outfields
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen): outfields.append(".")
                yield outfields

    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            outfields = list()
            for x in range(leftlen): outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 2
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start, interval.end +
                                       1) and item.end not in range(
                                           interval.start, interval.end + 1):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end +
                                       1) and item.start not in range(
                                           interval.start, interval.end + 1):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end +
                                         1) and item.end in range(
                                             interval.start, interval.end + 1):
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr(item, "visited", True)
                yield outfields
            if (len(result) == 0
                    or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen):
                    outfields.append(".")
                yield outfields

    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            outfields = list()
            for x in range(leftlen):
                outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 3
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()
    
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields
            if rightStrandCol == -1: rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if (asfraction==True):
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr( item, "visited", True )
                yield(getSelectedColumns( interval.fields, item.other, outColumns ))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield(getSelectedColumns( interval.fields, rightlen, outColumns ))
    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            yield(getSelectedColumns( leftlen, item.other, outColumns))
Ejemplo n.º 4
0
def getpairs(leftSet,
             rightSet,
             leftCol,
             mincols=1,
             asfraction=False,
             matchStrand=STRAND_NEUTRAL,
             skipChrNames=True,
             skipStrandNames=True):
    # Read leftSet into memory:
    leftlen = 0
    rightlen = 0
    leftStrandCol = -1
    minoverlap = mincols
    leftTree = IntervalTree()
    rightCols = list()
    for item in leftSet:
        if type(item) is GenomicInterval:
            leftTree.insert(item, leftSet.linenum, item.fields)
            if leftlen == 0: leftlen = item.nfields
            if leftStrandCol == -1: leftStrandCol = item.strand_col

    for interval in rightSet:
        if rightlen == 0 and type(interval) is GenomicInterval:
            rightlen = interval.nfields
            rightCols = range(rightlen)
            #remove the useless columns
            rightCols.remove(interval.start_col)
            rightCols.remove(interval.end_col)
            if skipChrNames:
                rightCols.remove(interval.chrom_col)
            if skipStrandNames:
                rightCols.remove(interval.strand_col)
        if not (type(interval) is GenomicInterval):
            yield interval
        else:
            result = []
            leftTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            rightbases = interval.end - interval.start
            for item in result:
                leftbases = item.end - item.start
                if (asfraction == True):
                    if leftbases < rightbases:
                        mincols = leftbases
                    else:
                        mincols = rightbases
                    mincols = math.floor(mincols * minoverlap)

                if (item.start >= interval.start and item.start <= interval.end
                    ) and (item.end < interval.start
                           or item.end > interval.end):
                    overlap = interval.end - item.start
                elif (item.end >= interval.start and item.end <= interval.end
                      ) and (item.start < interval.start
                             or item.end > interval.end):
                    overlap = item.end - interval.start
                elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end:
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[
                        interval.strand] * STRAND_INTEGER_VALUES[
                            item.other[leftStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0
                            and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr(item, "visited", True)
                leftTerm = item.other[leftCol]
                for col in rightCols:
                    #take each field that's not a number
                    #split it on semicolons, commas, and spaces
                    #output the word and the leftTerm as being associated
                    #curcol = re.sub("\;|\,","\t",interval.fields[col])
                    curcol = interval.fields[col]
                    lexer = shlex.shlex(curcol)
                    lexer.whitespace = '\t\r\n\,\;'
                    lexer.wordchars += ":'"
                    lexer.whitespace_split = True
                    lexer.quotes = '"'

                    for item in lexer:
                        item = item.strip()
                        if (item == "."): continue
                        try:
                            float(item)
                        except ValueError:
                            yield [item, leftTerm]
Ejemplo n.º 5
0
def join(
    leftSet,
    rightSet,
    mincols=1,
    leftfill=True,
    rightfill=True,
    asfraction=False,
    matchStrand=STRAND_NEUTRAL,
    outColumns=[-1, -1],
):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()

    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0:
                rightlen = item.nfields
            if rightStrandCol == -1:
                rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if asfraction == True:
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start, interval.end + 1) and item.end not in range(
                    interval.start, interval.end + 1
                ):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end + 1) and item.start not in range(
                    interval.start, interval.end + 1
                ):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end + 1) and item.end in range(
                    interval.start, interval.end + 1
                ):
                    overlap = item.end - item.start
                else:  # the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    # check strand
                    strandMatched = (
                        STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    )
                    if strandMatched == -1 and matchStrand > 0:
                        # needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if strandMatched == 1 and matchStrand < 0:
                        # needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if strandMatched == 0 and (matchStrand < -1 or matchStrand > 1):
                        # strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                # strand criteria met
                setattr(item, "visited", True)
                yield (getSelectedColumns(interval.fields, item.other, outColumns))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield (getSelectedColumns(interval.fields, rightlen, outColumns))
    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            yield (getSelectedColumns(leftlen, item.other, outColumns))
Ejemplo n.º 6
0
def getpairs(leftSet, rightSet, leftCol, mincols=1, asfraction=False, matchStrand=STRAND_NEUTRAL, skipChrNames=True, skipStrandNames=True):
    # Read leftSet into memory:
    leftlen = 0
    rightlen = 0
    leftStrandCol = -1
    minoverlap = mincols
    leftTree = IntervalTree()
    rightCols = list()
    for item in leftSet:
        if type( item ) is GenomicInterval:
            leftTree.insert( item, leftSet.linenum, item.fields )
            if leftlen == 0: leftlen = item.nfields
            if leftStrandCol == -1: leftStrandCol = item.strand_col

    for interval in rightSet:
        if rightlen == 0 and type( interval ) is GenomicInterval:
            rightlen = interval.nfields
            rightCols = range(rightlen)
            #remove the useless columns
            rightCols.remove( interval.start_col )
            rightCols.remove( interval.end_col )
            if skipChrNames:
                rightCols.remove( interval.chrom_col )
            if skipStrandNames:
                rightCols.remove( interval.strand_col )
        if not (type( interval ) is GenomicInterval):
            yield interval
        else:
            result = []
            leftTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            rightbases = interval.end - interval.start
            for item in result:
                leftbases = item.end - item.start
                if (asfraction==True):
                    if leftbases < rightbases:
                        mincols = leftbases
                    else:
                        mincols = rightbases
                    mincols = math.floor(mincols * minoverlap)
                    
                if (item.start >= interval.start and item.start <= interval.end) and (item.end < interval.start or item.end > interval.end):
                    overlap = interval.end-item.start
                elif (item.end >= interval.start and item.end <= interval.end) and (item.start < interval.start or item.end > interval.end):
                    overlap = item.end-interval.start
                elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end:
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[leftStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr( item, "visited", True )
                leftTerm = item.other[leftCol]
                for col in rightCols:
                    #take each field that's not a number
                    #split it on semicolons, commas, and spaces
                    #output the word and the leftTerm as being associated
                    #curcol = re.sub("\;|\,","\t",interval.fields[col])
                    curcol= interval.fields[col]
                    lexer = shlex.shlex(curcol)
                    lexer.whitespace='\t\r\n\,\;'
                    lexer.wordchars += ":'"
                    lexer.whitespace_split=True
                    lexer.quotes='"'
                        
                    for item in lexer:
                        item = item.strip()
                        if (item == "."): continue
                        try:
                            float(item) 
                        except ValueError:
                            yield [item, leftTerm]