Esempio n. 1
0
def iterator_filter_overlapping_query(psls, options):
    '''remove alignments that overlap on query.

    If multiple alignments overlap, the one with the highest number
    of matching nucleotides is chosen.
    '''

    # note: only takes the full ranges, but does not check for
    # individual overlap of blocks use connected components and
    # hasAlignmentOverlap
    ninput, noutput, ndiscarded = 0, 0, 0

    last_contig = None

    for block in Blat.iterator_query_overlap(
            psls,
            options.threshold_merge_distance):

        # commented code is for base-level filtering, which is very slow
        # disabled for now
        # if block[0].mQueryId != last_contig:
        #     last_contig = block[0].mQueryId
        #     E.info( "processing %s" % last_contig )

        l = len(block)
        ninput += l
        if l > 1:
            ndiscarded += l
            # components = Blat.getComponents( block, by_query = True )
            # for component in components:
            #     m = [ block[x] for x in component ]
            #     m.sort( key = lambda x: -x.mNMatches )
            #     ndiscarded += len(m) - 1
            #     yield m[0]
            #     noutput += 1
        else:
            yield block[0]
            noutput += 1

    E.info("iterator_filter_overlapping_query: ninput=%i, "
           "noutput=%i, ndiscarded=%i" %
           (ninput, noutput, ndiscarded))
Esempio n. 2
0
def iterator_filter_overlapping_query(psls, options):
    '''remove alignments that overlap on query.

    If multiple alignments overlap, the one with the highest number
    of matching nucleotides is chosen.
    '''

    # note: only takes the full ranges, but does not check for
    # individual overlap of blocks use connected components and
    # hasAlignmentOverlap
    ninput, noutput, ndiscarded = 0, 0, 0

    last_contig = None

    for block in Blat.iterator_query_overlap(psls,
                                             options.threshold_merge_distance):

        # commented code is for base-level filtering, which is very slow
        # disabled for now
        # if block[0].mQueryId != last_contig:
        #     last_contig = block[0].mQueryId
        #     E.info( "processing %s" % last_contig )

        l = len(block)
        ninput += l
        if l > 1:
            ndiscarded += l
            # components = Blat.getComponents( block, by_query = True )
            # for component in components:
            #     m = [ block[x] for x in component ]
            #     m.sort( key = lambda x: -x.mNMatches )
            #     ndiscarded += len(m) - 1
            #     yield m[0]
            #     noutput += 1
        else:
            yield block[0]
            noutput += 1

    E.info("iterator_filter_overlapping_query: ninput=%i, "
           "noutput=%i, ndiscarded=%i" % (ninput, noutput, ndiscarded))