예제 #1
0
def trimMatchOverlapsInBoth(inpfile, outfile, trim_subtype):
    gp = MyFile.myfile()
    MatchRecord.sortInXorderAP(inpfile, gp)
    # The following coalescing assumes perfect runs.
    hp = MyFile.myfile()
    coalesceMatches(gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')))
    gp = MyFile.myfile()
    trimMatchOverlapsInX(hp, gp, trim_subtype)
    hp = MyFile.myfile()
    MatchRecord.sortInYorderAP(gp, hp)
    trimMatchOverlapsInY(hp, outfile, trim_subtype)
    return
def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype):
    gp = MyFile.myfile()
    MatchRecord.sortInXorderAP(inpfile,gp)
    # The following coalescing assumes perfect runs.
    hp = MyFile.myfile()
    coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) )
    gp = MyFile.myfile()
    trimMatchOverlapsInX(hp,gp,trim_subtype)
    hp = MyFile.myfile()
    MatchRecord.sortInYorderAP(gp,hp)
    trimMatchOverlapsInY(hp,outfile,trim_subtype)
    return
예제 #3
0
def boxRecovery( inpfile, rawfile, outname):
    inpfile.seek(0)
    rawfile.seek(0)
    outfile = MyFile.myfile()

    rawfileIter = iter(rawfile)
    
    # This is a modified merge operation?
    # The two input files must be sorted the same manner.
    leftMatch = None
    for line in inpfile:
        if(line[0] == 'M'):
            rightMatch = MatchRecord.MatchRecord(line)
            if( leftMatch != None and leftMatch.inSameRunAs(rightMatch) ):
                # print >>STDERR, "In same run leftMatch=", leftMatch, " rightMatch=", rightMatch
                for rawline in rawfileIter:
                    if( rawline[0] == 'M'):
                        rawMatch = MatchRecord.MatchRecord(rawline)
                        if(rawMatch.sameAs(rightMatch)):
                            print >>outfile, rightMatch
                            break
                        else:
                            # print "Inside run rawMatch=", rawMatch
                            if(rawMatch.isInsideBox(leftMatch,rightMatch)):
                                print >>outfile, rawMatch
                            # end if
                        # end if
                    # end if
                # end for
                # We should die here if there is no rawMatch that matched the rightMatch ...
            else:
                # print >>STDERR, "Between runs leftMatch=", leftMatch, " rightMatch=", rightMatch
                for rawline in rawfileIter:
                    if( rawline[0] == 'M'):
                        rawMatch = MatchRecord.MatchRecord(rawline)
                        if(rawMatch.sameAs(rightMatch)):
                            print >>outfile, rightMatch
                            break
                        else: 
                            # print >>STDERR, "Discard rawMatch=", rawMatch
                            pass
                        # end if
                    # end if
                # end for
                # We should die here if there is no rawMatch that matched the rightMatch ...
                # Discard raw Matches until it is ge to the right match.
            # end if
            leftMatch = rightMatch
        # end if
    # end for
    return outfile
예제 #4
0
def mainLoop(inpfile, outfile, xIdx, yIdx, maxgap, erate):

    margin = 20  # This should be set by an ATAC global.
    countLines = 0
    inter_run_gap_count_total = 0
    closed_gap_count_total = 0
    squeezed_total = 0
    x_len_total = 0
    y_len_total = 0
    x_nonACGT_total = 0
    y_nonACGT_total = 0

    inpfile.seek(0)
    inpfileIter = iter(inpfile)

    sys.stderr.write("begin\n")

    left = None
    for line in inpfileIter:
        if (line[0] == 'M'):
            left = MatchRecord.MatchRecord(line)
            print >> outfile, left
            countLines += 1
            break

    sys.stderr.write("countLines=%d\n" % countLines)

    for line in inpfileIter:
        if (line[0] == 'M'):
            newRight = MatchRecord.MatchRecord(line)
            if (newRight.subtype == 'u'):
                right = newRight

                #if( countLines % 10000 == 0):
                #    sys.stderr.write("countLines=%d\n" % countLines)

                (inter_run_gap_count, ) = analyzeGap(xIdx, yIdx, left, right,
                                                     outfile, maxgap, erate,
                                                     margin)

                inter_run_gap_count_total += inter_run_gap_count

                # Output the record which was possibly trimmed.
                print >> outfile, right
                countLines += 1
                left = right
        # end if
    # end for

    sys.stderr.write("countLines %d inter_run_gap_count %d \n" %
                     (countLines, inter_run_gap_count_total))
예제 #5
0
def main(inpfile, outfile, xIdx, yIdx):
    inpfile.seek(0)
    outfile.seek(0)
    lineCount = 0
    t0 = time.time()
    for line in inpfile:
        lineCount += 1
        if ((lineCount % 10000) == 0):
            print >> sys.stderr, "lineCount=", lineCount, " time=", time.time(
            ) - t0
        if (line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            if (FM.subtype == 'g'):
                parentid = FM.matchid
                parent_x_forward = (FM.x_orientation == 1)
                parent_y_forward = (FM.y_orientation == 1)
                parent_x_start = FM.x_start
                parent_y_start = FM.y_start
                parent_x_length = FM.x_length
                parent_y_length = FM.y_length

                # Why two orientations and not just a flipped flag?
                # Because we want the resulting matches to come out in
                # the same sorted order as the input matches.

                x_substring = string.upper(
                    xIdx.getStringFromFasta(parent_x_forward, FM.x_scaf_uid,
                                            FM.x_start, FM.x_length))
                y_substring = string.upper(
                    yIdx.getStringFromFasta(parent_y_forward, FM.y_scaf_uid,
                                            FM.y_start, FM.y_length))
                ii = 0
                # Here we call the dedasher.
                halign.halignStart(x_substring, y_substring)
                for segment in iter(halign.halignDedash, None):
                    #print >>outfile, segment
                    (bgn1, bgn2, len1, len2, nmat) = segment
                    # Filter by a minimum length? say four bp.
                    ii += 1
                    FM.subtype = 'u'
                    FM.matchid = parentid + 'u' + str(ii)
                    # FM.runid = parentid
                    FM.x_start = parent_x_start + (
                        parent_x_length - bgn1 - len1, bgn1)[parent_x_forward]
                    FM.y_start = parent_y_start + (
                        parent_y_length - bgn2 - len2, bgn2)[parent_y_forward]
                    FM.x_length = len1
                    FM.y_length = len2
                    assert (len1 == len2)
                    mismatches = 0
                    for ic in range(len1):
                        if (x_seq[bgn1 + ic] != y_seq[bgn2 + ic]):
                            mismatches += 1
                    FM.extend['mm'] = str(mismatches)
                    FM.identifier = ""  # BEWARE
                    print >> outfile, FM
            else:
                print >> outfile, line,
        else:
            print >> outfile, line,
예제 #6
0
def filterByMatchLength( inpfile, outfile, minimum_length):
    "Only keep matches that are long enough."
    inpfile.seek(0)
    for line in inpfile:
        if(line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            if (FM.x_length >= minimum_length and
                FM.y_length >= minimum_length ):
                print >>outfile, FM
예제 #7
0
def createSignedEnumeration(inpfile):
    outfile = MyFile.myfile()
    p = 1
    inpfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            forwardX = FM.x_orientation
            forwardY = FM.y_orientation
            srank = cvm(forwardX == forwardY, p, -p)
            p += 1
            FM.extend['srank'] = srank
            print >> outfile, FM
        # end if
    # end while
    return outfile
예제 #8
0
def onlyKeepLongRuns ( inpfile, outname, lengthThreshold ):
    outfile = MyFile.myfile()
    rejectsfile = MyFile.myfile()
    
    FL = None
    store = []
    lenInMatches = 0
    inpfile.seek(0)
    for line in inpfile:
        if(line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            SL = FM.x_length
            if FL != None and FL.runid != FM.runid :
                for x in store:
                    print >>rejectsfile, x
                # end for
                store = []
                lenInMatches = SL
            else:
                lenInMatches += SL
            # end if

            if lenInMatches < lengthThreshold:
                store.append(FM)
            else:
                for x in store:
                    print >>outfile, x
                # end for
                store = []
                print >>outfile, FM
            # end if
            FL = FM
        # end if
    # end for
    rejectsfile.close()
    return outfile
예제 #9
0
def applyOneKeepMask(inpfile, outfile, keepMaskFile, processFirstAxis):
    # Note that the following merge-like control structure is
    # influenced by the function property of keep intevals to matches.

    debug = 0
    inpfile.seek(0)
    outfile.seek(0)
    keepMaskFile.seek(0)

    # Put the first valid match record into FM.  Each input ATAC match
    # record produces zero, one or more output ATAC matches.
    FM = None
    ma = None
    ms = None
    me = None
    qa = None
    qs = None
    ql = None

    # the set of masking intervals, using the q variables and iline
    maskiter = iter(keepMaskFile)

    # the set of masked matches using the m variables and mline
    inpiter = iter(inpfile)

    iline = None
    mline = None

    last_matchid = None
    subcount = 0

    try:  # StopIteration exception from either iterator gets us out
        while 1:
            if (iline == None):
                iline = maskiter.next()
                (
                    subtype,
                    qa,
                    qs,
                    ql,
                    cov,
                ) = iline.split()
                assert (subtype == 'C')
                cov = int(cov)
                if (cov != 1):
                    iline = None
                    continue
                qs = int(qs)
                ql = int(ql)
                qe = qs + ql

            if (mline == None):
                mline = inpiter.next()
                if (mline[0] != 'M'):
                    # not a match record, so just pass it through
                    print >> outfile, mline,
                    mline = None
                    continue
                FM = MatchRecord.MatchRecord(mline)
                assert (FM.subtype == "u" or FM.subtype == "x")
                if (processFirstAxis):
                    ma = FM.x_scaf_uid
                    ms = FM.x_start  # match start
                    me = ms + FM.x_length  # match end
                else:
                    ma = FM.y_scaf_uid
                    ms = FM.y_start  # match start
                    me = ms + FM.y_length  # match end

            # holding valid iline and mline data now

            if not (ma == qa):
                # not on same axis, need to get a new one
                if (ma < qa):
                    mline = None
                else:
                    iline = None

            elif not ((ms < qe) and (qs < me)):
                # we are not overlapping, need to get a new one of them
                if (ms < qs):
                    mline = None
                else:
                    iline = None

            else:
                # processing for overlaps
                FT = FM.copy()
                mx = max(ms, qs)
                mn = min(me, qe)
                trimFromStart = mx - ms
                trimFromEnd = me - mn
                trimmedLength = mn - mx
                if (FT.x_orientation == FT.y_orientation):
                    FT.x_start += trimFromStart
                    FT.y_start += trimFromStart
                else:
                    if (processFirstAxis):
                        FT.x_start += trimFromStart
                        FT.y_start += trimFromEnd
                    else:
                        FT.y_start += trimFromStart
                        FT.x_start += trimFromEnd
                FT.x_length = trimmedLength
                FT.y_length = trimmedLength
                if debug:
                    print >> sys.stdout, "# trimmed "
                    print >> sys.stdout, FT

                # We must insure that the match identifier is still unique.
                if last_matchid == FM.matchid:
                    subcount += 1
                else:
                    subcount = 0
                # print >>sys.stderr, last_matchid, FM.matchid, subcount
                last_matchid = FM.matchid

                if (subcount > 0):
                    if processFirstAxis:
                        FT.matchid = FT.matchid + "x" + str(subcount)
                    else:
                        FT.matchid = FT.matchid + "y" + str(subcount)

                print >> outfile, FT
                # we need to get a new one
                if (qe < me):
                    iline = None
                else:
                    mline = None

    except StopIteration:
        # If there are any left over non-match lines, then output them!
        for mline in inpiter:
            if (mline[0] != "M"):
                print >> outfile, mline,
예제 #10
0
def applyBothKeepMasks(inpfile, outfile):

    # Maybe we can think of a masking implementation where each ATAC match
    # is treated atomicly.  Assume that the keep mask intervals are sorted
    # by start postition.  Assume that the ATAC matches are sorted by start
    # postion.  Assert that all keep mask intervals are non-overlapping and
    # were cut from only one ATAC match.  Thus the mapping from keep mask
    # intervals is a function.  Note that this requires that we do not
    # coalesce abutting keep mask intervals that originate from multiple
    # matches.  Note this still allows an ATAC match to overlap more than
    # one keep mask interval.  Ignore all keep mask intervals with zero
    # length their creation has tie breaking problems.  See notes on 2003
    # Jul 29.

    debug = 0
    debugnum = 0
    inpfile.seek(0)
    outfile.seek(0)

    # Apply the keepMask for the first axis.
    # Make the sorted the keep mask intervals for the first axis.
    processFirstAxis = 1
    keepMaskFile = MyFile.myfile()
    tmpfile2 = inpfile
    tmpfile3 = MyFile.myfile()
    tmpfile4 = MyFile.myfile()

    findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile:
            print >> debugfile, line,

    MatchRecord.sortInXorderAP(tmpfile2, tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3:
            print >> debugfile, line,

    applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4:
            print >> debugfile, line,

    # Apply the keepMask for the second axis.
    # Make the sorted the keep mask intervals for the second axis.
    processFirstAxis = 0
    keepMaskFile = MyFile.myfile()
    tmpfile2 = tmpfile4
    tmpfile3 = MyFile.myfile()
    tmpfile4 = outfile

    findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile:
            print >> debugfile, line,

    MatchRecord.sortInYorderAP(tmpfile2, tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3:
            print >> debugfile, line,

    applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4:
            print >> debugfile, line,
예제 #11
0
def trimMatchOverlapsInY(inpfile, outfile, trim_subtype):
    "Trim the match overlaps with respect to the Y assembly."
    overlaps = 0
    abuts = 0
    posgaps = 0
    contained = 0
    trimmed = 0
    left = None

    picket = 0
    # For each genomic axis we scan left to right using this picket
    # position to annihilating any part of the current match to the
    # left of this picket.

    inpfile.seek(0)
    for line in iter(inpfile):
        if (line[0] == 'M'):
            right = MatchRecord.MatchRecord(line)
            if (right.subtype != trim_subtype):
                print >> outfile, line,
                continue
            if (left == None or
                    #left.y_scaf_uid < right.y_scaf_uid):
                    left.y_scaf_uid != right.y_scaf_uid):
                picket = 0
            else:
                assert (left != None)
                assert (right != None)
                if (left.y_scaf_uid > right.y_scaf_uid):
                    print >> sys.stderr, "sequence ids out of y sorted order"
                    print >> sys.stderr, left
                    print >> sys.stderr, right
                assert (left.subtype == right.subtype)
                assert (left.y_scaf_uid == right.y_scaf_uid)
                if (not (left.y_start <= right.y_start)):
                    print >> sys.stderr, "trimMatchOverlapsInY: Woops not sorted anymore!"
                    print >> sys.stderr, left
                    print >> sys.stderr, right
                    #assert(0)

                thisbgn = right.y_start
                thisend = right.y_start + right.y_length
                if (picket < thisend):
                    gaplen = thisbgn - picket
                    if (gaplen > 0):
                        posgaps += 1
                    if (gaplen == 0):
                        abuts += 1
                    if (gaplen < 0):
                        overlaps += 1
                        trimmed -= gaplen
                        right.y_start -= gaplen  # modify the match
                        right.y_length += gaplen
                        right.x_length += gaplen
                        if (right.x_orientation == right.y_orientation):
                            right.x_start -= gaplen  # modify the match
                else:
                    # picketed region contains right.
                    #print >>sys.stderr, "trimMatchOverlapsInY: Contained"
                    #print >>sys.stderr, left
                    #print >>sys.stderr, right
                    contained += 1
                    right = None  # remove this match
            if (right != None):
                print >> outfile, right
                newpicket = right.y_start + right.y_length
                assert (picket < newpicket)
                picket = newpicket
                left = right
        else:
            print >> outfile, line,
    print >> sys.stderr, "trimMatchOverlapsInY:\n",
    print >>sys.stderr, "#posgaps, #abuts, #overlaps, #contained, bp_trimmed= %d %d %d %d %d\n" \
          % (posgaps, abuts, overlaps, contained, trimmed, )
    return
예제 #12
0
def coalesceMatches(inpfile, outfile, needs_to_share_diagonal):
    "Coalesce overlapping and abutting matches within the same run."

    firstF = None
    lastF = None

    lastLX = -3
    lastLY = -4
    lastForward = 0

    lowHitPX = None
    lowHitPY = None
    hghHitPX = None
    hghHitPY = None
    inpfile.seek(0)
    outfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            curF = MatchRecord.MatchRecord(line)
            px = curF.x_start
            nx = curF.x_length
            py = curF.y_start
            ny = curF.y_length
            assert (px >= 0)
            assert (nx >= 0)
            assert (py >= 0)
            assert (ny >= 0)
            if (not (not needs_to_share_diagonal or nx == ny)):
                print >> sys.stderr, 'Bombed on:'
                print >> sys.stderr, str(curF)
                print >> sys.stderr, 'needs_to_share_diagonal=' + str(
                    needs_to_share_diagonal)
                print >> sys.stderr, 'nx=' + str(nx) + '  ny=' + str(ny)
            # end if
            assert ((hghHitPX == None or (not needs_to_share_diagonal)
                     or nx == ny))
            forward = (curF.x_orientation == curF.y_orientation)
            lx = px
            ly = cvm(forward, py, py + ny)
            rx = px + nx
            ry = cvm(forward, py + ny, py)

            overlapping = ((lastF != None)
                           and (curF.x_scaf_uid == lastF.x_scaf_uid)
                           and (curF.y_scaf_uid == lastF.y_scaf_uid)
                           and (((lx >= lowHitPX and lx <= hghHitPX) and
                                 (ly >= lowHitPY and ly <= hghHitPY)) or
                                ((rx >= lowHitPX and rx <= hghHitPX) and
                                 (ry >= lowHitPY and ry <= hghHitPY))))
            on_diagonal = ((forward == lastForward)
                           and ((lx - lastLX)
                                == ((ly - lastLY) * cvm(forward, 1, -1))))
            # print >>sys.stdout, lastF, curF
            # print >>sys.stdout, lx,rx,ly,ry
            # print >>sys.stdout, lowHitPX,hghHitPX,lowHitPY,hghHitPY
            # print >>sys.stdout, "overlapping=",overlapping
            # print >>sys.stdout, "on_diagonal=",on_diagonal

            lowMerPX = px
            lowMerPY = py
            hghMerPX = px + nx
            hghMerPY = py + ny
            if (not (overlapping and
                     (not needs_to_share_diagonal or on_diagonal))):
                if (firstF != None):
                    # if (lastF == None or firstF.runid != lastF.runid):
                    # end if
                    firstF.subtype = ('g', 'u')[needs_to_share_diagonal]
                    firstF.x_start = lowHitPX
                    firstF.y_start = lowHitPY
                    firstF.x_length = hghHitPX - lowHitPX
                    firstF.y_length = hghHitPY - lowHitPY
                    print >> outfile, firstF
                # end if
                firstF = curF
                lowHitPX = lowMerPX
                lowHitPY = lowMerPY
                hghHitPX = hghMerPX
                hghHitPY = hghMerPY
            # end if
            lowHitPX = cvm(lowHitPX < lowMerPX, lowHitPX, lowMerPX)
            lowHitPY = cvm(lowHitPY < lowMerPY, lowHitPY, lowMerPY)
            hghHitPX = cvm(hghHitPX > hghMerPX, hghHitPX, hghMerPX)
            hghHitPY = cvm(hghHitPY > hghMerPY, hghHitPY, hghMerPY)

            lastLX = lx
            lastLY = ly
            lastForward = forward
            lastF = curF
        # end if
    # end for

    if (firstF != None):
        firstF.subtype = ('g', 'u')[needs_to_share_diagonal]
        firstF.x_start = lowHitPX
        firstF.y_start = lowHitPY
        firstF.x_length = hghHitPX - lowHitPX
        firstF.y_length = hghHitPY - lowHitPY
        print >> outfile, firstF

    return
예제 #13
0
def findPerfectRuns(inpfile, maxJump, runIdPrefix):
    outfile = MyFile.myfile()
    left = None
    runid = 1
    inpfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            right = MatchRecord.MatchRecord(line)
            pr = int(right.extend['srank'])
            del (right.extend['srank'])
            if (left != None):
                maxGapInXandY = 0
                if (left.x_scaf_uid == right.x_scaf_uid
                        and left.y_scaf_uid == right.y_scaf_uid):
                    # Find the maximum of the gap in x and y axis.

                    x_rs = right.x_start
                    x_re = x_rs + right.x_length
                    x_ls = left.x_start
                    x_le = x_ls + left.x_length
                    assert (x_rs < x_re)
                    assert (x_ls < x_le)
                    # All matches are positive length.
                    x_gapLeftBeforeRight = x_rs - x_le
                    x_gapRightBeforeLeft = x_ls - x_re
                    assert (not (x_gapLeftBeforeRight > 0
                                 and x_gapRightBeforeLeft > 0))
                    x_gap = max(x_gapLeftBeforeRight, x_gapRightBeforeLeft)
                    # x_gap == 0 is abutting
                    # x_gap < 0  is overlapping

                    y_rs = right.y_start
                    y_re = y_rs + right.y_length
                    y_ls = left.y_start
                    y_le = y_ls + left.y_length
                    assert (y_rs < y_re)
                    assert (y_ls < y_le)
                    y_gapLeftBeforeRight = y_rs - y_le
                    y_gapRightBeforeLeft = y_ls - y_re
                    assert (not (y_gapLeftBeforeRight > 0
                                 and y_gapRightBeforeLeft > 0))
                    y_gap = max(y_gapLeftBeforeRight, y_gapRightBeforeLeft)
                    # y_gap == 0 is abutting
                    # y_gap < 0  is overlapping

                    maxGapInXandY = max(x_gap, y_gap)

                    if 1:
                        # Check the sorting of the matches.

                        sorted_by_x = (x_ls <= x_rs)
                        sorted_by_y = (y_ls <= y_rs)

                        if (not (sorted_by_x or sorted_by_y)):
                            print >> sys.stderr, "bad sorting in findPerfectRuns"
                            print >> sys.stderr, left
                            print >> sys.stderr, right
                        assert (sorted_by_x or sorted_by_y)
                        dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re)
                        dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re)
                        if (sorted_by_x and not (dovetail_in_x)):
                            print >> sys.stderr, "contained in x in findPerfectRuns"
                            print >> sys.stderr, left
                            print >> sys.stderr, right
                        if (sorted_by_y and not (dovetail_in_y)):
                            print >> sys.stderr, "contained in y in findPerfectRuns"
                            print >> sys.stderr, left
                            print >> sys.stderr, right
                # endif
                if ((left.x_scaf_uid != right.x_scaf_uid)
                        or  # check first axis id
                    (left.y_scaf_uid != right.y_scaf_uid)
                        or  # check second axis id
                    (maxGapInXandY > maxJump)
                        or (pr != lastpr + 1
                            )  # Using the signed rank NOT the run id !!!!
                    ):
                    runid += 1
                # end if
            # end if
            lastpr = pr
            right.runid = "%s%d" % (
                runIdPrefix,
                runid,
            )  # Assign the run id in the same slot as the signed rank.
            print >> outfile, right
            left = right
        # end if
    # end for
    return outfile
예제 #14
0
def runsAsMatches(inpfile):

    outfile = MyFile.myfile()
    lastF = None
    firstF = None
    runFill = 0
    inpfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            curF = MatchRecord.MatchRecord(line)
            if ((lastF == None) or (curF.runid != lastF.runid)):
                if ((lastF != None)
                        and (firstF.x_scaf_uid != lastF.x_scaf_uid)):
                    print >> sys.stderr, firstF
                    print >> sys.stderr, lastF
                # end if
                assert ((lastF == None)
                        or (firstF.x_scaf_uid == lastF.x_scaf_uid))
                assert ((lastF == None)
                        or (firstF.y_scaf_uid == lastF.y_scaf_uid))
                if (None != lastF):
                    x1 = firstF.x_start
                    x2 = lastF.x_start
                    startX = cvm(x1 < x2, x1, x2)
                    x1 += firstF.x_length
                    x2 += lastF.x_length
                    endX = cvm(x1 > x2, x1, x2)
                    y1 = firstF.y_start
                    y2 = lastF.y_start
                    startY = cvm(y1 < y2, y1, y2)
                    y1 += firstF.y_length
                    y2 += lastF.y_length
                    endY = cvm(y1 > y2, y1, y2)
                    lastF.subtype = 'r'
                    lastF.matchid = lastF.runid
                    lastF.runid = "."  # the agreed NULL value
                    lastF.x_start = startX
                    lastF.y_start = startY
                    lastF.x_length = endX - startX
                    lastF.y_length = endY - startY
                    lastF.runFill = runFill
                    print >> outfile, lastF
                # end if
                firstF = curF
                runFill = 0
            # end if
            runFill += curF.x_length
            lastF = curF
        # end if
    # end for

    if (None != lastF):
        x1 = firstF.x_start
        x2 = lastF.x_start
        startX = cvm(x1 < x2, x1, x2)
        x1 += firstF.x_length
        x2 += lastF.x_length
        endX = cvm(x1 > x2, x1, x2)
        y1 = firstF.y_start
        y2 = lastF.y_start
        startY = cvm(y1 < y2, y1, y2)
        y1 += firstF.y_length
        y2 += lastF.y_length
        endY = cvm(y1 > y2, y1, y2)
        lastF.subtype = 'r'
        lastF.matchid = lastF.runid
        lastF.runid = "."  # the agreed NULL value
        lastF.x_start = startX
        lastF.y_start = startY
        lastF.x_length = endX - startX
        lastF.y_length = endY - startY
        lastF.runFill = runFill
        print >> outfile, lastF
    # end if
    return outfile
예제 #15
0
    def runOld(self):
        self.globals['atacAlgorithmVersion'] = str(17)
        print >>STDERR, "runName = %s\n" % self.runName

        # The ATAC globals used by this script:
        opt_t = int(self.globals['globalMatchMinSize'])
        opt_l = int(self.globals['globalPerfectRunMinLen'])
        maxdiff = int(self.globals['globalPerfectRunMaxGapLen'])

        assemblyId1 = self.globals['assemblyId1']
        assemblyId2 = self.globals['assemblyId2']

        assemblyFile1 = self.globals['assemblyFile1']
        assemblyFile2 = self.globals['assemblyFile2']

        boxRecoveryOn = 0  # Deprecated for same species comparisons 2003/09/09.
        if(self.globals.has_key("boxRecoveryOn")):
            boxRecoveryOn = int(self.globals['boxRecoveryOn'])
            
        t0 = time.time()

        assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1)
        assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2)
        rawfile = None
        
        ###################################################################
        # Setup for checkpointing scheme.        
        redo = 0
        keep = 0
        step = 0
        if(self.globals.has_key("ckpKeep")):
            keep = int(self.globals['ckpKeep'])
        ckpName = "AllDone"
        ###################################################################

        print >>STDERR, 'Keep step=' + str(keep)
        print >>STDERR, 'At step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)

        outprefix = self.runName

        step += 1
        print >>STDERR, 'At uniqueFilter, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")):
                print >>STDERR, 'Running UniqueFilter'
                outfile = MyFile.myfile()
                UniqueFilter.main( self.matches, outfile)
                self.matches = outfile
                outprefix += '.uniq'
                self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At filterByMatchLength, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'Running filterByMatchLength'
            outfile = MyFile.myfile()
            filterByMatchLength( self.matches, outfile, opt_t)
            self.matches = outfile
            outprefix += '.t' + str(opt_t)
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At trimMatchOverlaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, "Start trimming for bp one-to-one-ness"
            tempdata = MyFile.myfile()
            TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
            self.matches = tempdata
            print >>STDERR, "Finished trimming for bp one-to-one-ness"
            outprefix += '.trim'
            self.checkpoint(outprefix)

        if( boxRecoveryOn == 1 ):
            # For box recovery later ... but what if we start from a checkpoint?
            rawfile = self.matches

        step += 1
        print >>STDERR, 'At formPerfectRuns, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6'
            tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                                   MatchRecord.sortInXorderAP,
                                                   MatchRecord.sortInYorderAP,
                                                   maxdiff,
                                                   'r')
            self.matches = tempdata
            outprefix += ".p6"
        # end if

        step += 1
        print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l)
            tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l)
            self.matches = tempdata
            outprefix += '.l' + str(opt_l)
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At formPerfectRuns, step=' + str(step) 
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'Heal the perfect runs'
            tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                       MatchRecord.sortInYorderAP,
                                       MatchRecord.sortInXorderAP, maxdiff, 'r')
            self.matches = tempdata
            outprefix += '.pr'
            self.checkpoint(outprefix)

        if(boxRecoveryOn == 1): 

            # This is a box recovery step.
            step += 1
            print >>STDERR, 'At boxRecovery, step=' + str(step) 
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br'
                print >>STDERR, "Make sorted raw matches"
                outfile = MyFile.myfile()
                MatchRecord.sortInXorderAP( rawfile, outfile)
                rawfile = outfile
                print >>STDERR, "perform box recovery"
                tempdata = boxRecovery( self.matches, rawfile, outprefix)
                self.matches = tempdata
                outprefix += '.br'
                self.checkpoint(outprefix)
            # end if

            step += 1
            print >>STDERR, 'At formPerfectRuns, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ( (keep < step) and not self.globals.has_key(ckpName))):
                print >>STDERR, "form perfect runs"
                redo = 1
                print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6'
                tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                       MatchRecord.sortInXorderAP,
                                       MatchRecord.sortInYorderAP, maxdiff, 'r')
                self.matches = tempdata
                outprefix += '.pr'
                self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq'
            tempdata = MyFile.myfile()
            squeezeIntraRunGaps.mainLoop(
                self.matches,
                tempdata,
                assemblyIdx1, assemblyIdx2)
            tempy = MyFile.myfile()
            # Beware the current match subtypes are 'x', 'L', and 'R'!
            coalesceMatches( tempdata, tempy, 1)
            self.matches = tempy
            outprefix += '.sq'
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, "Start trimming for bp one-to-one-ness"
            tempdata = MyFile.myfile()
            TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
            self.matches = tempdata
            outprefix += '.trim'
            print >>STDERR, "Finished trimming for bp one-to-one-ness"

        step += 1
        print >>STDERR, 'At RunsAsMatches, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            self.runs = PerfectRuns.runsAsMatches( self.matches)
            outprefix += '.runs'
            self.checkpoint(outprefix)
        # end if

        if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ):
        
            # Next comes the DNA sequence dependent stuff.
            step += 1
            print >>STDERR, 'At fillIntraRunGaps, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, "fill the intrarun gaps"
                if(not self.globals.has_key('fillIntraRunGapsErate')):
                    self.globals['fillIntraRunGapsErate'] = 0.10
                if(not self.globals.has_key('fillIntraRunGapsMaxGap')):
                    self.globals['fillIntraRunGapsMaxGap'] = 100000
                fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate'])
                fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap'])
                tempdata = MyFile.myfile()
                fillIntraRunGaps.mainLoop(self.matches, tempdata,
                                          assemblyIdx1, assemblyIdx2,
                                          fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
                self.matches = tempdata
                outprefix += '.fill'
                self.checkpoint(outprefix)

            step += 1
            print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, "trim the overlaps"
                tempdata = MyFile.myfile()
                TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
                self.matches = tempdata
                outprefix += '.trim'
                self.checkpoint(outprefix)
예제 #16
0
def mainLoop( inpfile, outfile, xIdx, yIdx):

    maxgap = 100000 # This should be set by an ATAC global.
    margin = 20     # This should be set by an ATAC global.

    countLines = 0
    inter_run_gap_count_total = 0
    closed_gap_count_total = 0
    squeezed_total = 0
    x_len_total = 0
    y_len_total = 0
    x_nonACGT_total = 0
    y_nonACGT_total = 0

    inpfile.seek(0)
    inpfileIter = iter(inpfile)

    left = None
    for line in inpfileIter:
        if(line[0] == 'M'):
            left = MatchRecord.MatchRecord(line)
            # outfile.write(str(left))
            print >>outfile, left
            countLines += 1
            break;

    for line in inpfileIter:
        if(line[0] == 'M'):
            right = MatchRecord.MatchRecord(line)

            #if( countLines % 10000 == 0):
            #    sys.stderr.write("countLines=%d\n" % countLines)

            (inter_run_gap_count,squeezed,x_len,y_len,x_notACGT,y_notACGT) \
              = analyzeGap(xIdx,yIdx,left,right, outfile, maxgap, margin)
            inter_run_gap_count_total += inter_run_gap_count
            squeezed_total += squeezed
            x_len_total += x_len
            y_len_total += y_len
            x_nonACGT_total += x_notACGT
            y_nonACGT_total += y_notACGT
            if(x_len == 0 and y_len == 0): closed_gap_count_total += 1

            # Output the record which was possibly trimmed.
            #outfile.write(str(right))
            print >>outfile, right
            countLines += 1
            left = right
        # end if
    # end for
    
    sys.stderr.write(
        "countLines %d inter_run_gap_count %d closed_gap_count %d squeezed %d x_len %d y_len %d x_nonACGT %d y_nonACGT %d\n" %
        (countLines,inter_run_gap_count_total,closed_gap_count_total,
         squeezed_total,x_len_total,y_len_total,x_nonACGT_total,y_nonACGT_total))

    sys.stderr.write("theIsolatedSNPcount = %d\n" % theIsolatedSNPcount)
    sys.stderr.write("completefillednotXY = %d\n" % completefillednotXY)
    sys.stderr.write("completefilledXnotY = %d\n" % completefilledXnotY)
    sys.stderr.write("completefilledYnotX = %d\n" % completefilledYnotX)
    sys.stderr.write("completefilledXandY = %d\n" % completefilledXandY)
예제 #17
0
def applyBothKeepMasks( inpfile, outfile ):

    # Maybe we can think of a masking implementation where each ATAC match
    # is treated atomicly.  Assume that the keep mask intervals are sorted
    # by start postition.  Assume that the ATAC matches are sorted by start
    # postion.  Assert that all keep mask intervals are non-overlapping and
    # were cut from only one ATAC match.  Thus the mapping from keep mask
    # intervals is a function.  Note that this requires that we do not
    # coalesce abutting keep mask intervals that originate from multiple
    # matches.  Note this still allows an ATAC match to overlap more than
    # one keep mask interval.  Ignore all keep mask intervals with zero
    # length their creation has tie breaking problems.  See notes on 2003
    # Jul 29.

    debug = 0
    debugnum = 0
    inpfile.seek(0)
    outfile.seek(0)


    # Apply the keepMask for the first axis.
    # Make the sorted the keep mask intervals for the first axis.
    processFirstAxis = 1
    keepMaskFile = MyFile.myfile()
    tmpfile2 = inpfile
    tmpfile3 = MyFile.myfile()
    tmpfile4 = MyFile.myfile()

    findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile: print >>debugfile, line,
            
    MatchRecord.sortInXorderAP(tmpfile2,tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3: print >>debugfile, line,
        
    applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4: print >>debugfile, line,
        
    # Apply the keepMask for the second axis.
    # Make the sorted the keep mask intervals for the second axis.
    processFirstAxis = 0
    keepMaskFile = MyFile.myfile()
    tmpfile2 = tmpfile4
    tmpfile3 = MyFile.myfile()
    tmpfile4 = outfile

    findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile: print >>debugfile, line,


    MatchRecord.sortInYorderAP(tmpfile2,tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3: print >>debugfile, line,

    applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4: print >>debugfile, line,