Ejemplo n.º 1
0
def GTFinder(folderName, inputfile, mummerPath):
    # GTFinder { I: LC.fasta, LC_filtered.fasta, reference.fasta; O: GTMap = [ [ contigsName,  [ [start1,end1], [start2, end2], ... ]] , ... , ] }
    # "Format of the dataList :  1      765  |    11596    10822  |      765      775  |    84.25  |        scf7180000000702    ref_NC_001133_"

    ### Finding the alignment
    if True:
        alignerRobot.useMummerAlign(mummerPath, folderName,
                                    "groundTruthMatchFixer" + inputfile,
                                    inputfile, "reference.fasta", False, "",
                                    False)

    dataList = alignerRobot.extractMumData(
        folderName, "groundTruthMatchFixer" + inputfile + "Out")

    lenDic = IORobot.obtainLength(folderName, inputfile)
    #print len(dataList)
    ### Parsing the alignment
    GTMap = []
    dataList.sort(key=itemgetter(-2))

    for key, items in groupby(dataList, itemgetter(-2)):
        contigName = key
        tmpList = list(items)
        tmpList.sort(key=itemgetter(0))
        #print len(tmpList)
        #rangeList= rangeParser(tmpList, lenDic[key])
        thres = 100
        B = intervalunion.intervalCover(tmpList, thres)
        rangeList = intervalunion.reportMisAssemblyIntervals(
            B, lenDic[key], thres)

        GTMap.append([contigName, rangeList])

    with open(folderName + inputfile + "GTMap.json", 'w') as outfile:
        json.dump(GTMap, outfile)
Ejemplo n.º 2
0
def generateAssociatedReadDic(folderName):
    dataList = []
    numberOfFiles = houseKeeper.globalParallelFileNum
    for i in range(1, 1 + numberOfFiles):
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)

        dataList = dataList + alignerRobot.extractMumData(
            folderName, "outAbun" + str(indexOfMum) + "Out")

    dataList.sort(key=itemgetter(-1))

    contigToReadsDic = {}

    lenContigDic = IORobot.obtainLength(folderName, "improved3.fasta")
    for eachitem in lenContigDic:
        contigToReadsDic[eachitem] = []

    for key, items in groupby(dataList, itemgetter(-1)):
        maxLen = 0
        tmpTarget = ""
        for eachitem in items:
            if eachitem[-4] > maxLen:
                maxLen = eachitem[-4]
                tmpTarget = eachitem[-2]

        contigToReadsDic[tmpTarget].append(key)

    with open(folderName + "contigToReadsDic.json", 'w') as outfile:
        json.dump(contigToReadsDic, outfile)
Ejemplo n.º 3
0
def loadRListDic(folderName):
    numberOfFiles = houseKeeper.globalParallelFileNum
    thres = 10000

    dataList = []
    for i in range(1, 1 + numberOfFiles):
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)

        dataList = dataList + alignerRobot.extractMumData(
            folderName, "outAbun" + str(indexOfMum) + "Out")

    dataList.sort(key=itemgetter(-2))

    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")

    RListDic = {}
    for key, items in groupby(dataList, itemgetter(-2)):
        RListDic[key] = []
        for eachitem in items:
            if eachitem[0] < thres or eachitem[1] > lenDic[
                    eachitem[-2]] - thres:
                readName = eachitem[-1]
                #RListDic[key].append("Contig" + readName[5:] + "_p")
                #RListDic[key].append("Contig" + readName[5:] + "_d")
                RListDic[key].append(readName)
        RListDic[key] = abunHouseKeeper.getDistinct(RListDic[key])

    return RListDic
Ejemplo n.º 4
0
def findGroundTruth(folderName, mummerPath):
    # "Format of the dataList :  1      765  |    11596    10822  |      765      775  |    84.25  | ref_NC_001133_       scf7180000000702"

    if True:
        alignerRobot.useMummerAlign(mummerPath, folderName, "groundTruthMatch",
                                    "reference.fasta",
                                    "improved3_Double.fasta", False, "", False)

    dataList = alignerRobot.extractMumData(folderName,
                                           "groundTruthMatch" + "Out")

    lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta")
    lenDicRef = IORobot.obtainLength(folderName, "reference.fasta")

    dataList.sort(key=itemgetter(-1))

    # print "print len(lenDic), len(lenDicRef), len(dataList)", len(lenDic), len(lenDicRef), len(dataList)

    # Format of newList : [refName, refStart, refEnd, contigName]

    newList = []
    for key, items in groupby(dataList, itemgetter(-1)):
        for eachitem in items:
            if isMatch(eachitem, lenDic) == 'f':
                newList.append(
                    [eachitem[-2], eachitem[0], eachitem[1], eachitem[-1]])
                #break
            elif isMatch(eachitem, lenDic) == 'r':
                refName = eachitem[-2]
                newList.append([
                    refName + "_r", lenDicRef[refName] - eachitem[1],
                    lenDicRef[refName] - eachitem[1], eachitem[-1]
                ])
                #break

    newList.sort()
    succList = []

    for key, items in groupby(newList, itemgetter(0)):
        tmpList = list(items)
        # print "len(tmpList)", len(tmpList)
        for i in range(len(tmpList) - 1):
            succList.append([tmpList[i][-1], tmpList[i + 1][-1]])

    # print "len(succList)", len(succList)
    return succList
Ejemplo n.º 5
0
def computeScore(folderName, eachMatching, lambdas, interiors, readMatching,
                 constants, isDebug, mummerLink):
    '''
	Input : lambdas, interiors, readMatching, constants
	Output : score \in real
	Algorithm : 
	1) Compute the total edits scores (from interiors, readMatching = [read2templateDic, template2readDic], and RList.fasta)
	2) Compute the abundance scores (from lambda)
	3) Combine them to give the final score 
	'''
    score = 0
    # 1)
    editScore = 0
    q = 0.01

    if not isDebug:

        #a) Perform an alignment and parse the results
        alignerRobot.useMummerAlign(mummerLink, folderName, "interiorAnchor",
                                    "RList_Double.fasta", "interiors.fasta",
                                    False, "", False)
        readAnchorDic = {}
        dataList = alignerRobot.extractMumData(folderName,
                                               "interiorAnchor" + "Out")
        thres = 30
        dataList.sort(key=itemgetter(-2))
        for key, items in groupby(dataList, itemgetter(-2)):
            maxMatch = 0
            for eachitem in items:
                if key in readMatching[0][key] and len(
                        readMatching[0]
                    [key]) > 0 and eachitem[-1] == readMatching[0][key][0]:
                    if eachitem[4] > maxMatch:
                        maxMatch = eachitem[4]
                        readAnchorDic[key] = [
                            eachitem[0], eachitem[1], eachitem[2], eachitem[3]
                        ]

        with open(folderName + "readAnchorDic.json", 'w') as outfile:
            json.dump(readAnchorDic, outfile)

    else:
        readAnchorDic = readInJSON(folderName, "readAnchorDic.json")

    #b) Perform careful edit distance computation
    interiorsDic = IORobot.loadContigsFromFile(folderName, "interiors.fasta")
    readsDic = IORobot.loadContigsFromFile(folderName, "RList_Double.fasta")

    for i in range(len(interiors)):
        tmpScore = 0
        for eachitem in readMatching[1]["Segkk" + str(i)]:
            readName = eachitem[-2]
            #print readName
            if readName in readAnchorDic:
                readStart, readEnd, templateStart, templateEnd = readAnchorDic[
                    readName]
                tmpScore += Levenshtein.distance(
                    readsDic[readName][readStart - 1:readEnd],
                    interiorsDic["Segkk" + str(i)][templateStart -
                                                   1:templateEnd])

        editScore += math.log(1.0 * q / (1 - 2 * q)) * tmpScore

    # 2)
    ### Need to correct the errors

    NiList = []

    internalReads = []
    for i in range(len(lambdas)):
        internalReads += readMatching[1]["Segkk" + str(i)][0]

    internalReadsSet = set(internalReads)

    contigToReadsDic = readInJSON(folderName, "contigToReadsDic.json")

    for i in range(len(eachMatching)):
        leftContig, rightContig = convertName(eachMatching[i][0]), convertName(
            eachMatching[i][1])
        Ni = len(set(contigToReadsDic[leftContig]) - internalReadsSet) + \
          len(set(contigToReadsDic[rightContig]) - internalReadsSet) + \
          len(readMatching[1]["Segkk" + str(i)])
        NiList.append(Ni)

    LiList = []
    contigsDic = IORobot.loadContigsFromFile(folderName,
                                             "improved3_Double.fasta")

    for i in range(len(eachMatching)):
        leftContig, rightContig = eachMatching[i][0], eachMatching[i][1]
        left, middle, right = contigsDic[leftContig], interiorsDic[
            "Segkk" + str(i)], contigsDic[rightContig]
        totalLen = len(left) + len(middle) + len(right)
        overlap = IORobot.align(left, middle, folderName, mummerLink)
        totalLen += overlap[0]
        overlap = IORobot.align(middle, right, folderName, mummerLink)
        totalLen += overlap[0]
        LiList.append(totalLen)

    abunScore = 0

    for i in range(len(lambdas)):
        abunScore += math.log(lambdas[i] / LiList[i]) * NiList[i]

    # 3)
    score = editScore + abunScore
    print score, editScore, abunScore, lambdas

    return score
Ejemplo n.º 6
0
def preparation(folderName):
    '''
	Prepare RList.fasta, contigLeft.json, contigRight.json, intermediate.fasta
	This step will not be needed in production as it should be automatically given or will follow a different logic of generation	
	'''

    CLeftList, CRightList = [], []
    RList = []
    templateList = []

    contigReadGraph = "phaseStringGraph1"
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDic)

    kthres, edgeThres = 3, 1

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    if True:
        adj = [[] for i in range(N1)]

        for i in range(N1):
            tmpList = abunGraphLib.findAllReachable(i, N1, G)

            for j in tmpList:
                if len(abunGraphLib.findAllPathK(i, j, G,
                                                 kthres)) >= edgeThres:
                    adj[i].append(j)

        print adj

    if True:
        CLeftList, CRightList = [0, 6], [4, 8]
        RList = []
        templateList = []

        numberOfFiles = houseKeeper.globalParallelFileNum
        dataList = []
        for i in range(1, 1 + numberOfFiles):
            if i < 10:
                indexOfMum = "0" + str(i)
            else:
                indexOfMum = str(i)

            dataList = dataList + alignerRobot.extractMumData(
                folderName, "outAbun" + str(indexOfMum) + "Out")

        middleList = [2]

        CLeftNameList, CRightNameList, middleNameList \
         = [abunHouseKeeper.parseIDToName(i, 'C', 0) for i in CLeftList]  \
         , [abunHouseKeeper.parseIDToName(i, 'C', 0) for i in CRightList] \
         , [abunHouseKeeper.parseIDToName(i, 'C', 0) for i in middleList]

        dataList.sort(key=itemgetter(-2))
        for key, items in groupby(dataList, itemgetter(-2)):
            print key
            if int(key[5:]) == 1:
                for eachitem in items:
                    RList.append(eachitem[-1])
                    #print eachitem[-4]

        #print "len(RList) : ", len(RList)
        RList = abunHouseKeeper.getDistinct(RList)
        print "len(RList) : ", len(RList)
        lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
        print lenDic["Segkk1"]
        # print RList
        IORobot.putListToFileO(folderName, "raw_reads.fasta", "RList", RList)

        ctgList = ["Contig0_p", "Contig3_p"]
        with open(folderName + "contigLeft.json", 'w') as outfile:
            json.dump(ctgList, outfile)

        ctgList = ["Contig2_p", "Contig4_p"]
        with open(folderName + "contigRight.json", 'w') as outfile:
            json.dump(ctgList, outfile)

        contigDic = IORobot.loadContigsFromFile(folderName,
                                                "improved3_Double.fasta")

        #addNoise(contigDic["Contig1_p"])
        ### no noise
        # IORobot.writeSegOut([contigDic["Contig1_p"]], folderName, "intermediate.fasta")
        ### with noise
        noisyIntermediate = dataGenLib.createANoisyRead(
            len(contigDic["Contig1_p"]), 0.01, contigDic["Contig1_p"])
        IORobot.writeSegOut([noisyIntermediate], folderName,
                            "intermediate.fasta")
        IORobot.writeSegOut([contigDic["Contig1_p"]], folderName,
                            "intermediateNoiseless.fasta")
Ejemplo n.º 7
0
def chopUpReads(folderName, mummerLink):
    print "chopUpReads"
    interiors = []

    ### Initializtion
    read2templateDic = readInJSON(folderName, "read2templateDic.json")
    template2readDic = readInJSON(folderName, "template2readDic.json")

    for eachitem in template2readDic:
        print "Length", eachitem, len(template2readDic[eachitem])
    #assert(False)

    dataList = alignerRobot.extractMumData(folderName,
                                           "templateAnchor" + "Out")
    lenDicTemplates = IORobot.obtainLength(folderName, "templates.fasta")
    templatesDic = IORobot.loadContigsFromFile(folderName, "templates.fasta")
    readsDic = IORobot.loadContigsFromFile(folderName, "RList_Double.fasta")
    dataList.sort(key=itemgetter(-1))

    ### Set up bins
    templateBeginEndDic = {}
    ell = 50

    for key, items in groupby(dataList, itemgetter(-1)):
        begin, end = 10**9, -1
        for eachitem in items:
            if eachitem[2] < begin:
                begin = eachitem[2]

            if eachitem[3] > end:
                end = eachitem[3]

        templateBeginEndDic[key] = [begin, end]

    print templateBeginEndDic

    GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta")

    for i in range(len(lenDicTemplates)):
        nameOfTemplate = "Segkk" + str(i)
        begin, end = templateBeginEndDic[nameOfTemplate]
        numberOfBins = int(math.ceil((end - begin) * 1.0 / ell))
        print "numberOfBins", numberOfBins
        bins = [
            consensusBins(j, begin + j * ell, min(begin + ell * (j + 1), end))
            for j in range(numberOfBins)
        ]

        temp2readAlignDic = loadAlignment(folderName, nameOfTemplate,
                                          template2readDic[nameOfTemplate],
                                          mummerLink)
        #for eachdebug in temp2readAlignDic:
        #	print temp2readAlignDic[eachdebug]

        ### Align reads to bins
        for eachalign in template2readDic[nameOfTemplate]:
            templateStart, templateEnd, readStart, readEnd = eachalign[
                2], eachalign[3], eachalign[0], eachalign[1]
            readName = eachalign[-2]
            #print readName, templateStart, templateEnd, readStart, readEnd , nameOfTemplate
            #assert(False)

            indexOfBin = min(math.ceil((templateStart - begin) * 1.0 / ell),
                             numberOfBins - 1)
            indexOfBin = int(indexOfBin)

            while bins[indexOfBin].end < templateEnd:
                binStart, binEnd = bins[indexOfBin].begin, bins[indexOfBin].end
                readSegStart, readSegEnd = temp2readAlignDic[readName][
                    binStart], temp2readAlignDic[readName][binEnd]
                bins[indexOfBin].addToReadList(
                    [readName, readSegStart, readSegEnd])
                indexOfBin += 1

        timestart = time.time()
        returnString = localConsensus(folderName, bins, readsDic, templatesDic,
                                      nameOfTemplate)
        print time.time() - timestart
        #assert(False)
        interiors.append(returnString)

        print "TemplateDist : ", Levenshtein.distance(
            templatesDic[nameOfTemplate], GTDic[nameOfTemplate])
        print "CleanedDist : ", Levenshtein.distance(returnString,
                                                     GTDic[nameOfTemplate])
        #print returnString[10257-3:10257+3], GTDic[nameOfTemplate][10259-3:10259+3]
        #for eachedit in Levenshtein.editops(returnString, GTDic[nameOfTemplate]):
        #	print eachedit
        # assert(False)

    IORobot.writeSegOut(interiors, folderName, "interiors.fasta")
    return interiors
Ejemplo n.º 8
0
def findAnchors(folderName, prevIteration, isDebug, mummerLink):
    '''
	Input: IORobot.writeSegOut(ctgList, folderName, "templates.fasta")
		   IORobot.putListToFileO(folderName, "raw_reads.fasta", "RList", RList)
    Output : the assignmentDic and lookUpDic

	'''

    if not isDebug:
        alignerRobot.useMummerAlign(mummerLink, folderName, "templateAnchor",
                                    "RList_Double.fasta", "templates.fasta",
                                    False, "", False)
        dataList = alignerRobot.extractMumData(folderName,
                                               "templateAnchor" + "Out")

        lenDicReads = IORobot.obtainLength(folderName, "RList_Double.fasta")

        lenDicTemplates = IORobot.obtainLength(folderName, "templates.fasta")

        templatesDic = IORobot.loadContigsFromFile(folderName,
                                                   "templates.fasta")
        readsDic = IORobot.loadContigsFromFile(folderName,
                                               "RList_Double.fasta")

        #print templatesDic["Segkk0"][1144:1144+ 50]
        #print readsDic["Segkk11098"][47:47+50]
        #print Levenshtein.distance(templatesDic["Segkk0"][1144:1144+ 50], readsDic["Segkk11098"][47:47+50])
        #assert(False)

        read2templateDic = {}
        template2readDic = {}

        for eachitem in lenDicTemplates:
            template2readDic[eachitem] = []

        for eachitem in lenDicReads:
            read2templateDic[eachitem] = []

        thres = 30
        dataList.sort(key=itemgetter(-2))
        for key, items in groupby(dataList, itemgetter(-2)):
            L = lenDicReads[key]
            tmpList = []
            for eachitem in items:
                if eachitem[4] > L - thres and eachitem[2] < eachitem[3]:
                    tmpList.append(eachitem)

            if len(tmpList) >= 1:
                returnItem = resolveCompetingTemplates(folderName, tmpList,
                                                       key, templatesDic,
                                                       readsDic,
                                                       prevIteration[0])
                readName, templateName = returnItem[-2], returnItem[-1]
                read2templateDic[readName].append(returnItem[-1])
                template2readDic[templateName].append(returnItem)

        with open(folderName + "read2templateDic.json", 'w') as outfile:
            json.dump(read2templateDic, outfile)

        with open(folderName + "template2readDic.json", 'w') as outfile:
            json.dump(template2readDic, outfile)

        print len(dataList), len(lenDicReads), len(lenDicTemplates), len(
            read2templateDic)
        # assert(False)

    else:
        read2templateDic = readInJSON(folderName, "read2templateDic.json")
        template2readDic = readInJSON(folderName, "template2readDic.json")

    return [read2templateDic, template2readDic]
Ejemplo n.º 9
0
def BResolvePreparation(folderName, inList, outList, G, Grev, N1, mummerLink):
    print "BResolvePreparation"
    # format :  resolvedList, brResolvedList, inList, outList [] [[3, 1], [3, 7]] [6] [3, 15]
    # resolvedList in standard format ... just that inList, outList has unnecessary *2 for head/tail difference
    # Input : brtest/ [0, 8] [5, 13]
    # print folderName,  inList, outList

    resolvedList = []

    if len(inList) > 1 and len(outList) > 1:
        # prepare left/righ contigs
        contigLeft = []
        for eachitem in inList:
            contigLeft.append(
                abunHouseKeeper.parseIDToName(eachitem / 2, 'C', 0))

        contigRight = []
        for eachitem in outList:
            contigRight.append(
                abunHouseKeeper.parseIDToName(eachitem / 2, 'C', 0))

        print "contigLeft, contigRight", contigLeft, contigRight

        with open(folderName + "contigLeft.json", 'w') as outfile:
            json.dump(contigLeft, outfile)

        with open(folderName + "contigRight.json", 'w') as outfile:
            json.dump(contigRight, outfile)

        # prepare RList
        RListDic = loadRListDic(folderName)

        RList = []
        for eachkey in contigLeft + contigRight:
            nodeIndex = abunHouseKeeper.parseEdgeNameToID(eachkey, 'C')
            nodeName = "Segkk" + str(nodeIndex / 2)
            RList = RList + RListDic[nodeName]

        RList = abunHouseKeeper.getDistinct(RList)

        IORobot.putListToFileO(folderName, "raw_reads.fasta", "RList", RList)
        IORobot.writeToFile_Double1(folderName, "RList.fasta",
                                    "RList_Double.fasta", "contig")

        # prepare intermediate
        ### Look for a path and then join here.

        pathList = findPathList(folderName, G, N1, contigLeft, contigRight)
        paths = findAPair(pathList)
        path1, path2 = findPathSegments(folderName, paths, N1, mummerLink)

        IORobot.writeSegOut([path1], folderName, "path1.fasta")
        IORobot.writeSegOut([path2], folderName, "path2.fasta")

        alignerRobot.useMummerAlign(mummerLink, folderName, "comparison",
                                    "path1.fasta", "path2.fasta", False, "",
                                    False)
        dataList = alignerRobot.extractMumData(folderName,
                                               "comparison" + "Out")

        dataList.sort(key=itemgetter(-2))
        begin, end = 1, 100

        for key, items in groupby(dataList, itemgetter(-2)):
            maxLen = -1
            for eachitem in items:
                if eachitem[4] > maxLen:
                    begin, end = eachitem[0], eachitem[1]
                    maxLen = eachitem[4]

        path1Dic = IORobot.loadContigsFromFile(folderName, "path1.fasta")

        IORobot.writeSegOut([path1Dic["Segkk0"][begin - 1:end]], folderName,
                            "intermediate.fasta")

        ratioScore, matching, contentForBetterInteriorToFlank = EMFlow(
            folderName, mummerLink)

        #assert(False)
        if 1 / ratioScore > 1.001:
            print "kkbug score", ratioScore
            for eachsub in matching:
                resolvedList.append([
                    abunHouseKeeper.parseEdgeNameToID(eachsub[0], 'C'),
                    abunHouseKeeper.parseEdgeNameToID(eachsub[1], 'C')
                ])

    return resolvedList