def abunSplit(folderName, mummerLink, myCountDic):

    """
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    """
    json_data = open(folderName + "phaseRepeat.txt", "r")
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2

    G = graphLib.seqGraph(N1)

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic)
        addEdges(G, resolvedList)

    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
Beispiel #2
0
def abunSplit(folderName, mummerLink, myCountDic):
    
    '''
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    '''
    json_data = open(folderName + "phaseRepeat.txt", 'r')
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)
    
    N1 = len(myCountDic)*2
    
    G = graphLib.seqGraph(N1)
    
    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic)
        addEdges(G, resolvedList)
        
    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
Beispiel #3
0
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph,
              contigFilename, readsetFilename):
    '''
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    '''

    json_data = open(folderName + "phaseRepeat.txt", 'r')
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2
    print "N1", N1

    G = graphLib.seqGraph(N1)

    gapContentLookUpList = []

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic, folderName,
                                      contigReadGraph, N1)
        print "resolvedList", resolvedList
        gapContentLookUpList += generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph,
            contigFilename, readsetFilename)

        addEdges(G, resolvedList)

    gapContentLookUpDic = {}
    gapContentLookUpList.sort()

    for eachitem in gapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
            eachitem[2], eachitem[3], eachitem[4]
        ]
        print eachitem[2:4], len(eachitem[4])

    # some how change ASplitter here by appending necessary information

    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta",
                                  contigFilename + "_Double.fasta",
                                  gapContentLookUpDic)
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):

    """
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    """

    json_data = open(folderName + "phaseRepeat.txt", "r")
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2
    print "N1", N1

    G = graphLib.seqGraph(N1)

    gapContentLookUpList = []

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)
        print "resolvedList", resolvedList
        gapContentLookUpList += generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename
        )

        addEdges(G, resolvedList)

    gapContentLookUpDic = {}
    gapContentLookUpList.sort()

    for eachitem in gapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
        print eachitem[2:4], len(eachitem[4])

    # some how change ASplitter here by appending necessary information

    G.condense()
    IORobot.extractGraphToContigs(
        G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic
    )
Beispiel #5
0
def readContigForAbunSplit(folderName, mummerLink, contigFilename,
                           readsetFilename, N1, contigReadGraph):

    json_data = open(folderName + "mapDummyToRealDic.json", 'r')
    mapDummyToRealDic = json.load(json_data)

    G = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "xResolvedGraph")

    gapContentLookUpDic = {}

    furtherGapList = []
    for i in range(N1):
        if len(G.graphNodesList[i].nodeIndexList) > 1:
            for j in range(len(G.graphNodesList[i].nodeIndexList) - 1):

                bk, fwd = G.graphNodesList[i].nodeIndexList[
                    j], G.graphNodesList[i].nodeIndexList[j + 1]

                key = str(bk) + "_" + str(fwd)

                if not key in gapContentLookUpDic:
                    furtherGapList.append([bk, fwd])

    with open(folderName + "furtherGapList.json", 'w') as f:
        json.dump(furtherGapList, f)

    furtherGapContentLookUpList = generateGapContentLookup(
        folderName, mummerLink, furtherGapList, contigReadGraph,
        contigFilename, readsetFilename, mapDummyToRealDic)

    for eachitem in furtherGapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
            eachitem[2], eachitem[3], eachitem[4]
        ]
        print eachitem[2:4], len(eachitem[4])

    #segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")

    print "Final step: really hacking a file"
    os.system("cp " + folderName + contigFilename + "_Double.fasta " +
              folderName + "tmpWithDummy.fasta")
    contigList = IORobot.readContigsFromFile(folderName,
                                             contigFilename + "_Double.fasta")

    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abunPre.fasta",
                                  "tmpWithDummy.fasta", gapContentLookUpDic,
                                  mapDummyToRealDic)

    if True:
        nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink,
                                                     "abunPre", "abunMum",
                                                     "abun")
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph):

    json_data = open(folderName + "mapDummyToRealDic.json", "r")
    mapDummyToRealDic = json.load(json_data)

    G = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "xResolvedGraph")

    gapContentLookUpDic = {}

    furtherGapList = []
    for i in range(N1):
        if len(G.graphNodesList[i].nodeIndexList) > 1:
            for j in range(len(G.graphNodesList[i].nodeIndexList) - 1):

                bk, fwd = G.graphNodesList[i].nodeIndexList[j], G.graphNodesList[i].nodeIndexList[j + 1]

                key = str(bk) + "_" + str(fwd)

                if not key in gapContentLookUpDic:
                    furtherGapList.append([bk, fwd])

    with open(folderName + "furtherGapList.json", "w") as f:
        json.dump(furtherGapList, f)

    furtherGapContentLookUpList = generateGapContentLookup(
        folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
    )

    for eachitem in furtherGapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
        print eachitem[2:4], len(eachitem[4])

    # segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")

    print "Final step: really hacking a file"
    os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
    contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

    IORobot.extractGraphToContigs(
        G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
    )

    if True:
        nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", "r")
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList
        ), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", "w") as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", "w") as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
        )
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", "w") as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", "r")
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", "a")
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(
            G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
        )
Beispiel #8
0
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph,
                          contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", 'r')
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic,
                                          folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(
            folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", 'w') as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", 'w') as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", 'r')
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph,
            contigFilename, readsetFilename, mapDummyToRealDic)
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
                eachitem[2], eachitem[3], eachitem[4]
            ]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", 'w') as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", 'r')
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", 'r')
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " +
                  folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(
            folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", 'a')
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta",
                                      "tmpWithDummy.fasta",
                                      gapContentLookUpDic, mapDummyToRealDic)