def abunSplit(folderName, mummerLink, myCountDic): """ Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs """ json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 G = graphLib.seqGraph(N1) for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic) addEdges(G, resolvedList) G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def abunSplit(folderName, mummerLink, myCountDic): ''' Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs ''' json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic)*2 G = graphLib.seqGraph(N1) for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic) addEdges(G, resolvedList) G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): ''' Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs ''' json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 print "N1", N1 G = graphLib.seqGraph(N1) gapContentLookUpList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) print "resolvedList", resolvedList gapContentLookUpList += generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename) addEdges(G, resolvedList) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) # some how change ASplitter here by appending necessary information G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic)
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): """ Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs """ json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 print "N1", N1 G = graphLib.seqGraph(N1) gapContentLookUpList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) print "resolvedList", resolvedList gapContentLookUpList += generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename ) addEdges(G, resolvedList) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) # some how change ASplitter here by appending necessary information G.condense() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic )
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph): json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") gapContentLookUpDic = {} furtherGapList = [] for i in range(N1): if len(G.graphNodesList[i].nodeIndexList) > 1: for j in range(len(G.graphNodesList[i].nodeIndexList) - 1): bk, fwd = G.graphNodesList[i].nodeIndexList[ j], G.graphNodesList[i].nodeIndexList[j + 1] key = str(bk) + "_" + str(fwd) if not key in gapContentLookUpDic: furtherGapList.append([bk, fwd]) with open(folderName + "furtherGapList.json", 'w') as f: json.dump(furtherGapList, f) furtherGapContentLookUpList = generateGapContentLookup( folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic) for eachitem in furtherGapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) #segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") IORobot.extractGraphToContigs(G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic) if True: nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph): json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") gapContentLookUpDic = {} furtherGapList = [] for i in range(N1): if len(G.graphNodesList[i].nodeIndexList) > 1: for j in range(len(G.graphNodesList[i].nodeIndexList) - 1): bk, fwd = G.graphNodesList[i].nodeIndexList[j], G.graphNodesList[i].nodeIndexList[j + 1] key = str(bk) + "_" + str(fwd) if not key in gapContentLookUpDic: furtherGapList.append([bk, fwd]) with open(folderName + "furtherGapList.json", "w") as f: json.dump(furtherGapList, f) furtherGapContentLookUpList = generateGapContentLookup( folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) for eachitem in furtherGapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) # segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") IORobot.extractGraphToContigs( G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic ) if True: nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList ), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", "w") as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", "w") as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", "w") as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", "r") gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", "a") for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic )
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving( folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", 'w') as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", 'w') as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", 'r') resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", 'w') as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", 'r') resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", 'r') gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile( folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", 'a') for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic)