def nPhaseAlgorithm(args): basePath=os.path.join(args.outputFolder,args.strainName) phasedPath=os.path.join(basePath,"Phased") phasedFastqPath=os.path.join(basePath,"Phased","FastQ") datavisFolderPath=os.path.join(basePath,"Phased","Plots") logPath=os.path.join(basePath,"Logs") #Currently not producing any logs allPaths=[basePath,phasedPath,phasedFastqPath,datavisFolderPath,logPath] for path in allPaths: os.makedirs(path, exist_ok=True) readmePath=os.path.join(basePath,"Readme.txt") fullLogPath=os.path.join(basePath,"Logs","fullLog.txt") #Run everything through nPhase phaseTool.nPhase(args.validatedSNPAssignmentsFile,args.strainName,args.contextDepthsFile,phasedPath,args.reference,args.minSim,args.minOvl,args.minLen,args.maxID) readmeText="\nPhased files can be found at "+phasedPath+"\nThe *_variants.tsv file contains information on the consensus heterozygous variants present in each predicted haplotig.\nThe *_clusterReadNames.tsv file contains information on the reads which comprise each cluster." print(readmeText) updateLog(readmePath,readmeText) #Simplify datavis dataVisPath=os.path.join(phasedPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_visDataFull.tsv") simpleOutPath=os.path.join(phasedPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_visDataSimple.tsv") nPhaseFunctions.simplifyDataVis(dataVisPath,simpleOutPath,1000) #Generate plots datavisPath=os.path.join(datavisFolderPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_") nPhaseFunctions.generateDataVis(simpleOutPath,datavisPath) readmeText="\nPlot can be found at "+datavisFolderPath print(readmeText) updateLog(readmePath,readmeText) #Generate FastQ Files haplotigReadNameFile=os.path.join(phasedPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_clusterReadNames.tsv") fastQFilePrefix=os.path.join(phasedFastqPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_") nPhaseFunctions.generateLongReadFastQFiles(haplotigReadNameFile,args.longReadFile,fastQFilePrefix)#Needs to work more efficiently, shouldn't just load the entire fastq into memory...right? (why not?) Do I do that at any other point? #Make this last just in case. readmeText="\nLong reads can be found in "+phasedFastqPath print(readmeText) updateLog(readmePath,readmeText) print("You can consult the readme at "+readmePath+" if you want a bit of guidance about your results. Please raise any issues on https://github.com/nPhasePipeline/nPhase") return 0
def partialPipeline(args): allPaths = [] #Create folder structure basePath = os.path.join(args.outputFolder, args.strainName) mappedShortReadPath = os.path.join(basePath, "Mapped", "shortReads") if args.mappedLongReads == "noMapped": mappedLongReadPath = os.path.join(basePath, "Mapped", "longReads") allPaths.append(mappedLongReadPath) variantCalledShortReadPath = os.path.join(basePath, "VariantCalls", "shortReads") variantCalledLongReadPath = os.path.join(basePath, "VariantCalls", "longReads") overlapPath = os.path.join(basePath, "Overlaps") phasedPath = os.path.join(basePath, "Phased") phasedFastqPath = os.path.join(basePath, "Phased", "FastQ") datavisFolderPath = os.path.join(basePath, "Phased", "Plots") logPath = os.path.join(basePath, "Logs") #Currently not producing any logs allPaths += [ basePath, mappedShortReadPath, variantCalledShortReadPath, variantCalledLongReadPath, overlapPath, phasedPath, phasedFastqPath, datavisFolderPath, logPath ] for path in allPaths: os.makedirs(path, exist_ok=True) readmePath = os.path.join(basePath, "Readme.txt") fullLogPath = os.path.join(basePath, "Logs", "fullLog.txt") ######################## #Pre-process reference # ######################## #Make sure the reference is indexed (might be missing a process) p = subprocess.run(["samtools", "faidx", args.reference], stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) logText = "COMMAND: " + " ".join(["samtools", "faidx", args.reference ]) + "\n\n" if p.stderr != "" or p.stdout != "": logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n" updateLog(fullLogPath, logText) p = subprocess.run(["bwa", "index", args.reference], stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) logText = "COMMAND: " + " ".join(["bwa", "index", args.reference]) + "\n\n" if p.stderr != "" or p.stdout != "": logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n" updateLog(fullLogPath, logText) p = subprocess.run( ["gatk", "CreateSequenceDictionary", "-R", args.reference], stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) logText = "COMMAND: " + " ".join( ["gatk", "CreateSequenceDictionary", "-R", args.reference]) + "\n\n" if p.stderr != "" or p.stdout != "": logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n" updateLog(fullLogPath, logText) ######################## #Pre-process long reads# ######################## #Map long reads to reference if args.mappedLongReads == "noMapped": splitReadFlag = "260" #This flag allows split reads outputLog, systemMessage = nPhaseFunctions.longReadMapping( args.strainName, args.longReadFile, args.reference, mappedLongReadPath, splitReadFlag, args.longReadPlatform, args.threads) print(systemMessage) updateLog(fullLogPath, outputLog) ######################### #Pre-process short reads# ######################### #Map short reads to reference if args.mappedShortReads == "noMapped" and args.vcfFile == "noVCF": outputLog, systemMessage = nPhaseFunctions.shortReadMapping( args.strainName, args.shortReadFile_R1, args.shortReadFile_R2, args.reference, mappedShortReadPath) print(systemMessage) updateLog(fullLogPath, outputLog) if args.vcfFile == "noVCF": #Variant call short reads and select SNPs only estimatedPloidy = "2" #This argument is required for GATK's variant calling and we can expect most SNPs to only have two alleles anyway if args.mappedShortReads == "noMapped": shortReadBam = os.path.join(mappedShortReadPath, args.strainName + ".final.bam") else: shortReadBam == args.mappedShortReads shortReadVCF = os.path.join(variantCalledShortReadPath, args.strainName + ".vcf") p = subprocess.run([ "gatk", "HaplotypeCaller", "-R", args.reference, "-ploidy", estimatedPloidy, "-I", shortReadBam, "-O", shortReadVCF ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) logText = "COMMAND: " + " ".join([ "gatk", "HaplotypeCaller", "-R", args.reference, "-ploidy", estimatedPloidy, "-I", shortReadBam, "-O", shortReadVCF ]) + "\n\n" if p.stderr != "" or p.stdout != "": logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n" updateLog(fullLogPath, logText) else: shortReadVCF = args.vcfFile shortReadSNPsVCF = os.path.join(variantCalledShortReadPath, args.strainName + ".SNPs.vcf") p = subprocess.run([ "gatk", "SelectVariants", "-R", args.reference, "--variant", shortReadVCF, "-O", shortReadSNPsVCF, "--select-type-to-include", "SNP" ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) logText = "COMMAND: " + " ".join([ "gatk", "SelectVariants", "-R", args.reference, "--variant", shortReadVCF, "-O", shortReadSNPsVCF, "--select-type-to-include", "SNP" ]) + "\n\n" if p.stderr != "" or p.stdout != "": logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n" updateLog(fullLogPath, logText) #Extract heterozygous positions from VCF file #Ugh this is a whole thing. SNPVCFFilePath = os.path.join(variantCalledShortReadPath, args.strainName + ".SNPs.vcf") hetSNPVCFFilePath = os.path.join(variantCalledShortReadPath, args.strainName + ".hetSNPs.vcf") hetSNPVCFText = "" SNPVCFFile = open(SNPVCFFilePath, "r") for line in SNPVCFFile: if "#" in line: hetSNPVCFText += line elif "AF=1.00" not in line: line = line.strip("\n") line = line.replace(";", "\t") line = line.split("\t") line = [ line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[7] ] hetSNPVCFText += "\t".join(line) + "\n" SNPVCFFile.close() hetSNPVCFFile = open(hetSNPVCFFilePath, "w") hetSNPVCFFile.write(hetSNPVCFText) hetSNPVCFText = "" #Just saving some memory since this isn't in a function hetSNPVCFFile.close() print("Identified heterozygous SNPs in short read VCF") shortReadPositionsOutputFilePath = os.path.join( mappedShortReadPath, args.strainName + ".hetSNPs.positions.tsv") nPhaseFunctions.getShortReadPositions(hetSNPVCFFilePath, shortReadPositionsOutputFilePath) shortReadSNPsBedFilePath = os.path.join(mappedShortReadPath, args.strainName + ".hetSNPs.bed") shortReadSNPsBedText = "" shortReadPositionsOutputFile = open(shortReadPositionsOutputFilePath, "r") for line in shortReadPositionsOutputFile: if "#" in line: shortReadSNPsBedText += line elif "AF=1.00" not in line: line = line.strip("\n") line = line.replace(":", "\t") line = line.split("\t") line = [line[0], str(int(line[1]) - 1), line[1]] shortReadSNPsBedText += "\t".join(line) + "\n" shortReadPositionsOutputFile.close() shortReadSNPsBedFile = open(shortReadSNPsBedFilePath, "w") shortReadSNPsBedFile.write(shortReadSNPsBedText) shortReadSNPsBedText = "" #Just saving some memory since this isn't in a function shortReadSNPsBedFile.close() print("Extracted heterozygous SNP info based on short read VCF") #Reduce long reads to their heterozygous SNPs if args.mappedLongReads == "noMapped": cleanLongReadSamFile = os.path.join(mappedLongReadPath, args.strainName + ".sorted.sam") else: cleanLongReadSamFile = args.mappedLongReads minQ = 0.01 # Currently minMQ = 0 # Not minAln = 0.5 # Used longReadPositionNTFile = os.path.join( variantCalledLongReadPath, args.strainName + ".hetPositions.SNPxLongReads.tsv") nPhaseFunctions.assignLongReadToSNPs(cleanLongReadSamFile, shortReadSNPsBedFilePath, args.reference, minQ, minMQ, minAln, longReadPositionNTFile) #Only keep the longest reads and get rid of duplicate heterozygous SNP profiles, compensate by keeping context coverage information. minCov = 0 # Currently minRatio = 0 # Not minTrioCov = 0 # Used validatedSNPAssignmentsFile = os.path.join( variantCalledLongReadPath, args.strainName + ".hetPositions.SNPxLongReads.validated.tsv") contextDepthsFile = os.path.join(overlapPath, args.strainName + ".contextDepths.tsv") nPhaseFunctions.longReadValidation(longReadPositionNTFile, minCov, minRatio, minTrioCov, validatedSNPAssignmentsFile, contextDepthsFile) #Run everything through nPhase phaseTool.nPhase(validatedSNPAssignmentsFile, args.strainName, contextDepthsFile, phasedPath, args.reference, args.minSim, args.minOvl, args.minLen, args.maxID) readmeText = "\nPhased files can be found at " + phasedPath + "\nThe *_variants.tsv file contains information on the consensus heterozygous variants present in each predicted haplotig.\nThe *_clusterReadNames.tsv file contains information on the reads which comprise each cluster." print(readmeText) updateLog(readmePath, readmeText) #Simplify datavis dataVisPath = os.path.join( phasedPath, args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) + "_" + str(args.maxID) + "_" + str(args.minLen) + "_visDataFull.tsv") simpleOutPath = os.path.join( phasedPath, args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) + "_" + str(args.maxID) + "_" + str(args.minLen) + "_visDataSimple.tsv") nPhaseFunctions.simplifyDataVis(dataVisPath, simpleOutPath, 1000) #Generate plots datavisPath = os.path.join( datavisFolderPath, args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) + "_" + str(args.maxID) + "_" + str(args.minLen) + "_") nPhaseFunctions.generateDataVis(simpleOutPath, datavisPath) readmeText = "\nPlot can be found at " + datavisFolderPath print(readmeText) updateLog(readmePath, readmeText) #Generate FastQ Files haplotigReadNameFile = os.path.join( phasedPath, args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) + "_" + str(args.maxID) + "_" + str(args.minLen) + "_clusterReadNames.tsv") fastQFilePrefix = os.path.join( phasedFastqPath, args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) + "_" + str(args.maxID) + "_" + str(args.minLen) + "_") nPhaseFunctions.generateLongReadFastQFiles( haplotigReadNameFile, args.longReadFile, fastQFilePrefix ) #Needs to work more efficiently, shouldn't just load the entire fastq into memory...right? (why not?) Do I do that at any other point? #Make this last just in case. readmeText = "\nLong reads can be found in " + phasedFastqPath print(readmeText) updateLog(readmePath, readmeText) print( "You can consult the readme at " + readmePath + " if you want a bit of guidance about your results. Please raise any issues on https://github.com/nPhasePipeline/nPhase" ) return 0
def nPhaseCleaning(nPhaseResultFolder, longReadFilePath, strainPrefix, percentKept=98, maxDiscordance=0, deduplicate=True, FFBool=False): percentKept = 1 - (percentKept / 100) readClusterPath = glob.glob( os.path.join(nPhaseResultFolder, "Phased", "*clusterReadNames.tsv"))[0] knownHetFile = glob.glob( os.path.join(nPhaseResultFolder, "VariantCalls", "shortReads", "*.hetSNPs.vcf"))[0] readSNPsPath = glob.glob( os.path.join(nPhaseResultFolder, "VariantCalls", "longReads", "*.hetPositions.SNPxLongReads.validated.tsv"))[0] outPath = os.path.join(nPhaseResultFolder, "Phased", "Cleaned", strainPrefix) os.makedirs(outPath, exist_ok=True) os.makedirs(os.path.join(outPath, "cleanFastQ"), exist_ok=True) os.makedirs(os.path.join(outPath, "Plots"), exist_ok=True) phasedSNPFilePath = os.path.join(outPath, strainPrefix + "_cleaned.variants.tsv") phasedClusterPath = os.path.join( outPath, strainPrefix + "_cleaned.clusterReadNames.tsv") clusterReads = readClusterReads(readClusterPath) readSNPs = readReadSNPs(readSNPsPath) clusters = rebuildClusters(clusterReads, readSNPs) print("Number of clusters before cleaning:", len(clusters)) if FFBool: if percentKept < 1.0: stitchedClusters = filterLowCoverageClusters(clusters, percentKept) stitchedClusters = iterativeStitching(stitchedClusters, maxDiscordance) else: stitchedClusters = iterativeStitching(clusters, maxDiscordance) else: stitchedClusters = iterativeStitching(clusters, maxDiscordance) if percentKept < 1.0: stitchedClusters = filterLowCoverageClusters( stitchedClusters, percentKept) if deduplicate: stitchedClusters = identifyGaps(stitchedClusters, readSNPs, 50) makeVariantsFile(stitchedClusters, phasedSNPFilePath) makeClusterReadFile(stitchedClusters, phasedClusterPath) simplerStitchedClusters = {} for clusterName, clusterData in stitchedClusters.items(): simplerStitchedClusters[clusterName] = clusterData["SNPs"] dataText = nPhaseFunctions.giveMeFullData(simplerStitchedClusters) dataVisPath = os.path.join(outPath, strainPrefix + "_phasedDataFull.tsv") dataVisFile = open(dataVisPath, "w") dataVisFile.write(dataText) dataVisFile.close() simpleOutPath = os.path.join(outPath, strainPrefix + "_phasedDataSimple.tsv") nPhaseFunctions.simplifyDataVis(dataVisPath, simpleOutPath, 1000) datavisPath = os.path.join(outPath, "Plots", strainPrefix + "_") nPhaseFunctions.generatePhasingVis(simpleOutPath, datavisPath) simpleOutPath = os.path.join(outPath, strainPrefix + "_covVis.tsv") windowSize = 10000 minCov = nPhaseFunctions.generateCoverage(phasedClusterPath, readSNPsPath, simpleOutPath, windowSize) nPhaseFunctions.generateCoverageVis(simpleOutPath, datavisPath) # Discordance simpleOutPath = os.path.join(outPath, strainPrefix + "_discordanceVis.tsv") nPhaseFunctions.generateDiscordance(phasedClusterPath, readSNPsPath, simpleOutPath) nPhaseFunctions.generateDiscordanceVis(simpleOutPath, datavisPath) fastQOut = os.path.join(outPath, "cleanFastQ", strainPrefix) nPhaseFunctions.generateLongReadFastQFiles(phasedClusterPath, longReadFilePath, fastQOut) print("Cleaning done! Your cleaning results are in", outPath)