Ejemplo n.º 1
0
def nPhaseAlgorithm(args):

    basePath=os.path.join(args.outputFolder,args.strainName)
    phasedPath=os.path.join(basePath,"Phased")
    phasedFastqPath=os.path.join(basePath,"Phased","FastQ")
    datavisFolderPath=os.path.join(basePath,"Phased","Plots")
    logPath=os.path.join(basePath,"Logs") #Currently not producing any logs

    allPaths=[basePath,phasedPath,phasedFastqPath,datavisFolderPath,logPath]

    for path in allPaths:
        os.makedirs(path, exist_ok=True)

    readmePath=os.path.join(basePath,"Readme.txt")
    fullLogPath=os.path.join(basePath,"Logs","fullLog.txt")

    #Run everything through nPhase
    phaseTool.nPhase(args.validatedSNPAssignmentsFile,args.strainName,args.contextDepthsFile,phasedPath,args.reference,args.minSim,args.minOvl,args.minLen,args.maxID)

    readmeText="\nPhased files can be found at "+phasedPath+"\nThe *_variants.tsv file contains information on the consensus heterozygous variants present in each predicted haplotig.\nThe *_clusterReadNames.tsv file contains information on the reads which comprise each cluster."
    print(readmeText)
    updateLog(readmePath,readmeText)

    #Simplify datavis
    dataVisPath=os.path.join(phasedPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_visDataFull.tsv")
    simpleOutPath=os.path.join(phasedPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_visDataSimple.tsv")
    nPhaseFunctions.simplifyDataVis(dataVisPath,simpleOutPath,1000)

    #Generate plots
    datavisPath=os.path.join(datavisFolderPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_")
    nPhaseFunctions.generateDataVis(simpleOutPath,datavisPath)

    readmeText="\nPlot can be found at "+datavisFolderPath
    print(readmeText)
    updateLog(readmePath,readmeText)

    #Generate FastQ Files
    haplotigReadNameFile=os.path.join(phasedPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_clusterReadNames.tsv")
    fastQFilePrefix=os.path.join(phasedFastqPath,args.strainName+"_"+str(args.minOvl)+"_"+str(args.minSim)+"_"+str(args.maxID)+"_"+str(args.minLen)+"_")

    nPhaseFunctions.generateLongReadFastQFiles(haplotigReadNameFile,args.longReadFile,fastQFilePrefix)#Needs to work more efficiently, shouldn't just load the entire fastq into memory...right? (why not?) Do I do that at any other point? #Make this last just in case.

    readmeText="\nLong reads can be found in "+phasedFastqPath
    print(readmeText)
    updateLog(readmePath,readmeText)

    print("You can consult the readme at "+readmePath+" if you want a bit of guidance about your results. Please raise any issues on https://github.com/nPhasePipeline/nPhase")

    return 0
Ejemplo n.º 2
0
def partialPipeline(args):
    allPaths = []

    #Create folder structure
    basePath = os.path.join(args.outputFolder, args.strainName)
    mappedShortReadPath = os.path.join(basePath, "Mapped", "shortReads")
    if args.mappedLongReads == "noMapped":
        mappedLongReadPath = os.path.join(basePath, "Mapped", "longReads")
        allPaths.append(mappedLongReadPath)
    variantCalledShortReadPath = os.path.join(basePath, "VariantCalls",
                                              "shortReads")
    variantCalledLongReadPath = os.path.join(basePath, "VariantCalls",
                                             "longReads")
    overlapPath = os.path.join(basePath, "Overlaps")
    phasedPath = os.path.join(basePath, "Phased")
    phasedFastqPath = os.path.join(basePath, "Phased", "FastQ")
    datavisFolderPath = os.path.join(basePath, "Phased", "Plots")
    logPath = os.path.join(basePath, "Logs")  #Currently not producing any logs

    allPaths += [
        basePath, mappedShortReadPath, variantCalledShortReadPath,
        variantCalledLongReadPath, overlapPath, phasedPath, phasedFastqPath,
        datavisFolderPath, logPath
    ]

    for path in allPaths:
        os.makedirs(path, exist_ok=True)

    readmePath = os.path.join(basePath, "Readme.txt")
    fullLogPath = os.path.join(basePath, "Logs", "fullLog.txt")

    ########################
    #Pre-process reference #
    ########################

    #Make sure the reference is indexed (might be missing a process)
    p = subprocess.run(["samtools", "faidx", args.reference],
                       stderr=subprocess.PIPE,
                       stdout=subprocess.PIPE,
                       universal_newlines=True)
    logText = "COMMAND: " + " ".join(["samtools", "faidx", args.reference
                                      ]) + "\n\n"
    if p.stderr != "" or p.stdout != "":
        logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n"
    updateLog(fullLogPath, logText)

    p = subprocess.run(["bwa", "index", args.reference],
                       stderr=subprocess.PIPE,
                       stdout=subprocess.PIPE,
                       universal_newlines=True)
    logText = "COMMAND: " + " ".join(["bwa", "index", args.reference]) + "\n\n"
    if p.stderr != "" or p.stdout != "":
        logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n"
    updateLog(fullLogPath, logText)

    p = subprocess.run(
        ["gatk", "CreateSequenceDictionary", "-R", args.reference],
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE,
        universal_newlines=True)
    logText = "COMMAND: " + " ".join(
        ["gatk", "CreateSequenceDictionary", "-R", args.reference]) + "\n\n"
    if p.stderr != "" or p.stdout != "":
        logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n"
    updateLog(fullLogPath, logText)

    ########################
    #Pre-process long reads#
    ########################

    #Map long reads to reference
    if args.mappedLongReads == "noMapped":
        splitReadFlag = "260"  #This flag allows split reads
        outputLog, systemMessage = nPhaseFunctions.longReadMapping(
            args.strainName, args.longReadFile, args.reference,
            mappedLongReadPath, splitReadFlag, args.longReadPlatform,
            args.threads)
        print(systemMessage)
        updateLog(fullLogPath, outputLog)

    #########################
    #Pre-process short reads#
    #########################

    #Map short reads to reference
    if args.mappedShortReads == "noMapped" and args.vcfFile == "noVCF":
        outputLog, systemMessage = nPhaseFunctions.shortReadMapping(
            args.strainName, args.shortReadFile_R1, args.shortReadFile_R2,
            args.reference, mappedShortReadPath)
        print(systemMessage)
        updateLog(fullLogPath, outputLog)

    if args.vcfFile == "noVCF":
        #Variant call short reads and select SNPs only
        estimatedPloidy = "2"  #This argument is required for GATK's variant calling and we can expect most SNPs to only have two alleles anyway

        if args.mappedShortReads == "noMapped":
            shortReadBam = os.path.join(mappedShortReadPath,
                                        args.strainName + ".final.bam")
        else:
            shortReadBam == args.mappedShortReads
        shortReadVCF = os.path.join(variantCalledShortReadPath,
                                    args.strainName + ".vcf")
        p = subprocess.run([
            "gatk", "HaplotypeCaller", "-R", args.reference, "-ploidy",
            estimatedPloidy, "-I", shortReadBam, "-O", shortReadVCF
        ],
                           stderr=subprocess.PIPE,
                           stdout=subprocess.PIPE,
                           universal_newlines=True)
        logText = "COMMAND: " + " ".join([
            "gatk", "HaplotypeCaller", "-R", args.reference, "-ploidy",
            estimatedPloidy, "-I", shortReadBam, "-O", shortReadVCF
        ]) + "\n\n"
        if p.stderr != "" or p.stdout != "":
            logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n"
        updateLog(fullLogPath, logText)
    else:
        shortReadVCF = args.vcfFile

    shortReadSNPsVCF = os.path.join(variantCalledShortReadPath,
                                    args.strainName + ".SNPs.vcf")
    p = subprocess.run([
        "gatk", "SelectVariants", "-R", args.reference, "--variant",
        shortReadVCF, "-O", shortReadSNPsVCF, "--select-type-to-include", "SNP"
    ],
                       stderr=subprocess.PIPE,
                       stdout=subprocess.PIPE,
                       universal_newlines=True)
    logText = "COMMAND: " + " ".join([
        "gatk", "SelectVariants", "-R", args.reference, "--variant",
        shortReadVCF, "-O", shortReadSNPsVCF, "--select-type-to-include", "SNP"
    ]) + "\n\n"
    if p.stderr != "" or p.stdout != "":
        logText += "STDERR:\n\n" + p.stderr + "\n\nSTDOUT:\n\n" + p.stdout + "\n\n"
    updateLog(fullLogPath, logText)

    #Extract heterozygous positions from VCF file
    #Ugh this is a whole thing.

    SNPVCFFilePath = os.path.join(variantCalledShortReadPath,
                                  args.strainName + ".SNPs.vcf")
    hetSNPVCFFilePath = os.path.join(variantCalledShortReadPath,
                                     args.strainName + ".hetSNPs.vcf")

    hetSNPVCFText = ""

    SNPVCFFile = open(SNPVCFFilePath, "r")
    for line in SNPVCFFile:
        if "#" in line:
            hetSNPVCFText += line
        elif "AF=1.00" not in line:
            line = line.strip("\n")
            line = line.replace(";", "\t")
            line = line.split("\t")
            line = [
                line[0], line[1], line[2], line[3], line[4], line[5], line[6],
                line[7]
            ]
            hetSNPVCFText += "\t".join(line) + "\n"

    SNPVCFFile.close()

    hetSNPVCFFile = open(hetSNPVCFFilePath, "w")
    hetSNPVCFFile.write(hetSNPVCFText)
    hetSNPVCFText = ""  #Just saving some memory since this isn't in a function
    hetSNPVCFFile.close()

    print("Identified heterozygous SNPs in short read VCF")

    shortReadPositionsOutputFilePath = os.path.join(
        mappedShortReadPath, args.strainName + ".hetSNPs.positions.tsv")
    nPhaseFunctions.getShortReadPositions(hetSNPVCFFilePath,
                                          shortReadPositionsOutputFilePath)

    shortReadSNPsBedFilePath = os.path.join(mappedShortReadPath,
                                            args.strainName + ".hetSNPs.bed")

    shortReadSNPsBedText = ""

    shortReadPositionsOutputFile = open(shortReadPositionsOutputFilePath, "r")
    for line in shortReadPositionsOutputFile:
        if "#" in line:
            shortReadSNPsBedText += line
        elif "AF=1.00" not in line:
            line = line.strip("\n")
            line = line.replace(":", "\t")
            line = line.split("\t")
            line = [line[0], str(int(line[1]) - 1), line[1]]
            shortReadSNPsBedText += "\t".join(line) + "\n"

    shortReadPositionsOutputFile.close()

    shortReadSNPsBedFile = open(shortReadSNPsBedFilePath, "w")
    shortReadSNPsBedFile.write(shortReadSNPsBedText)
    shortReadSNPsBedText = ""  #Just saving some memory since this isn't in a function
    shortReadSNPsBedFile.close()

    print("Extracted heterozygous SNP info based on short read VCF")

    #Reduce long reads to their heterozygous SNPs
    if args.mappedLongReads == "noMapped":
        cleanLongReadSamFile = os.path.join(mappedLongReadPath,
                                            args.strainName + ".sorted.sam")
    else:
        cleanLongReadSamFile = args.mappedLongReads
    minQ = 0.01  # Currently
    minMQ = 0  # Not
    minAln = 0.5  # Used
    longReadPositionNTFile = os.path.join(
        variantCalledLongReadPath,
        args.strainName + ".hetPositions.SNPxLongReads.tsv")
    nPhaseFunctions.assignLongReadToSNPs(cleanLongReadSamFile,
                                         shortReadSNPsBedFilePath,
                                         args.reference, minQ, minMQ, minAln,
                                         longReadPositionNTFile)

    #Only keep the longest reads and get rid of duplicate heterozygous SNP profiles, compensate by keeping context coverage information.
    minCov = 0  # Currently
    minRatio = 0  # Not
    minTrioCov = 0  # Used
    validatedSNPAssignmentsFile = os.path.join(
        variantCalledLongReadPath,
        args.strainName + ".hetPositions.SNPxLongReads.validated.tsv")
    contextDepthsFile = os.path.join(overlapPath,
                                     args.strainName + ".contextDepths.tsv")
    nPhaseFunctions.longReadValidation(longReadPositionNTFile, minCov,
                                       minRatio, minTrioCov,
                                       validatedSNPAssignmentsFile,
                                       contextDepthsFile)

    #Run everything through nPhase
    phaseTool.nPhase(validatedSNPAssignmentsFile, args.strainName,
                     contextDepthsFile, phasedPath, args.reference,
                     args.minSim, args.minOvl, args.minLen, args.maxID)

    readmeText = "\nPhased files can be found at " + phasedPath + "\nThe *_variants.tsv file contains information on the consensus heterozygous variants present in each predicted haplotig.\nThe *_clusterReadNames.tsv file contains information on the reads which comprise each cluster."
    print(readmeText)
    updateLog(readmePath, readmeText)

    #Simplify datavis
    dataVisPath = os.path.join(
        phasedPath,
        args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) +
        "_" + str(args.maxID) + "_" + str(args.minLen) + "_visDataFull.tsv")
    simpleOutPath = os.path.join(
        phasedPath,
        args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) +
        "_" + str(args.maxID) + "_" + str(args.minLen) + "_visDataSimple.tsv")
    nPhaseFunctions.simplifyDataVis(dataVisPath, simpleOutPath, 1000)

    #Generate plots
    datavisPath = os.path.join(
        datavisFolderPath,
        args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) +
        "_" + str(args.maxID) + "_" + str(args.minLen) + "_")
    nPhaseFunctions.generateDataVis(simpleOutPath, datavisPath)

    readmeText = "\nPlot can be found at " + datavisFolderPath
    print(readmeText)
    updateLog(readmePath, readmeText)

    #Generate FastQ Files
    haplotigReadNameFile = os.path.join(
        phasedPath, args.strainName + "_" + str(args.minOvl) + "_" +
        str(args.minSim) + "_" + str(args.maxID) + "_" + str(args.minLen) +
        "_clusterReadNames.tsv")
    fastQFilePrefix = os.path.join(
        phasedFastqPath,
        args.strainName + "_" + str(args.minOvl) + "_" + str(args.minSim) +
        "_" + str(args.maxID) + "_" + str(args.minLen) + "_")

    nPhaseFunctions.generateLongReadFastQFiles(
        haplotigReadNameFile, args.longReadFile, fastQFilePrefix
    )  #Needs to work more efficiently, shouldn't just load the entire fastq into memory...right? (why not?) Do I do that at any other point? #Make this last just in case.

    readmeText = "\nLong reads can be found in " + phasedFastqPath
    print(readmeText)
    updateLog(readmePath, readmeText)

    print(
        "You can consult the readme at " + readmePath +
        " if you want a bit of guidance about your results. Please raise any issues on https://github.com/nPhasePipeline/nPhase"
    )

    return 0
Ejemplo n.º 3
0
def nPhaseCleaning(nPhaseResultFolder,
                   longReadFilePath,
                   strainPrefix,
                   percentKept=98,
                   maxDiscordance=0,
                   deduplicate=True,
                   FFBool=False):
    percentKept = 1 - (percentKept / 100)

    readClusterPath = glob.glob(
        os.path.join(nPhaseResultFolder, "Phased", "*clusterReadNames.tsv"))[0]
    knownHetFile = glob.glob(
        os.path.join(nPhaseResultFolder, "VariantCalls", "shortReads",
                     "*.hetSNPs.vcf"))[0]
    readSNPsPath = glob.glob(
        os.path.join(nPhaseResultFolder, "VariantCalls", "longReads",
                     "*.hetPositions.SNPxLongReads.validated.tsv"))[0]
    outPath = os.path.join(nPhaseResultFolder, "Phased", "Cleaned",
                           strainPrefix)
    os.makedirs(outPath, exist_ok=True)
    os.makedirs(os.path.join(outPath, "cleanFastQ"), exist_ok=True)
    os.makedirs(os.path.join(outPath, "Plots"), exist_ok=True)

    phasedSNPFilePath = os.path.join(outPath,
                                     strainPrefix + "_cleaned.variants.tsv")
    phasedClusterPath = os.path.join(
        outPath, strainPrefix + "_cleaned.clusterReadNames.tsv")

    clusterReads = readClusterReads(readClusterPath)
    readSNPs = readReadSNPs(readSNPsPath)

    clusters = rebuildClusters(clusterReads, readSNPs)
    print("Number of clusters before cleaning:", len(clusters))

    if FFBool:
        if percentKept < 1.0:
            stitchedClusters = filterLowCoverageClusters(clusters, percentKept)
            stitchedClusters = iterativeStitching(stitchedClusters,
                                                  maxDiscordance)
        else:
            stitchedClusters = iterativeStitching(clusters, maxDiscordance)
    else:
        stitchedClusters = iterativeStitching(clusters, maxDiscordance)
        if percentKept < 1.0:
            stitchedClusters = filterLowCoverageClusters(
                stitchedClusters, percentKept)

    if deduplicate:
        stitchedClusters = identifyGaps(stitchedClusters, readSNPs, 50)

    makeVariantsFile(stitchedClusters, phasedSNPFilePath)
    makeClusterReadFile(stitchedClusters, phasedClusterPath)

    simplerStitchedClusters = {}
    for clusterName, clusterData in stitchedClusters.items():
        simplerStitchedClusters[clusterName] = clusterData["SNPs"]

    dataText = nPhaseFunctions.giveMeFullData(simplerStitchedClusters)
    dataVisPath = os.path.join(outPath, strainPrefix + "_phasedDataFull.tsv")
    dataVisFile = open(dataVisPath, "w")
    dataVisFile.write(dataText)
    dataVisFile.close()

    simpleOutPath = os.path.join(outPath,
                                 strainPrefix + "_phasedDataSimple.tsv")
    nPhaseFunctions.simplifyDataVis(dataVisPath, simpleOutPath, 1000)
    datavisPath = os.path.join(outPath, "Plots", strainPrefix + "_")
    nPhaseFunctions.generatePhasingVis(simpleOutPath, datavisPath)

    simpleOutPath = os.path.join(outPath, strainPrefix + "_covVis.tsv")
    windowSize = 10000
    minCov = nPhaseFunctions.generateCoverage(phasedClusterPath, readSNPsPath,
                                              simpleOutPath, windowSize)

    nPhaseFunctions.generateCoverageVis(simpleOutPath, datavisPath)

    # Discordance
    simpleOutPath = os.path.join(outPath, strainPrefix + "_discordanceVis.tsv")
    nPhaseFunctions.generateDiscordance(phasedClusterPath, readSNPsPath,
                                        simpleOutPath)

    nPhaseFunctions.generateDiscordanceVis(simpleOutPath, datavisPath)

    fastQOut = os.path.join(outPath, "cleanFastQ", strainPrefix)
    nPhaseFunctions.generateLongReadFastQFiles(phasedClusterPath,
                                               longReadFilePath, fastQOut)

    print("Cleaning done! Your cleaning results are in", outPath)