Beispiel #1
0
def runVariantCaller(options, continuing=False):
    """
    Run the variant caller. If continuing == True, then we are picking up a failed job from
    where it left off.
    """
    # Seed the Python random number generator
    random.seed("Full many a flower is born to blush unseen and waste its sweetness on the desert air")

    # Set up basic logging

    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

    log = logging.getLogger('Log')

    fh = None
    ch = logging.StreamHandler()

    if continuing:
        fh = logging.FileHandler(options.logFileName, 'a')
    else:
        fh = logging.FileHandler(options.logFileName, 'w')

    ch.setFormatter(formatter)
    fh.setFormatter(formatter)

    if options.verbosity == 0:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.ERROR)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity == 1:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.WARNING)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity == 2:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity >= 3:
        # Debug goes to file only.
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.DEBUG)
    else:
        raise StandardError, "Value of 'verbosity' input parameter must be between 0 and 3 inclusive"

    log.addHandler(ch)
    log.addHandler(fh)

    if continuing:
        log.info("Continuing variant calling from where we left off.")
    else:
        log.info("Beginning variant calling")

    log.info("Output will go to %s" %(options.output))

    regions = None

    if continuing:
        regions = options.unfinishedRegions
    else:
        regions = sorted(platypusutils.getRegions(options), cmp=regionSort)

    if options.nCPU == 1:
        fileName = None

        if options.output == "-":
            fileName = options.output
        else:
            fileName = options.output + "_temp_1.gz"

        p1 = PlatypusSingleProcess(fileName, options, regions, continuing)
        p1.run()
        if options.output != "-":
            mergeVCFFiles([fileName], options.output, log)
    else:
        # Create process manager
        fileNames = set()
        processes = []
        regionsForEachProcess = []

        # In this case, create all the BAM files here, before splitting into separate processes. The files will be left open until
        # the end of the parent process, and all child processes will share the same open files via pointers.
        bamFileNames = None
        samples = None
        samplesByID = None
        samplesByBAM = None
        bamFiles = None
        theLocks = None

        for i in range(options.nCPU):
            regionsForEachProcess.append([])

        for index,region in enumerate(regions):
            regionsForEachProcess[index % options.nCPU].append(region)

        for index in range(options.nCPU):
            #fileName = options.output + "_temp_%s.gz" %(index)
            fileName = options.output + "_temp_%s" %(index)
            fileNames.add(fileName)
            processes.append(PlatypusMultiProcess(fileName, options, regionsForEachProcess[index]))

        for process in processes:
            process.start()

        for process in processes:
            process.join()

        # Final output file
        mergeVCFFiles(fileNames, options.output, log)

    # All done. Write a message to the log, so that it's clear when the
    # program has actually finished, and not crashed.
    log.info("Finished variant calling")
Beispiel #2
0
def continueCalling(args):
    """
    This function allows the user to re-start Platypus from the partially completed output of
    a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus
    then picks up all the options for the previous job from the VCF header, and restarts calling from the latest
    sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF).
    """
    # Create a logger
    logger = logging.getLogger("ATemporaryLog")
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    ch.setLevel(logging.DEBUG)
    logger.setLevel(logging.DEBUG)


    # Seed the Python random number generator
    random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.")
    parser = extendedoptparse.OptionParser()
    parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string')
    (options, args) = parser.parse_args(args)

    newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf")

    logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile))
    logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName))

    theVCF = open(options.vcfFile, 'r')
    lastLine = None
    platypusOptions = None

    for line in theVCF:

        if "platypusOptions=" in line:
            platypusOptions = parsePlatypusOptionsFromVCFHeader(line)

        lastLine = line

    if platypusOptions is None:
        logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile))
        logger.error("Check that VCF file is a valid platypus output file")
        logger.error("Quitting now.")
        return

    cols = lastLine.strip().split("\t")

    lastChrom = cols[0]
    realLastPos = int(cols[1]) - 1
    lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize

    if platypusOptions.nCPU != 1:
        logger.error("Platypus can only currently continue from single process jobs")
        logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).")
        logger.error("Quitting now.")

    logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos))
    allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort)
    theIndex = -1

    for index,region in enumerate(allRegions):
        if region[0] == lastChrom and region[2] == lastPos:
            theIndex = index + 1

    if theIndex == -1:
        raise StandardError, "Could not find region which was unfinished in input VCF"

    logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile))

    doneRegions = allRegions[:theIndex]
    doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom])

    # Reset input VCF file
    theVCF.seek(0,0)

    # Make new file to store complete output
    outputVCF = open(newOutputFileName, "w")

    # Copy old, unfinished VCF into new VCF
    for line in theVCF:

        if line[0] == "#":
            outputVCF.write(line)
        else:
            cols = line.split("\t")
            chrom = cols[0]
            pos = int(cols[1]) - 1

            if chrom in doneChroms:
                outputVCF.write(line)

            elif chrom == lastChrom and pos < lastPos:
                outputVCF.write(line)

            else:
                break

    outputVCF.close()
    setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:])
    platypusOptions.output = newOutputFileName
    runVariantCaller(platypusOptions, continuing=True)
Beispiel #3
0
def continueCalling(args):
    """
    This function allows the user to re-start Platypus from the partially completed output of
    a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus
    then picks up all the options for the previous job from the VCF header, and restarts calling from the latest
    sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF).
    """
    # Create a logger
    logger = logging.getLogger("ATemporaryLog")
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    ch.setLevel(logging.DEBUG)
    logger.setLevel(logging.DEBUG)


    # Seed the Python random number generator
    random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.")
    parser = extendedoptparse.OptionParser()
    parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string')
    (options, args) = parser.parse_args(args)

    newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf")

    logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile))
    logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName))

    theVCF = Open(options.vcfFile, 'r')
    lastLine = None
    platypusOptions = None

    for line in theVCF:

        if "platypusOptions=" in line:
            platypusOptions = parsePlatypusOptionsFromVCFHeader(line)

        lastLine = line

    if platypusOptions is None:
        logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile))
        logger.error("Check that VCF file is a valid platypus output file")
        logger.error("Quitting now.")
        return

    cols = lastLine.strip().split("\t")

    lastChrom = cols[0]
    realLastPos = int(cols[1]) - 1
    lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize

    if platypusOptions.nCPU != 1:
        logger.error("Platypus can only currently continue from single process jobs")
        logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).")
        logger.error("Quitting now.")

    logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos))
    allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort)
    theIndex = -1

    for index,region in enumerate(allRegions):
        if region[0] == lastChrom and region[2] == lastPos:
            theIndex = index + 1

    if theIndex == -1:
        raise StandardError, "Could not find region which was unfinished in input VCF"

    logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile))

    doneRegions = allRegions[:theIndex]
    doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom])

    # Reset input VCF file
    theVCF.seek(0,0)

    # Make new file to store complete output
    outputVCF = Open(newOutputFileName, "w")

    # Copy old, unfinished VCF into new VCF
    for line in theVCF:

        if line[0] == "#":
            outputVCF.write(line)
        else:
            cols = line.split("\t")
            chrom = cols[0]
            pos = int(cols[1]) - 1

            if chrom in doneChroms:
                outputVCF.write(line)

            elif chrom == lastChrom and pos < lastPos:
                outputVCF.write(line)

            else:
                break

    outputVCF.close()
    setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:])
    platypusOptions.output = newOutputFileName
    runVariantCaller(platypusOptions, continuing=True)
Beispiel #4
0
def runVariantCaller(options, continuing=False):
    """
    Run the variant caller. If continuing == True, then we are picking up a failed job from
    where it left off.
    """
    
    options = expandPaths(options)
    
    # Seed the Python random number generator
    random.seed("Full many a flower is born to blush unseen and waste its sweetness on the desert air")

    # Set up basic logging

    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

    log = logging.getLogger('Log')

    fh = None
    ch = logging.StreamHandler()

    if continuing:
        fh = logging.FileHandler(options.logFileName, 'a')
    else:
        fh = logging.FileHandler(options.logFileName, 'w')

    ch.setFormatter(formatter)
    fh.setFormatter(formatter)

    if options.verbosity == 0:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.ERROR)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity == 1:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.WARNING)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity == 2:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity >= 3:
        # Debug goes to file only.
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.DEBUG)
    else:
        raise StandardError, "Value of 'verbosity' input parameter must be between 0 and 3 inclusive"

    log.addHandler(ch)
    log.addHandler(fh)

    if continuing:
        log.info("Continuing variant calling from where we left off.")
    else:
        log.info("Beginning variant calling")
    
    log.info("Output will go to %s" %(options.output))

    regions = None

    if continuing:
        regions = options.unfinishedRegions
    else:
        regions = sorted(platypusutils.getRegions(options), cmp=regionSort)

    if options.nCPU == 1:
        fileName = None

        if options.output == "-":
            fileName = options.output
        else:
            fileName = options.output + "_temp_1.gz"

        p1 = PlatypusSingleProcess(fileName, options, regions, continuing)
        p1.run()
        if options.output != "-":
            mergeVCFFiles([fileName], options.output, log)
    else:
        # Create process manager
        fileNames = set()
        processes = []
        regionsForEachProcess = []

        # In this case, create all the BAM files here, before splitting into separate processes. The files will be left open until
        # the end of the parent process, and all child processes will share the same open files via pointers.
        bamFileNames = None
        samples = None
        samplesByID = None
        samplesByBAM = None
        bamFiles = None
        theLocks = None

        for i in range(options.nCPU):
            regionsForEachProcess.append([])

        for index,region in enumerate(regions):
            regionsForEachProcess[index % options.nCPU].append(region)

        for index in range(options.nCPU):
            #fileName = options.output + "_temp_%s.gz" %(index)
            fileName = options.output + "_temp_%s" %(index)
            fileNames.add(fileName)
            processes.append(PlatypusMultiProcess(fileName, options, regionsForEachProcess[index]))

        for process in processes:
            process.start()

        for process in processes:
            process.join()

        # Final output file
        mergeVCFFiles(fileNames, options.output, log)

    # All done. Write a message to the log, so that it's clear when the
    # program has actually finished, and not crashed.
    log.info("Finished variant calling")
Beispiel #5
0
def runVariantCaller(options, continuing=False):
    """
    Run the variant caller. If continuing == True, then we are picking up a failed job from
    where it left off.
    """

    options = expandPaths(options)

    # Seed the Python random number generator
    random.seed(
        "Full many a flower is born to blush unseen and waste its sweetness on the desert air"
    )

    # Set up basic logging

    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

    log = logging.getLogger('Log')

    fh = None
    ch = logging.StreamHandler()

    if continuing:
        fh = logging.FileHandler(options.logFileName, 'a')
    else:
        fh = logging.FileHandler(options.logFileName, 'w')

    ch.setFormatter(formatter)
    fh.setFormatter(formatter)

    if options.verbosity == 0:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.ERROR)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity == 1:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.WARNING)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity == 2:
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.DEBUG)
    elif options.verbosity >= 3:
        # Debug goes to file only.
        log.setLevel(logging.DEBUG)
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.DEBUG)
    else:
        raise Exception(
            "Value of 'verbosity' input parameter must be between 0 and 3 inclusive"
        )

    log.addHandler(ch)
    log.addHandler(fh)

    if continuing:
        log.info("Continuing variant calling from where we left off.")
    else:
        log.info("Beginning variant calling")

    log.info("Output will go to %s" % (options.output))

    regions = None

    if continuing:
        regions = options.unfinishedRegions
    else:
        regions = sorted(platypusutils.getRegions(options), cmp=regionSort)

    # Always create process manager even if nCPU=1, so that we can listen for signals from the main thread
    fileNames = set()
    processes = []
    regionsForEachProcess = []

    # In this case, create all the BAM files here, before splitting into separate processes. The files will be left open until
    # the end of the parent process, and all child processes will share the same open files via pointers.
    bamFileNames = None
    samples = None
    samplesByID = None
    samplesByBAM = None
    bamFiles = None
    theLocks = None

    for i in range(options.nCPU):
        regionsForEachProcess.append([])

    for index, region in enumerate(regions):
        regionsForEachProcess[index % options.nCPU].append(region)

    if options.nCPU == 1 and options.output == "-":
        processes.append(
            PlatypusMultiProcess("-", options, regionsForEachProcess[0]))
    else:
        for index in range(options.nCPU):
            fileName = options.output + "_temp_%s" % (index)
            fileNames.add(fileName)
            processes.append(
                PlatypusMultiProcess(fileName, options,
                                     regionsForEachProcess[index]))

    for process in processes:
        process.start()

    # listen for signals while any process is alive
    while True in [process.is_alive() for process in processes]:
        try:
            time.sleep(1)
        except KeyboardInterrupt:
            print("KeyboardInterrupt detected, terminating all processes...")
            for process in processes:
                process.terminate()
            log.error("Variant calling aborted due to keyboard interrupt")
            sys.exit(1)

    # make sure all processes are finished
    for process in processes:
        process.join()

    # Final output file
    if options.output != "-":
        mergeVCFFiles(fileNames, options.output, log)

    # All done. Write a message to the log, so that it's clear when the
    # program has actually finished, and not crashed.
    log.info("Finished variant calling")