Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v",
                        "--version",
                        dest="version",
                        action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r",
                        "--reference",
                        dest="ref",
                        default=None,
                        help="Specify a reference file.")

    # output
    parser.add_argument("-o",
                        "--report-prefix",
                        dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix",
                        dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch",
                        dest="delete_scratch",
                        default=True,
                        action="store_false",
                        help="Filename prefix for scratch report output.")

    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument(
        '--convert-gvcf-truth',
        dest='convert_gvcf_truth',
        action="store_true",
        default=False,
        help=
        'Convert the truth set from genome VCF format to a VCF before processing.'
    )
    parser.add_argument(
        '--convert-gvcf-query',
        dest='convert_gvcf_query',
        action="store_true",
        default=False,
        help=
        'Convert the query set from genome VCF format to a VCF before processing.'
    )
    parser.add_argument(
        "--preprocess-truth",
        dest="preprocessing_truth",
        action="store_true",
        default=False,
        help=
        "Preprocess truth file with same settings as query (default is to accept truth in original format)."
    )
    parser.add_argument(
        "--usefiltered-truth",
        dest="usefiltered_truth",
        action="store_true",
        default=False,
        help=
        "Use filtered variant calls in truth file (by default, only PASS calls in the truth file are used)"
    )
    parser.add_argument(
        "--preprocessing-window-size",
        dest="preprocess_window",
        default=10000,
        type=int,
        help=
        "Preprocessing window size (variants further apart than that size are not expected to interfere)."
    )
    parser.add_argument(
        "--adjust-conf-regions",
        dest="preprocessing_truth_confregions",
        action="store_true",
        default=True,
        help=
        "Adjust confident regions to include variant locations. Note this will only include variants "
        "that are included in the CONF regions already when viewing with bcftools; this option only "
        "makes sure insertions are padded correctly in the CONF regions (to capture these, both the "
        "base before and after must be contained in the bed file).")
    parser.add_argument("--no-adjust-conf-regions",
                        dest="preprocessing_truth_confregions",
                        action="store_false",
                        help="Do not adjust confident regions for insertions.")

    # detailed control of comparison
    parser.add_argument(
        "--unhappy",
        "--no-haplotype-comparison",
        dest="no_hc",
        action="store_true",
        default=False,
        help=
        "Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument(
        "-w",
        "--window-size",
        dest="window",
        default=50,
        type=int,
        help=
        "Minimum distance between variants such that they fall into the same superlocus."
    )

    # xcmp-specific stuff
    parser.add_argument(
        "--xcmp-enumeration-threshold",
        dest="max_enum",
        default=16768,
        type=int,
        help=
        "Enumeration threshold / maximum number of sequences to enumerate per block."
    )

    parser.add_argument(
        "--xcmp-expand-hapblocks",
        dest="hb_expand",
        default=30,
        type=int,
        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads",
                        dest="threads",
                        default=multiprocessing.cpu_count(),
                        type=int,
                        help="Number of threads to use.")

    parser.add_argument(
        "--engine",
        dest="engine",
        default="xcmp",
        choices=["xcmp", "vcfeval", "scmp-somatic", "scmp-distance"],
        help="Comparison engine to use.")

    parser.add_argument(
        "--engine-vcfeval-path",
        dest="engine_vcfeval",
        required=False,
        default=Haplo.vcfeval.findVCFEval(),
        help="This parameter should give the path to the \"rtg\" executable. "
        "The default is %s" % Haplo.vcfeval.findVCFEval())

    parser.add_argument(
        "--engine-vcfeval-template",
        dest="engine_vcfeval_template",
        required=False,
        help=
        "Vcfeval needs the reference sequence formatted in its own file format "
        "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
        "to save time when running hap.py with vcfeval. If no SDF folder is "
        "specified, hap.py will create a temporary one.")

    parser.add_argument(
        "--scmp-distance",
        dest="engine_scmp_distance",
        required=False,
        default=30,
        type=int,
        help=
        "For distance-based matching (vcfeval and scmp), this is the distance between variants to use."
    )

    parser.add_argument(
        "--lose-match-distance",
        dest="engine_scmp_distance",
        required=False,
        type=int,
        help=
        "For distance-based matching (vcfeval and scmp), this is the distance between variants to use."
    )

    if Tools.has_sge:
        parser.add_argument(
            "--force-interactive",
            dest="force_interactive",
            default=False,
            action="store_true",
            help=
            "Force running interactively (i.e. when JOB_ID is not in the environment)"
        )

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument(
        "--logfile",
        dest="logfile",
        default=None,
        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="Raise logging level from warning to info.")

    verbosity_options.add_argument(
        "--quiet",
        dest="quiet",
        default=False,
        action="store_true",
        help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [
        x for x in unknown_args if x not in ["--force-interactive"]
    ]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " %
                          str(unknown_args))
        parser.print_help()
        exit(1)

    print "Hap.py %s" % Tools.version
    if args.version:
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception(
                "The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                " You can either use -T, or run the file through bedtools merge"
            )

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception(
            "Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not args.ref or not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(
            args.reports_prefix))):
        raise Exception(
            "The output path does not exist. Please specify a valid output path and prefix using -o"
        )

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(
            args.reports_prefix):
        raise Exception(
            "The output path should specify a file name prefix. Please specify a valid output path "
            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* ."
        )

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # turn on allele conversion
    if (args.engine == "scmp-somatic" or args.engine == "scmp-distance") \
            and not args.somatic_allele_conversion:
        args.somatic_allele_conversion = True
        if args.engine == "scmp-distance":
            args.somatic_allele_conversion = "first"

    # somatic allele conversion should also switch off decomposition
    if args.somatic_allele_conversion and ("-D" not in sys.argv
                                           and "--decompose" not in sys.argv):
        args.preprocessing_decompose = False

    # xcmp/scmp support bcf; others don't
    if args.engine in ["xcmp", "scmp-somatic", "scmp-distance"] \
            and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    # write session info and args file
    session = sessionInfo()
    session["final_args"] = args.__dict__
    with open(args.reports_prefix + ".runinfo.json", "w") as sessionfile:
        json.dump(session, sessionfile)

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()

        if args.engine.endswith("somatic") and \
           args.preprocessing_truth and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_truth = False
            logging.info(
                "Turning off pre.py preprocessing for somatic comparisons")

        if args.preprocessing_truth:
            if args.filter_nonref:
                logging.info(
                    "Filtering out any variants genotyped as <NON_REF>")

        ## Only converting truth gvcf to vcf if both arguments are true
        convert_gvcf_truth = False
        if args.convert_gvcf_truth or args.convert_gvcf_to_vcf:
            logging.info("Converting genome VCF to VCF")
            convert_gvcf_truth = True

        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        args.gender = pre.preprocess(
            args.vcf1,
            ttf.name,
            args.ref,
            args.locations,
            None if args.usefiltered_truth else "*",  # filters
            args.fixchr,
            args.regions_bedfile,
            args.targets_bedfile,
            args.preprocessing_leftshift
            if args.preprocessing_truth else False,
            args.preprocessing_decompose
            if args.preprocessing_truth else False,
            args.preprocessing_norm if args.preprocessing_truth else False,
            args.preprocess_window,
            args.threads,
            args.gender,
            args.somatic_allele_conversion,
            "TRUTH",
            filter_nonref=args.filter_nonref
            if args.preprocessing_truth else False,
            convert_gvcf_to_vcf=convert_gvcf_truth)

        args.vcf1 = ttf.name

        if args.fp_bedfile and args.preprocessing_truth_confregions:
            conf_temp = Haplo.gvcf2bed.gvcf2bed(args.vcf1, args.ref,
                                                args.fp_bedfile,
                                                args.scratch_prefix)
            tempfiles.append(conf_temp)
            args.strat_regions.append("CONF_VARS:" + conf_temp)

        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" %
                     (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs
                                  & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception(
                    "Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = args.locations.split(",")

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        if args.filter_nonref:
            logging.info("Filtering out any variants genotyped as <NON_REF>")

        ## Only converting truth gvcf to vcf if both arguments are true
        convert_gvcf_query = False
        if args.convert_gvcf_query or args.convert_gvcf_to_vcf:
            logging.info("Converting genome VCF to VCF")
            convert_gvcf_query = True

        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")

        if args.engine.endswith("somatic") and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_leftshift = False
            args.preprocessing_norm = False
            args.preprocessing_decompose = False
            logging.info(
                "Turning off pre.py preprocessing (query) for somatic comparisons"
            )

        pre.preprocess(
            args.vcf2,
            qtf.name,
            args.ref,
            str(",".join(args.locations)),
            filtering,
            args.fixchr,
            args.regions_bedfile,
            args.targets_bedfile,
            args.preprocessing_leftshift,
            args.preprocessing_decompose,
            args.preprocessing_norm,
            args.preprocess_window,
            args.threads,
            args.gender,  # same gender as truth above
            args.somatic_allele_conversion,
            "QUERY",
            filter_nonref=args.filter_nonref,
            convert_gvcf_to_vcf=convert_gvcf_query)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" %
                     (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper,
                              args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations,
                              args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception(
                    "Input files/regions do not contain variants (0 haplotype blocks were processed)."
                )

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc_header = args.roc
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2,
                                                  output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        elif args.engine.startswith("scmp"):
            tempfiles += Haplo.scmp.runSCmp(args.vcf1, args.vcf2, output_name,
                                            args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        if args.preserve_info and args.engine == "vcfeval":
            # if we use vcfeval we need to merge the INFO fields back in.
            tf = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
            tempfiles.append(tf)
            print >> tf, "TRUTH_IN"
            print >> tf, "QUERY_IN"
            tf.close()
            info_file = tempfile.NamedTemporaryFile(suffix=".vcf.gz",
                                                    delete=False)
            tempfiles.append(info_file.name)
            info_file.close()

            bcftools.runBcftools("merge", args.vcf1, args.vcf2,
                                 "--force-samples", "-m", "all", "|",
                                 "bcftools", "reheader", "-s", tf.name, "|",
                                 "bcftools", "view", "-o", info_file.name,
                                 "-O", "z")
            bcftools.runBcftools("index", info_file.name)

            merged_info_file = tempfile.NamedTemporaryFile(suffix=".vcf.gz",
                                                           delete=False)
            tempfiles.append(merged_info_file.name)
            merged_info_file.close()

            bcftools.runBcftools("merge", output_vcf, info_file.name, "-m",
                                 "all", "|", "bcftools", "view", "-s",
                                 "^TRUTH_IN,QUERY_IN", "-X", "-U", "-o",
                                 merged_info_file.name, "-O", "z")
            output_name = merged_info_file.name

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Esempio n. 2
0
def partialCredit(vcfname,
                  outputname,
                  reference,
                  locations,
                  threads=1,
                  window=10000,
                  leftshift=True,
                  decompose=True,
                  haploid_x=False):
    """ Partial-credit-process a VCF file according to our args """

    pool = getPool(int(threads))
    if threads > 1:
        logging.info("Partial credit processing uses %i parallel processes." % threads)

        if not locations:
            h = extractHeadersJSON(vcfname)
            if not h["tabix"]["chromosomes"]:
                logging.warn("Empty input or not tabix indexed")
                if outputname.endswith(".bcf"):
                    runBcftools("view", "-O", "b", "-o", outputname, vcfname)
                    runBcftools("index", outputname)
                else:
                    runBcftools("view", "-O", "z", "-o", outputname, vcfname)
                    runBcftools("index", "-t", outputname)
                # just return the same file
                return
            locations = h["tabix"]["chromosomes"]
        elif type(locations) is str or type(locations) is unicode:
            locations = locations.split(",")

        # use blocksplit to subdivide input
        res = runParallel(pool,
                          blocksplitWrapper,
                          locations,
                          {"vcf": vcfname,
                           "dist": window,
                           "pieces": min(40, threads*4)})

        if None in res:
            raise Exception("One of the blocksplit processes failed.")

        locations = list(itertools.chain.from_iterable(res))
        if not len(locations):
            logging.warn("Blocksplit returned no blocks. This can happen when "
                         "an input contains no valid variants.")
            locations = [""]
    else:
        locations = [""]

    res = []
    try:
        res = runParallel(pool,
                          preprocessWrapper,
                          itertools.izip(itertools.repeat(vcfname), locations),
                          {"reference": reference,
                           "decompose": decompose,
                           "leftshift": leftshift,
                           "haploid_x": haploid_x,
                           "bcf": outputname.endswith(".bcf")})

        if None in res:
            raise Exception("One of the preprocess jobs failed")
        if not res:
            raise Exception("No blocks were processed. List of locations: %s" % str(list(locations)))

        concatenateParts(outputname, *res)
        if outputname.endswith(".vcf.gz"):
            runBcftools("index", "-f", "-t", outputname)
        else:  # use bcf
            runBcftools("index", "-f", outputname)
    finally:
        for r in res:
            try:
                os.unlink(r)
            except:
                pass
            try:
                os.unlink(r + ".tbi")
            except:
                pass
            try:
                os.unlink(r + ".csi")
            except:
                pass
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")


    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--preprocessing-window-size", dest="preprocess_window",
                        default=10000, type=int,
                        help="Preprocessing window size (variants further apart than that size are not expected to interfere).")

    # detailed control of comparison
    parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between variants such that they fall into the same superlocus.")

    # xcmp-specific stuff
    parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        default=Haplo.vcfeval.findVCFEval(),
                        help="This parameter should give the path to the \"rtg\" executable. "
                             "The default is %s" % Haplo.vcfeval.findVCFEval())
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
                             "to save time when running hap.py with vcfeval. If no SDF folder is "
                             "specified, hap.py will create a temporary one.")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(1)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
        raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
        raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                        "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # xcmp supports bcf; others don't
    if args.engine == "xcmp" and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()
        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        pre.preprocess(args.vcf1,
                       ttf.name,
                       args.ref,
                       args.locations,
                       None if args.usefiltered_truth else "*",  # filters
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift if args.preprocessing_truth else False,
                       args.preprocessing_decompose if args.preprocessing_truth else False,
                       args.preprocessing_norm if args.preprocessing_truth else False,
                       args.preprocess_window,
                       args.threads)

        args.vcf1 = ttf.name
        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception("Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = [args.locations]

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")
        pre.preprocess(args.vcf2,
                       qtf.name,
                       args.ref,
                       str(",".join(args.locations)),
                       filtering,
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift,
                       args.preprocessing_decompose,
                       args.preprocessing_norm,
                       args.preprocess_window,
                       args.threads)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Esempio n. 4
0
def partialCredit(vcfname, outputname, reference, locations, threads=1, window=10000, leftshift=True, decompose=True):
    """ Partial-credit-process a VCF file according to our args """

    pool = getPool(int(threads))
    if threads > 1:
        logging.info("Partial credit processing uses %i parallel processes." % threads)

        if not locations:
            h = extractHeadersJSON(vcfname)
            if not h["tabix"]["chromosomes"]:
                logging.warn("Empty input or not tabix indexed")
                if outputname.endswith(".bcf"):
                    runBcftools("view", "-O", "b", "-o", outputname, vcfname)
                    runBcftools("index", outputname)
                else:
                    runBcftools("view", "-O", "z", "-o", outputname, vcfname)
                    runBcftools("index", "-t", outputname)
                # just return the same file
                return
            locations = h["tabix"]["chromosomes"]
        elif type(locations) is str or type(locations) is unicode:
            locations = locations.split(",")

        # use blocksplit to subdivide input
        res = runParallel(
            pool, blocksplitWrapper, locations, {"vcf": vcfname, "dist": window, "pieces": min(40, threads * 4)}
        )

        if None in res:
            raise Exception("One of the blocksplit processes failed.")

        locations = list(itertools.chain.from_iterable(res))
        if not len(locations):
            logging.warn("Blocksplit returned no blocks. This can happen when " "an input contains no valid variants.")
            locations = [""]
    else:
        locations = [""]

    res = []
    try:
        res = runParallel(
            pool,
            preprocessWrapper,
            itertools.izip(itertools.repeat(vcfname), locations),
            {
                "reference": reference,
                "decompose": decompose,
                "leftshift": leftshift,
                "bcf": outputname.endswith(".bcf"),
            },
        )

        if None in res:
            raise Exception("One of the preprocess jobs failed")
        if not res:
            raise Exception("No blocks were processed. List of locations: %s" % str(list(locations)))

        concatenateParts(outputname, *res)
        if outputname.endswith(".vcf.gz"):
            runBcftools("index", "-t", outputname)
        else:  # use bcf
            runBcftools("index", outputname)
    finally:
        for r in res:
            try:
                os.unlink(r)
            except:
                pass
            try:
                os.unlink(r + ".tbi")
            except:
                pass
            try:
                os.unlink(r + ".csi")
            except:
                pass