Python fastaContigLengths Examples

Programming Language: Python

Namespace/Package Name: Tools.fastasize

Method/Function: fastaContigLengths

Examples at hotexamples.com: 8

Python fastaContigLengths - 8 examples found. These are the top rated real world Python examples of Tools.fastasize.fastaContigLengths extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: hap.py Project: Jeffleecy/hap.py

def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v",
                        "--version",
                        dest="version",
                        action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r",
                        "--reference",
                        dest="ref",
                        default=None,
                        help="Specify a reference file.")

    # output
    parser.add_argument("-o",
                        "--report-prefix",
                        dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix",
                        dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch",
                        dest="delete_scratch",
                        default=True,
                        action="store_false",
                        help="Filename prefix for scratch report output.")

    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument(
        '--convert-gvcf-truth',
        dest='convert_gvcf_truth',
        action="store_true",
        default=False,
        help=
        'Convert the truth set from genome VCF format to a VCF before processing.'
    )
    parser.add_argument(
        '--convert-gvcf-query',
        dest='convert_gvcf_query',
        action="store_true",
        default=False,
        help=
        'Convert the query set from genome VCF format to a VCF before processing.'
    )
    parser.add_argument(
        "--preprocess-truth",
        dest="preprocessing_truth",
        action="store_true",
        default=False,
        help=
        "Preprocess truth file with same settings as query (default is to accept truth in original format)."
    )
    parser.add_argument(
        "--usefiltered-truth",
        dest="usefiltered_truth",
        action="store_true",
        default=False,
        help=
        "Use filtered variant calls in truth file (by default, only PASS calls in the truth file are used)"
    )
    parser.add_argument(
        "--preprocessing-window-size",
        dest="preprocess_window",
        default=10000,
        type=int,
        help=
        "Preprocessing window size (variants further apart than that size are not expected to interfere)."
    )
    parser.add_argument(
        "--adjust-conf-regions",
        dest="preprocessing_truth_confregions",
        action="store_true",
        default=True,
        help=
        "Adjust confident regions to include variant locations. Note this will only include variants "
        "that are included in the CONF regions already when viewing with bcftools; this option only "
        "makes sure insertions are padded correctly in the CONF regions (to capture these, both the "
        "base before and after must be contained in the bed file).")
    parser.add_argument("--no-adjust-conf-regions",
                        dest="preprocessing_truth_confregions",
                        action="store_false",
                        help="Do not adjust confident regions for insertions.")

    # detailed control of comparison
    parser.add_argument(
        "--unhappy",
        "--no-haplotype-comparison",
        dest="no_hc",
        action="store_true",
        default=False,
        help=
        "Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument(
        "-w",
        "--window-size",
        dest="window",
        default=50,
        type=int,
        help=
        "Minimum distance between variants such that they fall into the same superlocus."
    )

    # xcmp-specific stuff
    parser.add_argument(
        "--xcmp-enumeration-threshold",
        dest="max_enum",
        default=16768,
        type=int,
        help=
        "Enumeration threshold / maximum number of sequences to enumerate per block."
    )

    parser.add_argument(
        "--xcmp-expand-hapblocks",
        dest="hb_expand",
        default=30,
        type=int,
        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads",
                        dest="threads",
                        default=multiprocessing.cpu_count(),
                        type=int,
                        help="Number of threads to use.")

    parser.add_argument(
        "--engine",
        dest="engine",
        default="xcmp",
        choices=["xcmp", "vcfeval", "scmp-somatic", "scmp-distance"],
        help="Comparison engine to use.")

    parser.add_argument(
        "--engine-vcfeval-path",
        dest="engine_vcfeval",
        required=False,
        default=Haplo.vcfeval.findVCFEval(),
        help="This parameter should give the path to the \"rtg\" executable. "
        "The default is %s" % Haplo.vcfeval.findVCFEval())

    parser.add_argument(
        "--engine-vcfeval-template",
        dest="engine_vcfeval_template",
        required=False,
        help=
        "Vcfeval needs the reference sequence formatted in its own file format "
        "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
        "to save time when running hap.py with vcfeval. If no SDF folder is "
        "specified, hap.py will create a temporary one.")

    parser.add_argument(
        "--scmp-distance",
        dest="engine_scmp_distance",
        required=False,
        default=30,
        type=int,
        help=
        "For distance-based matching (vcfeval and scmp), this is the distance between variants to use."
    )

    parser.add_argument(
        "--lose-match-distance",
        dest="engine_scmp_distance",
        required=False,
        type=int,
        help=
        "For distance-based matching (vcfeval and scmp), this is the distance between variants to use."
    )

    if Tools.has_sge:
        parser.add_argument(
            "--force-interactive",
            dest="force_interactive",
            default=False,
            action="store_true",
            help=
            "Force running interactively (i.e. when JOB_ID is not in the environment)"
        )

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument(
        "--logfile",
        dest="logfile",
        default=None,
        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="Raise logging level from warning to info.")

    verbosity_options.add_argument(
        "--quiet",
        dest="quiet",
        default=False,
        action="store_true",
        help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [
        x for x in unknown_args if x not in ["--force-interactive"]
    ]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " %
                          str(unknown_args))
        parser.print_help()
        exit(1)

    print "Hap.py %s" % Tools.version
    if args.version:
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception(
                "The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                " You can either use -T, or run the file through bedtools merge"
            )

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception(
            "Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not args.ref or not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(
            args.reports_prefix))):
        raise Exception(
            "The output path does not exist. Please specify a valid output path and prefix using -o"
        )

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(
            args.reports_prefix):
        raise Exception(
            "The output path should specify a file name prefix. Please specify a valid output path "
            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* ."
        )

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # turn on allele conversion
    if (args.engine == "scmp-somatic" or args.engine == "scmp-distance") \
            and not args.somatic_allele_conversion:
        args.somatic_allele_conversion = True
        if args.engine == "scmp-distance":
            args.somatic_allele_conversion = "first"

    # somatic allele conversion should also switch off decomposition
    if args.somatic_allele_conversion and ("-D" not in sys.argv
                                           and "--decompose" not in sys.argv):
        args.preprocessing_decompose = False

    # xcmp/scmp support bcf; others don't
    if args.engine in ["xcmp", "scmp-somatic", "scmp-distance"] \
            and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    # write session info and args file
    session = sessionInfo()
    session["final_args"] = args.__dict__
    with open(args.reports_prefix + ".runinfo.json", "w") as sessionfile:
        json.dump(session, sessionfile)

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()

        if args.engine.endswith("somatic") and \
           args.preprocessing_truth and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_truth = False
            logging.info(
                "Turning off pre.py preprocessing for somatic comparisons")

        if args.preprocessing_truth:
            if args.filter_nonref:
                logging.info(
                    "Filtering out any variants genotyped as <NON_REF>")

        ## Only converting truth gvcf to vcf if both arguments are true
        convert_gvcf_truth = False
        if args.convert_gvcf_truth or args.convert_gvcf_to_vcf:
            logging.info("Converting genome VCF to VCF")
            convert_gvcf_truth = True

        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        args.gender = pre.preprocess(
            args.vcf1,
            ttf.name,
            args.ref,
            args.locations,
            None if args.usefiltered_truth else "*",  # filters
            args.fixchr,
            args.regions_bedfile,
            args.targets_bedfile,
            args.preprocessing_leftshift
            if args.preprocessing_truth else False,
            args.preprocessing_decompose
            if args.preprocessing_truth else False,
            args.preprocessing_norm if args.preprocessing_truth else False,
            args.preprocess_window,
            args.threads,
            args.gender,
            args.somatic_allele_conversion,
            "TRUTH",
            filter_nonref=args.filter_nonref
            if args.preprocessing_truth else False,
            convert_gvcf_to_vcf=convert_gvcf_truth)

        args.vcf1 = ttf.name

        if args.fp_bedfile and args.preprocessing_truth_confregions:
            conf_temp = Haplo.gvcf2bed.gvcf2bed(args.vcf1, args.ref,
                                                args.fp_bedfile,
                                                args.scratch_prefix)
            tempfiles.append(conf_temp)
            args.strat_regions.append("CONF_VARS:" + conf_temp)

        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" %
                     (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs
                                  & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception(
                    "Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = args.locations.split(",")

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        if args.filter_nonref:
            logging.info("Filtering out any variants genotyped as <NON_REF>")

        ## Only converting truth gvcf to vcf if both arguments are true
        convert_gvcf_query = False
        if args.convert_gvcf_query or args.convert_gvcf_to_vcf:
            logging.info("Converting genome VCF to VCF")
            convert_gvcf_query = True

        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")

        if args.engine.endswith("somatic") and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_leftshift = False
            args.preprocessing_norm = False
            args.preprocessing_decompose = False
            logging.info(
                "Turning off pre.py preprocessing (query) for somatic comparisons"
            )

        pre.preprocess(
            args.vcf2,
            qtf.name,
            args.ref,
            str(",".join(args.locations)),
            filtering,
            args.fixchr,
            args.regions_bedfile,
            args.targets_bedfile,
            args.preprocessing_leftshift,
            args.preprocessing_decompose,
            args.preprocessing_norm,
            args.preprocess_window,
            args.threads,
            args.gender,  # same gender as truth above
            args.somatic_allele_conversion,
            "QUERY",
            filter_nonref=args.filter_nonref,
            convert_gvcf_to_vcf=convert_gvcf_query)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" %
                     (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper,
                              args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations,
                              args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception(
                    "Input files/regions do not contain variants (0 haplotype blocks were processed)."
                )

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc_header = args.roc
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2,
                                                  output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        elif args.engine.startswith("scmp"):
            tempfiles += Haplo.scmp.runSCmp(args.vcf1, args.vcf2, output_name,
                                            args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        if args.preserve_info and args.engine == "vcfeval":
            # if we use vcfeval we need to merge the INFO fields back in.
            tf = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
            tempfiles.append(tf)
            print >> tf, "TRUTH_IN"
            print >> tf, "QUERY_IN"
            tf.close()
            info_file = tempfile.NamedTemporaryFile(suffix=".vcf.gz",
                                                    delete=False)
            tempfiles.append(info_file.name)
            info_file.close()

            bcftools.runBcftools("merge", args.vcf1, args.vcf2,
                                 "--force-samples", "-m", "all", "|",
                                 "bcftools", "reheader", "-s", tf.name, "|",
                                 "bcftools", "view", "-o", info_file.name,
                                 "-O", "z")
            bcftools.runBcftools("index", info_file.name)

            merged_info_file = tempfile.NamedTemporaryFile(suffix=".vcf.gz",
                                                           delete=False)
            tempfiles.append(merged_info_file.name)
            merged_info_file.close()

            bcftools.runBcftools("merge", output_vcf, info_file.name, "-m",
                                 "all", "|", "bcftools", "view", "-s",
                                 "^TRUTH_IN,QUERY_IN", "-X", "-U", "-o",
                                 merged_info_file.name, "-O", "z")
            output_name = merged_info_file.name

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))

Example #2

Show file

File: hap.py Project: pkrusche/hap.py

def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")


    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--preprocessing-window-size", dest="preprocess_window",
                        default=10000, type=int,
                        help="Preprocessing window size (variants further apart than that size are not expected to interfere).")

    # detailed control of comparison
    parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between variants such that they fall into the same superlocus.")

    # xcmp-specific stuff
    parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        default=Haplo.vcfeval.findVCFEval(),
                        help="This parameter should give the path to the \"rtg\" executable. "
                             "The default is %s" % Haplo.vcfeval.findVCFEval())
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
                             "to save time when running hap.py with vcfeval. If no SDF folder is "
                             "specified, hap.py will create a temporary one.")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(1)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
        raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
        raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                        "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # xcmp supports bcf; others don't
    if args.engine == "xcmp" and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()
        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        pre.preprocess(args.vcf1,
                       ttf.name,
                       args.ref,
                       args.locations,
                       None if args.usefiltered_truth else "*",  # filters
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift if args.preprocessing_truth else False,
                       args.preprocessing_decompose if args.preprocessing_truth else False,
                       args.preprocessing_norm if args.preprocessing_truth else False,
                       args.preprocess_window,
                       args.threads)

        args.vcf1 = ttf.name
        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception("Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = [args.locations]

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")
        pre.preprocess(args.vcf2,
                       qtf.name,
                       args.ref,
                       str(",".join(args.locations)),
                       filtering,
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift,
                       args.preprocessing_decompose,
                       args.preprocessing_norm,
                       args.preprocess_window,
                       args.threads)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))

Example #3

Show file

def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file prefix for statistics and feature table (when selected)")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="FP",
                        help="False-positive region bed file to distinguish UNK from FP")

    parser.add_argument("-a", "--ambiguous", dest="ambi", action='append',
                        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed "
                             "in some replicates)")

    parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False,
                        help="Use FP calls from ambiguous region files also.")

    parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False,
                        help="Assume the truth set covers the whole genome and only count FPs in regions "
                             "specified by the truth VCF or ambiguous/false-positive regions.")

    parser.add_argument("--no-count-unk", dest="count_unk", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False,
                        default=False, action="store_true",
                        help="print a table giving the number of ambiguous events per category")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file.")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--continue", dest="cont", default=False, action="store_true",
                        help="Continue from scratch space (i.e. use VCFs in there if they already exist).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(),
                        help="Select a feature table to output.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true",
                        help="Enable running of bcftools norm on the truth file.")

    parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true",
                        help="Enable running of bcftools norm on the query file.")

    parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true",
                        help="Enable running of bcftools norm on both truth and query file.")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True,
                        help="Add chr prefix to truth file (default: true).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True,
                        help="Add chr prefix to query file (default: true).")

    parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Same as --fixchr-truth.")

    parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Same as --fixchr-query.")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False,
                        help="Disable chr replacement for truth (default: false).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False,
                        help="Add chr prefix to query file (default: false).")

    parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true",
                        help="Disable checking the order of TP features (dev feature).")

    parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(),
                        help="Create a ROC-style table. This is caller specific "
                             " - this will override the --feature-table switch!")

    parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true",
                        help="Stratify into different AF buckets. This needs to have features available"
                             "for getting the AF both in truth and query variants.")
    parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2,
                        help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, "
                             "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.")
    parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE",
                        help="Feature name to use for retrieving AF for truth variants (TP and FN)")
    parser.add_argument("--af-query", dest="af_strat_query", default="T_AF",
                        help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)")

    parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true",
                        help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all "
                             "variants) and either the --feature-table or --roc switch.")

    parser.add_argument("--fp-region-size", dest="fpr_size",
                        help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using"
                             " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise."
                             " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls."
                             " The resulting value is used as fp.region.size.")

    parser.add_argument("--ci-level", dest="ci_level", default=0.95, type = float,
                        help="Confidence level for precision/recall confidence intervals (default: 0.95)")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    try:
        if type(args.af_strat_binsize) is str:
            args.af_strat_binsize = map(float, args.af_strat_binsize.split(","))
        else:
            args.af_strat_binsize = map(float, [args.af_strat_binsize])

        if not args.af_strat_binsize:
            raise Exception("Bin size list is empty")
    except:
        logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize))
        exit(1)

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname
        if not args.inc_nonpass:
            logging.warn("When creating ROCs without the -P switch, the ROC data points will only "
                         "include filtered variants (i.e. they will normally end at the caller's "
                         "quality threshold).")

    if not (args.ci_level > 0.0 and args.ci_level < 1.0):
        raise Exception("Confidence interval level must be > 0.0 and < 1.0.")

    if args.af_strat and not args.features:
        raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together "
                        "with --feature-table or --roc")

    if args.count_filtered_fn and (not args.inc_nonpass or not args.features):
        raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, "
                        "and when using unfiltered variants. Specify -P --feature-table <...> or use "
                        "--roc to select a ROC type.")

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(args.truth, ntpath, args.location,
                          True,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.normalize_truth,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(args.query, nqpath, args.location,
                          not args.inc_nonpass,  # pass_only
                          args.fixchr_query,  # chrprefix
                          args.normalize_query,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("FP/ambiguity classes with info (multiple classes can "
                             "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                    formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % (
                                str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                if vtype == "SNVs":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) &
                                                          (featuretable["ALT"].str.len() ==
                                                           featuretable["REF"].str.len())]
                elif vtype == "indels":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) |
                                                          (featuretable["ALT"].str.len() != 1)]
                else:
                    featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0]

                if args.af_strat:
                    start = 0.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = min(1.000000001, start + current_binsize)
                        n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") &
                                                      (featuretable_this_type[af_q_feature] >= start) &
                                                      (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") &
                                                        (featuretable_this_type[af_q_feature] >= start) &
                                                        (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") &
                                                       (featuretable_this_type[af_q_feature] >= start) &
                                                       (featuretable_this_type[af_q_feature] < end)]

                        r = {"type": "%s.%f-%f" % (vtype, start, end),
                             "total.truth": n_tp.shape[0] + n_fn.shape[0],
                             "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0],
                             "tp": n_tp.shape[0],
                             "fp": n_fp.shape[0],
                             "fn": n_fn.shape[0],
                             "unk": n_unk.shape[0],
                             "ambi": n_ambi.shape[0], }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start += current_binsize
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        # remove things where we haven't seen any variants in truth and query
        res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)]
        # summary metrics with confidence intervals
        ci_alpha = 1.0 - args.ci_level
        recall = binomialCI(res["tp"], res["tp"]+res["fn"], ci_alpha)
        precision = binomialCI(res["tp"], res["tp"]+res["fp"], ci_alpha)
        res["recall"], res["recall_lower"], res["recall_upper"] = recall
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"], res["precision_lower"], res["precision_upper"] = precision
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom, label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]
                    if not h1_chrs:
                        logging.warn("ntpath is empty")
                        h1_chrs = []

                    h2 = Tools.vcfextract.extractHeadersJSON(nqpath)
                    h2_chrs = h2["tabix"]["chromosomes"]
                    if not h2_chrs:
                        logging.warn("nqpath is empty")
                        h2_chrs = []

                    combined_chrs = list(set(h1_chrs + h2_chrs))
                    if len(combined_chrs) > 0:
                        qlocations = " ".join(combined_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] +
                                                                            res["fp"] - res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring
        res.to_csv(args.output + ".stats.csv")
        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #4

Show file

File: hap.py Project: goranrakocevic/hap.py

def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument('--location', '-l', dest='locations', required=False, default=None,
                        help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).')

    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False,
                        help="Use to include failing query variants in comparison.")

    parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Include failing variants from the truth dataset.")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")

    # DEPRECATED: we don't write bed files after 0.2.9
    parser.add_argument("-B", "--write-bed", dest="write_bed",
                        default=False, action="store_true",
                        help="This option is deprecated. BED files will not be written anymore.")

    # add quantification args
    qfy.updateArgs(parser)

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    # detailed control of comparison
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file using bcftools.")

    parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False,
                        help="Perform VCF preprocessing using bcftools.")

    parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False,
                        help="Enable preprocessing through bcftools norm -c x -D (requires external "
                             " preprocessing to be switched on).")

    parser.add_argument("-N", "--numeric-chromosomes", dest="numeric_chrs", action="store_true", default=None,
                        help="Use numeric chromosome names for truth and query. This is a shortcut for "
                             "-l 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y "
                             "--no-fixchr-truth --no-fixchr-query")

    parser.add_argument("-C", "--no-numeric-chromosomes", dest="numeric_chrs", action="store_false",
                        help="Use chr-prefixed chromosome names for truth and query. This is a shortcut for "
                             "-l chr1,...,chrY"
                             "--fixchr-truth --fixchr-query")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Add chr prefix to truth file (default: auto).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false",
                        help="Disable chr replacement for truth (default: auto).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false",
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None,
                        help="give credit for partially matched variants. "
                             "this is equivalent to --internal-leftshift and --internal-preprocessing.")

    parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None,
                        help="Give credit for partially matched variants. "
                             "This is equivalent to --internal-leftshift and --no-internal-preprocessing.")

    parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False,
                        help="Combination of --no-haplotype-comparison --no-internal-preprocessing "
                             "--no-internal-leftshift.")

    parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True,
                        help="Disable automatic index creation for input files. "
                             "The index is only necessary at this stage if we want to auto-detect locations. "
                             "When used with -l, and when it is known that there are variants at all given locations "
                             "this is not needed and can be switched off to save time.")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between two variants such that they fall into different haplotype "
                             "blocks")

    parser.add_argument("--enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        help="This parameter should give the path to the \"rtg\" executable.")
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa).")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(0)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.write_bed:
        logging.warn("The -B / --write-bed switches are deprecated in versions 0.2.9+, ")

    if args.roc:
        args.write_vcf = True

    # disable all clever matching
    if args.unhappy:
        args.int_preprocessing = False
        args.int_preprocessing_ls = False
        args.no_hc = True

    # Counting with partial credit
    elif args.partial_credit:
        # partial_credit switch is overridden by --no-* switches
        args.int_preprocessing = True
        args.int_preprocessing_ls = True
    elif args.partial_credit is None:
        # in the default setting, we enable partial credit but only override the
        # preprocessing settings if they haven't been specified
        if args.int_preprocessing is None:
            args.int_preprocessing = True
        if args.int_preprocessing_ls is None:
            args.int_preprocessing_ls = True
    elif args.partial_credit is not None:  # explicitly set to false
        args.int_preprocessing = False
        args.int_preprocessing_ls = True

    if args.int_preprocessing is None:
        args.int_preprocessing = False
    if args.int_preprocessing_ls is None:
        args.int_preprocessing_ls = False

    logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift",
                                                           "splitting" if args.int_preprocessing else "raw calls",
                                                           "haplocompare" if not args.no_hc else "no-haplocompare"))

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.targets_bedfile or args.engine != "xcmp":
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    tempfiles = []

    try:
        if not args.force_interactive and "JOB_ID" not in os.environ:
            parser.print_help()
            raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

        if not args.ref:
            args.ref = Tools.defaultReference()

        if not os.path.exists(args.ref):
            raise Exception("Please specify a valid reference path using -r.")

        if not args.reports_prefix:
            raise Exception("Please specify an output prefix using -o ")

        if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
            raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

        if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
            raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

        # noinspection PyProtectedMember
        if not args._vcfs or len(args._vcfs) != 2:
            raise Exception("Please specify exactly two input VCFs.")

        # noinspection PyProtectedMember
        args.vcf1 = args._vcfs[0]
        # noinspection PyProtectedMember
        args.vcf2 = args._vcfs[1]

        if not os.path.exists(args.vcf1):
            raise Exception("Input file %s does not exist." % args.vcf1)
        if not os.path.exists(args.vcf2):
            raise Exception("Input file %s does not exist." % args.vcf2)

        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        # detect numeric chromosome names
        if args.numeric_chrs is None:
            cts = fastaContigLengths(args.ref)
            cts = set(cts.keys())
            numeric_names = set(map(str, range(1, 23)) + ["X", "Y", "M"])
            non_numeric_names = set(["chr" + x for x in numeric_names])
            numeric_names &= cts
            non_numeric_names &= cts
            numeric_names = len(list(numeric_names))
            non_numeric_names = len(list(non_numeric_names))
            if numeric_names != 0 and non_numeric_names == 0:
                args.numeric_chrs = True
                logging.info("Auto-detected numeric chromosome names")
            elif numeric_names == 0 and non_numeric_names != 0:
                args.numeric_chrs = False
                logging.info("Auto-detected chr-prefixed chromosome names")

        if args.numeric_chrs:
            args.fixchr_truth = False
            args.fixchr_query = False
        elif args.numeric_chrs is not None:
            args.fixchr_truth = True
            args.fixchr_query = True

        h1 = vcfextract.extractHeadersJSON(args.vcf1)
        if args.auto_index and not h1["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf1)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name)
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        h2 = vcfextract.extractHeadersJSON(args.vcf2)
        if args.auto_index and not h2["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf2)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name)
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        ref_check = True
        try:
            happy_ref = args.ref
            v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"]
            v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"]
            if args.verbose:
                logging.info("References used: hap.py: %s / truth: %s / "
                             "query: %s" % (str(happy_ref), str(v1r), str(v2r)))

            v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "")
            v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "")

            if happy_ref == v1_ref and v1_ref == v2_ref:
                ref_check = True

            rids_vh = set()
            rids_v1 = set()
            rids_v2 = set()
            for refid in ["hg19", "hg38", "grc37", "grc38"]:
                if refid in happy_ref.lower():
                    rids_vh.add(refid)
                if refid in v1_ref.lower():
                    rids_v1.add(refid)
                if refid in v2_ref.lower():
                    rids_v2.add(refid)

            rids_v1 = sorted(list(rids_v1))
            rids_v2 = sorted(list(rids_v2))
            rids_vh = sorted(list(rids_vh))

            to_cmp = None
            if rids_v1:
                to_cmp = rids_v1
            if rids_v2:
                to_cmp = rids_v2
            if rids_vh:
                to_cmp = rids_vh
            if to_cmp and rids_v1 and rids_v1 != to_cmp:
                ref_check = False
            if to_cmp and rids_v2 and rids_v2 != to_cmp:
                ref_check = False
            if to_cmp and rids_vh and rids_vh != to_cmp:
                ref_check = False

        except:
            pass

        if not ref_check:
            logging.warn("Reference sequence check failed! "
                         "Please ensure that truth and query VCF use the same reference sequence as "
                         "hap.py. XCMP may fail if this is not the case, and the results will not be "
                         " accurate.")

        if args.locations is None or len(args.locations) == 0:
            # all chromosomes
            if args.numeric_chrs:
                args.locations = [x for x in map(str, range(1, 23))]
            else:
                args.locations = ["chr" + x for x in map(str, range(1, 23))]

        if type(args.locations) is not list and args.locations is not None:
            # noinspection PyUnresolvedReferences
            args.locations = args.locations.split(",")

        # HAP-143 fix the case where no chromosomes are in truth or query
        try:
            if not h1["tabix"]["chromosomes"]:
                h1["tabix"]["chromosomes"] = []
        except:
            pass
        try:
            if not h2["tabix"]["chromosomes"]:
                h2["tabix"]["chromosomes"] = []
        except:
            pass

        if not h1["tabix"]:
            args.preprocessing_truth = True
            logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            if args.fixchr_truth is None:
                args.fixchr_truth = True
        elif args.fixchr_truth is None:
            logging.info(str(h1["tabix"]))
            # autodetect chr naming
            count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"]
                                  if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_truth = True
                logging.info("Will fix chromosome names (truth).")
            else:
                logging.info("Will not fix chromosome names (truth).")
                args.fixchr_truth = False

        if not h2["tabix"]:
            args.preprocessing = True
            logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            # don't overwrite setting, but if it's None, replace with True to be sure
            if args.fixchr_query is None:
                args.fixchr_query = True
        elif args.fixchr_query is None:
            # autodetect chr naming
            count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"]
                                  if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_query = True
                logging.info("Will fix chromosome names (query).")
            else:
                logging.info("Will not fix chromosome names (query).")
                args.fixchr_query = False

        if args.fixchr_truth or args.preprocessing_norm:
            args.preprocessing_truth = True

        if args.fixchr_query or args.preprocessing_norm:
            args.preprocessing = True

        if args.preprocessing_truth:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations),
                          not args.usefiltered_truth,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf1 = vtf.name
            # get headers again if we preprocessed
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        if args.preprocessing:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations),
                          False,  # query filters are handled further down in matching
                          args.fixchr_query,  # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf2 = vtf.name
            # get headers again if we preprocessed
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        if not h1["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        if not h2["tabix"]:
            raise Exception("Query file is not Tabix indexed.")

        newlocations = []

        if not h1["tabix"]["chromosomes"]:
            h1["tabix"]["chromosomes"] = []
        if not h2["tabix"]["chromosomes"]:
            h2["tabix"]["chromosomes"] = []

        for _xc in args.locations:
            xc = _xc.split(":")[0]
            if xc not in h1["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in truth!" % xc)
            if xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % xc)

            if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]):
                logging.warn("Removing location %s because neither input file has calls there." % xc)
            else:
                newlocations.append(_xc)

        if not newlocations:
            raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" %
                            str(args.locations))

        args.locations = newlocations

        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)
            pool = multiprocessing.Pool(int(args.threads))

            # find balanced pieces
            args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))
        else:
            pool = None

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.", suffix=".vcf.gz")
        tf.close()
        tempfiles.append(tf.name)
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x[0] for x in res if x is not None]  # VCFs
            tempfiles += [x[1] for x in res if x is not None and x[1] is not None]  # beds (if any)

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x[0] for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            fo = Tools.BGZipFile(output_name, True)
            for i, x in enumerate(runme_list):
                f = gzip.GzipFile(x)
                for l in f:
                    if i == 0 or not l[0] == "#":
                        fo.write(l)
            fo.close()

            logging.info("Indexing...")
            to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ")
            logging.info("Running '%s'" % to_run)
            subprocess.check_call(to_run, shell=True)
            # passed to quantify
            args.type = "xcmp"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        logging.info("Counting variants...")

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))

Example #5

Show file

def preprocess(vcf_input,
               vcf_output,
               reference,
               locations=None,
               filters=None,
               fixchr=None,
               regions=None,
               targets=None,
               leftshift=True,
               decompose=True,
               bcftools_norm=False,
               windowsize=10000,
               threads=1,
               gender=None,
               somatic_allele_conversion=False,
               sample="SAMPLE"):
    """ Preprocess a single VCF file

    :param vcf_input: input file name
    :param vcf_output: output file name
    :param reference: reference fasta name
    :param locations: list of locations or None
    :param filters: list of filters to apply ("*" to only allow PASS)
    :param fixchr: None for auto, or True/False -- fix chr prefix to match reference
    :param regions: regions bed file
    :param targets: targets bed file
    :param leftshift: left-shift variants
    :param decompose: decompose variants
    :param bcftools_norm: use bcftools_norm
    :param windowsize: normalisation window size
    :param threads: number of threads to for preprcessing
    :param gender: the gender of the sample ("male" / "female" / "auto" / None)
    :param somatic_allele_conversion: convert somatic alleles -- False / half / het / hemi / hom
    :param sample: when using somatic_allele_conversion, name of the output sample

    :return: the gender if auto-determined (otherwise the same value as gender parameter)
    """

    tempfiles = []
    try:
        # If the input is in BCF format, we can continue to
        # process it in bcf
        # if it is in .vcf.gz, don't try to convert it to
        # bcf because there are a range of things that can
        # go wrong there (e.g. undefined contigs and bcftools
        # segfaults)
        if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"):
            int_suffix = ".bcf"
            int_format = "b"
            if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"):
                logging.warn(
                    "Turning vcf into bcf can cause problems when headers are not consistent with all "
                    "records in the file. I will run vcfcheck to see if we will run into trouble. "
                    "To save time in the future, consider converting your files into bcf using bcftools before"
                    " running pre.py.")
        else:
            int_suffix = ".vcf.gz"
            int_format = "z"

        # HAP-317 always check for BCF errors since preprocessing tools now require valid headers
        mf = subprocess.check_output("vcfcheck %s --check-bcf-errors 1" %
                                     pipes.quote(vcf_input),
                                     shell=True)

        if gender == "auto":
            logging.info(mf)
            if "female" in mf:
                gender = "female"
            else:
                gender = "male"

        h = vcfextract.extractHeadersJSON(vcf_input)
        reference_contigs = set(fastaContigLengths(reference).keys())
        reference_has_chr_prefix = hasChrPrefix(reference_contigs)

        allfilters = []
        for f in h["fields"]:
            try:
                if f["key"] == "FILTER":
                    allfilters.append(f["values"]["ID"])
            except:
                logging.warn("ignoring header: %s" % str(f))

        required_filters = None
        if filters:
            fts = filters.split(",")
            required_filters = ",".join(
                list(
                    set(["PASS", "."] +
                        [x for x in allfilters if x not in fts])))

        if fixchr is None:
            try:
                if not h["tabix"]:
                    logging.warn(
                        "input file is not tabix indexed, consider doing this in advance for performance reasons"
                    )
                    vtf = tempfile.NamedTemporaryFile(delete=False,
                                                      suffix=int_suffix)
                    vtf.close()
                    tempfiles.append(vtf.name)
                    runBcftools("view", "-o", vtf.name, "-O", int_format,
                                vcf_input)
                    runBcftools("index", vtf.name)
                    h2 = vcfextract.extractHeadersJSON(vcf_input)
                    chrlist = h2["tabix"]["chromosomes"]
                else:
                    chrlist = h["tabix"]["chromosomes"]
                vcf_has_chr_prefix = hasChrPrefix(chrlist)

                if reference_has_chr_prefix and not vcf_has_chr_prefix:
                    fixchr = True
            except:
                logging.warn("Guessing the chr prefix in %s has failed." %
                             vcf_input)

        # all these require preprocessing
        vtf = vcf_input

        if leftshift or decompose:
            vtf = tempfile.NamedTemporaryFile(delete=False, suffix=int_suffix)
            vtf.close()
            tempfiles.append(vtf.name)
            vtf = vtf.name
        else:
            vtf = vcf_output

        preprocessVCF(vcf_input,
                      vtf,
                      locations,
                      filters == "*",
                      fixchr,
                      bcftools_norm,
                      regions,
                      targets,
                      reference,
                      required_filters,
                      somatic_allele_conversion=somatic_allele_conversion,
                      sample=sample)

        if leftshift or decompose or gender == "male":
            Haplo.partialcredit.partialCredit(vtf,
                                              vcf_output,
                                              reference,
                                              locations,
                                              threads=threads,
                                              window=windowsize,
                                              leftshift=leftshift,
                                              decompose=decompose,
                                              haploid_x=gender == "male")
    finally:
        for t in tempfiles:
            try:
                os.unlink(t)
            except:
                pass

    return gender

Example #6

Show file

File: som.py Project: pkrusche/hap.py

def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file prefix for statistics and feature table (when selected)")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="FP",
                        help="False-positive region bed file to distinguish UNK from FP")

    parser.add_argument("-a", "--ambiguous", dest="ambi", action='append',
                        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed "
                             "in some replicates)")

    parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False,
                        help="Use FP calls from ambiguous region files also.")

    parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False,
                        help="Assume the truth set covers the whole genome and only count FPs in regions "
                             "specified by the truth VCF or ambiguous/false-positive regions.")

    parser.add_argument("--no-count-unk", dest="count_unk", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False,
                        default=False, action="store_true",
                        help="print a table giving the number of ambiguous events per category")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file.")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--continue", dest="cont", default=False, action="store_true",
                        help="Continue from scratch space (i.e. use VCFs in there if they already exist).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(),
                        help="Select a feature table to output.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true",
                        help="Enable running of bcftools norm on the truth file.")

    parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true",
                        help="Enable running of bcftools norm on the query file.")

    parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true",
                        help="Enable running of bcftools norm on both truth and query file.")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True,
                        help="Add chr prefix to truth file (default: true).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True,
                        help="Add chr prefix to query file (default: true).")

    parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Same as --fixchr-truth.")

    parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Same as --fixchr-query.")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False,
                        help="Disable chr replacement for truth (default: false).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False,
                        help="Add chr prefix to query file (default: false).")
    
    parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true",
                        help="Disable checking the order of TP features (dev feature).")

    parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(),
                        help="Create a ROC-style table. This is caller specific "
                             " - this will override the --feature-table switch!")

    parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true",
                        help="Stratify into different AF buckets. This needs to have features available"
                             "for getting the AF both in truth and query variants.")
    parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2,
                        help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, "
                             "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.")
    parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE",
                        help="Feature name to use for retrieving AF for truth variants (TP and FN)")
    parser.add_argument("--af-query", dest="af_strat_query", default="T_AF",
                        help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)")

    parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true",
                        help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all "
                             "variants) and either the --feature-table or --roc switch.")

    parser.add_argument("--fp-region-size", dest="fpr_size",
                        help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using"
                             " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise."
                             " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls."
                             " The resulting value is used as fp.region.size.")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    try:
        if type(args.af_strat_binsize) is str:
            args.af_strat_binsize = map(float, args.af_strat_binsize.split(","))
        else:
            args.af_strat_binsize = map(float, [args.af_strat_binsize])

        if not args.af_strat_binsize:
            raise Exception("Bin size list is empty")
    except:
        logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize))
        exit(1)

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname
        if not args.inc_nonpass:
            logging.warn("When creating ROCs without the -P switch, the ROC data points will only "
                         "include filtered variants (i.e. they will normally end at the caller's "
                         "quality threshold).")

    if args.af_strat and not args.features:
        raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together "
                        "with --feature-table or --roc")

    if args.count_filtered_fn and (not args.inc_nonpass or not args.features):
        raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, "
                        "and when using unfiltered variants. Specify -P --feature-table <...> or use "
                        "--roc to select a ROC type.")

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(args.truth, ntpath, args.location,
                          True,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.normalize_truth,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(args.query, nqpath, args.location,
                          not args.inc_nonpass,  # pass_only
                          args.fixchr_query,  # chrprefix
                          args.normalize_query,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("FP/ambiguity classes with info (multiple classes can "
                             "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                    formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % (
                                str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                if vtype == "SNVs":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) &
                                                          (featuretable["ALT"].str.len() ==
                                                           featuretable["REF"].str.len())]
                elif vtype == "indels":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) |
                                                          (featuretable["ALT"].str.len() != 1)]
                else:
                    featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0]

                if args.af_strat:
                    start = 0.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = min(1.000000001, start + current_binsize)
                        n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") &
                                                      (featuretable_this_type[af_q_feature] >= start) &
                                                      (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") &
                                                        (featuretable_this_type[af_q_feature] >= start) &
                                                        (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") &
                                                       (featuretable_this_type[af_q_feature] >= start) &
                                                       (featuretable_this_type[af_q_feature] < end)]

                        r = {"type": "%s.%f-%f" % (vtype, start, end),
                             "total.truth": n_tp.shape[0] + n_fn.shape[0],
                             "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0],
                             "tp": n_tp.shape[0],
                             "fp": n_fp.shape[0],
                             "fn": n_fn.shape[0],
                             "unk": n_unk.shape[0],
                             "ambi": n_ambi.shape[0], }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start += current_binsize
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        # remove things where we haven't seen any variants in truth and query
        res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)]
        # summary metrics
        res["recall"] = res["tp"] / (res["tp"] + res["fn"])
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"] = res["tp"] / (res["tp"] + res["fp"])
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom, label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]
                    if not h1_chrs:
                        logging.warn("ntpath is empty")
                        h1_chrs = []

                    h2 = Tools.vcfextract.extractHeadersJSON(nqpath)
                    h2_chrs = h2["tabix"]["chromosomes"]
                    if not h2_chrs:
                        logging.warn("nqpath is empty")
                        h2_chrs = []
                    
                    combined_chrs = list(set(h1_chrs + h2_chrs))
                    if len(combined_chrs) > 0:
                        qlocations = " ".join(combined_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] +
                                                                            res["fp"] - res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring
        res.to_csv(args.output + ".stats.csv")
        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #7

Show file

def main():

    args = parse_args()

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" %
                             (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(
                args.truth,
                ntpath,
                args.location,
                True,  # pass_only
                args.fixchr_truth,  # chrprefix
                args.normalize_truth,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(
                args.query,
                nqpath,
                args.location,
                not args.inc_nonpass,  # pass_only
                args.fixchr_query,  # chrprefix
                args.normalize_query,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([
            os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))
        ])

        tpfn_r_files = all([
            os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))
        ])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p",
                        os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p",
                        os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" %
                                (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(
            runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            "tp")
        fncounts = parseStats(
            runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort_values(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "FP/ambiguity classes with info (multiple classes can "
                    "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort_values(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "Reasons for defining as ambiguous (multiple reasons can overlap):\n"
                    + ambie.to_string(formatters={
                        'reason':
                        '{{:<{}s}}'.format(
                            ambie['reason'].str.len().max()).format
                    },
                                      index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={
                            'reason':
                            '{{:<{}s}}'.format(
                                ambie['reason'].str.len().max()).format
                        },
                        index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"),
                               "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"),
                                "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort_values(["CHROM", "POS"], inplace=True)
            tps2.sort_values(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception(
                    "Cannot read TP features, lists have different lengths : %i != %i"
                    % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception(
                                "Cannot merge TP features, inputs are out of order at %s / %s"
                                % (str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"),
                               "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted(
                [x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv",
                                float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN",
                            "REF"] = featuretable.ix[featuretable["tag"] ==
                                                     "FN", "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN",
                            "ALT"] = featuretable.ix[featuretable["tag"] ==
                                                     "FN", "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                featuretable["vtype"] = resolve_vtype(args)
                featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype,
                           "fp.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "FP")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "tp.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "TP")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "unk.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "UNK")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "ambi.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "AMBI")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]

                if args.af_strat:
                    start = 0.0
                    end = 1.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = start + current_binsize
                        if end >= 1:
                            end = 1.00000001
                        if start >= end:
                            break
                        n_tp = featuretable_this_type[
                            (featuretable_this_type["tag"] == "TP")
                            & (featuretable_this_type[af_t_feature] >= start) &
                            (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[
                            (featuretable_this_type["tag"] == "FN")
                            & (featuretable_this_type[af_t_feature] >= start) &
                            (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[
                            (featuretable_this_type["tag"] == "FP")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[
                            (featuretable_this_type["tag"] == "AMBI")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[
                            (featuretable_this_type["tag"] == "UNK")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]

                        r = {
                            "type":
                            "%s.%f-%f" % (vtype, start, end),
                            "total.truth":
                            n_tp.shape[0] + n_fn.shape[0],
                            "total.query":
                            n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] +
                            n_unk.shape[0],
                            "tp":
                            n_tp.shape[0],
                            "fp":
                            n_fp.shape[0],
                            "fn":
                            n_fn.shape[0],
                            "unk":
                            n_unk.shape[0],
                            "ambi":
                            n_ambi.shape[0]
                        }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[
                                n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[
                                n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[
                                n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[
                                n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] +
                                                     n_fn.shape[0] +
                                                     n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(
                                pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (
                                args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start = end
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        if not args.af_strat:
            res = res[(res["total.truth"] > 0)]

        # summary metrics with confidence intervals
        ci_alpha = 1.0 - args.ci_level

        recall = binomialCI(res["tp"], res["tp"] + res["fn"], ci_alpha)
        precision = binomialCI(res["tp"], res["tp"] + res["fp"], ci_alpha)
        res["recall"], res["recall_lower"], res["recall_upper"] = recall
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"], res["precision_lower"], res[
            "precision_upper"] = precision
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom,
                                                                label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]

                    if not h1_chrs:
                        logging.warn("No contigs in truth file")
                        h1_chrs = []

                    if len(h1_chrs) > 0:
                        qlocations = " ".join(h1_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (
                res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (
                res["tp"] - res["tp.filtered"] + res["fp"] -
                res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (
                res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] -
                                  res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (
                res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)

        metrics_output["metrics"].append(dataframeToMetricsTable(
            "result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring

        # save results
        res.to_csv(args.output + ".stats.csv")

        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

        if args.happy_stats:
            # parse saved feature table as the one in memory has been updated
            featuretable = pandas.read_csv(args.output + ".features.csv",
                                           low_memory=False,
                                           dtype={"FILTER": str})

            # hap.py summary.csv
            summary = summary_from_featuretable(featuretable, args)
            summary.to_csv(args.output + ".summary.csv")

            #  hap.py extended.csv
            if args.af_strat:
                extended = extended_from_featuretable(featuretable, args)
                extended.to_csv(args.output + ".extended.csv",
                                index=False,
                                na_rep="NA")

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #8

Show file

File: pre.py Project: pkrusche/hap.py

def preprocess(vcf_input,
               vcf_output,
               reference,
               locations=None,
               filters=None,
               fixchr=None,
               regions=None,
               targets=None,
               leftshift=True,
               decompose=True,
               bcftools_norm=False,
               windowsize=10000,
               threads=1,
               ):
    """ Preprocess a single VCF file

    :param vcf_input: input file name
    :param vcf_output: output file name
    :param reference: reference fasta name
    :param locations: list of locations or None
    :param filters: list of filters to apply ("*" to only allow PASS)
    :param fixchr: None for auto, or True/False -- fix chr prefix to match reference
    :param regions: regions bed file
    :param targets: targets bed file
    :param leftshift: left-shift variants
    :param decompose: decompose variants
    :param bcftools_norm: use bcftools_norm
    :param windowsize: normalisation window size
    :param threads: number of threads to for preprcessing
    """

    tempfiles = []
    try:
        # If the input is in BCF format, we can continue to
        # process it in bcf
        # if it is in .vcf.gz, don't try to convert it to
        # bcf because there are a range of things that can
        # go wrong there (e.g. undefined contigs and bcftools
        # segfaults)
        if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"):
            int_suffix = ".bcf"
            int_format = "b"
            if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"):
                logging.warn("Turning vcf into bcf can cause problems when headers aren't consistent with all "
                             "records in the file. I will run vcfcheck to see if we will run into trouble. "
                             "To save time in the future, consider converting your files into bcf using bcftools before"
                             " running pre.py.")
                subprocess.check_call("vcfcheck %s" % vcf_input, shell=True)
        else:
            int_suffix = ".vcf.gz"
            int_format = "z"

        h = vcfextract.extractHeadersJSON(vcf_input)
        reference_contigs = set(fastaContigLengths(reference).keys())
        reference_has_chr_prefix = hasChrPrefix(reference_contigs)

        allfilters = []
        for f in h["fields"]:
            try:
                if f["key"] == "FILTER":
                    allfilters.append(f["values"]["ID"])
            except:
                logging.warn("ignoring header: %s" % str(f))

        required_filters = None
        if filters:
            fts = filters.split(",")
            required_filters = ",".join(list(set(["PASS", "."] + [x for x in allfilters if x not in fts])))

        if fixchr is None:
            try:
                if not h["tabix"]:
                    logging.warn("input file is not tabix indexed, consider doing this in advance for performance reasons")
                    vtf = tempfile.NamedTemporaryFile(delete=False,
                                                      suffix=int_suffix)
                    vtf.close()
                    tempfiles.append(vtf.name)
                    runBcftools("view", "-o", vtf.name, "-O", int_format, vcf_input)
                    runBcftools("index", vtf.name)
                    h2 = vcfextract.extractHeadersJSON(vcf_input)
                    chrlist = h2["tabix"]["chromosomes"]
                else:
                    chrlist = h["tabix"]["chromosomes"]
                vcf_has_chr_prefix = hasChrPrefix(h["tabix"]["chromosomes"])

                if reference_has_chr_prefix and not vcf_has_chr_prefix:
                    fixchr = True
            except:
                logging.warn("Guessing the chr prefix in %s has failed." % vcf_input)

        # all these require preprocessing
        vtf = vcf_input

        if leftshift or decompose:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              suffix=int_suffix)
            vtf.close()
            tempfiles.append(vtf.name)
            vtf = vtf.name
        else:
            vtf = vcf_output

        preprocessVCF(vcf_input,
                      vtf,
                      locations,
                      filters == "*",
                      fixchr,
                      bcftools_norm,
                      regions,
                      targets,
                      reference,
                      required_filters)

        if leftshift or decompose:
            Haplo.partialcredit.partialCredit(vtf,
                                              vcf_output,
                                              reference,
                                              locations,
                                              threads=threads,
                                              window=windowsize,
                                              leftshift=leftshift,
                                              decompose=decompose)
    finally:
        for t in tempfiles:
            try:
                os.unlink(t)
            except:
                pass