Exemple #1
0
def quantify(args):
    """ Run quantify and write tables """
    vcf_name = args.in_vcf[0]

    if not vcf_name or not os.path.exists(vcf_name):
        raise Exception("Cannot read input VCF.")

    logging.info("Counting variants...")

    truth_or_query_is_bcf = False
    try:
        truth_or_query_is_bcf = args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf")
    except:
        # args.vcf1 and args.vcf2 are only available when we're running
        # inside hap.py.
        pass

    if args.bcf or truth_or_query_is_bcf:
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    output_vcf = args.reports_prefix + internal_format_suffix
    roc_table = args.reports_prefix + ".roc.tsv"

    qfyregions = {}

    if args.fp_bedfile:
        if not os.path.exists(args.fp_bedfile):
            raise Exception("FP / Confident region file not found at %s" % args.fp_bedfile)
        qfyregions["CONF"] = args.fp_bedfile

    if args.strat_tsv:
        with open(args.strat_tsv) as sf:
            for l in sf:
                n, _, f = l.strip().partition("\t")
                if n in qfyregions:
                    raise Exception("Duplicate stratification region ID: %s" % n)
                if not f:
                    if n:
                        raise Exception("No file for stratification region %s" % n)
                    else:
                        continue
                if not os.path.exists(f):
                    f = os.path.join(os.path.abspath(os.path.dirname(args.strat_tsv)), f)
                if not os.path.exists(f):
                    raise Exception("Quantification region file %s not found" % f)
                qfyregions[n] = f

    if args.strat_regions:
        for r in args.strat_regions:
            n, _, f = r.partition(":")
            if not os.path.exists(f):
                raise Exception("Quantification region file %s not found" % f)
            qfyregions[n] = f

    if vcf_name == output_vcf or vcf_name == output_vcf + internal_format_suffix:
        raise Exception("Cannot overwrite input VCF: %s would overwritten with output name %s." % (vcf_name, output_vcf))

    roc_header = args.roc
    try:
        roc_header = args.roc_header
    except:
        pass

    Haplo.quantify.run_quantify(vcf_name,
                                roc_table,
                                output_vcf if args.write_vcf else False,
                                qfyregions,
                                args.ref,
                                threads=args.threads,
                                output_vtc=args.output_vtc,
                                output_rocs=args.do_roc,
                                qtype=args.type,
                                roc_val=args.roc,
                                roc_header=roc_header,
                                roc_filter=args.roc_filter,
                                roc_delta=args.roc_delta,
                                roc_regions=args.roc_regions,
                                clean_info=not args.preserve_info,
                                strat_fixchr=args.strat_fixchr)

    metrics_output = makeMetricsObject("%s.comparison" % args.runner)

    filter_handling = None
    try:
        if args.engine == "vcfeval" or not args.usefiltered:
            filter_handling = "ALL" if args.usefiltered else "PASS"
    except AttributeError:
        # if we run this through qfy, these arguments are not present
        pass

    total_region_size = None
    headers = Tools.vcfextract.extractHeadersJSON(vcf_name)
    try:
        contigs_to_use = ",".join(headers["tabix"]["chromosomes"])
        contig_lengths = fastasize.fastaNonNContigLengths(args.ref)
        total_region_size = fastasize.calculateLength(contig_lengths, contigs_to_use)
        logging.info("Subset.Size for * is %i, based on these contigs: %s " % (total_region_size, str(contigs_to_use)))
    except:
        pass

    res = Haplo.happyroc.roc(roc_table, args.reports_prefix + ".roc",
                             filter_handling=filter_handling,
                             ci_alpha=args.ci_alpha,
                             total_region_size=total_region_size)
    df = res["all"]

    # only use summary numbers
    df = df[(df["QQ"] == "*") & (df["Filter"].isin(["ALL", "PASS"]))]

    summary_columns = ["Type",
                       "Filter",
                      ]

    for additional_column in ["TRUTH.TOTAL",
                              "TRUTH.TP",
                              "TRUTH.FN",
                              "QUERY.TOTAL",
                              "QUERY.FP",
                              "QUERY.UNK",
                              "FP.gt",
                              "FP.al",
                              "METRIC.Recall",
                              "METRIC.Precision",
                              "METRIC.Frac_NA",
                              "METRIC.F1_Score",
                              "TRUTH.TOTAL.TiTv_ratio",
                              "QUERY.TOTAL.TiTv_ratio",
                              "TRUTH.TOTAL.het_hom_ratio",
                              "QUERY.TOTAL.het_hom_ratio"]:
        summary_columns.append(additional_column)

    # Remove subtype
    summary_df = df[(df["Subtype"] == "*") & (df["Genotype"] == "*") & (df["Subset"] == "*")]

    summary_df[summary_columns].to_csv(args.reports_prefix + ".summary.csv", index=False)

    metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics",
                                                             summary_df[summary_columns]))

    if args.write_counts:
        df.to_csv(args.reports_prefix + ".extended.csv", index=False)
        metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df))

    essential_numbers = summary_df[summary_columns]

    pandas.set_option('display.max_columns', 500)
    pandas.set_option('display.width', 1000)

    essential_numbers = essential_numbers[essential_numbers["Type"].isin(
        ["SNP", "INDEL"])]

    logging.info("\n" + essential_numbers.to_string(index=False))

    # in default mode, print result summary to stdout
    if not args.quiet and not args.verbose:
        print "Benchmarking Summary:"
        print essential_numbers.to_string(index=False)

    # keep this for verbose output
    if not args.verbose:
        try:
            os.unlink(roc_table)
        except:
            pass

    for t in res.iterkeys():
        metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, res[t]))

    # gzip JSON output
    if args.write_json:
        with gzip.open(args.reports_prefix + ".metrics.json.gz", "w") as fp:
            json.dump(metrics_output, fp)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument('--location', '-l', dest='locations', required=False, default=None,
                        help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).')

    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False,
                        help="Use to include failing query variants in comparison.")

    parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Include failing variants from the truth dataset.")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="fp_bedfile",
                        default=None, type=str,
                        help="False positive / confident call regions (.bed or .bed.gz).")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")

    parser.add_argument("-V", "--write-vcf", dest="write_vcf",
                        default=False, action="store_true",
                        help="Write an annotated VCF.")

    parser.add_argument("-B", "--write-bed", dest="write_bed",
                        default=False, action="store_true",
                        help="Write a bed file with the haplotype blocks that were used.")

    parser.add_argument("-X", "--write-counts", dest="write_counts",
                        default=True, action="store_true",
                        help="Write advanced counts and metrics.")

    parser.add_argument("--no-write-counts", dest="write_counts",
                        default=True, action="store_false",
                        help="Do not write advanced counts and metrics.")

    parser.add_argument("--raw-counts", dest="raw_counts",
                        default=False, action="store_true",
                        help="Count variants in unprocessed input VCFs and output as TOTAL.*.RAW.")

    parser.add_argument("--roc", dest="roc", default=False,
                        help="Select an INFO feature to produce a ROC on. This works best with "
                             "--no-internal-preprocessing and --no-internal-leftshift since these "
                             "flags preserve the most INFO flags from the input files.")

    parser.add_argument("--roc-filter", dest="roc_filter", default=False,
                        help="Select a filter to ignore when making ROCs.")

    parser.add_argument("--roc-reversed", dest="roc_reversed", default=False,
                        help="Change the meaning of the ROC feature to count the other way around (higher values=bad).")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    # detailed control of comparison
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file using bcftools.")

    parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False,
                        help="Perform VCF preprocessing using bcftools.")

    parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False,
                        help="Enable preprocessing through bcftools norm -c x -D (requires external "
                             " preprocessing to be switched on).")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Add chr prefix to truth file (default: auto).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false",
                        help="Disable chr replacement for truth (default: auto).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false",
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None,
                        help="give credit for partially matched variants. "
                             "this is equivalent to --internal-leftshift and --internal-preprocessing.")

    parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None,
                        help="Give credit for partially matched variants. "
                             "This is equivalent to --internal-leftshift and --no-internal-preprocessing.")

    parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--match-raw", dest="int_match_raw", action="store_true", default=False,
                        help="Add a matching step in xcmp which also matches raw variant calls. This helps"
                             " when comparing files with very different representations.")

    parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False,
                        help="Combination of --no-haplotype-comparison --no-internal-preprocessing "
                             "--no-internal-leftshift.")

    parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True,
                        help="Disable automatic index creation for input files. "
                             "The index is only necessary at this stage if we want to auto-detect locations. "
                             "When used with -l, and when it is known that there are variants at all given locations "
                             "this is not needed and can be switched off to save time.")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between two variants such that they fall into different haplotype "
                             "blocks")

    parser.add_argument("--enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        help="This parameter should give the path to the \"rtg\" executable.")
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa).")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(0)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # disable all clever matching
    if args.unhappy:
        args.int_preprocessing = False
        args.int_preprocessing_ls = False
        args.no_hc = True

    # Counting with partial credit
    elif args.partial_credit:
        # partial_credit switch is overridden by --no-* switches
        args.int_preprocessing = True
        args.int_preprocessing_ls = True
    elif args.partial_credit is None:
        # in the default setting, we enable partial credit but only override the
        # preprocessing settings if they haven't been specified
        if args.int_preprocessing is None:
            args.int_preprocessing = True
        if args.int_preprocessing_ls is None:
            args.int_preprocessing_ls = True
    elif args.partial_credit is not None:  # explicitly set to false
        args.int_preprocessing = False
        args.int_preprocessing_ls = True

    if args.int_preprocessing is None:
        args.int_preprocessing = False
    if args.int_preprocessing_ls is None:
        args.int_preprocessing_ls = False

    logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift",
                                                           "splitting" if args.int_preprocessing else "raw calls",
                                                           "haplocompare" if not args.no_hc else "no-haplocompare"))

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.targets_bedfile or args.engine != "xcmp":
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    tempfiles = []

    try:
        if not args.force_interactive and "JOB_ID" not in os.environ:
            parser.print_help()
            raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

        if not args.ref:
            args.ref = Tools.defaultReference()

        if not os.path.exists(args.ref):
            raise Exception("Please specify a valid reference path using -r.")

        if not args.reports_prefix:
            raise Exception("Please specify an output prefix using -o ")

        if not os.path.exists(os.path.dirname(args.reports_prefix)):
            raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

        if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
            raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

        # noinspection PyProtectedMember
        if not args._vcfs or len(args._vcfs) != 2:
            raise Exception("Please specify exactly two input VCFs.")

        # noinspection PyProtectedMember
        args.vcf1 = args._vcfs[0]
        # noinspection PyProtectedMember
        args.vcf2 = args._vcfs[1]

        if not os.path.exists(args.vcf1):
            raise Exception("Input file %s does not exist." % args.vcf1)
        if not os.path.exists(args.vcf2):
            raise Exception("Input file %s does not exist." % args.vcf2)

        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        h1 = vcfextract.extractHeadersJSON(args.vcf1)
        if args.auto_index and not h1["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf1)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name)
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        h2 = vcfextract.extractHeadersJSON(args.vcf2)
        if args.auto_index and not h2["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf2)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name)
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        ref_check = False
        try:
            happy_ref = args.ref
            v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"]
            v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"]
            if args.verbose:
                logging.info("References used: hap.py: %s / truth: %s / "
                             "query: %s" % (str(happy_ref), str(v1r), str(v2r)))

            v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "")
            v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "")

            if happy_ref == v1_ref and v1_ref == v2_ref:
                ref_check = True

            refids_found = 0
            for refid in ["hg19", "hg38", "grc37", "grc38"]:
                if refid in happy_ref.lower() and refid in v1_ref.lower() and refid in v2_ref.lower():
                    if args.verbose:
                        logging.info("Reference matches pattern: %s" % refid)
                    refids_found += 1
            if refids_found == 1:
                ref_check = True
        except:
            pass

        if not ref_check:
            logging.warn("Reference sequence check failed! "
                         "Please ensure that truth and query VCF use the same reference sequence as "
                         "hap.py. XCMP may fail if this is not the case, and the results will not be "
                         " accurate.")

        if args.locations is None or len(args.locations) == 0:
            # all chromosomes
            args.locations = ["chr" + x for x in map(str, range(1, 23))]

        if type(args.locations) is not list and args.locations is not None:
            # noinspection PyUnresolvedReferences
            args.locations = args.locations.split(",")

        if not h1["tabix"]:
            args.preprocessing_truth = True
            logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            if args.fixchr_truth is None:
                args.fixchr_truth = True
        elif args.fixchr_truth is None:
            # autodetect chr naming
            count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"]
                                 if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_truth = True
                logging.info("Will fix chromosome names (truth).")
            else:
                logging.info("Will not fix chromosome names (truth).")
                args.fixchr_truth = False

        if not h2["tabix"]:
            args.preprocessing = True
            logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            # don't overwrite setting, but if it's None, replace with True to be sure
            if args.fixchr_query is None:
                args.fixchr_query = True
        elif args.fixchr_query is None:
            # autodetect chr naming
            count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"]
                                 if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_query = True
                logging.info("Will fix chromosome names (query).")
            else:
                logging.info("Will not fix chromosome names (query).")
                args.fixchr_query = False

        if args.fixchr_truth or args.preprocessing_norm:
            args.preprocessing_truth = True

        if args.fixchr_query or args.preprocessing_norm:
            args.preprocessing = True

        if args.preprocessing_truth:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations),
                          not args.usefiltered_truth,     # pass_only
                          args.fixchr_truth,        # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf1 = vtf.name
            # get headers again if we preprocessed
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        if args.preprocessing:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations),
                          not args.usefiltered,     # pass_only
                          args.fixchr_query,        # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf2 = vtf.name
            # get headers again if we preprocessed
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        if not h1["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        if not h2["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        newlocations = []

        if not h1["tabix"]["chromosomes"]:
            h1["tabix"]["chromosomes"] = []
        if not h2["tabix"]["chromosomes"]:
            h2["tabix"]["chromosomes"] = []

        for _xc in args.locations:
            xc = _xc.split(":")[0]
            if xc not in h1["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in truth!" % xc)
            if xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % xc)

            if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]):
                logging.warn("Removing location %s because neither input file has calls there." % xc)
            else:
                newlocations.append(_xc)

        if not newlocations:
            raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" %
                            str(args.locations))

        args.locations = newlocations

        if args.threads > 1:
            logging.info("Running using %i parallel processes." % args.threads)
            pool = multiprocessing.Pool(int(args.threads))

            # find balanced pieces
            args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))
        else:
            pool = None

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth input file")

        if args.raw_counts:
            counts_truth = Haplo.quantify.run_quantify(args.vcf1,
                                                       None,
                                                       None,
                                                       {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                                       args.ref,
                                                       h1["samples"][0],
                                                       locations=args.locations)
        else:
            counts_truth = None

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from truth input file")
        if args.raw_counts:
            counts_query = Haplo.quantify.run_quantify(args.vcf2,
                                                       None,
                                                       None,
                                                       {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                                       args.ref,
                                                       h2["samples"][0],
                                                       locations=args.locations)
        else:
            counts_query = None

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.", suffix=".vcf.gz")
        tf.close()
        tempfiles.append(tf.name)
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x[0] for x in res if x is not None]   # VCFs
            tempfiles += [x[1] for x in res if x is not None and x[1] is not None]   # beds (if any)

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            bedfiles = [x[1] for x in res if x is not None and x[1] is not None]
            if args.write_bed and bedfiles:
                runme = " ".join(["cat"] +
                                 bedfiles +
                                 [">", args.reports_prefix.replace(" ", "\\ ") + ".blocks.bed"])
                logging.info("Concatenating block files: %s..." % runme)
                subprocess.check_call(runme,
                                      shell=True)

            logging.info("Concatenating variants...")
            runme_list = [x[0] for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            fo = Tools.BGZipFile(output_name, True)
            for i, x in enumerate(runme_list):
                f = gzip.GzipFile(x)
                for l in f:
                    if i == 0 or not l[0] == "#":
                        fo.write(l)
            fo.close()

            logging.info("Indexing...")
            to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ")
            logging.info("Running '%s'" % to_run)
            subprocess.check_call(to_run, shell=True)
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        if args.write_counts:
            json_name = args.reports_prefix + ".counts.json"
        else:
            tf = tempfile.NamedTemporaryFile(delete=False,
                                             dir=args.scratch_prefix,
                                             prefix="counts.",
                                             suffix=".json")
            tf.close()
            json_name = tf.name

        logging.info("Counting variants...")

        counts = Haplo.quantify.run_quantify(output_name,
                                             json_name,
                                             args.reports_prefix + ".vcf.gz" if args.write_vcf else False,
                                             {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                             args.ref)

        df = pandas.DataFrame(counts)
        if args.write_counts:
            df.to_csv(args.reports_prefix + ".counts.csv")

        metrics_output = makeMetricsObject("hap.py.comparison")

        if args.write_counts:
            metrics_output["metrics"].append(dataframeToMetricsTable("raw.counts", df))

        # calculate precision / recall
        count_types = []
        if args.raw_counts:
            simplified_truth_counts = Haplo.quantify.simplify_counts(counts_truth, h1["samples"][0:1])
            simplified_query_counts = Haplo.quantify.simplify_counts(counts_query, h2["samples"][0:1])

            count_types += simplified_truth_counts.keys()
            count_types += simplified_query_counts.keys()
        else:
            simplified_truth_counts = None
            simplified_query_counts = None

        simplified_numbers = Haplo.quantify.simplify_counts(counts)

        count_types += simplified_numbers.keys()
        count_types = sorted(list(set(count_types)))

        for vtype in count_types:
            if vtype not in simplified_numbers:
                simplified_numbers[vtype] = {}

            simplified_numbers[vtype]["METRIC.Recall"] = 0
            simplified_numbers[vtype]["METRIC.Recall2"] = 0
            simplified_numbers[vtype]["METRIC.Precision"] = 0
            simplified_numbers[vtype]["METRIC.Frac_NA"] = 0

            try:
                simplified_numbers[vtype]["METRIC.Recall"] = \
                    float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                    float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Recall2"] = \
                    float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                    float(simplified_numbers[vtype]["TRUTH.TOTAL"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Precision"] = \
                    float(simplified_numbers[vtype]["QUERY.TP"]) / \
                    float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Frac_NA"] = \
                    float(simplified_numbers[vtype]["QUERY.UNK"]) / \
                    float(simplified_numbers[vtype]["QUERY.TOTAL"])
            except:
                pass

            try:
                simplified_numbers[vtype]["TRUTH.TOTAL.RAW"] = simplified_truth_counts[vtype][h1["samples"][0] +
                                                                                              ".TOTAL"]
            except:
                pass

            try:
                simplified_numbers[vtype]["QUERY.TOTAL.RAW"] = simplified_query_counts[vtype][h2["samples"][0] +
                                                                                              ".TOTAL"]
            except:
                pass

        pandas.set_option("display.width", 120)
        pandas.set_option("display.max_columns", 1000)
        df = pandas.DataFrame(simplified_numbers).transpose()
        vstring = "hap.py-%s" % Tools.version
        vstring += " ".join(sys.argv)

        df.loc[vstring] = 0

        # for x in df:
        #     # everything not a metric is a count
        #     if not x.startswith("METRIC"):
        #         df[x] = df[x].astype("int64")

        df[["TRUTH.TOTAL",
            "QUERY.TOTAL",
            "METRIC.Recall",
            "METRIC.Precision",
            "METRIC.Frac_NA"]].to_csv(args.reports_prefix + ".summary.csv")

        metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics",
                                         df[["TRUTH.TOTAL",
                                             "QUERY.TOTAL",
                                             "METRIC.Recall",
                                             "METRIC.Precision",
                                             "METRIC.Frac_NA"]]))

        if args.write_counts:
            df.to_csv(args.reports_prefix + ".extended.csv")
            metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df))

        essential_numbers = df[["TRUTH.TOTAL",
                                "QUERY.TOTAL",
                                "METRIC.Recall",
                                "METRIC.Precision",
                                "METRIC.Frac_NA"]]

        pandas.set_option('display.max_columns', 500)
        pandas.set_option('display.width', 1000)

        essential_numbers = essential_numbers[essential_numbers.index.isin(
            ["Locations.SNP", "Locations.INDEL"])]

        logging.info("\n" + str(essential_numbers))

        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "Benchmarking Summary:"
            print str(essential_numbers)

        if args.roc:
            vcf = args.reports_prefix + ".vcf.gz"
            res = Haplo.happyroc.roc(vcf, args.roc, args.roc_filter, args.reports_prefix + ".roc", args.roc_reversed)

            for t in res.iterkeys():
                rocdf = pandas.read_table(res[t])
                metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, rocdf))

        with open(args.reports_prefix + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)
    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Exemple #3
0
def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        required=True,
        help="Output file prefix for statistics and feature table (when selected)",
    )

    parser.add_argument("-l", "--location", dest="location", default="", help="Location for bcftools view (e.g. chr1)")

    parser.add_argument(
        "-R",
        "--restrict-regions",
        dest="regions_bedfile",
        default=None,
        type=str,
        help="Restrict analysis to given (sparse) regions (using -R in bcftools).",
    )

    parser.add_argument(
        "-T",
        "--target-regions",
        dest="targets_bedfile",
        default=None,
        type=str,
        help="Restrict analysis to given (dense) regions (using -T in bcftools).",
    )

    parser.add_argument(
        "-f", "--false-positives", dest="FP", help="False-positive region bed file to distinguish UNK from FP"
    )

    parser.add_argument(
        "-a",
        "--ambiguous",
        dest="ambi",
        action="append",
        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed " "in some replicates)",
    )

    parser.add_argument(
        "--ambiguous-fp",
        dest="ambi_fp",
        action="store_true",
        default=False,
        help="Use FP calls from ambiguous region files also.",
    )

    parser.add_argument(
        "-e",
        "--explain_ambiguous",
        dest="explain_ambiguous",
        required=False,
        default=False,
        action="store_true",
        help="print a table giving the number of ambiguous events per category",
    )

    parser.add_argument(
        "-r", "--reference", dest="ref", default=Tools.defaultReference(), help="Specify a reference file."
    )

    parser.add_argument(
        "--scratch-prefix", dest="scratch_prefix", default=None, help="Filename prefix for scratch report output."
    )

    parser.add_argument(
        "--keep-scratch",
        dest="delete_scratch",
        default=True,
        action="store_false",
        help="Filename prefix for scratch report output.",
    )

    parser.add_argument(
        "--continue",
        dest="cont",
        default=False,
        action="store_true",
        help="Continue from scratch space (i.e. use VCFs in there if they already exist).",
    )

    parser.add_argument(
        "-P",
        "--include-nonpass",
        dest="inc_nonpass",
        action="store_true",
        default=False,
        help="Use to include failing variants in comparison.",
    )

    parser.add_argument(
        "--feature-table",
        dest="features",
        default=False,
        choices=Somatic.FeatureSet.sets.keys(),
        help="Select a feature table to output.",
    )

    parser.add_argument(
        "--bam",
        dest="bams",
        default=[],
        action="append",
        help="pass one or more BAM files for feature table extraction",
    )

    parser.add_argument(
        "--normalize-truth",
        dest="normalize_truth",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on the truth file.",
    )

    parser.add_argument(
        "--normalize-query",
        dest="normalize_query",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on the query file.",
    )

    parser.add_argument(
        "-N",
        "--normalize-all",
        dest="normalize_all",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on both truth and query file.",
    )

    parser.add_argument(
        "--fix-chr-query",
        dest="fixchr_query",
        default=False,
        action="store_true",
        help="Replace numeric chromosome names in the query by chr*-type names",
    )

    parser.add_argument(
        "--fix-chr-truth",
        dest="fixchr_truth",
        default=False,
        action="store_true",
        help="Replace numeric chromosome names in the truth by chr*-type names",
    )

    parser.add_argument(
        "--no-order-check",
        dest="disable_order_check",
        default=False,
        action="store_true",
        help="Disable checking the order of TP features (dev feature).",
    )

    parser.add_argument(
        "--roc",
        dest="roc",
        default=None,
        choices=ROC.list(),
        help="Create a ROC-style table. This is caller specific " " - this will override the --feature-table switch!",
    )

    parser.add_argument(
        "--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr"
    )

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="Raise logging level from warning to info.",
    )

    verbosity_options.add_argument(
        "--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only."
    )

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile, format="%(asctime)s %(levelname)-8s %(message)s", level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(
                args.truth,
                ntpath,
                args.location,
                True,  # pass_only
                args.fixchr_truth,  # chrprefix
                args.normalize_truth,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref,
            )
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(
                args.query,
                nqpath,
                args.location,
                not args.inc_nonpass,  # pass_only
                args.fixchr_query,  # chrprefix
                args.normalize_query,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref,
            )
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all(
            [
                os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            ]
        )

        tpfn_r_files = all(
            [
                os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            ]
        )

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4])

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP")

        has_fp = (fpclasses.count("FP") > 0) or (fpclasses.count("fp") > 0 and args.ambi_fp)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == "#":
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not has_fp:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        res = res[(res["total.truth"] > 0) | (res["total.query"] > 0)]

        # use this to use plain row counts rather than stratified bcftools counts
        # truthcounts = countVCFRows(ntpath) # , "total.truth")
        # querycounts = countVCFRows(nqpath) # , "total.query")
        #
        # tpcounts = countVCFRows(os.path.join(scratch, "tpfn", "0002.vcf.gz"))  #, "tp")
        # fncounts = countVCFRows(os.path.join(scratch, "tpfn", "0000.vcf.gz"))  #, "fn")
        # fpcounts = countVCFRows(fppath)  #, "fp")
        # ambicounts = countVCFRows(ambipath)  #, "ambi")
        # unkcounts = countVCFRows(unkpath)  #, "unk")
        #
        # res = pandas.DataFrame({
        #     "total.truth" : [ truthcounts ],
        #     "total.query" : [ querycounts ],
        #     "tp" : [ tpcounts ],
        #     "fn" : [ fncounts ],
        #     "fp" : [ fpcounts ],
        #     "ambi" : [ ambicounts ],
        #     "unk" : [ unkcounts ]
        # })
        #
        # res["type"] = "records"

        # summary metrics
        res["recall"] = res["tp"] / (res["tp"] + res["fn"])
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"] = res["tp"] / (res["tp"] + res["fp"])
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        metrics_output = makeMetricsObject("som.py.comparison")
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(index=False)
                )
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(
                        index=False
                    )
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "Reasons for defining as ambiguous (multiple reasons can overlap):\n"
                    + ambie.to_string(
                        formatters={"reason": "{{:<{}s}}".format(ambie["reason"].str.len().max()).format}, index=False
                    )
                )
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={"reason": "{{:<{}s}}".format(ambie["reason"].str.len().max()).format}, index=False
                    )
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        res.to_csv(args.output + ".stats.csv")

        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception(
                                "Cannot merge TP features, inputs are out of order at %s / %s"
                                % (str(tps[x : x + 1]), str(tps2[x : x + 1]))
                            )

            logging.info("Merging...")

            cdata = {"CHROM": tps["CHROM"], "POS": tps["POS"], "tag": tps["tag"]}

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and not a in columns_tps2:
                    tpc[a] = tps[a]
                elif not a in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(fppath, "AMBI")
            unks = fset.collect(fppath, "UNK")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs, unks]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format="%.8f")

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format="%.8f")

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)
Exemple #4
0
def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file prefix for statistics and feature table (when selected)")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="FP",
                        help="False-positive region bed file to distinguish UNK from FP")

    parser.add_argument("-a", "--ambiguous", dest="ambi", action='append',
                        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed "
                             "in some replicates)")

    parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False,
                        help="Use FP calls from ambiguous region files also.")

    parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False,
                        help="Assume the truth set covers the whole genome and only count FPs in regions "
                             "specified by the truth VCF or ambiguous/false-positive regions.")

    parser.add_argument("--no-count-unk", dest="count_unk", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False,
                        default=False, action="store_true",
                        help="print a table giving the number of ambiguous events per category")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file.")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--continue", dest="cont", default=False, action="store_true",
                        help="Continue from scratch space (i.e. use VCFs in there if they already exist).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(),
                        help="Select a feature table to output.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true",
                        help="Enable running of bcftools norm on the truth file.")

    parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true",
                        help="Enable running of bcftools norm on the query file.")

    parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true",
                        help="Enable running of bcftools norm on both truth and query file.")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True,
                        help="Add chr prefix to truth file (default: true).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True,
                        help="Add chr prefix to query file (default: true).")

    parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Same as --fixchr-truth.")

    parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Same as --fixchr-query.")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False,
                        help="Disable chr replacement for truth (default: false).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False,
                        help="Add chr prefix to query file (default: false).")

    parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true",
                        help="Disable checking the order of TP features (dev feature).")

    parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(),
                        help="Create a ROC-style table. This is caller specific "
                             " - this will override the --feature-table switch!")

    parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true",
                        help="Stratify into different AF buckets. This needs to have features available"
                             "for getting the AF both in truth and query variants.")
    parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2,
                        help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, "
                             "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.")
    parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE",
                        help="Feature name to use for retrieving AF for truth variants (TP and FN)")
    parser.add_argument("--af-query", dest="af_strat_query", default="T_AF",
                        help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)")

    parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true",
                        help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all "
                             "variants) and either the --feature-table or --roc switch.")

    parser.add_argument("--fp-region-size", dest="fpr_size",
                        help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using"
                             " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise."
                             " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls."
                             " The resulting value is used as fp.region.size.")

    parser.add_argument("--ci-level", dest="ci_level", default=0.95, type = float,
                        help="Confidence level for precision/recall confidence intervals (default: 0.95)")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    try:
        if type(args.af_strat_binsize) is str:
            args.af_strat_binsize = map(float, args.af_strat_binsize.split(","))
        else:
            args.af_strat_binsize = map(float, [args.af_strat_binsize])

        if not args.af_strat_binsize:
            raise Exception("Bin size list is empty")
    except:
        logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize))
        exit(1)

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname
        if not args.inc_nonpass:
            logging.warn("When creating ROCs without the -P switch, the ROC data points will only "
                         "include filtered variants (i.e. they will normally end at the caller's "
                         "quality threshold).")

    if not (args.ci_level > 0.0 and args.ci_level < 1.0):
        raise Exception("Confidence interval level must be > 0.0 and < 1.0.")

    if args.af_strat and not args.features:
        raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together "
                        "with --feature-table or --roc")

    if args.count_filtered_fn and (not args.inc_nonpass or not args.features):
        raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, "
                        "and when using unfiltered variants. Specify -P --feature-table <...> or use "
                        "--roc to select a ROC type.")

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(args.truth, ntpath, args.location,
                          True,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.normalize_truth,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(args.query, nqpath, args.location,
                          not args.inc_nonpass,  # pass_only
                          args.fixchr_query,  # chrprefix
                          args.normalize_query,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("FP/ambiguity classes with info (multiple classes can "
                             "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                    formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % (
                                str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                if vtype == "SNVs":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) &
                                                          (featuretable["ALT"].str.len() ==
                                                           featuretable["REF"].str.len())]
                elif vtype == "indels":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) |
                                                          (featuretable["ALT"].str.len() != 1)]
                else:
                    featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0]

                if args.af_strat:
                    start = 0.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = min(1.000000001, start + current_binsize)
                        n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") &
                                                      (featuretable_this_type[af_q_feature] >= start) &
                                                      (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") &
                                                        (featuretable_this_type[af_q_feature] >= start) &
                                                        (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") &
                                                       (featuretable_this_type[af_q_feature] >= start) &
                                                       (featuretable_this_type[af_q_feature] < end)]

                        r = {"type": "%s.%f-%f" % (vtype, start, end),
                             "total.truth": n_tp.shape[0] + n_fn.shape[0],
                             "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0],
                             "tp": n_tp.shape[0],
                             "fp": n_fp.shape[0],
                             "fn": n_fn.shape[0],
                             "unk": n_unk.shape[0],
                             "ambi": n_ambi.shape[0], }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start += current_binsize
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        # remove things where we haven't seen any variants in truth and query
        res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)]
        # summary metrics with confidence intervals
        ci_alpha = 1.0 - args.ci_level
        recall = binomialCI(res["tp"], res["tp"]+res["fn"], ci_alpha)
        precision = binomialCI(res["tp"], res["tp"]+res["fp"], ci_alpha)
        res["recall"], res["recall_lower"], res["recall_upper"] = recall
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"], res["precision_lower"], res["precision_upper"] = precision
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom, label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]
                    if not h1_chrs:
                        logging.warn("ntpath is empty")
                        h1_chrs = []

                    h2 = Tools.vcfextract.extractHeadersJSON(nqpath)
                    h2_chrs = h2["tabix"]["chromosomes"]
                    if not h2_chrs:
                        logging.warn("nqpath is empty")
                        h2_chrs = []

                    combined_chrs = list(set(h1_chrs + h2_chrs))
                    if len(combined_chrs) > 0:
                        qlocations = " ".join(combined_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] +
                                                                            res["fp"] - res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring
        res.to_csv(args.output + ".stats.csv")
        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)
Exemple #5
0
def quantify(args):
    """ Run quantify and write tables """
    vcf_name = args.in_vcf[0]

    if not vcf_name or not os.path.exists(vcf_name):
        raise Exception("Cannot read input VCF.")

    logging.info("Counting variants...")

    truth_or_query_is_bcf = False
    try:
        truth_or_query_is_bcf = args.vcf1.endswith(
            ".bcf") and args.vcf2.endswith(".bcf")
    except:
        # args.vcf1 and args.vcf2 are only available when we're running
        # inside hap.py.
        pass

    if args.bcf or truth_or_query_is_bcf:
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    output_vcf = args.reports_prefix + internal_format_suffix
    roc_table = args.reports_prefix + ".roc.tsv"

    qfyregions = {}

    if args.fp_bedfile:
        if not os.path.exists(args.fp_bedfile):
            raise Exception("FP / Confident region file not found at %s" %
                            args.fp_bedfile)
        qfyregions["CONF"] = args.fp_bedfile

    if args.strat_tsv:
        with open(args.strat_tsv) as sf:
            for l in sf:
                n, _, f = l.strip().partition("\t")
                if n in qfyregions:
                    raise Exception("Duplicate stratification region ID: %s" %
                                    n)
                if not f:
                    if n:
                        raise Exception(
                            "No file for stratification region %s" % n)
                    else:
                        continue
                if not os.path.exists(f):
                    f = os.path.join(
                        os.path.abspath(os.path.dirname(args.strat_tsv)), f)
                if not os.path.exists(f):
                    raise Exception("Quantification region file %s not found" %
                                    f)
                qfyregions[n] = f

    if args.strat_regions:
        for r in args.strat_regions:
            n, _, f = r.partition(":")
            if not os.path.exists(f):
                raise Exception("Quantification region file %s not found" % f)
            qfyregions[n] = f

    if vcf_name == output_vcf or vcf_name == output_vcf + internal_format_suffix:
        raise Exception(
            "Cannot overwrite input VCF: %s would overwritten with output name %s."
            % (vcf_name, output_vcf))

    roc_header = args.roc
    try:
        roc_header = args.roc_header
    except:
        pass

    Haplo.quantify.run_quantify(vcf_name,
                                roc_table,
                                output_vcf if args.write_vcf else False,
                                qfyregions,
                                args.ref,
                                threads=args.threads,
                                output_vtc=args.output_vtc,
                                output_rocs=args.do_roc,
                                qtype=args.type,
                                roc_val=args.roc,
                                roc_header=roc_header,
                                roc_filter=args.roc_filter,
                                roc_delta=args.roc_delta,
                                roc_regions=args.roc_regions,
                                clean_info=not args.preserve_info,
                                strat_fixchr=args.strat_fixchr)

    metrics_output = makeMetricsObject("%s.comparison" % args.runner)

    filter_handling = None
    try:
        if args.engine == "vcfeval" or not args.usefiltered:
            filter_handling = "ALL" if args.usefiltered else "PASS"
    except AttributeError:
        # if we run this through qfy, these arguments are not present
        pass

    total_region_size = None
    headers = Tools.vcfextract.extractHeadersJSON(vcf_name)
    try:
        contigs_to_use = ",".join(headers["tabix"]["chromosomes"])
        contig_lengths = fastasize.fastaNonNContigLengths(args.ref)
        total_region_size = fastasize.calculateLength(contig_lengths,
                                                      contigs_to_use)
        logging.info("Subset.Size for * is %i, based on these contigs: %s " %
                     (total_region_size, str(contigs_to_use)))
    except:
        pass

    res = Haplo.happyroc.roc(roc_table,
                             args.reports_prefix + ".roc",
                             filter_handling=filter_handling,
                             ci_alpha=args.ci_alpha,
                             total_region_size=total_region_size)
    df = res["all"]

    # only use summary numbers
    df = df[(df["QQ"] == "*") & (df["Filter"].isin(["ALL", "PASS"]))]

    summary_columns = [
        "Type",
        "Filter",
    ]

    for additional_column in [
            "TRUTH.TOTAL", "TRUTH.TP", "TRUTH.FN", "QUERY.TOTAL", "QUERY.FP",
            "QUERY.UNK", "FP.gt", "METRIC.Recall", "METRIC.Precision",
            "METRIC.Frac_NA", "METRIC.F1_Score", "TRUTH.TOTAL.TiTv_ratio",
            "QUERY.TOTAL.TiTv_ratio", "TRUTH.TOTAL.het_hom_ratio",
            "QUERY.TOTAL.het_hom_ratio"
    ]:
        summary_columns.append(additional_column)

    # Remove subtype
    summary_df = df[(df["Subtype"] == "*") & (df["Genotype"] == "*") &
                    (df["Subset"] == "*")]

    summary_df[summary_columns].to_csv(args.reports_prefix + ".summary.csv",
                                       index=False)

    metrics_output["metrics"].append(
        dataframeToMetricsTable("summary.metrics",
                                summary_df[summary_columns]))

    if args.write_counts:
        df.to_csv(args.reports_prefix + ".extended.csv", index=False)
        metrics_output["metrics"].append(
            dataframeToMetricsTable("all.metrics", df))

    essential_numbers = summary_df[summary_columns]

    pandas.set_option('display.max_columns', 500)
    pandas.set_option('display.width', 1000)

    essential_numbers = essential_numbers[essential_numbers["Type"].isin(
        ["SNP", "INDEL"])]

    logging.info("\n" + essential_numbers.to_string(index=False))

    # in default mode, print result summary to stdout
    if not args.quiet and not args.verbose:
        print "Benchmarking Summary:"
        print essential_numbers.to_string(index=False)

    # keep this for verbose output
    if not args.verbose:
        try:
            os.unlink(roc_table)
        except:
            pass

    for t in res.iterkeys():
        metrics_output["metrics"].append(
            dataframeToMetricsTable("roc." + t, res[t]))

    # gzip JSON output
    if args.write_json:
        with gzip.open(args.reports_prefix + ".metrics.json.gz", "w") as fp:
            json.dump(metrics_output, fp)
Exemple #6
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument(
        '--location',
        '-l',
        dest='locations',
        required=False,
        default=None,
        help=
        'Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).'
    )

    parser.add_argument("-v",
                        "--version",
                        dest="version",
                        action="store_true",
                        help="Show version number and exit.")

    parser.add_argument(
        "-P",
        "--include-nonpass",
        dest="usefiltered",
        action="store_true",
        default=False,
        help="Use to include failing query variants in comparison.")

    parser.add_argument(
        "--include-nonpass-truth",
        dest="usefiltered_truth",
        action="store_true",
        default=False,
        help="Include failing variants from the truth dataset.")

    parser.add_argument(
        "-R",
        "--restrict-regions",
        dest="regions_bedfile",
        default=None,
        type=str,
        help=
        "Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument(
        "-T",
        "--target-regions",
        dest="targets_bedfile",
        default=None,
        type=str,
        help=
        "Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument(
        "-f",
        "--false-positives",
        dest="fp_bedfile",
        default=None,
        type=str,
        help="False positive / confident call regions (.bed or .bed.gz).")

    parser.add_argument("-r",
                        "--reference",
                        dest="ref",
                        default=None,
                        help="Specify a reference file.")

    # output
    parser.add_argument("-o",
                        "--report-prefix",
                        dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")

    parser.add_argument("-V",
                        "--write-vcf",
                        dest="write_vcf",
                        default=False,
                        action="store_true",
                        help="Write an annotated VCF.")

    parser.add_argument(
        "-B",
        "--write-bed",
        dest="write_bed",
        default=False,
        action="store_true",
        help="Write a bed file with the haplotype blocks that were used.")

    parser.add_argument("-X",
                        "--write-counts",
                        dest="write_counts",
                        default=True,
                        action="store_true",
                        help="Write advanced counts and metrics.")

    parser.add_argument("--no-write-counts",
                        dest="write_counts",
                        default=True,
                        action="store_false",
                        help="Do not write advanced counts and metrics.")

    parser.add_argument(
        "--raw-counts",
        dest="raw_counts",
        default=False,
        action="store_true",
        help=
        "Count variants in unprocessed input VCFs and output as TOTAL.*.RAW.")

    parser.add_argument(
        "--roc",
        dest="roc",
        default=False,
        help="Select an INFO feature to produce a ROC on. This works best with "
        "--no-internal-preprocessing and --no-internal-leftshift since these "
        "flags preserve the most INFO flags from the input files.")

    parser.add_argument("--roc-filter",
                        dest="roc_filter",
                        default=False,
                        help="Select a filter to ignore when making ROCs.")

    parser.add_argument(
        "--roc-reversed",
        dest="roc_reversed",
        default=False,
        help=
        "Change the meaning of the ROC feature to count the other way around (higher values=bad)."
    )

    parser.add_argument("--scratch-prefix",
                        dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")

    parser.add_argument("--keep-scratch",
                        dest="delete_scratch",
                        default=True,
                        action="store_false",
                        help="Filename prefix for scratch report output.")

    # detailed control of comparison
    parser.add_argument("--preprocess-truth",
                        dest="preprocessing_truth",
                        action="store_true",
                        default=False,
                        help="Preprocess truth file using bcftools.")

    parser.add_argument("--external-preprocessing",
                        dest="preprocessing",
                        action="store_true",
                        default=False,
                        help="Perform VCF preprocessing using bcftools.")

    parser.add_argument(
        "--bcftools-norm",
        dest="preprocessing_norm",
        action="store_true",
        default=False,
        help=
        "Enable preprocessing through bcftools norm -c x -D (requires external "
        " preprocessing to be switched on).")

    parser.add_argument("--fixchr-truth",
                        dest="fixchr_truth",
                        action="store_true",
                        default=None,
                        help="Add chr prefix to truth file (default: auto).")

    parser.add_argument("--fixchr-query",
                        dest="fixchr_query",
                        action="store_true",
                        default=None,
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument(
        "--no-fixchr-truth",
        dest="fixchr_truth",
        action="store_false",
        help="Disable chr replacement for truth (default: auto).")

    parser.add_argument("--no-fixchr-query",
                        dest="fixchr_query",
                        action="store_false",
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument(
        "--partial-credit",
        dest="partial_credit",
        action="store_true",
        default=None,
        help="give credit for partially matched variants. "
        "this is equivalent to --internal-leftshift and --internal-preprocessing."
    )

    parser.add_argument(
        "--no-partial-credit",
        dest="partial_credit",
        action="store_false",
        default=None,
        help="Give credit for partially matched variants. "
        "This is equivalent to --internal-leftshift and --no-internal-preprocessing."
    )

    parser.add_argument(
        "--internal-leftshift",
        dest="int_preprocessing_ls",
        action="store_true",
        default=None,
        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument(
        "--internal-preprocessing",
        dest="int_preprocessing",
        action="store_true",
        default=None,
        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument(
        "--no-internal-leftshift",
        dest="int_preprocessing_ls",
        action="store_false",
        default=None,
        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument(
        "--no-internal-preprocessing",
        dest="int_preprocessing",
        action="store_false",
        default=None,
        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument(
        "--match-raw",
        dest="int_match_raw",
        action="store_true",
        default=False,
        help=
        "Add a matching step in xcmp which also matches raw variant calls. This helps"
        " when comparing files with very different representations.")

    parser.add_argument(
        "--no-haplotype-comparison",
        dest="no_hc",
        action="store_true",
        default=False,
        help=
        "Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument(
        "--unhappy",
        dest="unhappy",
        action="store_true",
        default=False,
        help=
        "Combination of --no-haplotype-comparison --no-internal-preprocessing "
        "--no-internal-leftshift.")

    parser.add_argument(
        "--no-auto-index",
        dest="auto_index",
        action="store_false",
        default=True,
        help="Disable automatic index creation for input files. "
        "The index is only necessary at this stage if we want to auto-detect locations. "
        "When used with -l, and when it is known that there are variants at all given locations "
        "this is not needed and can be switched off to save time.")

    parser.add_argument(
        "-w",
        "--window-size",
        dest="window",
        default=50,
        type=int,
        help=
        "Minimum distance between two variants such that they fall into different haplotype "
        "blocks")

    parser.add_argument(
        "--enumeration-threshold",
        dest="max_enum",
        default=16768,
        type=int,
        help=
        "Enumeration threshold / maximum number of sequences to enumerate per block."
    )

    parser.add_argument(
        "-e",
        "--expand-hapblocks",
        dest="hb_expand",
        default=30,
        type=int,
        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads",
                        dest="threads",
                        default=multiprocessing.cpu_count(),
                        type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine",
                        dest="engine",
                        default="xcmp",
                        choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument(
        "--engine-vcfeval-path",
        dest="engine_vcfeval",
        required=False,
        help="This parameter should give the path to the \"rtg\" executable.")
    parser.add_argument(
        "--engine-vcfeval-template",
        dest="engine_vcfeval_template",
        required=False,
        help=
        "Vcfeval needs the reference sequence formatted in its own file format "
        "(SDF -- run rtg format -o ref.SDF ref.fa).")

    if Tools.has_sge:
        parser.add_argument(
            "--force-interactive",
            dest="force_interactive",
            default=False,
            action="store_true",
            help=
            "Force running interactively (i.e. when JOB_ID is not in the environment)"
        )

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument(
        "--logfile",
        dest="logfile",
        default=None,
        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="Raise logging level from warning to info.")

    verbosity_options.add_argument(
        "--quiet",
        dest="quiet",
        default=False,
        action="store_true",
        help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [
        x for x in unknown_args if x not in ["--force-interactive"]
    ]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " %
                          str(unknown_args))
        parser.print_help()
        exit(0)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # disable all clever matching
    if args.unhappy:
        args.int_preprocessing = False
        args.int_preprocessing_ls = False
        args.no_hc = True

    # Counting with partial credit
    elif args.partial_credit:
        # partial_credit switch is overridden by --no-* switches
        args.int_preprocessing = True
        args.int_preprocessing_ls = True
    elif args.partial_credit is None:
        # in the default setting, we enable partial credit but only override the
        # preprocessing settings if they haven't been specified
        if args.int_preprocessing is None:
            args.int_preprocessing = True
        if args.int_preprocessing_ls is None:
            args.int_preprocessing_ls = True
    elif args.partial_credit is not None:  # explicitly set to false
        args.int_preprocessing = False
        args.int_preprocessing_ls = True

    if args.int_preprocessing is None:
        args.int_preprocessing = False
    if args.int_preprocessing_ls is None:
        args.int_preprocessing_ls = False

    logging.info("Preprocessing settings: %s / %s / %s" %
                 ("leftshift" if args.int_preprocessing_ls else "no-leftshift",
                  "splitting" if args.int_preprocessing else "raw calls",
                  "haplocompare" if not args.no_hc else "no-haplocompare"))

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception(
                "The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                " You can either use -T, or run the file through bedtools merge"
            )
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.targets_bedfile or args.engine != "xcmp":
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    tempfiles = []

    try:
        if not args.force_interactive and "JOB_ID" not in os.environ:
            parser.print_help()
            raise Exception(
                "Please qsub me so I get approximately 1 GB of RAM per thread."
            )

        if not args.ref:
            args.ref = Tools.defaultReference()

        if not os.path.exists(args.ref):
            raise Exception("Please specify a valid reference path using -r.")

        if not args.reports_prefix:
            raise Exception("Please specify an output prefix using -o ")

        if not os.path.exists(
                os.path.dirname(os.path.abspath(args.reports_prefix))):
            raise Exception(
                "The output path does not exist. Please specify a valid output path and prefix using -o"
            )

        if os.path.basename(args.reports_prefix) == "" or os.path.isdir(
                args.reports_prefix):
            raise Exception(
                "The output path should specify a file name prefix. Please specify a valid output path "
                "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* ."
            )

        # noinspection PyProtectedMember
        if not args._vcfs or len(args._vcfs) != 2:
            raise Exception("Please specify exactly two input VCFs.")

        # noinspection PyProtectedMember
        args.vcf1 = args._vcfs[0]
        # noinspection PyProtectedMember
        args.vcf2 = args._vcfs[1]

        if not os.path.exists(args.vcf1):
            raise Exception("Input file %s does not exist." % args.vcf1)
        if not os.path.exists(args.vcf2):
            raise Exception("Input file %s does not exist." % args.vcf2)

        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        h1 = vcfextract.extractHeadersJSON(args.vcf1)
        if args.auto_index and not h1["tabix"]:
            logging.info(
                "Creating indexed version of %s -- consider creating an index beforehand to save time here."
                % args.vcf1)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name)
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        h2 = vcfextract.extractHeadersJSON(args.vcf2)
        if args.auto_index and not h2["tabix"]:
            logging.info(
                "Creating indexed version of %s -- consider creating an index beforehand to save time here."
                % args.vcf2)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name)
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        ref_check = True
        try:
            happy_ref = args.ref
            v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"]
            v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"]
            if args.verbose:
                logging.info("References used: hap.py: %s / truth: %s / "
                             "query: %s" %
                             (str(happy_ref), str(v1r), str(v2r)))

            v1_ref = ";".join([str(xxy["value"])
                               for xxy in v1r]).replace("file://", "")
            v2_ref = ";".join([str(xxy["value"])
                               for xxy in v2r]).replace("file://", "")

            if happy_ref == v1_ref and v1_ref == v2_ref:
                ref_check = True

            refids_found = 0
            rids_vh = set()
            rids_v1 = set()
            rids_v2 = set()
            for refid in ["hg19", "hg38", "grc37", "grc38"]:
                if refid in happy_ref.lower():
                    rids_vh.add(refid)
                if refid in v1_ref.lower():
                    rids_v1.add(refid)
                if refid in v2_ref.lower():
                    rids_v2.add(refid)

            rids_v1 = sorted(list(rids_v1))
            rids_v2 = sorted(list(rids_v2))
            rids_vh = sorted(list(rids_vh))

            to_cmp = None
            if rids_v1: to_cmp = rids_v1
            if rids_v2: to_cmp = rids_v2
            if rids_vh: to_cmp = rids_vh
            if to_cmp and rids_v1 and rids_v1 != to_cmp:
                ref_check = False
            if to_cmp and rids_v2 and rids_v2 != to_cmp:
                ref_check = False
            if to_cmp and rids_vh and rids_vh != to_cmp:
                ref_check = False

        except:
            pass

        if not ref_check:
            logging.warn(
                "Reference sequence check failed! "
                "Please ensure that truth and query VCF use the same reference sequence as "
                "hap.py. XCMP may fail if this is not the case, and the results will not be "
                " accurate.")

        if args.locations is None or len(args.locations) == 0:
            # all chromosomes
            args.locations = ["chr" + x for x in map(str, range(1, 23))]

        if type(args.locations) is not list and args.locations is not None:
            # noinspection PyUnresolvedReferences
            args.locations = args.locations.split(",")

        # HAP-143 fix the case where no chromosomes are in truth or query
        try:
            if not h1["tabix"]["chromosomes"]:
                h1["tabix"]["chromosomes"] = []
        except:
            pass
        try:
            if not h2["tabix"]["chromosomes"]:
                h2["tabix"]["chromosomes"] = []
        except:
            pass

        if not h1["tabix"]:
            args.preprocessing_truth = True
            logging.warn(
                "Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion."
            )
            if args.fixchr_truth is None:
                args.fixchr_truth = True
        elif args.fixchr_truth is None:
            logging.info(str(h1["tabix"]))
            # autodetect chr naming
            count_with_fix = len([
                __ for __ in h1["tabix"]["chromosomes"]
                if ("chr%s" % str(__)) in args.locations
            ])
            count_no_fix = len([
                __ for __ in h1["tabix"]["chromosomes"]
                if str(__) in args.locations
            ])
            logging.info(
                "Truth: Number of chromosome names matching with / without renaming : %i / %i "
                % (count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_truth = True
                logging.info("Will fix chromosome names (truth).")
            else:
                logging.info("Will not fix chromosome names (truth).")
                args.fixchr_truth = False

        if not h2["tabix"]:
            args.preprocessing = True
            logging.warn(
                "Query file is not Tabix indexed. Switching on pre-processing + chr name conversion."
            )
            # don't overwrite setting, but if it's None, replace with True to be sure
            if args.fixchr_query is None:
                args.fixchr_query = True
        elif args.fixchr_query is None:
            # autodetect chr naming
            count_with_fix = len([
                __ for __ in h2["tabix"]["chromosomes"]
                if ("chr%s" % str(__)) in args.locations
            ])
            count_no_fix = len([
                __ for __ in h2["tabix"]["chromosomes"]
                if str(__) in args.locations
            ])
            logging.info(
                "Query: Number of chromosome names matching with / without renaming : %i / %i "
                % (count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_query = True
                logging.info("Will fix chromosome names (query).")
            else:
                logging.info("Will not fix chromosome names (query).")
                args.fixchr_query = False

        if args.fixchr_truth or args.preprocessing_norm:
            args.preprocessing_truth = True

        if args.fixchr_query or args.preprocessing_norm:
            args.preprocessing = True

        if args.preprocessing_truth:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(
                args.vcf1,
                vtf.name,
                ",".join(args.locations),
                not args.usefiltered_truth,  # pass_only
                args.fixchr_truth,  # chrprefix
                args.preprocessing_norm,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
            args.vcf1 = vtf.name
            # get headers again if we preprocessed
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        if args.preprocessing:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(
                args.vcf2,
                vtf.name,
                ",".join(args.locations),
                not args.usefiltered,  # pass_only
                args.fixchr_query,  # chrprefix
                args.preprocessing_norm,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
            args.vcf2 = vtf.name
            # get headers again if we preprocessed
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        if not h1["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        if not h2["tabix"]:
            raise Exception("Query file is not Tabix indexed.")

        newlocations = []

        if not h1["tabix"]["chromosomes"]:
            h1["tabix"]["chromosomes"] = []
        if not h2["tabix"]["chromosomes"]:
            h2["tabix"]["chromosomes"] = []

        for _xc in args.locations:
            xc = _xc.split(":")[0]
            if xc not in h1["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in truth!" % xc)
            if xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % xc)

            if (xc not in h1["tabix"]["chromosomes"]) and (
                    xc not in h2["tabix"]["chromosomes"]):
                logging.warn(
                    "Removing location %s because neither input file has calls there."
                    % xc)
            else:
                newlocations.append(_xc)

        if not newlocations:
            raise Exception(
                "Location list is empty: the input files do not appear to have variants on any of %s"
                % str(args.locations))

        args.locations = newlocations

        if args.threads > 1:
            logging.info("Running using %i parallel processes." % args.threads)
            pool = multiprocessing.Pool(int(args.threads))

            # find balanced pieces
            args.pieces = (args.threads + len(args.locations) - 1) / len(
                args.locations)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper,
                              args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))
        else:
            pool = None

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if args.raw_counts:
            counts_truth = Haplo.quantify.run_quantify(
                args.vcf1,
                None,
                None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                args.ref,
                h1["samples"][0],
                locations=args.locations)
        else:
            counts_truth = None

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")
        if args.raw_counts:
            counts_query = Haplo.quantify.run_quantify(
                args.vcf2,
                None,
                None, {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                args.ref,
                h2["samples"][0],
                locations=args.locations)
        else:
            counts_query = None

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=".vcf.gz")
        tf.close()
        tempfiles.append(tf.name)
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations,
                              args)
            tempfiles += [x[0] for x in res if x is not None]  # VCFs
            tempfiles += [
                x[1] for x in res if x is not None and x[1] is not None
            ]  # beds (if any)

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception(
                    "Input files/regions do not contain variants (0 haplotype blocks were processed)."
                )

            # concatenate + index
            bedfiles = [
                x[1] for x in res if x is not None and x[1] is not None
            ]
            if args.write_bed and bedfiles:
                runme = " ".join(["cat"] + bedfiles + [
                    ">",
                    args.reports_prefix.replace(" ", "\\ ") + ".blocks.bed"
                ])
                logging.info("Concatenating block files: %s..." % runme)
                subprocess.check_call(runme, shell=True)

            logging.info("Concatenating variants...")
            runme_list = [x[0] for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            fo = Tools.BGZipFile(output_name, True)
            for i, x in enumerate(runme_list):
                f = gzip.GzipFile(x)
                for l in f:
                    if i == 0 or not l[0] == "#":
                        fo.write(l)
            fo.close()

            logging.info("Indexing...")
            to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ")
            logging.info("Running '%s'" % to_run)
            subprocess.check_call(to_run, shell=True)
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2,
                                                  output_name, args)
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        if args.write_counts:
            json_name = args.reports_prefix + ".counts.json"
        else:
            tf = tempfile.NamedTemporaryFile(delete=False,
                                             dir=args.scratch_prefix,
                                             prefix="counts.",
                                             suffix=".json")
            tf.close()
            json_name = tf.name

        logging.info("Counting variants...")

        counts = Haplo.quantify.run_quantify(
            output_name, json_name,
            args.reports_prefix + ".vcf.gz" if args.write_vcf else False,
            {"CONF": args.fp_bedfile} if args.fp_bedfile else None, args.ref)

        df = pandas.DataFrame(counts)
        if args.write_counts:
            df.to_csv(args.reports_prefix + ".counts.csv")

        metrics_output = makeMetricsObject("hap.py.comparison")

        if args.write_counts:
            metrics_output["metrics"].append(
                dataframeToMetricsTable("raw.counts", df))

        # calculate precision / recall
        count_types = []
        if args.raw_counts:
            simplified_truth_counts = Haplo.quantify.simplify_counts(
                counts_truth, h1["samples"][0:1])
            simplified_query_counts = Haplo.quantify.simplify_counts(
                counts_query, h2["samples"][0:1])

            count_types += simplified_truth_counts.keys()
            count_types += simplified_query_counts.keys()
        else:
            simplified_truth_counts = None
            simplified_query_counts = None

        simplified_numbers = Haplo.quantify.simplify_counts(counts)

        count_types += simplified_numbers.keys()
        count_types = sorted(list(set(count_types)))

        for vtype in count_types:
            if vtype not in simplified_numbers:
                simplified_numbers[vtype] = {}

            simplified_numbers[vtype]["METRIC.Recall"] = 0
            simplified_numbers[vtype]["METRIC.Recall2"] = 0
            simplified_numbers[vtype]["METRIC.Precision"] = 0
            simplified_numbers[vtype]["METRIC.Frac_NA"] = 0

            try:
                simplified_numbers[vtype]["METRIC.Recall"] = \
                    float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                    float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Recall2"] = \
                    float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                    float(simplified_numbers[vtype]["TRUTH.TOTAL"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Precision"] = \
                    float(simplified_numbers[vtype]["QUERY.TP"]) / \
                    float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Frac_NA"] = \
                    float(simplified_numbers[vtype]["QUERY.UNK"]) / \
                    float(simplified_numbers[vtype]["QUERY.TOTAL"])
            except:
                pass

            try:
                simplified_numbers[vtype][
                    "TRUTH.TOTAL.RAW"] = simplified_truth_counts[vtype][
                        h1["samples"][0] + ".TOTAL"]
            except:
                pass

            try:
                simplified_numbers[vtype][
                    "QUERY.TOTAL.RAW"] = simplified_query_counts[vtype][
                        h2["samples"][0] + ".TOTAL"]
            except:
                pass

        pandas.set_option("display.width", 120)
        pandas.set_option("display.max_columns", 1000)
        df = pandas.DataFrame(simplified_numbers).transpose()
        vstring = "hap.py-%s" % Tools.version
        vstring += " ".join(sys.argv)

        df.loc[vstring] = 0

        # for x in df:
        #     # everything not a metric is a count
        #     if not x.startswith("METRIC"):
        #         df[x] = df[x].astype("int64")

        df[[
            "TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision",
            "METRIC.Frac_NA"
        ]].to_csv(args.reports_prefix + ".summary.csv")

        metrics_output["metrics"].append(
            dataframeToMetricsTable(
                "summary.metrics", df[[
                    "TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall",
                    "METRIC.Precision", "METRIC.Frac_NA"
                ]]))

        if args.write_counts:
            df.to_csv(args.reports_prefix + ".extended.csv")
            metrics_output["metrics"].append(
                dataframeToMetricsTable("all.metrics", df))

        essential_numbers = df[[
            "TRUTH.TOTAL", "QUERY.TOTAL", "METRIC.Recall", "METRIC.Precision",
            "METRIC.Frac_NA"
        ]]

        pandas.set_option('display.max_columns', 500)
        pandas.set_option('display.width', 1000)

        essential_numbers = essential_numbers[essential_numbers.index.isin(
            ["Locations.SNP", "Locations.INDEL"])]

        logging.info("\n" + str(essential_numbers))

        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "Benchmarking Summary:"
            print str(essential_numbers)

        if args.roc:
            vcf = args.reports_prefix + ".vcf.gz"
            res = Haplo.happyroc.roc(vcf, args.roc, args.roc_filter,
                                     args.reports_prefix + ".roc",
                                     args.roc_reversed)

            for t in res.iterkeys():
                rocdf = pandas.read_table(res[t])
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("roc." + t, rocdf))

        with open(args.reports_prefix + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)
    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Exemple #7
0
def quantify(args):
    """ Run quantify and write tables """
    vcf_name = args.in_vcf[0]

    if not vcf_name or not os.path.exists(vcf_name):
        raise Exception("Cannot read input VCF.")

    json_name = args.reports_prefix + ".counts.json"

    logging.info("Counting variants...")

    output_vcf = args.reports_prefix + ".vcf.gz"

    roc_table = None

    if args.roc:
        roc_table = args.reports_prefix + ".roc.tsv"

    if args.verbose:
        # verbose writes internal summary file
        # this will be what we migrate to in 0.3.0
        sum_file = args.reports_prefix + ".internal.summary.tsv"
    else:
        sum_file = None

    counts = Haplo.quantify.run_quantify(vcf_name,
                                         json_name,
                                         output_vcf if args.write_vcf else False,
                                         {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                         args.ref,
                                         threads=args.threads,
                                         output_vtc=args.output_vtc,
                                         qtype=args.type,
                                         roc_val=args.roc,
                                         roc_file=roc_table,
                                         summary_file=sum_file,
                                         roc_filter=args.roc_filter,
                                         roc_delta=args.roc_delta,
                                         output_filter_rocs=args.output_filter_rocs,
                                         clean_info=not args.preserve_info)

    df = pandas.DataFrame(counts)

    metrics_output = makeMetricsObject("%s.comparison" % args.runner)

    if args.write_counts:
        df.to_csv(args.reports_prefix + ".counts.csv")
        metrics_output["metrics"].append(dataframeToMetricsTable("raw.counts", df))

    # calculate precision / recall
    count_types = []

    simplified_numbers = Haplo.quantify.simplify_counts(counts)

    count_types += simplified_numbers.keys()
    count_types = sorted(list(set(count_types)))

    for vtype in count_types:
        if vtype not in simplified_numbers:
            simplified_numbers[vtype] = {}

        simplified_numbers[vtype]["METRIC.Recall"] = 0
        simplified_numbers[vtype]["METRIC.Recall2"] = 0
        simplified_numbers[vtype]["METRIC.Precision"] = 0
        simplified_numbers[vtype]["METRIC.Frac_NA"] = 0

        try:
            simplified_numbers[vtype]["METRIC.Recall"] = \
                float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"])
        except:
            pass

        try:
            simplified_numbers[vtype]["METRIC.Recall2"] = \
                float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                float(simplified_numbers[vtype]["TRUTH.TOTAL"])
        except:
            pass

        try:
            simplified_numbers[vtype]["METRIC.Precision"] = \
                float(simplified_numbers[vtype]["QUERY.TP"]) / \
                float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"])
        except:
            pass

        try:
            simplified_numbers[vtype]["METRIC.Frac_NA"] = \
                float(simplified_numbers[vtype]["QUERY.UNK"]) / \
                float(simplified_numbers[vtype]["QUERY.TOTAL"])
        except:
            pass

    pandas.set_option("display.width", 120)
    pandas.set_option("display.max_columns", 1000)
    df = pandas.DataFrame(simplified_numbers).transpose()

    vstring = "%s-%s" % (args.runner, Tools.version)
    vstring += " ".join(sys.argv)

    df.loc[vstring] = 0

    summary_columns = ["TRUTH.TOTAL",
                       "QUERY.TOTAL",
                       "METRIC.Recall",
                       "METRIC.Precision",
                       "METRIC.Frac_NA"]

    for additional_column in ["TRUTH.TOTAL.TiTv_ratio",
                              "QUERY.TOTAL.TiTv_ratio",
                              "TRUTH.TOTAL.het_hom_ratio",
                              "QUERY.TOTAL.het_hom_ratio"]:
        if additional_column in df.columns:
            summary_columns.append(additional_column)

    df[summary_columns].to_csv(args.reports_prefix + ".summary.csv")

    metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics",
                                                             df[summary_columns]))

    if args.write_counts:
        df.to_csv(args.reports_prefix + ".extended.csv")
        metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df))

    essential_numbers = df[summary_columns]

    pandas.set_option('display.max_columns', 500)
    pandas.set_option('display.width', 1000)

    essential_numbers = essential_numbers[essential_numbers.index.isin(
        ["Locations.SNP", "Locations.INDEL"])]

    logging.info("\n" + str(essential_numbers))

    # in default mode, print result summary to stdout
    if not args.quiet and not args.verbose:
        print "Benchmarking Summary:"
        print str(essential_numbers)

    if args.roc:
        res = Haplo.happyroc.roc(roc_table, args.reports_prefix + ".roc")

        # keep this for verbose output
        if not args.verbose:
            try:
                os.unlink(roc_table)
            except:
                pass

        for t in res.iterkeys():
            metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, res[t]))

    with open(args.reports_prefix + ".metrics.json", "w") as fp:
        json.dump(metrics_output, fp)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file prefix for statistics and feature table (when selected)")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="FP",
                        help="False-positive region bed file to distinguish UNK from FP")

    parser.add_argument("-a", "--ambiguous", dest="ambi", action='append',
                        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed "
                             "in some replicates)")

    parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False,
                        help="Use FP calls from ambiguous region files also.")

    parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False,
                        help="Assume the truth set covers the whole genome and only count FPs in regions "
                             "specified by the truth VCF or ambiguous/false-positive regions.")

    parser.add_argument("--no-count-unk", dest="count_unk", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False,
                        default=False, action="store_true",
                        help="print a table giving the number of ambiguous events per category")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file.")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--continue", dest="cont", default=False, action="store_true",
                        help="Continue from scratch space (i.e. use VCFs in there if they already exist).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(),
                        help="Select a feature table to output.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true",
                        help="Enable running of bcftools norm on the truth file.")

    parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true",
                        help="Enable running of bcftools norm on the query file.")

    parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true",
                        help="Enable running of bcftools norm on both truth and query file.")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True,
                        help="Add chr prefix to truth file (default: true).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True,
                        help="Add chr prefix to query file (default: true).")

    parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Same as --fixchr-truth.")

    parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Same as --fixchr-query.")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False,
                        help="Disable chr replacement for truth (default: false).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False,
                        help="Add chr prefix to query file (default: false).")
    
    parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true",
                        help="Disable checking the order of TP features (dev feature).")

    parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(),
                        help="Create a ROC-style table. This is caller specific "
                             " - this will override the --feature-table switch!")

    parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true",
                        help="Stratify into different AF buckets. This needs to have features available"
                             "for getting the AF both in truth and query variants.")
    parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2,
                        help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, "
                             "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.")
    parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE",
                        help="Feature name to use for retrieving AF for truth variants (TP and FN)")
    parser.add_argument("--af-query", dest="af_strat_query", default="T_AF",
                        help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)")

    parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true",
                        help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all "
                             "variants) and either the --feature-table or --roc switch.")

    parser.add_argument("--fp-region-size", dest="fpr_size",
                        help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using"
                             " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise."
                             " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls."
                             " The resulting value is used as fp.region.size.")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    try:
        if type(args.af_strat_binsize) is str:
            args.af_strat_binsize = map(float, args.af_strat_binsize.split(","))
        else:
            args.af_strat_binsize = map(float, [args.af_strat_binsize])

        if not args.af_strat_binsize:
            raise Exception("Bin size list is empty")
    except:
        logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize))
        exit(1)

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname
        if not args.inc_nonpass:
            logging.warn("When creating ROCs without the -P switch, the ROC data points will only "
                         "include filtered variants (i.e. they will normally end at the caller's "
                         "quality threshold).")

    if args.af_strat and not args.features:
        raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together "
                        "with --feature-table or --roc")

    if args.count_filtered_fn and (not args.inc_nonpass or not args.features):
        raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, "
                        "and when using unfiltered variants. Specify -P --feature-table <...> or use "
                        "--roc to select a ROC type.")

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(args.truth, ntpath, args.location,
                          True,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.normalize_truth,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(args.query, nqpath, args.location,
                          not args.inc_nonpass,  # pass_only
                          args.fixchr_query,  # chrprefix
                          args.normalize_query,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("FP/ambiguity classes with info (multiple classes can "
                             "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                    formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % (
                                str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                if vtype == "SNVs":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) &
                                                          (featuretable["ALT"].str.len() ==
                                                           featuretable["REF"].str.len())]
                elif vtype == "indels":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) |
                                                          (featuretable["ALT"].str.len() != 1)]
                else:
                    featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0]

                if args.af_strat:
                    start = 0.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = min(1.000000001, start + current_binsize)
                        n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") &
                                                      (featuretable_this_type[af_q_feature] >= start) &
                                                      (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") &
                                                        (featuretable_this_type[af_q_feature] >= start) &
                                                        (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") &
                                                       (featuretable_this_type[af_q_feature] >= start) &
                                                       (featuretable_this_type[af_q_feature] < end)]

                        r = {"type": "%s.%f-%f" % (vtype, start, end),
                             "total.truth": n_tp.shape[0] + n_fn.shape[0],
                             "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0],
                             "tp": n_tp.shape[0],
                             "fp": n_fp.shape[0],
                             "fn": n_fn.shape[0],
                             "unk": n_unk.shape[0],
                             "ambi": n_ambi.shape[0], }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start += current_binsize
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        # remove things where we haven't seen any variants in truth and query
        res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)]
        # summary metrics
        res["recall"] = res["tp"] / (res["tp"] + res["fn"])
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"] = res["tp"] / (res["tp"] + res["fp"])
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom, label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]
                    if not h1_chrs:
                        logging.warn("ntpath is empty")
                        h1_chrs = []

                    h2 = Tools.vcfextract.extractHeadersJSON(nqpath)
                    h2_chrs = h2["tabix"]["chromosomes"]
                    if not h2_chrs:
                        logging.warn("nqpath is empty")
                        h2_chrs = []
                    
                    combined_chrs = list(set(h1_chrs + h2_chrs))
                    if len(combined_chrs) > 0:
                        qlocations = " ".join(combined_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] +
                                                                            res["fp"] - res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring
        res.to_csv(args.output + ".stats.csv")
        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)
Exemple #9
0
def main():

    args = parse_args()

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" %
                             (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(
                args.truth,
                ntpath,
                args.location,
                True,  # pass_only
                args.fixchr_truth,  # chrprefix
                args.normalize_truth,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(
                args.query,
                nqpath,
                args.location,
                not args.inc_nonpass,  # pass_only
                args.fixchr_query,  # chrprefix
                args.normalize_query,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([
            os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))
        ])

        tpfn_r_files = all([
            os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))
        ])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p",
                        os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p",
                        os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" %
                                (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(
            runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            "tp")
        fncounts = parseStats(
            runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort_values(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "FP/ambiguity classes with info (multiple classes can "
                    "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort_values(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "Reasons for defining as ambiguous (multiple reasons can overlap):\n"
                    + ambie.to_string(formatters={
                        'reason':
                        '{{:<{}s}}'.format(
                            ambie['reason'].str.len().max()).format
                    },
                                      index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={
                            'reason':
                            '{{:<{}s}}'.format(
                                ambie['reason'].str.len().max()).format
                        },
                        index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"),
                               "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"),
                                "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort_values(["CHROM", "POS"], inplace=True)
            tps2.sort_values(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception(
                    "Cannot read TP features, lists have different lengths : %i != %i"
                    % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception(
                                "Cannot merge TP features, inputs are out of order at %s / %s"
                                % (str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"),
                               "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted(
                [x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv",
                                float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN",
                            "REF"] = featuretable.ix[featuretable["tag"] ==
                                                     "FN", "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN",
                            "ALT"] = featuretable.ix[featuretable["tag"] ==
                                                     "FN", "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                featuretable["vtype"] = resolve_vtype(args)
                featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype,
                           "fp.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "FP")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "tp.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "TP")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "unk.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "UNK")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "ambi.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "AMBI")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]

                if args.af_strat:
                    start = 0.0
                    end = 1.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = start + current_binsize
                        if end >= 1:
                            end = 1.00000001
                        if start >= end:
                            break
                        n_tp = featuretable_this_type[
                            (featuretable_this_type["tag"] == "TP")
                            & (featuretable_this_type[af_t_feature] >= start) &
                            (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[
                            (featuretable_this_type["tag"] == "FN")
                            & (featuretable_this_type[af_t_feature] >= start) &
                            (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[
                            (featuretable_this_type["tag"] == "FP")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[
                            (featuretable_this_type["tag"] == "AMBI")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[
                            (featuretable_this_type["tag"] == "UNK")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]

                        r = {
                            "type":
                            "%s.%f-%f" % (vtype, start, end),
                            "total.truth":
                            n_tp.shape[0] + n_fn.shape[0],
                            "total.query":
                            n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] +
                            n_unk.shape[0],
                            "tp":
                            n_tp.shape[0],
                            "fp":
                            n_fp.shape[0],
                            "fn":
                            n_fn.shape[0],
                            "unk":
                            n_unk.shape[0],
                            "ambi":
                            n_ambi.shape[0]
                        }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[
                                n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[
                                n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[
                                n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[
                                n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] +
                                                     n_fn.shape[0] +
                                                     n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(
                                pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (
                                args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start = end
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        if not args.af_strat:
            res = res[(res["total.truth"] > 0)]

        # summary metrics with confidence intervals
        ci_alpha = 1.0 - args.ci_level

        recall = binomialCI(res["tp"], res["tp"] + res["fn"], ci_alpha)
        precision = binomialCI(res["tp"], res["tp"] + res["fp"], ci_alpha)
        res["recall"], res["recall_lower"], res["recall_upper"] = recall
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"], res["precision_lower"], res[
            "precision_upper"] = precision
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom,
                                                                label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]

                    if not h1_chrs:
                        logging.warn("No contigs in truth file")
                        h1_chrs = []

                    if len(h1_chrs) > 0:
                        qlocations = " ".join(h1_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (
                res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (
                res["tp"] - res["tp.filtered"] + res["fp"] -
                res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (
                res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] -
                                  res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (
                res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)

        metrics_output["metrics"].append(dataframeToMetricsTable(
            "result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring

        # save results
        res.to_csv(args.output + ".stats.csv")

        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

        if args.happy_stats:
            # parse saved feature table as the one in memory has been updated
            featuretable = pandas.read_csv(args.output + ".features.csv",
                                           low_memory=False,
                                           dtype={"FILTER": str})

            # hap.py summary.csv
            summary = summary_from_featuretable(featuretable, args)
            summary.to_csv(args.output + ".summary.csv")

            #  hap.py extended.csv
            if args.af_strat:
                extended = extended_from_featuretable(featuretable, args)
                extended.to_csv(args.output + ".extended.csv",
                                index=False,
                                na_rep="NA")

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)