Beispiel #1
0
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        records = []

        if not avg_depth:
            logging.warn("No average depths available, normalized depth features cannot be calculated")

        hdrs = extractHeadersJSON(vcfname)

        tsn = ""
        nsn = ""

        n_sample = "NORMAL"
        t_sample = "TUMOR"

        logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample,
                                                                                                    tsn, t_sample))

        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.SSC", "I.GPV", "I.SPV",
                    n_sample + "GT", t_sample + "GT", # Genotype
                    n_sample + "GQ", t_sample + "GQ", # Genotype quality
                    n_sample + "DP", t_sample + "DP", # Read depth
                    n_sample + "RD", t_sample + "RD", # Reference depth
                    n_sample + "AD", t_sample + "AD", # Alternative depth
                    n_sample + "FREQ", t_sample + "FREQ" # Alt. frequence (FA in MuTect)
                    ]
        
        has_warned = {}

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            for q in [n_sample + "GT", t_sample + "GT"]:
                if not q in rec or rec[q] is None:
                    rec[q] = "."
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            # fix missing features
            for q in [n_sample + "GT", t_sample + "GT",
                      n_sample + "GQ", t_sample + "GQ",
                      n_sample + "DP", t_sample + "DP",
                      n_sample + "AD", t_sample + "AD",
                      n_sample + "RD", t_sample + "RD",
                      n_sample + "FREQ", t_sample + "FREQ"]:
                if not q in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True
                else:
                    if q.endswith("FREQ"):
                        try:
                            rec[q] = float(rec[q])
                        except ValueError:
                            rec[q] = float("NaN")
                    
                    else:
                        try:
                            rec[q] = int(rec[q])
                        except ValueError:
                            rec[q] = -1

            rec["tag"] = tag

            n_DP        = float(rec[n_sample + "DP"])
            t_DP        = float(rec[t_sample + "DP"])

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio      = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio      = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif not "DPnorm" in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            n_allele_ref_count = rec[n_sample + "RD"]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_count = 0
            else:
                n_allele_alt_count = rec[n_sample + "AD"]

            if n_allele_alt_count + n_allele_ref_count == 0:
                n_allele_rate = 0
            else:
                n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count)

            t_allele_ref_count = rec[t_sample + "RD"]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_count = 0
            else:
                t_allele_alt_count =  rec[t_sample + "AD"]

            if t_allele_alt_count + t_allele_ref_count == 0:
                t_allele_rate = 0
            else:
                t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count)

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "SSC": rec["I.SSC"],
                "GPV": rec["I.GPV"],
                "SPV": rec["I.SPV"],                
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE" : n_DP_ratio,
                "T_DP_RATE" : t_DP_ratio,
                "N_GT": rec[n_sample + "GT"],
                "T_GT": rec[t_sample + "GT"],
                "N_GQ": rec[n_sample +"GQ"],
                "T_GQ": rec[t_sample +"GQ"],
                "N_AD": rec[n_sample + "AD"],
                "T_AD": rec[t_sample + "AD"],
                "N_FA": rec[n_sample + "FREQ"],
                "T_FA": rec[t_sample + "FREQ"],
                "N_ALT_RATE": n_allele_rate,
                "T_ALT_RATE": t_allele_rate,
                "tag" : tag
            }
            
            records.append(qrec)

        cols = [
            "CHROM",
            "POS",
            "REF",
            "ALT",
            "FILTER",
            "SSC",
            "GPV",
            "SPV",
            "N_DP",
            "T_DP",
            "N_DP_RATE",
            "T_DP_RATE",
            "N_GT",
            "T_GT",
            "N_GQ",
            "T_GQ",
            "N_AD",
            "T_AD",
            "N_FA",
            "T_FA",
            "N_ALT_RATE",
            "T_ALT_RATE",
            "tag"]
            

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument('--location', '-l', dest='locations', required=False, default=None,
                        help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).')

    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False,
                        help="Use to include failing query variants in comparison.")

    parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Include failing variants from the truth dataset.")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="fp_bedfile",
                        default=None, type=str,
                        help="False positive / confident call regions (.bed or .bed.gz).")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")

    parser.add_argument("-V", "--write-vcf", dest="write_vcf",
                        default=False, action="store_true",
                        help="Write an annotated VCF.")

    parser.add_argument("-B", "--write-bed", dest="write_bed",
                        default=False, action="store_true",
                        help="Write a bed file with the haplotype blocks that were used.")

    parser.add_argument("-X", "--write-counts", dest="write_counts",
                        default=True, action="store_true",
                        help="Write advanced counts and metrics.")

    parser.add_argument("--no-write-counts", dest="write_counts",
                        default=True, action="store_false",
                        help="Do not write advanced counts and metrics.")

    parser.add_argument("--raw-counts", dest="raw_counts",
                        default=False, action="store_true",
                        help="Count variants in unprocessed input VCFs and output as TOTAL.*.RAW.")

    parser.add_argument("--roc", dest="roc", default=False,
                        help="Select an INFO feature to produce a ROC on. This works best with "
                             "--no-internal-preprocessing and --no-internal-leftshift since these "
                             "flags preserve the most INFO flags from the input files.")

    parser.add_argument("--roc-filter", dest="roc_filter", default=False,
                        help="Select a filter to ignore when making ROCs.")

    parser.add_argument("--roc-reversed", dest="roc_reversed", default=False,
                        help="Change the meaning of the ROC feature to count the other way around (higher values=bad).")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    # detailed control of comparison
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file using bcftools.")

    parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False,
                        help="Perform VCF preprocessing using bcftools.")

    parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False,
                        help="Enable preprocessing through bcftools norm -c x -D (requires external "
                             " preprocessing to be switched on).")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Add chr prefix to truth file (default: auto).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false",
                        help="Disable chr replacement for truth (default: auto).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false",
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None,
                        help="give credit for partially matched variants. "
                             "this is equivalent to --internal-leftshift and --internal-preprocessing.")

    parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None,
                        help="Give credit for partially matched variants. "
                             "This is equivalent to --internal-leftshift and --no-internal-preprocessing.")

    parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--match-raw", dest="int_match_raw", action="store_true", default=False,
                        help="Add a matching step in xcmp which also matches raw variant calls. This helps"
                             " when comparing files with very different representations.")

    parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False,
                        help="Combination of --no-haplotype-comparison --no-internal-preprocessing "
                             "--no-internal-leftshift.")

    parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True,
                        help="Disable automatic index creation for input files. "
                             "The index is only necessary at this stage if we want to auto-detect locations. "
                             "When used with -l, and when it is known that there are variants at all given locations "
                             "this is not needed and can be switched off to save time.")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between two variants such that they fall into different haplotype "
                             "blocks")

    parser.add_argument("--enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        help="This parameter should give the path to the \"rtg\" executable.")
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa).")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(0)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # disable all clever matching
    if args.unhappy:
        args.int_preprocessing = False
        args.int_preprocessing_ls = False
        args.no_hc = True

    # Counting with partial credit
    elif args.partial_credit:
        # partial_credit switch is overridden by --no-* switches
        args.int_preprocessing = True
        args.int_preprocessing_ls = True
    elif args.partial_credit is None:
        # in the default setting, we enable partial credit but only override the
        # preprocessing settings if they haven't been specified
        if args.int_preprocessing is None:
            args.int_preprocessing = True
        if args.int_preprocessing_ls is None:
            args.int_preprocessing_ls = True
    elif args.partial_credit is not None:  # explicitly set to false
        args.int_preprocessing = False
        args.int_preprocessing_ls = True

    if args.int_preprocessing is None:
        args.int_preprocessing = False
    if args.int_preprocessing_ls is None:
        args.int_preprocessing_ls = False

    logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift",
                                                           "splitting" if args.int_preprocessing else "raw calls",
                                                           "haplocompare" if not args.no_hc else "no-haplocompare"))

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.targets_bedfile or args.engine != "xcmp":
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    tempfiles = []

    try:
        if not args.force_interactive and "JOB_ID" not in os.environ:
            parser.print_help()
            raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

        if not args.ref:
            args.ref = Tools.defaultReference()

        if not os.path.exists(args.ref):
            raise Exception("Please specify a valid reference path using -r.")

        if not args.reports_prefix:
            raise Exception("Please specify an output prefix using -o ")

        if not os.path.exists(os.path.dirname(args.reports_prefix)):
            raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

        if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
            raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

        # noinspection PyProtectedMember
        if not args._vcfs or len(args._vcfs) != 2:
            raise Exception("Please specify exactly two input VCFs.")

        # noinspection PyProtectedMember
        args.vcf1 = args._vcfs[0]
        # noinspection PyProtectedMember
        args.vcf2 = args._vcfs[1]

        if not os.path.exists(args.vcf1):
            raise Exception("Input file %s does not exist." % args.vcf1)
        if not os.path.exists(args.vcf2):
            raise Exception("Input file %s does not exist." % args.vcf2)

        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        h1 = vcfextract.extractHeadersJSON(args.vcf1)
        if args.auto_index and not h1["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf1)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name)
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        h2 = vcfextract.extractHeadersJSON(args.vcf2)
        if args.auto_index and not h2["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf2)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name)
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        ref_check = False
        try:
            happy_ref = args.ref
            v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"]
            v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"]
            if args.verbose:
                logging.info("References used: hap.py: %s / truth: %s / "
                             "query: %s" % (str(happy_ref), str(v1r), str(v2r)))

            v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "")
            v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "")

            if happy_ref == v1_ref and v1_ref == v2_ref:
                ref_check = True

            refids_found = 0
            for refid in ["hg19", "hg38", "grc37", "grc38"]:
                if refid in happy_ref.lower() and refid in v1_ref.lower() and refid in v2_ref.lower():
                    if args.verbose:
                        logging.info("Reference matches pattern: %s" % refid)
                    refids_found += 1
            if refids_found == 1:
                ref_check = True
        except:
            pass

        if not ref_check:
            logging.warn("Reference sequence check failed! "
                         "Please ensure that truth and query VCF use the same reference sequence as "
                         "hap.py. XCMP may fail if this is not the case, and the results will not be "
                         " accurate.")

        if args.locations is None or len(args.locations) == 0:
            # all chromosomes
            args.locations = ["chr" + x for x in map(str, range(1, 23))]

        if type(args.locations) is not list and args.locations is not None:
            # noinspection PyUnresolvedReferences
            args.locations = args.locations.split(",")

        if not h1["tabix"]:
            args.preprocessing_truth = True
            logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            if args.fixchr_truth is None:
                args.fixchr_truth = True
        elif args.fixchr_truth is None:
            # autodetect chr naming
            count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"]
                                 if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_truth = True
                logging.info("Will fix chromosome names (truth).")
            else:
                logging.info("Will not fix chromosome names (truth).")
                args.fixchr_truth = False

        if not h2["tabix"]:
            args.preprocessing = True
            logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            # don't overwrite setting, but if it's None, replace with True to be sure
            if args.fixchr_query is None:
                args.fixchr_query = True
        elif args.fixchr_query is None:
            # autodetect chr naming
            count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"]
                                 if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_query = True
                logging.info("Will fix chromosome names (query).")
            else:
                logging.info("Will not fix chromosome names (query).")
                args.fixchr_query = False

        if args.fixchr_truth or args.preprocessing_norm:
            args.preprocessing_truth = True

        if args.fixchr_query or args.preprocessing_norm:
            args.preprocessing = True

        if args.preprocessing_truth:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations),
                          not args.usefiltered_truth,     # pass_only
                          args.fixchr_truth,        # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf1 = vtf.name
            # get headers again if we preprocessed
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        if args.preprocessing:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations),
                          not args.usefiltered,     # pass_only
                          args.fixchr_query,        # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf2 = vtf.name
            # get headers again if we preprocessed
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        if not h1["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        if not h2["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        newlocations = []

        if not h1["tabix"]["chromosomes"]:
            h1["tabix"]["chromosomes"] = []
        if not h2["tabix"]["chromosomes"]:
            h2["tabix"]["chromosomes"] = []

        for _xc in args.locations:
            xc = _xc.split(":")[0]
            if xc not in h1["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in truth!" % xc)
            if xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % xc)

            if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]):
                logging.warn("Removing location %s because neither input file has calls there." % xc)
            else:
                newlocations.append(_xc)

        if not newlocations:
            raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" %
                            str(args.locations))

        args.locations = newlocations

        if args.threads > 1:
            logging.info("Running using %i parallel processes." % args.threads)
            pool = multiprocessing.Pool(int(args.threads))

            # find balanced pieces
            args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))
        else:
            pool = None

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth input file")

        if args.raw_counts:
            counts_truth = Haplo.quantify.run_quantify(args.vcf1,
                                                       None,
                                                       None,
                                                       {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                                       args.ref,
                                                       h1["samples"][0],
                                                       locations=args.locations)
        else:
            counts_truth = None

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from truth input file")
        if args.raw_counts:
            counts_query = Haplo.quantify.run_quantify(args.vcf2,
                                                       None,
                                                       None,
                                                       {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                                       args.ref,
                                                       h2["samples"][0],
                                                       locations=args.locations)
        else:
            counts_query = None

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.", suffix=".vcf.gz")
        tf.close()
        tempfiles.append(tf.name)
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x[0] for x in res if x is not None]   # VCFs
            tempfiles += [x[1] for x in res if x is not None and x[1] is not None]   # beds (if any)

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            bedfiles = [x[1] for x in res if x is not None and x[1] is not None]
            if args.write_bed and bedfiles:
                runme = " ".join(["cat"] +
                                 bedfiles +
                                 [">", args.reports_prefix.replace(" ", "\\ ") + ".blocks.bed"])
                logging.info("Concatenating block files: %s..." % runme)
                subprocess.check_call(runme,
                                      shell=True)

            logging.info("Concatenating variants...")
            runme_list = [x[0] for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            fo = Tools.BGZipFile(output_name, True)
            for i, x in enumerate(runme_list):
                f = gzip.GzipFile(x)
                for l in f:
                    if i == 0 or not l[0] == "#":
                        fo.write(l)
            fo.close()

            logging.info("Indexing...")
            to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ")
            logging.info("Running '%s'" % to_run)
            subprocess.check_call(to_run, shell=True)
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        if args.write_counts:
            json_name = args.reports_prefix + ".counts.json"
        else:
            tf = tempfile.NamedTemporaryFile(delete=False,
                                             dir=args.scratch_prefix,
                                             prefix="counts.",
                                             suffix=".json")
            tf.close()
            json_name = tf.name

        logging.info("Counting variants...")

        counts = Haplo.quantify.run_quantify(output_name,
                                             json_name,
                                             args.reports_prefix + ".vcf.gz" if args.write_vcf else False,
                                             {"CONF": args.fp_bedfile} if args.fp_bedfile else None,
                                             args.ref)

        df = pandas.DataFrame(counts)
        if args.write_counts:
            df.to_csv(args.reports_prefix + ".counts.csv")

        metrics_output = makeMetricsObject("hap.py.comparison")

        if args.write_counts:
            metrics_output["metrics"].append(dataframeToMetricsTable("raw.counts", df))

        # calculate precision / recall
        count_types = []
        if args.raw_counts:
            simplified_truth_counts = Haplo.quantify.simplify_counts(counts_truth, h1["samples"][0:1])
            simplified_query_counts = Haplo.quantify.simplify_counts(counts_query, h2["samples"][0:1])

            count_types += simplified_truth_counts.keys()
            count_types += simplified_query_counts.keys()
        else:
            simplified_truth_counts = None
            simplified_query_counts = None

        simplified_numbers = Haplo.quantify.simplify_counts(counts)

        count_types += simplified_numbers.keys()
        count_types = sorted(list(set(count_types)))

        for vtype in count_types:
            if vtype not in simplified_numbers:
                simplified_numbers[vtype] = {}

            simplified_numbers[vtype]["METRIC.Recall"] = 0
            simplified_numbers[vtype]["METRIC.Recall2"] = 0
            simplified_numbers[vtype]["METRIC.Precision"] = 0
            simplified_numbers[vtype]["METRIC.Frac_NA"] = 0

            try:
                simplified_numbers[vtype]["METRIC.Recall"] = \
                    float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                    float(simplified_numbers[vtype]["TRUTH.TP"] + simplified_numbers[vtype]["TRUTH.FN"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Recall2"] = \
                    float(simplified_numbers[vtype]["TRUTH.TP"]) / \
                    float(simplified_numbers[vtype]["TRUTH.TOTAL"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Precision"] = \
                    float(simplified_numbers[vtype]["QUERY.TP"]) / \
                    float(simplified_numbers[vtype]["QUERY.TP"] + simplified_numbers[vtype]["QUERY.FP"])
            except:
                pass

            try:
                simplified_numbers[vtype]["METRIC.Frac_NA"] = \
                    float(simplified_numbers[vtype]["QUERY.UNK"]) / \
                    float(simplified_numbers[vtype]["QUERY.TOTAL"])
            except:
                pass

            try:
                simplified_numbers[vtype]["TRUTH.TOTAL.RAW"] = simplified_truth_counts[vtype][h1["samples"][0] +
                                                                                              ".TOTAL"]
            except:
                pass

            try:
                simplified_numbers[vtype]["QUERY.TOTAL.RAW"] = simplified_query_counts[vtype][h2["samples"][0] +
                                                                                              ".TOTAL"]
            except:
                pass

        pandas.set_option("display.width", 120)
        pandas.set_option("display.max_columns", 1000)
        df = pandas.DataFrame(simplified_numbers).transpose()
        vstring = "hap.py-%s" % Tools.version
        vstring += " ".join(sys.argv)

        df.loc[vstring] = 0

        # for x in df:
        #     # everything not a metric is a count
        #     if not x.startswith("METRIC"):
        #         df[x] = df[x].astype("int64")

        df[["TRUTH.TOTAL",
            "QUERY.TOTAL",
            "METRIC.Recall",
            "METRIC.Precision",
            "METRIC.Frac_NA"]].to_csv(args.reports_prefix + ".summary.csv")

        metrics_output["metrics"].append(dataframeToMetricsTable("summary.metrics",
                                         df[["TRUTH.TOTAL",
                                             "QUERY.TOTAL",
                                             "METRIC.Recall",
                                             "METRIC.Precision",
                                             "METRIC.Frac_NA"]]))

        if args.write_counts:
            df.to_csv(args.reports_prefix + ".extended.csv")
            metrics_output["metrics"].append(dataframeToMetricsTable("all.metrics", df))

        essential_numbers = df[["TRUTH.TOTAL",
                                "QUERY.TOTAL",
                                "METRIC.Recall",
                                "METRIC.Precision",
                                "METRIC.Frac_NA"]]

        pandas.set_option('display.max_columns', 500)
        pandas.set_option('display.width', 1000)

        essential_numbers = essential_numbers[essential_numbers.index.isin(
            ["Locations.SNP", "Locations.INDEL"])]

        logging.info("\n" + str(essential_numbers))

        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "Benchmarking Summary:"
            print str(essential_numbers)

        if args.roc:
            vcf = args.reports_prefix + ".vcf.gz"
            res = Haplo.happyroc.roc(vcf, args.roc, args.roc_filter, args.reports_prefix + ".roc", args.roc_reversed)

            for t in res.iterkeys():
                rocdf = pandas.read_table(res[t])
                metrics_output["metrics"].append(dataframeToMetricsTable("roc." + t, rocdf))

        with open(args.reports_prefix + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)
    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument('--location', '-l', dest='locations', required=False, default=None,
                        help='Add a location to the compare list (when not given, will use chr1-22, chrX, chrY).')

    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-P", "--include-nonpass", dest="usefiltered", action="store_true", default=False,
                        help="Use to include failing query variants in comparison.")

    parser.add_argument("--include-nonpass-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Include failing variants from the truth dataset.")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")

    # DEPRECATED: we don't write bed files after 0.2.9
    parser.add_argument("-B", "--write-bed", dest="write_bed",
                        default=False, action="store_true",
                        help="This option is deprecated. BED files will not be written anymore.")

    # add quantification args
    qfy.updateArgs(parser)

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    # detailed control of comparison
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file using bcftools.")

    parser.add_argument("--external-preprocessing", dest="preprocessing", action="store_true", default=False,
                        help="Perform VCF preprocessing using bcftools.")

    parser.add_argument("--bcftools-norm", dest="preprocessing_norm", action="store_true", default=False,
                        help="Enable preprocessing through bcftools norm -c x -D (requires external "
                             " preprocessing to be switched on).")

    parser.add_argument("-N", "--numeric-chromosomes", dest="numeric_chrs", action="store_true", default=None,
                        help="Use numeric chromosome names for truth and query. This is a shortcut for "
                             "-l 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y "
                             "--no-fixchr-truth --no-fixchr-query")

    parser.add_argument("-C", "--no-numeric-chromosomes", dest="numeric_chrs", action="store_false",
                        help="Use chr-prefixed chromosome names for truth and query. This is a shortcut for "
                             "-l chr1,...,chrY"
                             "--fixchr-truth --fixchr-query")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Add chr prefix to truth file (default: auto).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false",
                        help="Disable chr replacement for truth (default: auto).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false",
                        help="Add chr prefix to query file (default: auto).")

    parser.add_argument("--partial-credit", dest="partial_credit", action="store_true", default=None,
                        help="give credit for partially matched variants. "
                             "this is equivalent to --internal-leftshift and --internal-preprocessing.")

    parser.add_argument("--no-partial-credit", dest="partial_credit", action="store_false", default=None,
                        help="Give credit for partially matched variants. "
                             "This is equivalent to --internal-leftshift and --no-internal-preprocessing.")

    parser.add_argument("--internal-leftshift", dest="int_preprocessing_ls", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--internal-preprocessing", dest="int_preprocessing", action="store_true", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-leftshift", dest="int_preprocessing_ls", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-internal-preprocessing", dest="int_preprocessing", action="store_false", default=None,
                        help="Switch off xcmp's internal VCF leftshift preprocessing.")

    parser.add_argument("--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("--unhappy", dest="unhappy", action="store_true", default=False,
                        help="Combination of --no-haplotype-comparison --no-internal-preprocessing "
                             "--no-internal-leftshift.")

    parser.add_argument("--no-auto-index", dest="auto_index", action="store_false", default=True,
                        help="Disable automatic index creation for input files. "
                             "The index is only necessary at this stage if we want to auto-detect locations. "
                             "When used with -l, and when it is known that there are variants at all given locations "
                             "this is not needed and can be switched off to save time.")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between two variants such that they fall into different haplotype "
                             "blocks")

    parser.add_argument("--enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("-e", "--expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        help="This parameter should give the path to the \"rtg\" executable.")
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa).")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(0)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.write_bed:
        logging.warn("The -B / --write-bed switches are deprecated in versions 0.2.9+, ")

    if args.roc:
        args.write_vcf = True

    # disable all clever matching
    if args.unhappy:
        args.int_preprocessing = False
        args.int_preprocessing_ls = False
        args.no_hc = True

    # Counting with partial credit
    elif args.partial_credit:
        # partial_credit switch is overridden by --no-* switches
        args.int_preprocessing = True
        args.int_preprocessing_ls = True
    elif args.partial_credit is None:
        # in the default setting, we enable partial credit but only override the
        # preprocessing settings if they haven't been specified
        if args.int_preprocessing is None:
            args.int_preprocessing = True
        if args.int_preprocessing_ls is None:
            args.int_preprocessing_ls = True
    elif args.partial_credit is not None:  # explicitly set to false
        args.int_preprocessing = False
        args.int_preprocessing_ls = True

    if args.int_preprocessing is None:
        args.int_preprocessing = False
    if args.int_preprocessing_ls is None:
        args.int_preprocessing_ls = False

    logging.info("Preprocessing settings: %s / %s / %s" % ("leftshift" if args.int_preprocessing_ls else "no-leftshift",
                                                           "splitting" if args.int_preprocessing else "raw calls",
                                                           "haplocompare" if not args.no_hc else "no-haplocompare"))

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.targets_bedfile or args.engine != "xcmp":
        args.preprocessing_truth = True
        args.preprocessing = True

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    tempfiles = []

    try:
        if not args.force_interactive and "JOB_ID" not in os.environ:
            parser.print_help()
            raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

        if not args.ref:
            args.ref = Tools.defaultReference()

        if not os.path.exists(args.ref):
            raise Exception("Please specify a valid reference path using -r.")

        if not args.reports_prefix:
            raise Exception("Please specify an output prefix using -o ")

        if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
            raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

        if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
            raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

        # noinspection PyProtectedMember
        if not args._vcfs or len(args._vcfs) != 2:
            raise Exception("Please specify exactly two input VCFs.")

        # noinspection PyProtectedMember
        args.vcf1 = args._vcfs[0]
        # noinspection PyProtectedMember
        args.vcf2 = args._vcfs[1]

        if not os.path.exists(args.vcf1):
            raise Exception("Input file %s does not exist." % args.vcf1)
        if not os.path.exists(args.vcf2):
            raise Exception("Input file %s does not exist." % args.vcf2)

        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        # detect numeric chromosome names
        if args.numeric_chrs is None:
            cts = fastaContigLengths(args.ref)
            cts = set(cts.keys())
            numeric_names = set(map(str, range(1, 23)) + ["X", "Y", "M"])
            non_numeric_names = set(["chr" + x for x in numeric_names])
            numeric_names &= cts
            non_numeric_names &= cts
            numeric_names = len(list(numeric_names))
            non_numeric_names = len(list(non_numeric_names))
            if numeric_names != 0 and non_numeric_names == 0:
                args.numeric_chrs = True
                logging.info("Auto-detected numeric chromosome names")
            elif numeric_names == 0 and non_numeric_names != 0:
                args.numeric_chrs = False
                logging.info("Auto-detected chr-prefixed chromosome names")

        if args.numeric_chrs:
            args.fixchr_truth = False
            args.fixchr_query = False
        elif args.numeric_chrs is not None:
            args.fixchr_truth = True
            args.fixchr_query = True

        h1 = vcfextract.extractHeadersJSON(args.vcf1)
        if args.auto_index and not h1["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf1)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf1 = Tools.bcftools.makeIndex(args.vcf1, vtf.name)
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        h2 = vcfextract.extractHeadersJSON(args.vcf2)
        if args.auto_index and not h2["tabix"]:
            logging.info("Creating indexed version of %s -- consider creating an index beforehand to save time here." %
                         args.vcf2)
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.ix",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            tempfiles.append(vtf.name + ".tbi")
            args.vcf2 = Tools.bcftools.makeIndex(args.vcf2, vtf.name)
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        ref_check = True
        try:
            happy_ref = args.ref
            v1r = [_h for _h in h1["fields"] if _h["key"] == "reference"]
            v2r = [_h for _h in h2["fields"] if _h["key"] == "reference"]
            if args.verbose:
                logging.info("References used: hap.py: %s / truth: %s / "
                             "query: %s" % (str(happy_ref), str(v1r), str(v2r)))

            v1_ref = ";".join([str(xxy["value"]) for xxy in v1r]).replace("file://", "")
            v2_ref = ";".join([str(xxy["value"]) for xxy in v2r]).replace("file://", "")

            if happy_ref == v1_ref and v1_ref == v2_ref:
                ref_check = True

            rids_vh = set()
            rids_v1 = set()
            rids_v2 = set()
            for refid in ["hg19", "hg38", "grc37", "grc38"]:
                if refid in happy_ref.lower():
                    rids_vh.add(refid)
                if refid in v1_ref.lower():
                    rids_v1.add(refid)
                if refid in v2_ref.lower():
                    rids_v2.add(refid)

            rids_v1 = sorted(list(rids_v1))
            rids_v2 = sorted(list(rids_v2))
            rids_vh = sorted(list(rids_vh))

            to_cmp = None
            if rids_v1:
                to_cmp = rids_v1
            if rids_v2:
                to_cmp = rids_v2
            if rids_vh:
                to_cmp = rids_vh
            if to_cmp and rids_v1 and rids_v1 != to_cmp:
                ref_check = False
            if to_cmp and rids_v2 and rids_v2 != to_cmp:
                ref_check = False
            if to_cmp and rids_vh and rids_vh != to_cmp:
                ref_check = False

        except:
            pass

        if not ref_check:
            logging.warn("Reference sequence check failed! "
                         "Please ensure that truth and query VCF use the same reference sequence as "
                         "hap.py. XCMP may fail if this is not the case, and the results will not be "
                         " accurate.")

        if args.locations is None or len(args.locations) == 0:
            # all chromosomes
            if args.numeric_chrs:
                args.locations = [x for x in map(str, range(1, 23))]
            else:
                args.locations = ["chr" + x for x in map(str, range(1, 23))]

        if type(args.locations) is not list and args.locations is not None:
            # noinspection PyUnresolvedReferences
            args.locations = args.locations.split(",")

        # HAP-143 fix the case where no chromosomes are in truth or query
        try:
            if not h1["tabix"]["chromosomes"]:
                h1["tabix"]["chromosomes"] = []
        except:
            pass
        try:
            if not h2["tabix"]["chromosomes"]:
                h2["tabix"]["chromosomes"] = []
        except:
            pass

        if not h1["tabix"]:
            args.preprocessing_truth = True
            logging.warn("Truth file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            if args.fixchr_truth is None:
                args.fixchr_truth = True
        elif args.fixchr_truth is None:
            logging.info(str(h1["tabix"]))
            # autodetect chr naming
            count_with_fix = len([__ for __ in h1["tabix"]["chromosomes"]
                                  if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h1["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Truth: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_truth = True
                logging.info("Will fix chromosome names (truth).")
            else:
                logging.info("Will not fix chromosome names (truth).")
                args.fixchr_truth = False

        if not h2["tabix"]:
            args.preprocessing = True
            logging.warn("Query file is not Tabix indexed. Switching on pre-processing + chr name conversion.")
            # don't overwrite setting, but if it's None, replace with True to be sure
            if args.fixchr_query is None:
                args.fixchr_query = True
        elif args.fixchr_query is None:
            # autodetect chr naming
            count_with_fix = len([__ for __ in h2["tabix"]["chromosomes"]
                                  if ("chr%s" % str(__)) in args.locations])
            count_no_fix = len([__ for __ in h2["tabix"]["chromosomes"] if str(__) in args.locations])
            logging.info("Query: Number of chromosome names matching with / without renaming : %i / %i " % (
                count_with_fix, count_no_fix))
            if count_with_fix > count_no_fix:
                args.fixchr_query = True
                logging.info("Will fix chromosome names (query).")
            else:
                logging.info("Will not fix chromosome names (query).")
                args.fixchr_query = False

        if args.fixchr_truth or args.preprocessing_norm:
            args.preprocessing_truth = True

        if args.fixchr_query or args.preprocessing_norm:
            args.preprocessing = True

        if args.preprocessing_truth:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="truth.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf1, vtf.name, ",".join(args.locations),
                          not args.usefiltered_truth,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf1 = vtf.name
            # get headers again if we preprocessed
            h1 = vcfextract.extractHeadersJSON(args.vcf1)

        if args.preprocessing:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              dir=args.scratch_prefix,
                                              prefix="query.pp",
                                              suffix=".vcf.gz")
            vtf.close()
            tempfiles.append(vtf.name)
            preprocessVCF(args.vcf2, vtf.name, ",".join(args.locations),
                          False,  # query filters are handled further down in matching
                          args.fixchr_query,  # chrprefix
                          args.preprocessing_norm,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
            args.vcf2 = vtf.name
            # get headers again if we preprocessed
            h2 = vcfextract.extractHeadersJSON(args.vcf2)

        if not h1["tabix"]:
            raise Exception("Truth file is not Tabix indexed.")

        if not h2["tabix"]:
            raise Exception("Query file is not Tabix indexed.")

        newlocations = []

        if not h1["tabix"]["chromosomes"]:
            h1["tabix"]["chromosomes"] = []
        if not h2["tabix"]["chromosomes"]:
            h2["tabix"]["chromosomes"] = []

        for _xc in args.locations:
            xc = _xc.split(":")[0]
            if xc not in h1["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in truth!" % xc)
            if xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % xc)

            if (xc not in h1["tabix"]["chromosomes"]) and (xc not in h2["tabix"]["chromosomes"]):
                logging.warn("Removing location %s because neither input file has calls there." % xc)
            else:
                newlocations.append(_xc)

        if not newlocations:
            raise Exception("Location list is empty: the input files do not appear to have variants on any of %s" %
                            str(args.locations))

        args.locations = newlocations

        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)
            pool = multiprocessing.Pool(int(args.threads))

            # find balanced pieces
            args.pieces = (args.threads + len(args.locations) - 1) / len(args.locations)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))
        else:
            pool = None

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.", suffix=".vcf.gz")
        tf.close()
        tempfiles.append(tf.name)
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x[0] for x in res if x is not None]  # VCFs
            tempfiles += [x[1] for x in res if x is not None and x[1] is not None]  # beds (if any)

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x[0] for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            fo = Tools.BGZipFile(output_name, True)
            for i, x in enumerate(runme_list):
                f = gzip.GzipFile(x)
                for l in f:
                    if i == 0 or not l[0] == "#":
                        fo.write(l)
            fo.close()

            logging.info("Indexing...")
            to_run = "tabix -p vcf %s" % output_name.replace(" ", "\\ ")
            logging.info("Running '%s'" % to_run)
            subprocess.check_call(to_run, shell=True)
            # passed to quantify
            args.type = "xcmp"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        logging.info("Counting variants...")

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")


    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--preprocessing-window-size", dest="preprocess_window",
                        default=10000, type=int,
                        help="Preprocessing window size (variants further apart than that size are not expected to interfere).")

    # detailed control of comparison
    parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between variants such that they fall into the same superlocus.")

    # xcmp-specific stuff
    parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        default=Haplo.vcfeval.findVCFEval(),
                        help="This parameter should give the path to the \"rtg\" executable. "
                             "The default is %s" % Haplo.vcfeval.findVCFEval())
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
                             "to save time when running hap.py with vcfeval. If no SDF folder is "
                             "specified, hap.py will create a temporary one.")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(1)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
        raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
        raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                        "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # xcmp supports bcf; others don't
    if args.engine == "xcmp" and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()
        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        pre.preprocess(args.vcf1,
                       ttf.name,
                       args.ref,
                       args.locations,
                       None if args.usefiltered_truth else "*",  # filters
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift if args.preprocessing_truth else False,
                       args.preprocessing_decompose if args.preprocessing_truth else False,
                       args.preprocessing_norm if args.preprocessing_truth else False,
                       args.preprocess_window,
                       args.threads)

        args.vcf1 = ttf.name
        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception("Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = [args.locations]

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")
        pre.preprocess(args.vcf2,
                       qtf.name,
                       args.ref,
                       str(",".join(args.locations)),
                       filtering,
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift,
                       args.preprocessing_decompose,
                       args.preprocessing_norm,
                       args.preprocess_window,
                       args.threads)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Beispiel #5
0
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type """
    records = []

    if not avg_depth:
        logging.warn(
            "No average depths available, normalized depth features cannot be calculated"
        )

    hdrs = extractHeadersJSON(vcfname)

    # TODO could figure this out automatically
    nsn = "NORMAL"
    tsn = "TUMOR"
    n_sample = "S.1."
    t_sample = "S.2."

    logging.info(
        "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)"
        % (nsn, n_sample, tsn, t_sample))

    features = [
        "CHROM",
        "POS",
        "REF",
        "ALT",
        "FILTER",
        "I.SSC",
        "I.GPV",
        "I.SPV",
        n_sample + "GT",
        t_sample + "GT",  # Genotype
        n_sample + "GQ",
        t_sample + "GQ",  # Genotype quality
        n_sample + "DP",
        t_sample + "DP",  # Read depth
        n_sample + "RD",
        t_sample + "RD",  # Reference depth
        n_sample + "AD",
        t_sample + "AD",  # Alternative depth
        n_sample + "FREQ",
        t_sample + "FREQ"  # Alt. frequence (FA in MuTect)
    ]

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        for q in [n_sample + "GT", t_sample + "GT"]:
            if not q in rec or rec[q] is None:
                rec[q] = "."
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        # fix missing features
        for q in [
                n_sample + "GT", t_sample + "GT", n_sample + "GQ",
                t_sample + "GQ", n_sample + "DP", t_sample + "DP",
                n_sample + "AD", t_sample + "AD", n_sample + "RD",
                t_sample + "RD", n_sample + "FREQ", t_sample + "FREQ"
        ]:
            if not q in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True
            else:
                if q.endswith("FREQ"):
                    try:
                        rec[q] = float(rec[q])
                    except ValueError:
                        rec[q] = float("NaN")

                else:
                    try:
                        rec[q] = int(rec[q])
                    except ValueError:
                        rec[q] = -1

        rec["tag"] = tag

        n_DP = float(rec[n_sample + "DP"])
        t_DP = float(rec[t_sample + "DP"])

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif not "DPnorm" in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        n_allele_ref_count = rec[n_sample + "RD"]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            n_allele_alt_count = 0
        else:
            n_allele_alt_count = rec[n_sample + "AD"]

        if n_allele_alt_count + n_allele_ref_count == 0:
            n_allele_rate = 0
        else:
            n_allele_rate = n_allele_alt_count / float(n_allele_alt_count +
                                                       n_allele_ref_count)

        t_allele_ref_count = rec[t_sample + "RD"]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            t_allele_alt_count = 0
        else:
            t_allele_alt_count = rec[t_sample + "AD"]

        if t_allele_alt_count + t_allele_ref_count == 0:
            t_allele_rate = 0
        else:
            t_allele_rate = t_allele_alt_count / float(t_allele_alt_count +
                                                       t_allele_ref_count)

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "SSC": rec["I.SSC"],
            "GPV": rec["I.GPV"],
            "SPV": rec["I.SPV"],
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_GT": rec[n_sample + "GT"],
            "T_GT": rec[t_sample + "GT"],
            "N_GQ": rec[n_sample + "GQ"],
            "T_GQ": rec[t_sample + "GQ"],
            "N_AD": rec[n_sample + "AD"],
            "T_AD": rec[t_sample + "AD"],
            "N_FA": rec[n_sample + "FREQ"],
            "T_FA": rec[t_sample + "FREQ"],
            "N_ALT_RATE": n_allele_rate,
            "T_ALT_RATE": t_allele_rate,
            "tag": tag
        }

        records.append(qrec)

    cols = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "SSC", "GPV", "SPV", "N_DP",
        "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_GQ", "T_GQ",
        "N_AD", "T_AD", "N_FA", "T_FA", "N_ALT_RATE", "T_ALT_RATE", "tag"
    ]

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #6
0
def partialCredit(vcfname, outputname, reference, locations, threads=1, window=10000, leftshift=True, decompose=True):
    """ Partial-credit-process a VCF file according to our args """

    pool = getPool(int(threads))
    if threads > 1:
        logging.info("Partial credit processing uses %i parallel processes." % threads)

        if not locations:
            h = extractHeadersJSON(vcfname)
            if not h["tabix"]["chromosomes"]:
                logging.warn("Empty input or not tabix indexed")
                if outputname.endswith(".bcf"):
                    runBcftools("view", "-O", "b", "-o", outputname, vcfname)
                    runBcftools("index", outputname)
                else:
                    runBcftools("view", "-O", "z", "-o", outputname, vcfname)
                    runBcftools("index", "-t", outputname)
                # just return the same file
                return
            locations = h["tabix"]["chromosomes"]
        elif type(locations) is str or type(locations) is unicode:
            locations = locations.split(",")

        # use blocksplit to subdivide input
        res = runParallel(
            pool, blocksplitWrapper, locations, {"vcf": vcfname, "dist": window, "pieces": min(40, threads * 4)}
        )

        if None in res:
            raise Exception("One of the blocksplit processes failed.")

        locations = list(itertools.chain.from_iterable(res))
        if not len(locations):
            logging.warn("Blocksplit returned no blocks. This can happen when " "an input contains no valid variants.")
            locations = [""]
    else:
        locations = [""]

    res = []
    try:
        res = runParallel(
            pool,
            preprocessWrapper,
            itertools.izip(itertools.repeat(vcfname), locations),
            {
                "reference": reference,
                "decompose": decompose,
                "leftshift": leftshift,
                "bcf": outputname.endswith(".bcf"),
            },
        )

        if None in res:
            raise Exception("One of the preprocess jobs failed")
        if not res:
            raise Exception("No blocks were processed. List of locations: %s" % str(list(locations)))

        concatenateParts(outputname, *res)
        if outputname.endswith(".vcf.gz"):
            runBcftools("index", "-t", outputname)
        else:  # use bcf
            runBcftools("index", outputname)
    finally:
        for r in res:
            try:
                os.unlink(r)
            except:
                pass
            try:
                os.unlink(r + ".tbi")
            except:
                pass
            try:
                os.unlink(r + ".csi")
            except:
                pass
Beispiel #7
0
def extractMutectSNVFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        records = []

        if not avg_depth:
            logging.warn("No average depths available, normalized depth features cannot be calculated")

        hdrs = extractHeadersJSON(vcfname)

        tsn = ""
        nsn = ""

        t_sample = "S.1."
        n_sample = "S.2."

        try:
            samples = hdrs["samples"]
            for f in hdrs["fields"]:
                if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect":
                    clopts = f["values"]["CommandLineOptions"]
                    # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal
                    m = re.search("tumor_sample_name=([^\s]+)", clopts)
                    if m:
                        tsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == tsn:
                                t_sample = "S.%i." % (i+1)
                                break
                    m = re.search("normal_sample_name=([^\s]+)", clopts)
                    if m:
                        nsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == nsn:
                                n_sample = "S.%i." % (i+1)
                                break

        except:
            logging.warn("Unable to detect tumour / normal sample order from VCF header")

        logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample,
                                                                                                    tsn, t_sample))

        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.DB", "I.TLOD", "I.NLOD", "I.ECNT",
                    "I.HCNT", "I.MAX_ED", "I.MIN_ED",
                    n_sample + "GT", t_sample + "GT",
                    n_sample + "DP", t_sample + "DP",
                    n_sample + "QSS", t_sample + "QSS",
                    n_sample + "AD", t_sample + "AD"]

        has_warned = {}

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            for q in [n_sample + "GT", t_sample + "GT"]:
                if not q in rec or rec[q] is None:
                    rec[q] = "."
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            # fix missing features
            for q in ["I.DB", "I.TLOD", "I.NLOD", "I.ECNT",
                      "I.HCNT", "I.MAX_ED", "I.MIN_ED",
                      n_sample + "GT", t_sample + "GT",
                      n_sample + "DP", t_sample + "DP",
                      n_sample + "QSS", t_sample + "QSS",
                      n_sample + "AD", t_sample + "AD"]:
                if not q in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True
                else:
                    # list features
                    if q.endswith("AD") or q.endswith("QSS"):
                        if type(rec[q]) is not list:
                            if not has_warned[q + "_PARSE_FAIL"]:
                                logging.warn("Cannot parse %s: %s" % (q, str(rec[q])))
                                has_warned[q + "_PARSE_FAIL"] = True
                                rec[q] = [0] * (1 + len(rec["ALT"]))

                            for xx in range(0, 1 + len(rec["ALT"])):
                                if len(rec[q]) <= xx:
                                    rec[q].append(0)
                                else:
                                    try:
                                        rec[q][xx] = float(rec[q][xx])
                                    except ValueError:
                                        rec[q][xx] = 0
                    else:
                        try:
                            rec[q] = int(rec[q])
                        except ValueError:
                            rec[q] = -1

            rec["tag"] = tag
            TLOD = float(rec["I.TLOD"])
            NLOD = float(rec["I.NLOD"])

            n_DP        = float(rec[n_sample + "DP"])
            t_DP        = float(rec[t_sample + "DP"])

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio      = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio      = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif not "DPnorm" in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            n_allele_ref_count = rec[n_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_count = 0
            else:
                n_allele_alt_count = 0
                for a in xrange(0, len(alleles_alt)):
                    n_allele_alt_count += float(rec[n_sample + "AD"][a + 1])

            if n_allele_alt_count + n_allele_ref_count == 0:
                n_allele_rate = 0
            else:
                n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count)

            t_allele_ref_count = rec[t_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_count = 0
            else:
                t_allele_alt_count = 0
                for a in xrange(0, len(alleles_alt)):
                    t_allele_alt_count += float(rec[t_sample + "AD"][a + 1])

            if t_allele_alt_count + t_allele_ref_count == 0:
                t_allele_rate = 0
            else:
                t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count)

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "DBSNP": rec["I.DB"],
                "TLOD": TLOD,
                "NLOD": NLOD,
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE" : n_DP_ratio,
                "T_DP_RATE" : t_DP_ratio,
                "N_GT": rec[n_sample + "GT"],
                "T_GT": rec[t_sample + "GT"],
                "N_AD": rec[n_sample + "AD"],
                "T_AD": rec[t_sample + "AD"],
                "N_QSS": rec[n_sample + "QSS"],
                "T_QSS": rec[t_sample + "QSS"],
                "N_AF": n_allele_rate,
                "T_AF": t_allele_rate,
                "ECNT": rec["I.ECNT"],
                "HCNT": rec["I.HCNT"],
                "MAX_ED": rec["I.MAX_ED"],
                "MIN_ED": rec["I.MIN_ED"],
                "tag" : tag
            }
            records.append(qrec)

        cols = ["CHROM", "POS", "REF", "ALT",
                "FILTER", "TLOD", "NLOD", "DBSNP",
                "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT",
                "N_AD", "T_AD", "N_QSS", "T_QSS",
                "N_AF", "T_AF",
                "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")


    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Use filtered variant calls in truth file (by default, only PASS calls in the truth file are used)")
    parser.add_argument("--preprocessing-window-size", dest="preprocess_window",
                        default=10000, type=int,
                        help="Preprocessing window size (variants further apart than that size are not expected to interfere).")
    parser.add_argument("--adjust-conf-regions", dest="preprocessing_truth_confregions", action="store_true", default=True,
                        help="Adjust confident regions to include variant locations.")
    parser.add_argument("--no-adjust-conf-regions", dest="preprocessing_truth_confregions", action="store_false",
                        help="Adjust confident regions to include variant locations.")

    # detailed control of comparison
    parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between variants such that they fall into the same superlocus.")

    # xcmp-specific stuff
    parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval", "scmp-somatic"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        default=Haplo.vcfeval.findVCFEval(),
                        help="This parameter should give the path to the \"rtg\" executable. "
                             "The default is %s" % Haplo.vcfeval.findVCFEval())
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
                             "to save time when running hap.py with vcfeval. If no SDF folder is "
                             "specified, hap.py will create a temporary one.")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(1)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
        raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
        raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                        "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # turn on allele conversion
    if args.engine == "scmp-somatic" and args.somatic_allele_conversion == False:
        args.somatic_allele_conversion = True

    # somatic allele conversion should also switch off decomposition
    if args.somatic_allele_conversion == True and "--decompose" not in sys.argv:
        args.preprocessing_decompose = False

    # xcmp/scmp support bcf; others don't
    if args.engine in ["xcmp", "scmp-somatic"] and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()

        if args.engine.endswith("somatic") and \
           args.preprocessing_truth and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_truth = False
            logging.info("Turning off pre.py preprocessing for somatic comparisons")

        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        args.gender = pre.preprocess(args.vcf1,
                                     ttf.name,
                                     args.ref,
                                     args.locations,
                                     None if args.usefiltered_truth else "*",  # filters
                                     args.fixchr,
                                     args.regions_bedfile,
                                     args.targets_bedfile,
                                     args.preprocessing_leftshift if args.preprocessing_truth else False,
                                     args.preprocessing_decompose if args.preprocessing_truth else False,
                                     args.preprocessing_norm if args.preprocessing_truth else False,
                                     args.preprocess_window,
                                     args.threads,
                                     args.gender,
                                     args.somatic_allele_conversion)

        args.vcf1 = ttf.name

        if args.fp_bedfile and args.preprocessing_truth_confregions:
            conf_temp = Haplo.gvcf2bed.gvcf2bed(args.vcf1, args.ref, args.fp_bedfile, args.scratch_prefix)
            tempfiles.append(conf_temp)
            args.strat_regions.append("CONF_VARS:" + conf_temp)

        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception("Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = args.locations.split(",")

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")

        if args.engine.endswith("somatic") and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_leftshift = False
            args.preprocessing_norm = False
            args.preprocessing_decompose = False
            logging.info("Turning off pre.py preprocessing (query) for somatic comparisons")

        pre.preprocess(args.vcf2,
                       qtf.name,
                       args.ref,
                       str(",".join(args.locations)),
                       filtering,
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift,
                       args.preprocessing_decompose,
                       args.preprocessing_norm,
                       args.preprocess_window,
                       args.threads,
                       args.gender,
                       args.somatic_allele_conversion)  # same gender as truth above

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc_header = args.roc
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        elif args.engine == "scmp-somatic":
            tempfiles += Haplo.scmp.runSCmp(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))
Beispiel #9
0
def extractMutectIndelFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        records = []

        if not avg_depth:
            logging.warn("No average depths available, normalized depth features cannot be calculated")

        hdrs = extractHeadersJSON(vcfname)

        tsn = ""
        nsn = ""

        t_sample = "S.1."
        n_sample = "S.2."

        try:
            samples = hdrs["samples"]
            for f in hdrs["fields"]:
                if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect":
                    clopts = f["values"]["CommandLineOptions"]
                    # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal
                    m = re.search("tumor_sample_name=([^\s]+)", clopts)
                    if m:
                        tsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == tsn:
                                t_sample = "S.%i." % (i+1)
                                break
                    m = re.search("normal_sample_name=([^\s]+)", clopts)
                    if m:
                        nsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == nsn:
                                n_sample = "S.%i." % (i+1)
                                break

        except:
            logging.warn("Unable to detect tumour / normal sample order from VCF header")

        logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample,
                                                                                                    tsn, t_sample))
        has_warned = {}

        ##FORMAT=<ID=MM,Number=2,Type=Float,Description="Average # of mismatches per ref-/consensus indel-supporting read">
        ##FORMAT=<ID=MQS,Number=2,Type=Float,Description="Average mapping qualities of ref-/consensus indel-supporting reads">
        ##FORMAT=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases in ref-/consensus indel-supporting reads">
        ##FORMAT=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads">
        ##FORMAT=<ID=REnd,Number=2,Type=Integer,Description="Median/mad of indel offsets from the ends of the reads">
        ##FORMAT=<ID=RStart,Number=2,Type=Integer,Description="Median/mad of indel offsets from the starts of the reads">
        ##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)">

        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    n_sample + "GT", t_sample + "GT",
                    n_sample + "DP", t_sample + "DP",
                    n_sample + "AD", t_sample + "AD",
                    n_sample + "MM", t_sample + "MM",
                    n_sample + "MQS", t_sample + "MQS",
                    n_sample + "NQSBQ", t_sample + "NQSBQ",
                    n_sample + "NQSMM", t_sample + "NQSMM",
                    n_sample + "RStart", t_sample + "RStart",
                    n_sample + "REnd", t_sample + "REnd",
                    n_sample + "SC", t_sample + "SC"]

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            for q in [n_sample + "GT", t_sample + "GT"]:
                if not q in rec or rec[q] is None:
                    rec[q] = "."
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            # fix missing features
            for q in [n_sample + "GT", t_sample + "GT",
                      n_sample + "DP", t_sample + "DP",
                      n_sample + "AD", t_sample + "AD",
                      n_sample + "MM", t_sample + "MM",
                      n_sample + "MQS", t_sample + "MQS",
                      n_sample + "NQSBQ", t_sample + "NQSBQ",
                      n_sample + "NQSMM", t_sample + "NQSMM",
                      n_sample + "RStart", t_sample + "RStart",
                      n_sample + "REnd", t_sample + "REnd",
                      n_sample + "SC", t_sample + "SC"]:
                if not q in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True
                else:
                    if q.endswith("AD") or q.endswith("MM") or q.endswith("MQS") or \
                       q.endswith("NQSBQ") or q.endswith("NQSMM") or \
                       q.endswith("REnd") or q.endswith("RStart"):
                        if type(rec[q]) is not list:
                            if not has_warned[q + "_PARSE_FAIL"]:
                                logging.warn("Cannot parse %s: %s" % (q, str(rec[q])))
                                has_warned[q + "_PARSE_FAIL"] = True
                                rec[q] = [-1, -1]
                            for xx in range(2):
                                if len(rec[q]) <= xx:
                                    rec[q].append(-1)
                                else:
                                    try:
                                        rec[q][xx] = float(rec[q][xx])
                                    except ValueError:
                                        rec[q][xx] = -1
                    elif q.endswith("SC"):
                        if type(rec[q]) is not list:
                            if not has_warned[q + "_PARSE_FAIL"]:
                                logging.warn("Cannot parse %s: %s" % (q, str(rec[q])))
                                has_warned[q + "_PARSE_FAIL"] = True
                                rec[q] = [-1, -1, -1, -1]
                        else:
                            for xx in range(4):
                                if len(rec[q]) <= xx:
                                    rec[q].append(-1)
                                else:
                                    try:
                                        rec[q][xx] = float(rec[q][xx])
                                    except ValueError:
                                        rec[q][xx] = -1
                    else:
                        try:
                            rec[q] = int(rec[q])
                        except ValueError:
                            rec[q] = -1

            rec["tag"] = tag

            n_DP        = float(rec[n_sample + "DP"])
            t_DP        = float(rec[t_sample + "DP"])

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio      = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio      = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif not "DPnorm" in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            n_allele_ref_count = rec[n_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_count = 0
            else:
                n_allele_alt_count = 0
                for a in xrange(1, len(rec[n_sample + "AD"])):
                    n_allele_alt_count += float(rec[n_sample + "AD"][a])

            if n_allele_alt_count + n_allele_ref_count == 0:
                n_allele_rate = 0
            else:
                n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count)

            t_allele_ref_count = rec[t_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_count = 0
            else:
                t_allele_alt_count = 0
                for a in xrange(1, len(rec[t_sample + "AD"])):
                    t_allele_alt_count += float(rec[t_sample + "AD"][a])

            if t_allele_alt_count + t_allele_ref_count == 0:
                t_allele_rate = 0
            else:
                t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count)

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE" : n_DP_ratio,
                "T_DP_RATE" : t_DP_ratio,
                "N_GT": rec[n_sample + "GT"],
                "T_GT": rec[t_sample + "GT"],
                "N_AD": rec[n_sample + "AD"],
                "T_AD": rec[t_sample + "AD"],
                "N_ALT_RATE": n_allele_rate,
                "T_ALT_RATE": t_allele_rate,
                "N_MM": n_sample + "MM",
                "T_MM": t_sample + "MM",
                "N_MQS": n_sample + "MQS",
                "T_MQS": t_sample + "MQS",
                "N_NQSBQ": n_sample + "NQSBQ",
                "T_NQSBQ": t_sample + "NQSBQ",
                "N_NQSMM": n_sample + "NQSMM",
                "T_NQSMM": t_sample + "NQSMM",
                "N_RStart": n_sample + "RStart",
                "T_RStart": t_sample + "RStart",
                "N_REnd": n_sample + "REnd",
                "T_REnd": t_sample + "REnd",
                "N_SC": n_sample + "SC",
                "T_SC": t_sample + "SC",
                "tag" : tag
            }
            records.append(qrec)

        cols = [
            "CHROM",
            "POS",
            "REF",
            "ALT",
            "FILTER",
            "DBSNP",
            "N_DP",
            "T_DP",
            "N_DP_RATE",
            "T_DP_RATE",
            "N_GT",
            "T_GT",
            "N_AD",
            "T_AD",
            "N_ALT_RATE",
            "T_ALT_RATE",
            "N_MM",
            "T_MM",
            "N_MQS",
            "T_MQS",
            "N_NQSBQ",
            "T_NQSBQ",
            "N_NQSMM",
            "T_NQSMM",
            "N_RStart",
            "T_RStart",
            "N_REnd",
            "T_REnd",
            "N_SC",
            "T_SC",
            "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #10
0
def preprocess(vcf_input,
               vcf_output,
               reference,
               locations=None,
               filters=None,
               fixchr=None,
               regions=None,
               targets=None,
               leftshift=True,
               decompose=True,
               bcftools_norm=False,
               windowsize=10000,
               threads=1,
               gender=None,
               somatic_allele_conversion=False,
               sample="SAMPLE",
               filter_nonref=True,
               convert_gvcf_to_vcf=False):
    """ Preprocess a single VCF file

    :param vcf_input: input file name
    :param vcf_output: output file name
    :param reference: reference fasta name
    :param locations: list of locations or None
    :param filters: list of filters to apply ("*" to only allow PASS)
    :param fixchr: None for auto, or True/False -- fix chr prefix to match reference
    :param regions: regions bed file
    :param targets: targets bed file
    :param leftshift: left-shift variants
    :param decompose: decompose variants
    :param bcftools_norm: use bcftools_norm
    :param windowsize: normalisation window size
    :param threads: number of threads to for preprcessing
    :param gender: the sex of the sample ("male" / "female" / "auto" / None)
    :param somatic_allele_conversion: convert somatic alleles -- False / half / het / hemi / hom
    :param sample: when using somatic_allele_conversion, name of the output sample
    :param filter_nonref: remove any variants genotyped as <NON_REF>

    :return: the sex if auto-determined (otherwise the same value as sex parameter)
    """

    tempfiles = []
    try:
        # If the input is in BCF format, we can continue to
        # process it in bcf
        # if it is in .vcf.gz, don't try to convert it to
        # bcf because there are a range of things that can
        # go wrong there (e.g. undefined contigs and bcftools
        # segfaults)
        if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"):
            int_suffix = ".bcf"
            int_format = "b"
            if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"):
                logging.warn("Turning vcf into bcf can cause problems when headers are not consistent with all "
                             "records in the file. I will run vcfcheck to see if we will run into trouble. "
                             "To save time in the future, consider converting your files into bcf using bcftools before"
                             " running pre.py.")
        else:
            int_suffix = ".vcf.gz"
            int_format = "z"

        # HAP-317 always check for BCF errors since preprocessing tools now require valid headers
        mf = subprocess.check_output("vcfcheck %s --check-bcf-errors 1" % pipes.quote(vcf_input), shell=True)

        if gender == "auto":
            logging.info(mf)
            if "female" in mf:
                gender = "female"
            else:
                gender = "male"

        h = vcfextract.extractHeadersJSON(vcf_input)
        reference_contigs = set(fastaContigLengths(reference).keys())
        reference_has_chr_prefix = hasChrPrefix(reference_contigs)

        allfilters = []
        for f in h["fields"]:
            try:
                if f["key"] == "FILTER":
                    allfilters.append(f["values"]["ID"])
            except:
                logging.warn("ignoring header: %s" % str(f))

        required_filters = None
        if filters:
            fts = filters.split(",")
            required_filters = ",".join(list(set(["PASS", "."] + [x for x in allfilters if x not in fts])))

        if fixchr is None:
            try:
                if not h["tabix"]:
                    logging.warn("input file is not tabix indexed, consider doing this in advance for performance reasons")
                    vtf = tempfile.NamedTemporaryFile(delete=False,
                                                      suffix=int_suffix)
                    vtf.close()
                    tempfiles.append(vtf.name)
                    runBcftools("view", "-o", vtf.name, "-O", int_format, vcf_input)
                    runBcftools("index", vtf.name)
                    h2 = vcfextract.extractHeadersJSON(vcf_input)
                    chrlist = h2["tabix"]["chromosomes"]
                else:
                    chrlist = h["tabix"]["chromosomes"]
                vcf_has_chr_prefix = hasChrPrefix(chrlist)

                if reference_has_chr_prefix and not vcf_has_chr_prefix:
                    fixchr = True
            except:
                logging.warn("Guessing the chr prefix in %s has failed." % vcf_input)

        if leftshift or decompose: # all these require preprocessing
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              suffix=int_suffix)
            vtf.close()
            tempfiles.append(vtf.name)
            vtf = vtf.name
        else:
            vtf = vcf_output

        preprocessVCF(vcf_input,
                      vtf,
                      locations,
                      filters == "*",
                      fixchr,
                      bcftools_norm,
                      regions,
                      targets,
                      reference,
                      required_filters,
                      somatic_allele_conversion=somatic_allele_conversion,
                      sample=sample,
                      filter_nonref=filter_nonref,
                      convert_gvcf=convert_gvcf_to_vcf,
                      num_threads=threads)

        if leftshift or decompose or gender == "male":
            Haplo.partialcredit.partialCredit(vtf,
                                              vcf_output,
                                              reference,
                                              locations,
                                              threads=threads,
                                              window=windowsize,
                                              leftshift=leftshift,
                                              decompose=decompose,
                                              haploid_x=gender == "male")
    finally:
        for t in tempfiles:
            try:
                os.unlink(t)
            except:
                pass

    return gender
Beispiel #11
0
def preprocess(vcf_input,
               vcf_output,
               reference,
               locations=None,
               filters=None,
               fixchr=None,
               regions=None,
               targets=None,
               leftshift=True,
               decompose=True,
               bcftools_norm=False,
               windowsize=10000,
               threads=1,
               ):
    """ Preprocess a single VCF file

    :param vcf_input: input file name
    :param vcf_output: output file name
    :param reference: reference fasta name
    :param locations: list of locations or None
    :param filters: list of filters to apply ("*" to only allow PASS)
    :param fixchr: None for auto, or True/False -- fix chr prefix to match reference
    :param regions: regions bed file
    :param targets: targets bed file
    :param leftshift: left-shift variants
    :param decompose: decompose variants
    :param bcftools_norm: use bcftools_norm
    :param windowsize: normalisation window size
    :param threads: number of threads to for preprcessing
    """

    tempfiles = []
    try:
        # If the input is in BCF format, we can continue to
        # process it in bcf
        # if it is in .vcf.gz, don't try to convert it to
        # bcf because there are a range of things that can
        # go wrong there (e.g. undefined contigs and bcftools
        # segfaults)
        if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"):
            int_suffix = ".bcf"
            int_format = "b"
            if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"):
                logging.warn("Turning vcf into bcf can cause problems when headers aren't consistent with all "
                             "records in the file. I will run vcfcheck to see if we will run into trouble. "
                             "To save time in the future, consider converting your files into bcf using bcftools before"
                             " running pre.py.")
                subprocess.check_call("vcfcheck %s" % vcf_input, shell=True)
        else:
            int_suffix = ".vcf.gz"
            int_format = "z"

        h = vcfextract.extractHeadersJSON(vcf_input)
        reference_contigs = set(fastaContigLengths(reference).keys())
        reference_has_chr_prefix = hasChrPrefix(reference_contigs)

        allfilters = []
        for f in h["fields"]:
            try:
                if f["key"] == "FILTER":
                    allfilters.append(f["values"]["ID"])
            except:
                logging.warn("ignoring header: %s" % str(f))

        required_filters = None
        if filters:
            fts = filters.split(",")
            required_filters = ",".join(list(set(["PASS", "."] + [x for x in allfilters if x not in fts])))

        if fixchr is None:
            try:
                if not h["tabix"]:
                    logging.warn("input file is not tabix indexed, consider doing this in advance for performance reasons")
                    vtf = tempfile.NamedTemporaryFile(delete=False,
                                                      suffix=int_suffix)
                    vtf.close()
                    tempfiles.append(vtf.name)
                    runBcftools("view", "-o", vtf.name, "-O", int_format, vcf_input)
                    runBcftools("index", vtf.name)
                    h2 = vcfextract.extractHeadersJSON(vcf_input)
                    chrlist = h2["tabix"]["chromosomes"]
                else:
                    chrlist = h["tabix"]["chromosomes"]
                vcf_has_chr_prefix = hasChrPrefix(h["tabix"]["chromosomes"])

                if reference_has_chr_prefix and not vcf_has_chr_prefix:
                    fixchr = True
            except:
                logging.warn("Guessing the chr prefix in %s has failed." % vcf_input)

        # all these require preprocessing
        vtf = vcf_input

        if leftshift or decompose:
            vtf = tempfile.NamedTemporaryFile(delete=False,
                                              suffix=int_suffix)
            vtf.close()
            tempfiles.append(vtf.name)
            vtf = vtf.name
        else:
            vtf = vcf_output

        preprocessVCF(vcf_input,
                      vtf,
                      locations,
                      filters == "*",
                      fixchr,
                      bcftools_norm,
                      regions,
                      targets,
                      reference,
                      required_filters)

        if leftshift or decompose:
            Haplo.partialcredit.partialCredit(vtf,
                                              vcf_output,
                                              reference,
                                              locations,
                                              threads=threads,
                                              window=windowsize,
                                              leftshift=leftshift,
                                              decompose=decompose)
    finally:
        for t in tempfiles:
            try:
                os.unlink(t)
            except:
                pass