Python runBcftools Examples

Programming Language: Python

Namespace/Package Name: Tools.bcftools

Method/Function: runBcftools

Examples at hotexamples.com: 18

Python runBcftools - 18 examples found. These are the top rated real world Python examples of Tools.bcftools.runBcftools extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: partialcredit.py Project: xchromosome219/hap.py

def preprocessWrapper(file_and_location, args):
    starttime = time.time()
    filename, location_str = file_and_location
    if args["bcf"]:
        int_suffix = "bcf"
    else:
        int_suffix = "vcf.gz"

    tf = tempfile.NamedTemporaryFile(delete=False,
                                     prefix="input.%s" % location_str,
                                     suffix=".prep." + int_suffix)
    tf.close()

    to_run = "preprocess %s:* %s-o %s -V %i -L %i -r %s" % \
             (pipes.quote(filename),
              ("-l %s " % pipes.quote(location_str)) if location_str else "",
              tf.name,
              args["decompose"],
              args["leftshift"],
              pipes.quote(args["reference"]))

    if args["haploid_x"]:
        to_run += " --haploid-x 1"

    tfe = tempfile.NamedTemporaryFile(delete=False,
                                      prefix="stderr",
                                      suffix=".log")
    tfo = tempfile.NamedTemporaryFile(delete=False,
                                      prefix="stdout",
                                      suffix=".log")
    finished = False
    try:
        logging.info("Running '%s'" % to_run)
        subprocess.check_call(to_run, shell=True, stdout=tfo, stderr=tfe)
        finished = True
    finally:
        if finished:
            tfo.close()
            tfe.close()
            with open(tfo.name) as f:
                for l in f:
                    logging.info(l.replace("\n", ""))
            os.unlink(tfo.name)
            with open(tfe.name) as f:
                for l in f:
                    logging.warn(l.replace("\n", ""))
            os.unlink(tfe.name)
        else:
            logging.error("Preprocess command %s failed. Outputs are here %s / %s" % (to_run, tfo.name, tfe.name))
            with open(tfo.name) as f:
                for l in f:
                    logging.error(l.replace("\n", ""))
            with open(tfe.name) as f:
                for l in f:
                    logging.error(l.replace("\n", ""))

    elapsed = time.time() - starttime
    logging.info("preprocess for %s -- time taken %.2f" % (location_str, elapsed))
    runBcftools("index", tf.name)
    return tf.name

Example #2

Show file

File: partialcredit.py Project: pkrusche/hap.py

def preprocessWrapper(file_and_location, args):
    starttime = time.time()
    filename, location_str = file_and_location
    if args["bcf"]:
        int_suffix = "bcf"
    else:
        int_suffix = "vcf.gz"

    tf = tempfile.NamedTemporaryFile(delete=False, prefix="input.%s" % location_str, suffix=".prep." + int_suffix)
    tf.close()

    to_run = "preprocess %s:* %s-o %s -V %i -L %i -r %s" % (
        filename.replace(" ", "\\ "),
        ("-l %s " % location_str) if location_str else "",
        tf.name,
        args["decompose"],
        args["leftshift"],
        args["reference"],
    )

    tfe = tempfile.NamedTemporaryFile(delete=False, prefix="stderr", suffix=".log")
    tfo = tempfile.NamedTemporaryFile(delete=False, prefix="stdout", suffix=".log")
    try:
        logging.info("Running '%s'" % to_run)
        subprocess.check_call(to_run, shell=True, stdout=tfo, stderr=tfe)
    finally:
        tfo.close()
        tfe.close()
        with open(tfo.name) as f:
            for l in f:
                logging.info(l.replace("\n", ""))
        os.unlink(tfo.name)
        with open(tfe.name) as f:
            for l in f:
                logging.warn(l.replace("\n", ""))
        os.unlink(tfe.name)

    elapsed = time.time() - starttime
    logging.info("preprocess for %s -- time taken %.2f" % (location_str, elapsed))
    runBcftools("index", tf.name)
    return tf.name

Example #3

Show file

File: scmp.py Project: Illumina/hap.py

def runSCmp(vcf1, vcf2, target, args):
    """ Runs scmp, which outputs a file quantify can produce counts on
    vcf1 and vcf2 must be indexed and only contain a single sample column.
    """

    try:
        if args.engine == "scmp-distance":
            cmode = "distance"
        else:
            cmode = "alleles"

        tf = tempfile.NamedTemporaryFile(delete=False)
        tf.close()
        try:
            # change GTs so we can compare them
            vargs = ["merge", "--force-samples", vcf1, vcf2,
                     "-o", tf.name]
            runBcftools(*vargs)
            vargs = ["view", tf.name,
                     "|",
                     "scmp",
                     "-M", cmode,
                     "-", "-r", args.ref,
                     "--threads", str(args.threads),
                     "-o", target]
            if args.roc:
                vargs += ["--q", args.roc]

            vargs += ["--distance-maxdist", str(args.engine_scmp_distance)]
            runBcftools(*vargs)
        finally:
            os.remove(tf.name)

        if target.endswith(".vcf.gz"):
            runBcftools("index", "-t", target)
            return [target, target + ".tbi"]
        else:
            runBcftools("index", target)
            return [target, target + ".csi"]
    except Exception as e:
        logging.error("Exception when running scmp: %s" % str(e))
        logging.error('-'*60)
        traceback.print_exc(file=LoggingWriter(logging.ERROR))
        logging.error('-'*60)
        raise
    except BaseException as e:
        logging.error("Exception when running scmp: %s" % str(e))
        logging.error('-'*60)
        traceback.print_exc(file=LoggingWriter(logging.ERROR))
        logging.error('-'*60)
        raise

Example #4

Show file

File: scmp.py Project: zsanli/docker_hap_roc

def runSCmp(vcf1, vcf2, target, args):
    """ Runs scmp, which outputs a file quantify can produce counts on
    vcf1 and vcf2 must be indexed and only contain a single sample column.
    """

    try:
        if args.engine == "scmp-distance":
            cmode = "distance"
        else:
            cmode = "alleles"

        tf = tempfile.NamedTemporaryFile(delete=False)
        tf.close()
        try:
            # change GTs so we can compare them
            vargs = ["merge", "--force-samples", vcf1, vcf2, "-o", tf.name]
            runBcftools(*vargs)
            vargs = [
                "view", tf.name, "|", "scmp", "-M", cmode, "-", "-r", args.ref,
                "--threads",
                str(args.threads), "-o", target
            ]
            if args.roc:
                vargs += ["--q", args.roc]

            vargs += ["--distance-maxdist", str(args.engine_scmp_distance)]
            runBcftools(*vargs)
        finally:
            os.remove(tf.name)

        if target.endswith(".vcf.gz"):
            runBcftools("index", "-t", target)
            return [target, target + ".tbi"]
        else:
            runBcftools("index", target)
            return [target, target + ".csi"]
    except Exception as e:
        logging.error("Exception when running scmp: %s" % str(e))
        logging.error('-' * 60)
        traceback.print_exc(file=LoggingWriter(logging.ERROR))
        logging.error('-' * 60)
        raise
    except BaseException as e:
        logging.error("Exception when running scmp: %s" % str(e))
        logging.error('-' * 60)
        traceback.print_exc(file=LoggingWriter(logging.ERROR))
        logging.error('-' * 60)
        raise

Example #5

Show file

def runSCmp(vcf1, vcf2, target, args):
    """ Runs scmp, which outputs a file quantify can produce counts on
    vcf1 and vcf2 must be indexed and only contain a single sample column.
    """

    try:
        # change GTs so we can compare them
        vargs = [
            "merge", "--force-samples", vcf1, vcf2, "|", "scmp", "-", "-r",
            args.ref, "--threads",
            str(args.threads), "-o", target
        ]

        if args.roc:
            vargs += ["--q", args.roc]

        runBcftools(*vargs)

        if target.endswith(".vcf.gz"):
            runBcftools("index", "-t", target)
            return [target, target + ".tbi"]
        else:
            runBcftools("index", target)
            return [target, target + ".csi"]
    except Exception as e:
        logging.error("Exception when running scmp: %s" % str(e))
        logging.error('-' * 60)
        traceback.print_exc(file=LoggingWriter(logging.ERROR))
        logging.error('-' * 60)
        raise
    except BaseException as e:
        logging.error("Exception when running scmp: %s" % str(e))
        logging.error('-' * 60)
        traceback.print_exc(file=LoggingWriter(logging.ERROR))
        logging.error('-' * 60)
        raise

Example #6

Show file

def main():
    parser = argparse.ArgumentParser("Somatic VCF Feature Extraction")

    parser.add_argument("input", help="Input VCF file")

    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        required=True,
                        help="Output file name. Output will be in CSV format")

    parser.add_argument("-l",
                        "--location",
                        dest="location",
                        default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument(
        "-R",
        "--restrict-regions",
        dest="regions_bedfile",
        default=None,
        type=str,
        help=
        "Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument(
        "-T",
        "--target-regions",
        dest="targets_bedfile",
        default=None,
        type=str,
        help=
        "Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-P",
                        "--include-nonpass",
                        dest="inc_nonpass",
                        action="store_true",
                        default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument(
        "--feature-table",
        dest="features",
        default="generic",
        help="Select a feature table to output. Options are: %s" %
        str(Somatic.FeatureSet.sets.keys()))

    parser.add_argument(
        "--feature-label",
        dest="label",
        default=None,
        help=
        "We will output a lable column, this value will go in there -- default is "
        "the input filename.")

    parser.add_argument(
        "--bam",
        dest="bams",
        default=[],
        action="append",
        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("-r",
                        "--reference",
                        dest="ref",
                        default=Tools.defaultReference(),
                        help="Specify a reference file for normalization.")

    parser.add_argument(
        "--normalize",
        dest="normalize",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on the input file.")

    parser.add_argument(
        "--fix-chr",
        dest="fixchr",
        default=False,
        action="store_true",
        help="Replace numeric chromosome names in the query by chr*-type names"
    )

    args = parser.parse_args()

    scratch = tempfile.mkdtemp()

    try:
        logging.info("Scratch path is %s" % scratch)

        if not args.label:
            args.label = os.path.basename(args.input)

        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()
            md = {}
            for x in bres.index:
                logging.info("Mean coverage on %s is %f" %
                             (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        logging.info("Preprocessing input...")
        preprocessVCF(
            args.input,
            nqpath,
            args.location,
            not args.inc_nonpass,  # pass_only
            args.fixchr,  # chrprefix
            args.normalize,  # norm,
            args.regions_bedfile,
            args.targets_bedfile,
            args.ref)

        runBcftools("index", nqpath)

        logging.info("Extracting features...")
        fset = Somatic.FeatureSet.make(args.features)
        fset.setChrDepths(md)
        featuretable = fset.collect(nqpath, args.label)

        if not args.output.endswith(".csv"):
            args.output += ".csv"
        logging.info("Saving feature table %s..." % args.output)
        featuretable.to_csv(args.output)

    finally:
        logging.info("Deleting scratch folder %s " % scratch)
        shutil.rmtree(scratch)

Example #7

Show file

File: partialcredit.py Project: zzygyx9119/hap.py

def partialCredit(vcfname,
                  outputname,
                  reference,
                  locations,
                  threads=1,
                  window=10000,
                  leftshift=True,
                  decompose=True,
                  haploid_x=False):
    """ Partial-credit-process a VCF file according to our args """

    pool = getPool(int(threads))
    if threads > 1:
        logging.info("Partial credit processing uses %i parallel processes." % threads)

        if not locations:
            h = extractHeadersJSON(vcfname)
            if not h["tabix"]["chromosomes"]:
                logging.warn("Empty input or not tabix indexed")
                if outputname.endswith(".bcf"):
                    runBcftools("view", "-O", "b", "-o", outputname, vcfname)
                    runBcftools("index", outputname)
                else:
                    runBcftools("view", "-O", "z", "-o", outputname, vcfname)
                    runBcftools("index", "-t", outputname)
                # just return the same file
                return
            locations = h["tabix"]["chromosomes"]
        elif type(locations) is str or type(locations) is unicode:
            locations = locations.split(",")

        # use blocksplit to subdivide input
        res = runParallel(pool,
                          blocksplitWrapper,
                          locations,
                          {"vcf": vcfname,
                           "dist": window,
                           "pieces": min(40, threads*4)})

        if None in res:
            raise Exception("One of the blocksplit processes failed.")

        locations = list(itertools.chain.from_iterable(res))
        if not len(locations):
            logging.warn("Blocksplit returned no blocks. This can happen when "
                         "an input contains no valid variants.")
            locations = [""]
    else:
        locations = [""]

    res = []
    try:
        res = runParallel(pool,
                          preprocessWrapper,
                          itertools.izip(itertools.repeat(vcfname), locations),
                          {"reference": reference,
                           "decompose": decompose,
                           "leftshift": leftshift,
                           "haploid_x": haploid_x,
                           "bcf": outputname.endswith(".bcf")})

        if None in res:
            raise Exception("One of the preprocess jobs failed")
        if not res:
            raise Exception("No blocks were processed. List of locations: %s" % str(list(locations)))

        concatenateParts(outputname, *res)
        if outputname.endswith(".vcf.gz"):
            runBcftools("index", "-f", "-t", outputname)
        else:  # use bcf
            runBcftools("index", "-f", outputname)
    finally:
        for r in res:
            try:
                os.unlink(r)
            except:
                pass
            try:
                os.unlink(r + ".tbi")
            except:
                pass
            try:
                os.unlink(r + ".csi")
            except:
                pass

Example #8

Show file

File: hap.py Project: Jeffleecy/hap.py

def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v",
                        "--version",
                        dest="version",
                        action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r",
                        "--reference",
                        dest="ref",
                        default=None,
                        help="Specify a reference file.")

    # output
    parser.add_argument("-o",
                        "--report-prefix",
                        dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix",
                        dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch",
                        dest="delete_scratch",
                        default=True,
                        action="store_false",
                        help="Filename prefix for scratch report output.")

    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument(
        '--convert-gvcf-truth',
        dest='convert_gvcf_truth',
        action="store_true",
        default=False,
        help=
        'Convert the truth set from genome VCF format to a VCF before processing.'
    )
    parser.add_argument(
        '--convert-gvcf-query',
        dest='convert_gvcf_query',
        action="store_true",
        default=False,
        help=
        'Convert the query set from genome VCF format to a VCF before processing.'
    )
    parser.add_argument(
        "--preprocess-truth",
        dest="preprocessing_truth",
        action="store_true",
        default=False,
        help=
        "Preprocess truth file with same settings as query (default is to accept truth in original format)."
    )
    parser.add_argument(
        "--usefiltered-truth",
        dest="usefiltered_truth",
        action="store_true",
        default=False,
        help=
        "Use filtered variant calls in truth file (by default, only PASS calls in the truth file are used)"
    )
    parser.add_argument(
        "--preprocessing-window-size",
        dest="preprocess_window",
        default=10000,
        type=int,
        help=
        "Preprocessing window size (variants further apart than that size are not expected to interfere)."
    )
    parser.add_argument(
        "--adjust-conf-regions",
        dest="preprocessing_truth_confregions",
        action="store_true",
        default=True,
        help=
        "Adjust confident regions to include variant locations. Note this will only include variants "
        "that are included in the CONF regions already when viewing with bcftools; this option only "
        "makes sure insertions are padded correctly in the CONF regions (to capture these, both the "
        "base before and after must be contained in the bed file).")
    parser.add_argument("--no-adjust-conf-regions",
                        dest="preprocessing_truth_confregions",
                        action="store_false",
                        help="Do not adjust confident regions for insertions.")

    # detailed control of comparison
    parser.add_argument(
        "--unhappy",
        "--no-haplotype-comparison",
        dest="no_hc",
        action="store_true",
        default=False,
        help=
        "Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument(
        "-w",
        "--window-size",
        dest="window",
        default=50,
        type=int,
        help=
        "Minimum distance between variants such that they fall into the same superlocus."
    )

    # xcmp-specific stuff
    parser.add_argument(
        "--xcmp-enumeration-threshold",
        dest="max_enum",
        default=16768,
        type=int,
        help=
        "Enumeration threshold / maximum number of sequences to enumerate per block."
    )

    parser.add_argument(
        "--xcmp-expand-hapblocks",
        dest="hb_expand",
        default=30,
        type=int,
        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads",
                        dest="threads",
                        default=multiprocessing.cpu_count(),
                        type=int,
                        help="Number of threads to use.")

    parser.add_argument(
        "--engine",
        dest="engine",
        default="xcmp",
        choices=["xcmp", "vcfeval", "scmp-somatic", "scmp-distance"],
        help="Comparison engine to use.")

    parser.add_argument(
        "--engine-vcfeval-path",
        dest="engine_vcfeval",
        required=False,
        default=Haplo.vcfeval.findVCFEval(),
        help="This parameter should give the path to the \"rtg\" executable. "
        "The default is %s" % Haplo.vcfeval.findVCFEval())

    parser.add_argument(
        "--engine-vcfeval-template",
        dest="engine_vcfeval_template",
        required=False,
        help=
        "Vcfeval needs the reference sequence formatted in its own file format "
        "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
        "to save time when running hap.py with vcfeval. If no SDF folder is "
        "specified, hap.py will create a temporary one.")

    parser.add_argument(
        "--scmp-distance",
        dest="engine_scmp_distance",
        required=False,
        default=30,
        type=int,
        help=
        "For distance-based matching (vcfeval and scmp), this is the distance between variants to use."
    )

    parser.add_argument(
        "--lose-match-distance",
        dest="engine_scmp_distance",
        required=False,
        type=int,
        help=
        "For distance-based matching (vcfeval and scmp), this is the distance between variants to use."
    )

    if Tools.has_sge:
        parser.add_argument(
            "--force-interactive",
            dest="force_interactive",
            default=False,
            action="store_true",
            help=
            "Force running interactively (i.e. when JOB_ID is not in the environment)"
        )

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument(
        "--logfile",
        dest="logfile",
        default=None,
        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="Raise logging level from warning to info.")

    verbosity_options.add_argument(
        "--quiet",
        dest="quiet",
        default=False,
        action="store_true",
        help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [
        x for x in unknown_args if x not in ["--force-interactive"]
    ]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " %
                          str(unknown_args))
        parser.print_help()
        exit(1)

    print "Hap.py %s" % Tools.version
    if args.version:
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception(
                "The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                " You can either use -T, or run the file through bedtools merge"
            )

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception(
            "Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not args.ref or not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(
            args.reports_prefix))):
        raise Exception(
            "The output path does not exist. Please specify a valid output path and prefix using -o"
        )

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(
            args.reports_prefix):
        raise Exception(
            "The output path should specify a file name prefix. Please specify a valid output path "
            "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* ."
        )

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # turn on allele conversion
    if (args.engine == "scmp-somatic" or args.engine == "scmp-distance") \
            and not args.somatic_allele_conversion:
        args.somatic_allele_conversion = True
        if args.engine == "scmp-distance":
            args.somatic_allele_conversion = "first"

    # somatic allele conversion should also switch off decomposition
    if args.somatic_allele_conversion and ("-D" not in sys.argv
                                           and "--decompose" not in sys.argv):
        args.preprocessing_decompose = False

    # xcmp/scmp support bcf; others don't
    if args.engine in ["xcmp", "scmp-somatic", "scmp-distance"] \
            and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    # write session info and args file
    session = sessionInfo()
    session["final_args"] = args.__dict__
    with open(args.reports_prefix + ".runinfo.json", "w") as sessionfile:
        json.dump(session, sessionfile)

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()

        if args.engine.endswith("somatic") and \
           args.preprocessing_truth and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_truth = False
            logging.info(
                "Turning off pre.py preprocessing for somatic comparisons")

        if args.preprocessing_truth:
            if args.filter_nonref:
                logging.info(
                    "Filtering out any variants genotyped as <NON_REF>")

        ## Only converting truth gvcf to vcf if both arguments are true
        convert_gvcf_truth = False
        if args.convert_gvcf_truth or args.convert_gvcf_to_vcf:
            logging.info("Converting genome VCF to VCF")
            convert_gvcf_truth = True

        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        args.gender = pre.preprocess(
            args.vcf1,
            ttf.name,
            args.ref,
            args.locations,
            None if args.usefiltered_truth else "*",  # filters
            args.fixchr,
            args.regions_bedfile,
            args.targets_bedfile,
            args.preprocessing_leftshift
            if args.preprocessing_truth else False,
            args.preprocessing_decompose
            if args.preprocessing_truth else False,
            args.preprocessing_norm if args.preprocessing_truth else False,
            args.preprocess_window,
            args.threads,
            args.gender,
            args.somatic_allele_conversion,
            "TRUTH",
            filter_nonref=args.filter_nonref
            if args.preprocessing_truth else False,
            convert_gvcf_to_vcf=convert_gvcf_truth)

        args.vcf1 = ttf.name

        if args.fp_bedfile and args.preprocessing_truth_confregions:
            conf_temp = Haplo.gvcf2bed.gvcf2bed(args.vcf1, args.ref,
                                                args.fp_bedfile,
                                                args.scratch_prefix)
            tempfiles.append(conf_temp)
            args.strat_regions.append("CONF_VARS:" + conf_temp)

        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" %
                     (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs
                                  & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception(
                    "Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = args.locations.split(",")

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        if args.filter_nonref:
            logging.info("Filtering out any variants genotyped as <NON_REF>")

        ## Only converting truth gvcf to vcf if both arguments are true
        convert_gvcf_query = False
        if args.convert_gvcf_query or args.convert_gvcf_to_vcf:
            logging.info("Converting genome VCF to VCF")
            convert_gvcf_query = True

        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")

        if args.engine.endswith("somatic") and \
           (args.preprocessing_leftshift or args.preprocessing_norm or args.preprocessing_decompose):
            args.preprocessing_leftshift = False
            args.preprocessing_norm = False
            args.preprocessing_decompose = False
            logging.info(
                "Turning off pre.py preprocessing (query) for somatic comparisons"
            )

        pre.preprocess(
            args.vcf2,
            qtf.name,
            args.ref,
            str(",".join(args.locations)),
            filtering,
            args.fixchr,
            args.regions_bedfile,
            args.targets_bedfile,
            args.preprocessing_leftshift,
            args.preprocessing_decompose,
            args.preprocessing_norm,
            args.preprocess_window,
            args.threads,
            args.gender,  # same gender as truth above
            args.somatic_allele_conversion,
            "QUERY",
            filter_nonref=args.filter_nonref,
            convert_gvcf_to_vcf=convert_gvcf_query)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" %
                     (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper,
                              args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations,
                              args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception(
                    "Input files/regions do not contain variants (0 haplotype blocks were processed)."
                )

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc_header = args.roc
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2,
                                                  output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        elif args.engine.startswith("scmp"):
            tempfiles += Haplo.scmp.runSCmp(args.vcf1, args.vcf2, output_name,
                                            args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        if args.preserve_info and args.engine == "vcfeval":
            # if we use vcfeval we need to merge the INFO fields back in.
            tf = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
            tempfiles.append(tf)
            print >> tf, "TRUTH_IN"
            print >> tf, "QUERY_IN"
            tf.close()
            info_file = tempfile.NamedTemporaryFile(suffix=".vcf.gz",
                                                    delete=False)
            tempfiles.append(info_file.name)
            info_file.close()

            bcftools.runBcftools("merge", args.vcf1, args.vcf2,
                                 "--force-samples", "-m", "all", "|",
                                 "bcftools", "reheader", "-s", tf.name, "|",
                                 "bcftools", "view", "-o", info_file.name,
                                 "-O", "z")
            bcftools.runBcftools("index", info_file.name)

            merged_info_file = tempfile.NamedTemporaryFile(suffix=".vcf.gz",
                                                           delete=False)
            tempfiles.append(merged_info_file.name)
            merged_info_file.close()

            bcftools.runBcftools("merge", output_vcf, info_file.name, "-m",
                                 "all", "|", "bcftools", "view", "-s",
                                 "^TRUTH_IN,QUERY_IN", "-X", "-U", "-o",
                                 merged_info_file.name, "-O", "z")
            output_name = merged_info_file.name

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))

Example #9

Show file

def run_quantify(filename,
                 output_file=None,
                 write_vcf=False,
                 regions=None,
                 reference=Tools.defaultReference(),
                 locations=None,
                 threads=1,
                 output_vtc=False,
                 output_rocs=False,
                 qtype=None,
                 roc_file=None,
                 roc_val=None,
                 roc_header=None,
                 roc_filter=None,
                 roc_delta=None,
                 roc_regions=None,
                 clean_info=True,
                 strat_fixchr=False):
    """Run quantify and return parsed JSON

    :param filename: the VCF file name
    :param output_file: output file name (if None, will use a temp file)
    :param write_vcf: write annotated VCF (give filename)
    :type write_vcf: str
    :param regions: dictionary of stratification region names and file names
    :param reference: reference fasta path
    :param locations: a location to use
    :param output_vtc: enable / disable the VTC field
    :param output_rocs: enable / disable output of ROCs by QQ level
    :param roc_file: filename for a TSV file with ROC observations
    :param roc_val: field to use for ROC QQ
    :param roc_header: name of ROC value for tables
    :param roc_filter: ROC filtering settings
    :param roc_delta: ROC minimum spacing between levels
    :param roc_regions: List of regions to output full ROCs for
    :param clean_info: remove unused INFO fields
    :param strat_fixchr: fix chr naming in stratification regions
    :returns: parsed counts JSON
    """

    if not output_file:
        output_file = tempfile.NamedTemporaryFile().name

    run_str = "quantify %s -o %s" % (pipes.quote(filename),
                                     pipes.quote(output_file))
    run_str += " -r %s" % pipes.quote(reference)
    run_str += " --threads %i" % threads

    if output_vtc:
        run_str += " --output-vtc 1"
    else:
        run_str += " --output-vtc 0"

    if output_rocs:
        run_str += " --output-rocs 1"
    else:
        run_str += " --output-rocs 0"

    if qtype:
        run_str += " --type %s" % qtype

    if roc_file:
        run_str += " --output-roc %s" % pipes.quote(roc_file)

    if roc_val:
        run_str += " --qq %s" % pipes.quote(roc_val)
        if roc_header != roc_val:
            # for xcmp, we extract the QQ value into the IQQ INFO field
            # we pass the original name along here
            run_str += " --qq-header %s" % pipes.quote(roc_header)

    if roc_filter:
        run_str += " --roc-filter '%s'" % pipes.quote(roc_filter)

    if roc_delta:
        run_str += " --roc-delta %f" % roc_delta

    if clean_info:
        run_str += " --clean-info 1"
    else:
        run_str += " --clean-info 0"

    if strat_fixchr:
        run_str += " --fix-chr-regions 1"
    else:
        run_str += " --fix-chr-regions 0"

    if write_vcf:
        if not write_vcf.endswith(".vcf.gz") and not write_vcf.endswith(
                ".bcf"):
            write_vcf += ".vcf.gz"
        run_str += " -v %s" % pipes.quote(write_vcf)

    if regions:
        for k, v in regions.iteritems():
            run_str += " -R '%s:%s'" % (k, v)

    if roc_regions:
        for r in roc_regions:
            run_str += " --roc-regions '%s'" % r

    location_file = None
    if locations:
        location_file = _locations_tmp_bed_file(locations)
        run_str += " --only '%s'" % location_file

    tfe = tempfile.NamedTemporaryFile(delete=False,
                                      prefix="stderr",
                                      suffix=".log")
    tfo = tempfile.NamedTemporaryFile(delete=False,
                                      prefix="stdout",
                                      suffix=".log")

    logging.info("Running '%s'" % run_str)

    try:
        subprocess.check_call(run_str, shell=True, stdout=tfo, stderr=tfe)
    except:
        tfo.close()
        tfe.close()
        with open(tfo.name) as f:
            for l in f:
                logging.error("[stdout] " + l.replace("\n", ""))
        os.unlink(tfo.name)
        with open(tfe.name) as f:
            for l in f:
                logging.error("[stderr] " + l.replace("\n", ""))
        os.unlink(tfe.name)
        if location_file:
            os.unlink(location_file)
        raise

    tfo.close()
    tfe.close()
    with open(tfo.name) as f:
        for l in f:
            logging.info("[stdout] " + l.replace("\n", ""))
    os.unlink(tfo.name)
    with open(tfe.name) as f:
        for l in f:
            logging.info("[stderr] " + l.replace("\n", ""))
    os.unlink(tfe.name)
    if location_file:
        os.unlink(location_file)

    if write_vcf and write_vcf.endswith(".bcf"):
        runBcftools("index", write_vcf)
    elif write_vcf:
        to_run = "tabix -p vcf %s" % pipes.quote(write_vcf)
        logging.info("Running '%s'" % to_run)
        subprocess.check_call(to_run, shell=True)

Example #10

Show file

def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file prefix for statistics and feature table (when selected)")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="FP",
                        help="False-positive region bed file to distinguish UNK from FP")

    parser.add_argument("-a", "--ambiguous", dest="ambi", action='append',
                        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed "
                             "in some replicates)")

    parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False,
                        help="Use FP calls from ambiguous region files also.")

    parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False,
                        help="Assume the truth set covers the whole genome and only count FPs in regions "
                             "specified by the truth VCF or ambiguous/false-positive regions.")

    parser.add_argument("--no-count-unk", dest="count_unk", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False,
                        default=False, action="store_true",
                        help="print a table giving the number of ambiguous events per category")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file.")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--continue", dest="cont", default=False, action="store_true",
                        help="Continue from scratch space (i.e. use VCFs in there if they already exist).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(),
                        help="Select a feature table to output.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true",
                        help="Enable running of bcftools norm on the truth file.")

    parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true",
                        help="Enable running of bcftools norm on the query file.")

    parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true",
                        help="Enable running of bcftools norm on both truth and query file.")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True,
                        help="Add chr prefix to truth file (default: true).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True,
                        help="Add chr prefix to query file (default: true).")

    parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Same as --fixchr-truth.")

    parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Same as --fixchr-query.")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False,
                        help="Disable chr replacement for truth (default: false).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False,
                        help="Add chr prefix to query file (default: false).")

    parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true",
                        help="Disable checking the order of TP features (dev feature).")

    parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(),
                        help="Create a ROC-style table. This is caller specific "
                             " - this will override the --feature-table switch!")

    parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true",
                        help="Stratify into different AF buckets. This needs to have features available"
                             "for getting the AF both in truth and query variants.")
    parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2,
                        help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, "
                             "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.")
    parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE",
                        help="Feature name to use for retrieving AF for truth variants (TP and FN)")
    parser.add_argument("--af-query", dest="af_strat_query", default="T_AF",
                        help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)")

    parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true",
                        help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all "
                             "variants) and either the --feature-table or --roc switch.")

    parser.add_argument("--fp-region-size", dest="fpr_size",
                        help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using"
                             " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise."
                             " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls."
                             " The resulting value is used as fp.region.size.")

    parser.add_argument("--ci-level", dest="ci_level", default=0.95, type = float,
                        help="Confidence level for precision/recall confidence intervals (default: 0.95)")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    try:
        if type(args.af_strat_binsize) is str:
            args.af_strat_binsize = map(float, args.af_strat_binsize.split(","))
        else:
            args.af_strat_binsize = map(float, [args.af_strat_binsize])

        if not args.af_strat_binsize:
            raise Exception("Bin size list is empty")
    except:
        logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize))
        exit(1)

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname
        if not args.inc_nonpass:
            logging.warn("When creating ROCs without the -P switch, the ROC data points will only "
                         "include filtered variants (i.e. they will normally end at the caller's "
                         "quality threshold).")

    if not (args.ci_level > 0.0 and args.ci_level < 1.0):
        raise Exception("Confidence interval level must be > 0.0 and < 1.0.")

    if args.af_strat and not args.features:
        raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together "
                        "with --feature-table or --roc")

    if args.count_filtered_fn and (not args.inc_nonpass or not args.features):
        raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, "
                        "and when using unfiltered variants. Specify -P --feature-table <...> or use "
                        "--roc to select a ROC type.")

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(args.truth, ntpath, args.location,
                          True,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.normalize_truth,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(args.query, nqpath, args.location,
                          not args.inc_nonpass,  # pass_only
                          args.fixchr_query,  # chrprefix
                          args.normalize_query,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("FP/ambiguity classes with info (multiple classes can "
                             "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                    formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % (
                                str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                if vtype == "SNVs":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) &
                                                          (featuretable["ALT"].str.len() ==
                                                           featuretable["REF"].str.len())]
                elif vtype == "indels":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) |
                                                          (featuretable["ALT"].str.len() != 1)]
                else:
                    featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0]

                if args.af_strat:
                    start = 0.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = min(1.000000001, start + current_binsize)
                        n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") &
                                                      (featuretable_this_type[af_q_feature] >= start) &
                                                      (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") &
                                                        (featuretable_this_type[af_q_feature] >= start) &
                                                        (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") &
                                                       (featuretable_this_type[af_q_feature] >= start) &
                                                       (featuretable_this_type[af_q_feature] < end)]

                        r = {"type": "%s.%f-%f" % (vtype, start, end),
                             "total.truth": n_tp.shape[0] + n_fn.shape[0],
                             "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0],
                             "tp": n_tp.shape[0],
                             "fp": n_fp.shape[0],
                             "fn": n_fn.shape[0],
                             "unk": n_unk.shape[0],
                             "ambi": n_ambi.shape[0], }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start += current_binsize
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        # remove things where we haven't seen any variants in truth and query
        res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)]
        # summary metrics with confidence intervals
        ci_alpha = 1.0 - args.ci_level
        recall = binomialCI(res["tp"], res["tp"]+res["fn"], ci_alpha)
        precision = binomialCI(res["tp"], res["tp"]+res["fp"], ci_alpha)
        res["recall"], res["recall_lower"], res["recall_upper"] = recall
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"], res["precision_lower"], res["precision_upper"] = precision
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom, label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]
                    if not h1_chrs:
                        logging.warn("ntpath is empty")
                        h1_chrs = []

                    h2 = Tools.vcfextract.extractHeadersJSON(nqpath)
                    h2_chrs = h2["tabix"]["chromosomes"]
                    if not h2_chrs:
                        logging.warn("nqpath is empty")
                        h2_chrs = []

                    combined_chrs = list(set(h1_chrs + h2_chrs))
                    if len(combined_chrs) > 0:
                        qlocations = " ".join(combined_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] +
                                                                            res["fp"] - res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring
        res.to_csv(args.output + ".stats.csv")
        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #11

Show file

File: quantify.py Project: pkrusche/hap.py

def run_quantify(
    filename,
    output_file=None,
    write_vcf=False,
    regions=None,
    reference=Tools.defaultReference(),
    locations=None,
    threads=1,
    output_vtc=False,
    output_rocs=False,
    qtype=None,
    roc_file=None,
    roc_val=None,
    roc_filter=None,
    roc_delta=None,
    clean_info=True,
    strat_fixchr=False,
):
    """Run quantify and return parsed JSON

    :param filename: the VCF file name
    :param output_file: output file name (if None, will use a temp file)
    :param write_vcf: write annotated VCF (give filename)
    :type write_vcf: str
    :param regions: dictionary of stratification region names and file names
    :param reference: reference fasta path
    :param locations: a location to use
    :param output_vtc: enable / disable the VTC field
    :param output_rocs: enable / disable output of ROCs by QQ level
    :param roc_file: filename for a TSV file with ROC observations
    :param roc_val: field to use for ROC QQ
    :param roc_filter: ROC filtering settings
    :param roc_delta: ROC minimum spacing between levels
    :param clean_info: remove unused INFO fields
    :param strat_fixchr: fix chr naming in stratification regions
    :returns: parsed counts JSON
    """

    if not output_file:
        output_file = tempfile.NamedTemporaryFile().name

    run_str = "quantify '%s' -o '%s'" % (filename.replace(" ", "\\ "), output_file)
    run_str += " -r '%s'" % reference.replace(" ", "\\ ")
    run_str += " --threads %i" % threads

    if output_vtc:
        run_str += " --output-vtc 1"
    else:
        run_str += " --output-vtc 0"

    if output_rocs:
        run_str += " --output-rocs 1"
    else:
        run_str += " --output-rocs 0"

    if qtype:
        run_str += " --type %s" % qtype

    if roc_file:
        run_str += " --output-roc %s" % roc_file

    if roc_val:
        run_str += " --qq %s" % roc_val

    if roc_filter:
        run_str += " --roc-filter '%s'" % roc_filter

    if roc_delta:
        run_str += " --roc-delta %f" % roc_delta

    if clean_info:
        run_str += " --clean-info 1"
    else:
        run_str += " --clean-info 0"

    if strat_fixchr:
        run_str += " --fix-chr-regions 1"
    else:
        run_str += " --fix-chr-regions 0"

    if write_vcf:
        if not write_vcf.endswith(".vcf.gz") and not write_vcf.endswith(".bcf"):
            write_vcf += ".vcf.gz"
        run_str += " -v '%s'" % write_vcf

    if regions:
        for k, v in regions.iteritems():
            run_str += " -R '%s:%s'" % (k, v)

    location_file = None
    if locations:
        location_file = _locations_tmp_bed_file(locations)
        run_str += " --only '%s'" % location_file

    tfe = tempfile.NamedTemporaryFile(delete=False, prefix="stderr", suffix=".log")
    tfo = tempfile.NamedTemporaryFile(delete=False, prefix="stdout", suffix=".log")

    logging.info("Running '%s'" % run_str)

    try:
        subprocess.check_call(run_str, shell=True, stdout=tfo, stderr=tfe)
    except:
        tfo.close()
        tfe.close()
        with open(tfo.name) as f:
            for l in f:
                logging.error("[stdout] " + l.replace("\n", ""))
        os.unlink(tfo.name)
        with open(tfe.name) as f:
            for l in f:
                logging.error("[stderr] " + l.replace("\n", ""))
        os.unlink(tfe.name)
        if location_file:
            os.unlink(location_file)
        raise

    tfo.close()
    tfe.close()
    with open(tfo.name) as f:
        for l in f:
            logging.info("[stdout] " + l.replace("\n", ""))
    os.unlink(tfo.name)
    with open(tfe.name) as f:
        for l in f:
            logging.info("[stderr] " + l.replace("\n", ""))
    os.unlink(tfe.name)
    if location_file:
        os.unlink(location_file)

    if write_vcf and write_vcf.endswith(".bcf"):
        runBcftools("index", write_vcf)
    else:
        to_run = "tabix -p vcf '%s'" % write_vcf
        logging.info("Running '%s'" % to_run)
        subprocess.check_call(to_run, shell=True)

Example #12

Show file

File: som.py Project: jaredo/hap.py

def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument(
        "-o",
        "--output",
        dest="output",
        required=True,
        help="Output file prefix for statistics and feature table (when selected)",
    )

    parser.add_argument("-l", "--location", dest="location", default="", help="Location for bcftools view (e.g. chr1)")

    parser.add_argument(
        "-R",
        "--restrict-regions",
        dest="regions_bedfile",
        default=None,
        type=str,
        help="Restrict analysis to given (sparse) regions (using -R in bcftools).",
    )

    parser.add_argument(
        "-T",
        "--target-regions",
        dest="targets_bedfile",
        default=None,
        type=str,
        help="Restrict analysis to given (dense) regions (using -T in bcftools).",
    )

    parser.add_argument(
        "-f", "--false-positives", dest="FP", help="False-positive region bed file to distinguish UNK from FP"
    )

    parser.add_argument(
        "-a",
        "--ambiguous",
        dest="ambi",
        action="append",
        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed " "in some replicates)",
    )

    parser.add_argument(
        "--ambiguous-fp",
        dest="ambi_fp",
        action="store_true",
        default=False,
        help="Use FP calls from ambiguous region files also.",
    )

    parser.add_argument(
        "-e",
        "--explain_ambiguous",
        dest="explain_ambiguous",
        required=False,
        default=False,
        action="store_true",
        help="print a table giving the number of ambiguous events per category",
    )

    parser.add_argument(
        "-r", "--reference", dest="ref", default=Tools.defaultReference(), help="Specify a reference file."
    )

    parser.add_argument(
        "--scratch-prefix", dest="scratch_prefix", default=None, help="Filename prefix for scratch report output."
    )

    parser.add_argument(
        "--keep-scratch",
        dest="delete_scratch",
        default=True,
        action="store_false",
        help="Filename prefix for scratch report output.",
    )

    parser.add_argument(
        "--continue",
        dest="cont",
        default=False,
        action="store_true",
        help="Continue from scratch space (i.e. use VCFs in there if they already exist).",
    )

    parser.add_argument(
        "-P",
        "--include-nonpass",
        dest="inc_nonpass",
        action="store_true",
        default=False,
        help="Use to include failing variants in comparison.",
    )

    parser.add_argument(
        "--feature-table",
        dest="features",
        default=False,
        choices=Somatic.FeatureSet.sets.keys(),
        help="Select a feature table to output.",
    )

    parser.add_argument(
        "--bam",
        dest="bams",
        default=[],
        action="append",
        help="pass one or more BAM files for feature table extraction",
    )

    parser.add_argument(
        "--normalize-truth",
        dest="normalize_truth",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on the truth file.",
    )

    parser.add_argument(
        "--normalize-query",
        dest="normalize_query",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on the query file.",
    )

    parser.add_argument(
        "-N",
        "--normalize-all",
        dest="normalize_all",
        default=False,
        action="store_true",
        help="Enable running of bcftools norm on both truth and query file.",
    )

    parser.add_argument(
        "--fix-chr-query",
        dest="fixchr_query",
        default=False,
        action="store_true",
        help="Replace numeric chromosome names in the query by chr*-type names",
    )

    parser.add_argument(
        "--fix-chr-truth",
        dest="fixchr_truth",
        default=False,
        action="store_true",
        help="Replace numeric chromosome names in the truth by chr*-type names",
    )

    parser.add_argument(
        "--no-order-check",
        dest="disable_order_check",
        default=False,
        action="store_true",
        help="Disable checking the order of TP features (dev feature).",
    )

    parser.add_argument(
        "--roc",
        dest="roc",
        default=None,
        choices=ROC.list(),
        help="Create a ROC-style table. This is caller specific " " - this will override the --feature-table switch!",
    )

    parser.add_argument(
        "--logfile", dest="logfile", default=None, help="Write logging information into file rather than to stderr"
    )

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument(
        "--verbose",
        dest="verbose",
        default=False,
        action="store_true",
        help="Raise logging level from warning to info.",
    )

    verbosity_options.add_argument(
        "--quiet", dest="quiet", default=False, action="store_true", help="Set logging level to output errors only."
    )

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile, format="%(asctime)s %(levelname)-8s %(message)s", level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(
                args.truth,
                ntpath,
                args.location,
                True,  # pass_only
                args.fixchr_truth,  # chrprefix
                args.normalize_truth,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref,
            )
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(
                args.query,
                nqpath,
                args.location,
                not args.inc_nonpass,  # pass_only
                args.fixchr_query,  # chrprefix
                args.normalize_query,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref,
            )
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all(
            [
                os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            ]
        )

        tpfn_r_files = all(
            [
                os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            ]
        )

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4])

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP")

        has_fp = (fpclasses.count("FP") > 0) or (fpclasses.count("fp") > 0 and args.ambi_fp)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == "#":
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not has_fp:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        res = res[(res["total.truth"] > 0) | (res["total.query"] > 0)]

        # use this to use plain row counts rather than stratified bcftools counts
        # truthcounts = countVCFRows(ntpath) # , "total.truth")
        # querycounts = countVCFRows(nqpath) # , "total.query")
        #
        # tpcounts = countVCFRows(os.path.join(scratch, "tpfn", "0002.vcf.gz"))  #, "tp")
        # fncounts = countVCFRows(os.path.join(scratch, "tpfn", "0000.vcf.gz"))  #, "fn")
        # fpcounts = countVCFRows(fppath)  #, "fp")
        # ambicounts = countVCFRows(ambipath)  #, "ambi")
        # unkcounts = countVCFRows(unkpath)  #, "unk")
        #
        # res = pandas.DataFrame({
        #     "total.truth" : [ truthcounts ],
        #     "total.query" : [ querycounts ],
        #     "tp" : [ tpcounts ],
        #     "fn" : [ fncounts ],
        #     "fp" : [ fpcounts ],
        #     "ambi" : [ ambicounts ],
        #     "unk" : [ unkcounts ]
        # })
        #
        # res["type"] = "records"

        # summary metrics
        res["recall"] = res["tp"] / (res["tp"] + res["fn"])
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"] = res["tp"] / (res["tp"] + res["fp"])
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        metrics_output = makeMetricsObject("som.py.comparison")
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(index=False)
                )
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " "overlap):\n" + ambie.to_string(
                        index=False
                    )
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "Reasons for defining as ambiguous (multiple reasons can overlap):\n"
                    + ambie.to_string(
                        formatters={"reason": "{{:<{}s}}".format(ambie["reason"].str.len().max()).format}, index=False
                    )
                )
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={"reason": "{{:<{}s}}".format(ambie["reason"].str.len().max()).format}, index=False
                    )
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        res.to_csv(args.output + ".stats.csv")

        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception(
                                "Cannot merge TP features, inputs are out of order at %s / %s"
                                % (str(tps[x : x + 1]), str(tps2[x : x + 1]))
                            )

            logging.info("Merging...")

            cdata = {"CHROM": tps["CHROM"], "POS": tps["POS"], "tag": tps["tag"]}

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and not a in columns_tps2:
                    tpc[a] = tps[a]
                elif not a in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(fppath, "AMBI")
            unks = fset.collect(fppath, "UNK")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs, unks]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format="%.8f")

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format="%.8f")

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #13

Show file

File: hap.py Project: pkrusche/hap.py

def main():
    parser = argparse.ArgumentParser("Haplotype Comparison")

    # input
    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Show version number and exit.")

    parser.add_argument("-r", "--reference", dest="ref", default=None, help="Specify a reference file.")

    # output
    parser.add_argument("-o", "--report-prefix", dest="reports_prefix",
                        default=None,
                        help="Filename prefix for report output.")
    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Directory for scratch files.")
    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")


    # add quantification args
    qfy.updateArgs(parser)

    # control preprocessing
    pre.updateArgs(parser)
    parser.add_argument("--preprocess-truth", dest="preprocessing_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--usefiltered-truth", dest="usefiltered_truth", action="store_true", default=False,
                        help="Preprocess truth file with same settings as query (default is to accept truth in original format).")
    parser.add_argument("--preprocessing-window-size", dest="preprocess_window",
                        default=10000, type=int,
                        help="Preprocessing window size (variants further apart than that size are not expected to interfere).")

    # detailed control of comparison
    parser.add_argument("--unhappy", "--no-haplotype-comparison", dest="no_hc", action="store_true", default=False,
                        help="Disable haplotype comparison (only count direct GT matches as TP).")

    parser.add_argument("-w", "--window-size", dest="window",
                        default=50, type=int,
                        help="Minimum distance between variants such that they fall into the same superlocus.")

    # xcmp-specific stuff
    parser.add_argument("--xcmp-enumeration-threshold", dest="max_enum",
                        default=16768, type=int,
                        help="Enumeration threshold / maximum number of sequences to enumerate per block.")

    parser.add_argument("--xcmp-expand-hapblocks", dest="hb_expand",
                        default=30, type=int,
                        help="Expand haplotype blocks by this many basepairs left and right.")
    parser.add_argument("--threads", dest="threads",
                        default=multiprocessing.cpu_count(), type=int,
                        help="Number of threads to use.")

    parser.add_argument("--engine", dest="engine",
                        default="xcmp", choices=["xcmp", "vcfeval"],
                        help="Comparison engine to use.")

    parser.add_argument("--engine-vcfeval-path", dest="engine_vcfeval", required=False,
                        default=Haplo.vcfeval.findVCFEval(),
                        help="This parameter should give the path to the \"rtg\" executable. "
                             "The default is %s" % Haplo.vcfeval.findVCFEval())
    parser.add_argument("--engine-vcfeval-template", dest="engine_vcfeval_template", required=False,
                        help="Vcfeval needs the reference sequence formatted in its own file format "
                             "(SDF -- run rtg format -o ref.SDF ref.fa). You can specify this here "
                             "to save time when running hap.py with vcfeval. If no SDF folder is "
                             "specified, hap.py will create a temporary one.")

    if Tools.has_sge:
        parser.add_argument("--force-interactive", dest="force_interactive",
                            default=False, action="store_true",
                            help="Force running interactively (i.e. when JOB_ID is not in the environment)")

    parser.add_argument("_vcfs", help="Two VCF files.", default=[], nargs="*")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args, unknown_args = parser.parse_known_args()

    if not Tools.has_sge:
        args.force_interactive = True

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    # remove some safe unknown args
    unknown_args = [x for x in unknown_args if x not in ["--force-interactive"]]
    if len(sys.argv) < 2 or len(unknown_args) > 0:
        if unknown_args:
            logging.error("Unknown arguments specified : %s " % str(unknown_args))
        parser.print_help()
        exit(1)

    if args.version:
        print "Hap.py %s" % Tools.version
        exit(0)

    if args.roc:
        args.write_vcf = True

    # sanity-check regions bed file (HAP-57)
    if args.regions_bedfile:
        logging.info("Checking input regions.")
        if bedOverlapCheck(args.regions_bedfile):
            raise Exception("The regions bed file (specified using -R) has overlaps, this will not work with xcmp."
                            " You can either use -T, or run the file through bedtools merge")

    if args.fp_bedfile and not os.path.exists(args.fp_bedfile):
        raise Exception("FP/confident call region bed file does not exist.")

    if not args.force_interactive and "JOB_ID" not in os.environ:
        parser.print_help()
        raise Exception("Please qsub me so I get approximately 1 GB of RAM per thread.")

    if not args.ref:
        args.ref = Tools.defaultReference()

    if not os.path.exists(args.ref):
        raise Exception("Please specify a valid reference path using -r.")

    if not args.reports_prefix:
        raise Exception("Please specify an output prefix using -o ")

    if not os.path.exists(os.path.dirname(os.path.abspath(args.reports_prefix))):
        raise Exception("The output path does not exist. Please specify a valid output path and prefix using -o")

    if os.path.basename(args.reports_prefix) == "" or os.path.isdir(args.reports_prefix):
        raise Exception("The output path should specify a file name prefix. Please specify a valid output path "
                        "and prefix using -o. For example, -o /tmp/test will create files named /tmp/test* .")

    # noinspection PyProtectedMember
    if not args._vcfs or len(args._vcfs) != 2:
        raise Exception("Please specify exactly two input VCFs.")

    # noinspection PyProtectedMember
    args.vcf1 = args._vcfs[0]
    # noinspection PyProtectedMember
    args.vcf2 = args._vcfs[1]

    if not os.path.exists(args.vcf1):
        raise Exception("Input file %s does not exist." % args.vcf1)
    if not os.path.exists(args.vcf2):
        raise Exception("Input file %s does not exist." % args.vcf2)

    tempfiles = []

    # xcmp supports bcf; others don't
    if args.engine == "xcmp" and (args.bcf or (args.vcf1.endswith(".bcf") and args.vcf2.endswith(".bcf"))):
        internal_format_suffix = ".bcf"
    else:
        internal_format_suffix = ".vcf.gz"

    try:
        logging.info("Comparing %s and %s" % (args.vcf1, args.vcf2))

        logging.info("Preprocessing truth: %s" % args.vcf1)
        starttime = time.time()

        ttf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="truth.pp",
                                          suffix=internal_format_suffix)
        ttf.close()
        tempfiles.append(ttf.name)
        tempfiles.append(ttf.name + ".csi")
        tempfiles.append(ttf.name + ".tbi")
        pre.preprocess(args.vcf1,
                       ttf.name,
                       args.ref,
                       args.locations,
                       None if args.usefiltered_truth else "*",  # filters
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift if args.preprocessing_truth else False,
                       args.preprocessing_decompose if args.preprocessing_truth else False,
                       args.preprocessing_norm if args.preprocessing_truth else False,
                       args.preprocess_window,
                       args.threads)

        args.vcf1 = ttf.name
        h1 = vcfextract.extractHeadersJSON(args.vcf1)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf1, elapsed))

        # once we have preprocessed the truth file we can resolve the locations
        # doing this here improves the time for query preprocessing below
        reference_contigs = set(fastaContigLengths(args.ref).keys())

        if not args.locations:
            # default set of locations is the overlap between truth and reference
            args.locations = list(reference_contigs & set(h1["tabix"]["chromosomes"]))
            if not args.locations:
                raise Exception("Truth and reference have no chromosomes in common!")
        elif type(args.locations) is not list:
            args.locations = [args.locations]

        args.locations = sorted(args.locations)

        logging.info("Preprocessing query: %s" % args.vcf2)
        starttime = time.time()

        if args.pass_only:
            filtering = "*"
        else:
            filtering = args.filters_only

        qtf = tempfile.NamedTemporaryFile(delete=False,
                                          dir=args.scratch_prefix,
                                          prefix="query.pp",
                                          suffix=internal_format_suffix)
        qtf.close()
        tempfiles.append(qtf.name)
        tempfiles.append(qtf.name + ".csi")
        tempfiles.append(qtf.name + ".tbi")
        pre.preprocess(args.vcf2,
                       qtf.name,
                       args.ref,
                       str(",".join(args.locations)),
                       filtering,
                       args.fixchr,
                       args.regions_bedfile,
                       args.targets_bedfile,
                       args.preprocessing_leftshift,
                       args.preprocessing_decompose,
                       args.preprocessing_norm,
                       args.preprocess_window,
                       args.threads)

        args.vcf2 = qtf.name
        h2 = vcfextract.extractHeadersJSON(args.vcf2)

        elapsed = time.time() - starttime
        logging.info("preprocess for %s -- time taken %.2f" % (args.vcf2, elapsed))

        if not h1["tabix"]:
            raise Exception("Truth file is not indexed after preprocesing.")

        if not h2["tabix"]:
            raise Exception("Query file is not indexed after preprocessing.")

        for _xc in args.locations:
            if _xc not in h2["tabix"]["chromosomes"]:
                logging.warn("No calls for location %s in query!" % _xc)

        pool = getPool(args.threads)
        if args.threads > 1 and args.engine == "xcmp":
            logging.info("Running using %i parallel processes." % args.threads)

            # find balanced pieces
            # cap parallelism at 64 since otherwise bcftools concat below might run out
            # of file handles
            args.pieces = min(args.threads, 64)
            res = runParallel(pool, Haplo.blocksplit.blocksplitWrapper, args.locations, args)

            if None in res:
                raise Exception("One of the blocksplit processes failed.")

            tempfiles += res

            args.locations = []
            for f in res:
                with open(f) as fp:
                    for l in fp:
                        ll = l.strip().split("\t", 3)
                        if len(ll) < 3:
                            continue
                        xchr = ll[0]
                        start = int(ll[1]) + 1
                        end = int(ll[2])
                        args.locations.append("%s:%i-%i" % (xchr, start, end))

        # count variants before normalisation
        if "samples" not in h1 or not h1["samples"]:
            raise Exception("Cannot read sample names from truth VCF file")

        if "samples" not in h2 or not h2["samples"]:
            raise Exception("Cannot read sample names from query VCF file")

        tf = tempfile.NamedTemporaryFile(delete=False,
                                         dir=args.scratch_prefix,
                                         prefix="hap.py.result.",
                                         suffix=internal_format_suffix)
        tf.close()
        tempfiles.append(tf.name)
        tempfiles.append(tf.name + ".tbi")
        tempfiles.append(tf.name + ".csi")
        output_name = tf.name

        if args.engine == "xcmp":
            # do xcmp
            logging.info("Using xcmp for comparison")
            res = runParallel(pool, Haplo.xcmp.xcmpWrapper, args.locations, args)
            tempfiles += [x for x in res if x is not None]  # VCFs

            if None in res:
                raise Exception("One of the xcmp jobs failed.")

            if len(res) == 0:
                raise Exception("Input files/regions do not contain variants (0 haplotype blocks were processed).")

            # concatenate + index
            logging.info("Concatenating variants...")
            runme_list = [x for x in res if x is not None]
            if len(runme_list) == 0:
                raise Exception("No outputs to concatenate!")

            logging.info("Concatenating...")
            bcftools.concatenateParts(output_name, *runme_list)
            logging.info("Indexing...")
            bcftools.runBcftools("index", output_name)
            # passed to quantify
            args.type = "xcmp"
            # xcmp extracts whichever field we're using into the QQ info field
            args.roc = "IQQ"
        elif args.engine == "vcfeval":
            tempfiles += Haplo.vcfeval.runVCFEval(args.vcf1, args.vcf2, output_name, args)
            # passed to quantify
            args.type = "ga4gh"
        else:
            raise Exception("Unknown comparison engine: %s" % args.engine)

        args.in_vcf = [output_name]
        args.runner = "hap.py"
        qfy.quantify(args)

    finally:
        if args.delete_scratch:
            for x in tempfiles:
                try:
                    os.remove(x)
                except:
                    pass
        else:
            logging.info("Scratch files kept : %s" % (str(tempfiles)))

Example #14

Show file

def main():

    args = parse_args()

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" %
                             (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(
                args.truth,
                ntpath,
                args.location,
                True,  # pass_only
                args.fixchr_truth,  # chrprefix
                args.normalize_truth,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(
                args.query,
                nqpath,
                args.location,
                not args.inc_nonpass,  # pass_only
                args.fixchr_query,  # chrprefix
                args.normalize_query,  # norm,
                args.regions_bedfile,
                args.targets_bedfile,
                args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([
            os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))
        ])

        tpfn_r_files = all([
            os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))
        ])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p",
                        os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p",
                        os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" %
                                (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(
            runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")),
            "tp")
        fncounts = parseStats(
            runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")),
            "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort_values(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "FP/ambiguity classes with info (multiple classes can "
                    "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort_values(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info(
                    "Reasons for defining as ambiguous (multiple reasons can overlap):\n"
                    + ambie.to_string(formatters={
                        'reason':
                        '{{:<{}s}}'.format(
                            ambie['reason'].str.len().max()).format
                    },
                                      index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={
                            'reason':
                            '{{:<{}s}}'.format(
                                ambie['reason'].str.len().max()).format
                        },
                        index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(
                    dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"),
                               "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"),
                                "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort_values(["CHROM", "POS"], inplace=True)
            tps2.sort_values(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception(
                    "Cannot read TP features, lists have different lengths : %i != %i"
                    % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception(
                                "Cannot merge TP features, inputs are out of order at %s / %s"
                                % (str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"),
                               "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted(
                [x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv",
                                float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN",
                            "REF"] = featuretable.ix[featuretable["tag"] ==
                                                     "FN", "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN",
                            "ALT"] = featuretable.ix[featuretable["tag"] ==
                                                     "FN", "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                featuretable["vtype"] = resolve_vtype(args)
                featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype,
                           "fp.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "FP")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "tp.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "TP")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "unk.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "UNK")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]
                    res.ix[res["type"] == vtype,
                           "ambi.filtered"] = featuretable_this_type[
                               (featuretable_this_type["tag"] == "AMBI")
                               & (featuretable_this_type["FILTER"] != ""
                                  )].shape[0]

                if args.af_strat:
                    start = 0.0
                    end = 1.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = start + current_binsize
                        if end >= 1:
                            end = 1.00000001
                        if start >= end:
                            break
                        n_tp = featuretable_this_type[
                            (featuretable_this_type["tag"] == "TP")
                            & (featuretable_this_type[af_t_feature] >= start) &
                            (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[
                            (featuretable_this_type["tag"] == "FN")
                            & (featuretable_this_type[af_t_feature] >= start) &
                            (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[
                            (featuretable_this_type["tag"] == "FP")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[
                            (featuretable_this_type["tag"] == "AMBI")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[
                            (featuretable_this_type["tag"] == "UNK")
                            & (featuretable_this_type[af_q_feature] >= start) &
                            (featuretable_this_type[af_q_feature] < end)]

                        r = {
                            "type":
                            "%s.%f-%f" % (vtype, start, end),
                            "total.truth":
                            n_tp.shape[0] + n_fn.shape[0],
                            "total.query":
                            n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] +
                            n_unk.shape[0],
                            "tp":
                            n_tp.shape[0],
                            "fp":
                            n_fp.shape[0],
                            "fn":
                            n_fn.shape[0],
                            "unk":
                            n_unk.shape[0],
                            "ambi":
                            n_ambi.shape[0]
                        }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[
                                n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[
                                n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[
                                n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[
                                n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] +
                                                     n_fn.shape[0] +
                                                     n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(
                                pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (
                                args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start = end
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        if not args.af_strat:
            res = res[(res["total.truth"] > 0)]

        # summary metrics with confidence intervals
        ci_alpha = 1.0 - args.ci_level

        recall = binomialCI(res["tp"], res["tp"] + res["fn"], ci_alpha)
        precision = binomialCI(res["tp"], res["tp"] + res["fp"], ci_alpha)
        res["recall"], res["recall_lower"], res["recall_upper"] = recall
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"], res["precision_lower"], res[
            "precision_upper"] = precision
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom,
                                                                label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]

                    if not h1_chrs:
                        logging.warn("No contigs in truth file")
                        h1_chrs = []

                    if len(h1_chrs) > 0:
                        qlocations = " ".join(h1_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (
                res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (
                res["tp"] - res["tp.filtered"] + res["fp"] -
                res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (
                res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] -
                                  res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (
                res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)

        metrics_output["metrics"].append(dataframeToMetricsTable(
            "result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring

        # save results
        res.to_csv(args.output + ".stats.csv")

        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

        if args.happy_stats:
            # parse saved feature table as the one in memory has been updated
            featuretable = pandas.read_csv(args.output + ".features.csv",
                                           low_memory=False,
                                           dtype={"FILTER": str})

            # hap.py summary.csv
            summary = summary_from_featuretable(featuretable, args)
            summary.to_csv(args.output + ".summary.csv")

            #  hap.py extended.csv
            if args.af_strat:
                extended = extended_from_featuretable(featuretable, args)
                extended.to_csv(args.output + ".extended.csv",
                                index=False,
                                na_rep="NA")

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #15

Show file

File: som.py Project: pkrusche/hap.py

def main():
    parser = argparse.ArgumentParser("Somatic Comparison")

    parser.add_argument("truth", help="Truth VCF file")
    parser.add_argument("query", help="Query VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file prefix for statistics and feature table (when selected)")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-f", "--false-positives", dest="FP",
                        help="False-positive region bed file to distinguish UNK from FP")

    parser.add_argument("-a", "--ambiguous", dest="ambi", action='append',
                        help="Ambiguous region bed file(s) to distinguish from FP (e.g. variant only observed "
                             "in some replicates)")

    parser.add_argument("--ambi-fp", dest="ambi_fp", action='store_true', default=False,
                        help="Use FP calls from ambiguous region files also.")

    parser.add_argument("--no-ambi-fp", dest="ambi_fp", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("--count-unk", dest="count_unk", action='store_true', default=False,
                        help="Assume the truth set covers the whole genome and only count FPs in regions "
                             "specified by the truth VCF or ambiguous/false-positive regions.")

    parser.add_argument("--no-count-unk", dest="count_unk", action='store_false',
                        help="Do not use FP calls from ambiguous region files also.")

    parser.add_argument("-e", "--explain_ambiguous", dest="explain_ambiguous", required=False,
                        default=False, action="store_true",
                        help="print a table giving the number of ambiguous events per category")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file.")

    parser.add_argument("--scratch-prefix", dest="scratch_prefix",
                        default=None,
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--keep-scratch", dest="delete_scratch",
                        default=True, action="store_false",
                        help="Filename prefix for scratch report output.")

    parser.add_argument("--continue", dest="cont", default=False, action="store_true",
                        help="Continue from scratch space (i.e. use VCFs in there if they already exist).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default=False, choices=Somatic.FeatureSet.sets.keys(),
                        help="Select a feature table to output.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("--normalize-truth", dest="normalize_truth", default=False, action="store_true",
                        help="Enable running of bcftools norm on the truth file.")

    parser.add_argument("--normalize-query", dest="normalize_query", default=False, action="store_true",
                        help="Enable running of bcftools norm on the query file.")

    parser.add_argument("-N", "--normalize-all", dest="normalize_all", default=False, action="store_true",
                        help="Enable running of bcftools norm on both truth and query file.")

    parser.add_argument("--fixchr-truth", dest="fixchr_truth", action="store_true", default=True,
                        help="Add chr prefix to truth file (default: true).")

    parser.add_argument("--fixchr-query", dest="fixchr_query", action="store_true", default=True,
                        help="Add chr prefix to query file (default: true).")

    parser.add_argument("--fix-chr-truth", dest="fixchr_truth", action="store_true", default=None,
                        help="Same as --fixchr-truth.")

    parser.add_argument("--fix-chr-query", dest="fixchr_query", action="store_true", default=None,
                        help="Same as --fixchr-query.")

    parser.add_argument("--no-fixchr-truth", dest="fixchr_truth", action="store_false", default=False,
                        help="Disable chr replacement for truth (default: false).")

    parser.add_argument("--no-fixchr-query", dest="fixchr_query", action="store_false", default=False,
                        help="Add chr prefix to query file (default: false).")
    
    parser.add_argument("--no-order-check", dest="disable_order_check", default=False, action="store_true",
                        help="Disable checking the order of TP features (dev feature).")

    parser.add_argument("--roc", dest="roc", default=None, choices=ROC.list(),
                        help="Create a ROC-style table. This is caller specific "
                             " - this will override the --feature-table switch!")

    parser.add_argument("--bin-afs", dest="af_strat", default=None, action="store_true",
                        help="Stratify into different AF buckets. This needs to have features available"
                             "for getting the AF both in truth and query variants.")
    parser.add_argument("--af-binsize", dest="af_strat_binsize", default=0.2,
                        help="Bin size for AF binning (should be < 1). Multiple bin sizes can be specified using a comma, "
                             "e.g. 0.1,0.2,0.5,0.2 will split at 0.1, 0.3, 0.8 and 1.0.")
    parser.add_argument("--af-truth", dest="af_strat_truth", default="I.T_ALT_RATE",
                        help="Feature name to use for retrieving AF for truth variants (TP and FN)")
    parser.add_argument("--af-query", dest="af_strat_query", default="T_AF",
                        help="Feature name to use for retrieving AF for query variants (FP/UNK/AMBI)")

    parser.add_argument("-FN", "--count-filtered-fn", dest="count_filtered_fn", action="store_true",
                        help="Count filtered vs. absent FN numbers. This requires the -P switch (to use all "
                             "variants) and either the --feature-table or --roc switch.")

    parser.add_argument("--fp-region-size", dest="fpr_size",
                        help="How to obtain the normalisation constant for FP rate. By default, this will use the FP region bed file size when using"
                             " --count-unk and the size of all reference contigs that overlap with the location specified in -l otherwise."
                             " This can be overridden with: 1) a number of nucleotides, or 2) \"auto\" to use the lengths of all contigs that have calls."
                             " The resulting value is used as fp.region.size.")

    parser.add_argument("--logfile", dest="logfile", default=None,
                        help="Write logging information into file rather than to stderr")

    verbosity_options = parser.add_mutually_exclusive_group(required=False)

    verbosity_options.add_argument("--verbose", dest="verbose", default=False, action="store_true",
                                   help="Raise logging level from warning to info.")

    verbosity_options.add_argument("--quiet", dest="quiet", default=False, action="store_true",
                                   help="Set logging level to output errors only.")

    args = parser.parse_args()

    if args.verbose:
        loglevel = logging.INFO
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.WARNING

    try:
        if type(args.af_strat_binsize) is str:
            args.af_strat_binsize = map(float, args.af_strat_binsize.split(","))
        else:
            args.af_strat_binsize = map(float, [args.af_strat_binsize])

        if not args.af_strat_binsize:
            raise Exception("Bin size list is empty")
    except:
        logging.error("Failed to parse stratification bin size: %s" % str(args.af_strat_binsize))
        exit(1)

    # reinitialize logging
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=args.logfile,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        level=loglevel)

    if args.normalize_all:
        args.normalize_truth = True
        args.normalize_query = True

    if args.roc:
        args.roc = ROC.make(args.roc)
        args.features = args.roc.ftname
        if not args.inc_nonpass:
            logging.warn("When creating ROCs without the -P switch, the ROC data points will only "
                         "include filtered variants (i.e. they will normally end at the caller's "
                         "quality threshold).")

    if args.af_strat and not args.features:
        raise Exception("To stratify by AFs, a feature table must be selected -- use this switch together "
                        "with --feature-table or --roc")

    if args.count_filtered_fn and (not args.inc_nonpass or not args.features):
        raise Exception("Counting filtered / unfiltered FNs only works when a feature table is selected, "
                        "and when using unfiltered variants. Specify -P --feature-table <...> or use "
                        "--roc to select a ROC type.")

    if args.scratch_prefix:
        scratch = os.path.abspath(args.scratch_prefix)
        args.delete_scratch = False
        Tools.mkdir_p(scratch)
    else:
        scratch = tempfile.mkdtemp()

    logging.info("Scratch path is %s" % scratch)
    try:
        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()

            md = {}

            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"]) * 3.0

        logging.info("Normalizing/reading inputs")

        ntpath = os.path.join(scratch, "normalized_truth.vcf.gz")

        if not (args.cont and os.path.exists(ntpath)):
            preprocessVCF(args.truth, ntpath, args.location,
                          True,  # pass_only
                          args.fixchr_truth,  # chrprefix
                          args.normalize_truth,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % ntpath)

        if not (args.cont and os.path.exists(ntpath + ".csi")):
            runBcftools("index", ntpath)

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        if not (args.cont and os.path.exists(nqpath)):
            preprocessVCF(args.query, nqpath, args.location,
                          not args.inc_nonpass,  # pass_only
                          args.fixchr_query,  # chrprefix
                          args.normalize_query,  # norm,
                          args.regions_bedfile,
                          args.targets_bedfile,
                          args.ref)
        else:
            logging.info("Continuing from %s" % nqpath)

        if not (args.cont and os.path.exists(nqpath + ".csi")):
            runBcftools("index", nqpath)

        logging.info("Intersecting")

        tpfn_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                          os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        tpfn_r_files = all([os.path.exists(os.path.join(scratch, "tpfn", "0000.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0001.vcf.gz")),
                            os.path.exists(os.path.join(scratch, "tpfn", "0002.vcf.gz"))])

        if not (args.cont and tpfn_files):
            runBcftools("isec", ntpath, nqpath, "-p", os.path.join(scratch, "tpfn"), "-O", "z")
        else:
            logging.info("Continuing from %s" % os.path.join(scratch, "tpfn"))

        if args.features and not (args.cont and tpfn_r_files):
            # only need to do this for getting the feature table
            runBcftools("isec", nqpath, ntpath, "-p", os.path.join(scratch, "tpfn_r"), "-O", "z")

        logging.info("Getting FPs / Ambi / Unk")

        fppath = os.path.join(scratch, "fp.vcf.gz")
        unkpath = os.path.join(scratch, "unk.vcf.gz")
        ambipath = os.path.join(scratch, "ambi.vcf.gz")

        # get header to print to unk and ambi VCFs
        rununiquepath = os.path.join(scratch, "tpfn", "0001.vcf.gz")
        header = runBcftools("view", rununiquepath, "--header-only")

        fp = Tools.BGZipFile(fppath, True)
        fp.write(header)

        unk = Tools.BGZipFile(unkpath, True)
        unk.write(header)

        ambi = Tools.BGZipFile(ambipath, True)
        ambi.write(header)

        ambiClasses = Counter()
        ambiReasons = Counter()

        fpclasses = BedIntervalTree()
        if args.ambi:
            # can have multiple ambiguous BED files
            for aBED in args.ambi:
                # auto-label from first value after chr start end
                # new ambi files have the label in position 4
                # old ones will look weird here.
                fpclasses.addFromBed(aBED, lambda xe: xe[4], args.fixchr_truth)

        if args.FP:
            fpclasses.addFromBed(args.FP, "FP", args.fixchr_truth)

        # split VCF into FP, UNK and AMBI
        toProcess = gzip.open(rununiquepath, "rb")
        for entry in toProcess:
            if entry[0] == '#':
                continue

            fields = entry.strip().split("\t")
            chrom = fields[0]
            start = int(fields[1])
            stop = int(fields[1]) + len(fields[3])

            overlap = fpclasses.intersect(chrom, start, stop)

            is_fp = False
            is_ambi = False

            classes_this_pos = set()

            for o in overlap:
                reason = o.value[0]
                if reason == "fp" and args.ambi_fp:
                    reason = "FP"
                elif reason == "fp":
                    reason = "ambi-fp"
                elif reason == "unk":
                    reason = "ambi-unk"

                classes_this_pos.add(reason)
                try:
                    ambiReasons["%s: rep. count %s" % (reason, o.value[1])] += 1
                except IndexError:
                    ambiReasons["%s: rep. count *" % reason] += 1
                for x in o.value[3:]:
                    ambiReasons["%s: %s" % (reason, x)] += 1
                if reason == "FP":
                    is_fp = True
                else:
                    is_ambi = True

            for reason in classes_this_pos:
                ambiClasses[reason] += 1

            if is_fp:
                fp.write(entry)
            elif is_ambi:
                ambi.write(entry)
            elif not args.count_unk:
                # when we don't have FP regions, unk stuff becomes FP
                fp.write(entry)
            else:
                unk.write(entry)

        toProcess.close()

        # since 0001.vcf.gz should already be sorted, we can just convert to bgzipped vcf
        # and create index
        fp.close()
        ambi.close()
        unk.close()

        runBcftools("index", "--tbi", fppath)
        runBcftools("index", "--tbi", unkpath)
        runBcftools("index", "--tbi", ambipath)

        logging.info("Counting variants...")

        truthcounts = parseStats(runBcftools("stats", ntpath), "total.truth")
        querycounts = parseStats(runBcftools("stats", nqpath), "total.query")

        tpcounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0002.vcf.gz")), "tp")
        fncounts = parseStats(runBcftools("stats", os.path.join(scratch, "tpfn", "0000.vcf.gz")), "fn")
        fpcounts = parseStats(runBcftools("stats", fppath), "fp")
        ambicounts = parseStats(runBcftools("stats", ambipath), "ambi")
        unkcounts = parseStats(runBcftools("stats", unkpath), "unk")

        res = pandas.merge(truthcounts, querycounts, on="type")
        res = pandas.merge(res, tpcounts, on="type")
        res = pandas.merge(res, fpcounts, on="type")
        res = pandas.merge(res, fncounts, on="type")
        res = pandas.merge(res, unkcounts, on="type")
        res = pandas.merge(res, ambicounts, on="type")

        # no explicit guarantee that total.query is equal to unk + ambi + fp + tp
        # testSum = res["fp"] + res["tp"] + res["unk"] + res["ambi"]

        # filter and relabel
        res = res[res["type"] != "samples"]
        res = res[res["type"] != "multiallelic SNP sites"]
        res = res[res["type"] != "multiallelic sites"]
        res.loc[res["type"] == "SNPs", "type"] = "SNVs"

        metrics_output = makeMetricsObject("som.py.comparison")

        if args.ambi and args.explain_ambiguous:
            ac = list(ambiClasses.iteritems())
            if ac:
                ambie = pandas.DataFrame(ac, columns=["class", "count"])
                ambie.sort(["class"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("FP/ambiguity classes with info (multiple classes can "
                             "overlap):\n" + ambie.to_string(index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "FP/ambiguity classes with info (multiple classes can " \
                          "overlap):\n" + ambie.to_string(index=False)
                ambie.to_csv(args.output + ".ambiclasses.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambiclasses", ambie))
            else:
                logging.info("No ambiguous variants.")

            ar = list(ambiReasons.iteritems())
            if ar:
                ambie = pandas.DataFrame(ar, columns=["reason", "count"])
                ambie.sort(["reason"], inplace=True)
                pandas.set_option("display.max_rows", 1000)
                pandas.set_option("display.max_columns", 1000)
                pandas.set_option("display.width", 1000)
                pandas.set_option("display.height", 1100)
                logging.info("Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                    formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False))
                # in default mode, print result summary to stdout
                if not args.quiet and not args.verbose:
                    print "Reasons for defining as ambiguous (multiple reasons can overlap):\n" + ambie.to_string(
                        formatters={'reason': '{{:<{}s}}'.format(ambie['reason'].str.len().max()).format}, index=False)
                ambie.to_csv(args.output + ".ambireasons.csv")
                metrics_output["metrics"].append(dataframeToMetricsTable("ambireasons", ambie))
            else:
                logging.info("No ambiguous variants.")

        if args.features:
            logging.info("Extracting features...")
            fset = Somatic.FeatureSet.make(args.features)
            fset.setChrDepths(md)

            logging.info("Collecting TP info (1)...")
            tps = fset.collect(os.path.join(scratch, "tpfn", "0002.vcf.gz"), "TP")

            # TP_r is a hint for fset, they are both TPs
            logging.info("Collecting TP info (2)...")
            tps2 = fset.collect(os.path.join(scratch, "tpfn_r", "0002.vcf.gz"), "TP_r")

            # this is slow because it tries to sort
            # ... which we don't need to do since tps1 and tps2 have the same ordering

            logging.info("Sorting...")
            tps.sort(["CHROM", "POS"], inplace=True)
            tps2.sort(["CHROM", "POS"], inplace=True)
            tps = tps.reset_index(drop=True)
            tps2 = tps2.reset_index(drop=True)

            logging.info("Merging TP info...")
            columns_tps = list(tps)
            columns_tps2 = list(tps2)

            len1 = tps.shape[0]
            len2 = tps2.shape[0]

            if len1 != len2:
                raise Exception("Cannot read TP features, lists have different lengths : %i != %i" % (len1, len2))

            if not args.disable_order_check:
                logging.info("Checking order %i / %i" % (len1, len2))

                for x in xrange(0, len1):
                    for a in ["CHROM", "POS"]:
                        if tps.loc[x][a] != tps2.loc[x][a]:
                            raise Exception("Cannot merge TP features, inputs are out of order at %s / %s" % (
                                str(tps[x:x + 1]), str(tps2[x:x + 1])))

            logging.info("Merging...")

            cdata = {
                "CHROM": tps["CHROM"],
                "POS": tps["POS"],
                "tag": tps["tag"]
            }

            tpc = pandas.DataFrame(cdata, columns=["CHROM", "POS", "tag"])

            all_columns = list(set(columns_tps + columns_tps2))
            for a in all_columns:
                if a in columns_tps and a not in columns_tps2:
                    tpc[a] = tps[a]
                elif a not in columns_tps and a in columns_tps2:
                    tpc[a] = tps2[a]
                elif a not in ["CHROM", "POS", "tag"]:
                    tpc[a] = tps2[a]
                    tpc[a + ".truth"] = tps[a]

            logging.info("Collecting FP info...")
            fps = fset.collect(fppath, "FP")
            ambs = fset.collect(ambipath, "AMBI")

            logging.info("Collecting FN info...")
            fns = fset.collect(os.path.join(scratch, "tpfn", "0000.vcf.gz"), "FN")

            renamed = {}
            tp_cols = list(tpc)
            for col in list(fns):
                if col + ".truth" in tp_cols:
                    renamed[col] = col + ".truth"
            fns.rename(columns=renamed, inplace=True)

            featurelist = [tpc, fps, fns, ambs]

            if unkpath is not None:
                logging.info("Collecting UNK info...")
                unk = fset.collect(unkpath, "UNK")
                featurelist.append(unk)

            logging.info("Making feature table...")
            featuretable = pandas.concat(featurelist)

            # reorder to make more legible
            first_columns = ["CHROM", "POS", "tag"]
            # noinspection PyTypeChecker
            all_columns = list(featuretable)

            if "REF" in all_columns:
                first_columns.append("REF")

            if "REF.truth" in all_columns:
                first_columns.append("REF.truth")

            if "ALT" in all_columns:
                first_columns.append("ALT")

            if "ALT.truth" in all_columns:
                first_columns.append("ALT.truth")

            ordered_columns = first_columns + sorted([x for x in all_columns if x not in first_columns])
            featuretable = featuretable[ordered_columns]
            # make sure positions are integers
            featuretable["POS"] = featuretable["POS"].astype(int)

            logging.info("Saving feature table...")
            featuretable.to_csv(args.output + ".features.csv", float_format='%.8f')

            if args.roc is not None:
                roc_table = args.roc.from_table(featuretable)
                roc_table.to_csv(args.output + ".roc.csv", float_format='%.8f')

            featuretable["FILTER"].fillna("", inplace=True)
            featuretable.ix[featuretable["REF"].str.len() < 1, "absent"] = True
            featuretable.ix[featuretable["tag"] == "FN", "REF"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "REF.truth"]
            featuretable.ix[featuretable["tag"] == "FN", "ALT"] = featuretable.ix[featuretable["tag"] == "FN",
                                                                                  "ALT.truth"]
            af_t_feature = args.af_strat_truth
            af_q_feature = args.af_strat_query
            for vtype in ["records", "SNVs", "indels"]:
                if vtype == "SNVs":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() > 0) &
                                                          (featuretable["ALT"].str.len() ==
                                                           featuretable["REF"].str.len())]
                elif vtype == "indels":
                    featuretable_this_type = featuretable[(featuretable["REF"].str.len() != 1) |
                                                          (featuretable["ALT"].str.len() != 1)]
                else:
                    featuretable_this_type = featuretable

                if args.count_filtered_fn:
                    res.ix[res["type"] == vtype, "fp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "FP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "tp.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "TP") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "unk.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "UNK") & (featuretable_this_type["FILTER"] != "")].shape[0]
                    res.ix[res["type"] == vtype, "ambi.filtered"] = featuretable_this_type[
                        (featuretable_this_type["tag"] == "AMBI") & (featuretable_this_type["FILTER"] != "")].shape[0]

                if args.af_strat:
                    start = 0.0
                    current_binsize = args.af_strat_binsize[0]
                    next_binsize = 0
                    while start < 1.0:
                        # include 1 in last interval
                        end = min(1.000000001, start + current_binsize)
                        n_tp = featuretable_this_type[(featuretable_this_type["tag"] == "TP") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fn = featuretable_this_type[(featuretable_this_type["tag"] == "FN") &
                                                      (featuretable_this_type[af_t_feature] >= start) &
                                                      (featuretable_this_type[af_t_feature] < end)]
                        n_fp = featuretable_this_type[(featuretable_this_type["tag"] == "FP") &
                                                      (featuretable_this_type[af_q_feature] >= start) &
                                                      (featuretable_this_type[af_q_feature] < end)]
                        n_ambi = featuretable_this_type[(featuretable_this_type["tag"] == "AMBI") &
                                                        (featuretable_this_type[af_q_feature] >= start) &
                                                        (featuretable_this_type[af_q_feature] < end)]
                        n_unk = featuretable_this_type[(featuretable_this_type["tag"] == "UNK") &
                                                       (featuretable_this_type[af_q_feature] >= start) &
                                                       (featuretable_this_type[af_q_feature] < end)]

                        r = {"type": "%s.%f-%f" % (vtype, start, end),
                             "total.truth": n_tp.shape[0] + n_fn.shape[0],
                             "total.query": n_tp.shape[0] + n_fp.shape[0] + n_ambi.shape[0] + n_unk.shape[0],
                             "tp": n_tp.shape[0],
                             "fp": n_fp.shape[0],
                             "fn": n_fn.shape[0],
                             "unk": n_unk.shape[0],
                             "ambi": n_ambi.shape[0], }

                        if args.count_filtered_fn:
                            r["fp.filtered"] = n_fp[n_fp["FILTER"] != ""].shape[0]
                            r["tp.filtered"] = n_tp[n_tp["FILTER"] != ""].shape[0]
                            r["unk.filtered"] = n_unk[n_unk["FILTER"] != ""].shape[0]
                            r["ambi.filtered"] = n_ambi[n_ambi["FILTER"] != ""].shape[0]

                        res = pandas.concat([res, pandas.DataFrame([r])])

                        if args.roc is not None and (n_tp.shape[0] + n_fn.shape[0] + n_fp.shape[0]) > 0:
                            roc_table_strat = args.roc.from_table(pandas.concat([n_tp, n_fp, n_fn]))
                            rtname = "%s.%s.%f-%f.roc.csv" % (args.output, vtype, start, end)
                            roc_table_strat.to_csv(rtname, float_format='%.8f')
                        start += current_binsize
                        next_binsize += 1
                        if next_binsize >= len(args.af_strat_binsize):
                            next_binsize = 0
                        current_binsize = args.af_strat_binsize[next_binsize]

        # remove things where we haven't seen any variants in truth and query
        res = res[(res["total.truth"] > 0) & (res["total.query"] > 0)]
        # summary metrics
        res["recall"] = res["tp"] / (res["tp"] + res["fn"])
        res["recall2"] = res["tp"] / (res["total.truth"])
        res["precision"] = res["tp"] / (res["tp"] + res["fp"])
        res["na"] = res["unk"] / (res["total.query"])
        res["ambiguous"] = res["ambi"] / res["total.query"]

        any_fp = fpclasses.countbases(label="FP")

        fp_region_count = 0
        auto_size = True
        if args.fpr_size:
            try:
                fp_region_count = int(args.fpr_size)
                auto_size = False
            except:
                pass
        if auto_size:
            if any_fp:
                if args.location:
                    chrom, _, rest = args.location.partition(":")
                    if rest:
                        start, _, end = rest.partition("_")
                        if start:
                            start = int(start)
                        if end:
                            end = int(end)
                    else:
                        fp_region_count += fpclasses.countbases(chrom, label="FP")
                else:
                    fp_region_count = any_fp
            else:
                cs = fastaContigLengths(args.ref)
                if args.location:
                    fp_region_count = calculateLength(cs, args.location)
                else:
                    # use all locations we saw calls on
                    h1 = Tools.vcfextract.extractHeadersJSON(ntpath)
                    h1_chrs = h1["tabix"]["chromosomes"]
                    if not h1_chrs:
                        logging.warn("ntpath is empty")
                        h1_chrs = []

                    h2 = Tools.vcfextract.extractHeadersJSON(nqpath)
                    h2_chrs = h2["tabix"]["chromosomes"]
                    if not h2_chrs:
                        logging.warn("nqpath is empty")
                        h2_chrs = []
                    
                    combined_chrs = list(set(h1_chrs + h2_chrs))
                    if len(combined_chrs) > 0:
                        qlocations = " ".join(combined_chrs)
                        fp_region_count = calculateLength(cs, qlocations)
                    else:
                        fp_region_count = 0

        res["fp.region.size"] = fp_region_count
        res["fp.rate"] = 1e6 * res["fp"] / res["fp.region.size"]

        if args.count_filtered_fn:
            res["recall.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] + res["fn"])

            res["precision.filtered"] = (res["tp"] - res["tp.filtered"]) / (res["tp"] - res["tp.filtered"] +
                                                                            res["fp"] - res["fp.filtered"])

            res["fp.rate.filtered"] = 1e6 * (res["fp"] - res["fp.filtered"]) / res["fp.region.size"]

            res["na.filtered"] = (res["unk"] - res["unk.filtered"]) / (res["total.query"])
            res["ambiguous.filtered"] = (res["ambi"] - res["ambi.filtered"]) / res["total.query"]

        # HAP-162 remove inf values
        res.replace([np.inf, -np.inf], 0)
        metrics_output["metrics"].append(dataframeToMetricsTable("result", res))
        vstring = "som.py-%s" % Tools.version

        logging.info("\n" + res.to_string())
        # in default mode, print result summary to stdout
        if not args.quiet and not args.verbose:
            print "\n" + res.to_string()

        res["sompyversion"] = vstring

        vstring = " ".join(sys.argv)
        res["sompycmd"] = vstring
        res.to_csv(args.output + ".stats.csv")
        with open(args.output + ".metrics.json", "w") as fp:
            json.dump(metrics_output, fp)

    finally:
        if args.delete_scratch:
            shutil.rmtree(scratch)
        else:
            logging.info("Scratch kept at %s" % scratch)

Example #16

Show file

def preprocess(vcf_input,
               vcf_output,
               reference,
               locations=None,
               filters=None,
               fixchr=None,
               regions=None,
               targets=None,
               leftshift=True,
               decompose=True,
               bcftools_norm=False,
               windowsize=10000,
               threads=1,
               gender=None,
               somatic_allele_conversion=False,
               sample="SAMPLE"):
    """ Preprocess a single VCF file

    :param vcf_input: input file name
    :param vcf_output: output file name
    :param reference: reference fasta name
    :param locations: list of locations or None
    :param filters: list of filters to apply ("*" to only allow PASS)
    :param fixchr: None for auto, or True/False -- fix chr prefix to match reference
    :param regions: regions bed file
    :param targets: targets bed file
    :param leftshift: left-shift variants
    :param decompose: decompose variants
    :param bcftools_norm: use bcftools_norm
    :param windowsize: normalisation window size
    :param threads: number of threads to for preprcessing
    :param gender: the gender of the sample ("male" / "female" / "auto" / None)
    :param somatic_allele_conversion: convert somatic alleles -- False / half / het / hemi / hom
    :param sample: when using somatic_allele_conversion, name of the output sample

    :return: the gender if auto-determined (otherwise the same value as gender parameter)
    """

    tempfiles = []
    try:
        # If the input is in BCF format, we can continue to
        # process it in bcf
        # if it is in .vcf.gz, don't try to convert it to
        # bcf because there are a range of things that can
        # go wrong there (e.g. undefined contigs and bcftools
        # segfaults)
        if vcf_input.endswith(".bcf") or vcf_output.endswith(".bcf"):
            int_suffix = ".bcf"
            int_format = "b"
            if not vcf_input.endswith(".bcf") and vcf_output.endswith(".bcf"):
                logging.warn(
                    "Turning vcf into bcf can cause problems when headers are not consistent with all "
                    "records in the file. I will run vcfcheck to see if we will run into trouble. "
                    "To save time in the future, consider converting your files into bcf using bcftools before"
                    " running pre.py.")
        else:
            int_suffix = ".vcf.gz"
            int_format = "z"

        # HAP-317 always check for BCF errors since preprocessing tools now require valid headers
        mf = subprocess.check_output("vcfcheck %s --check-bcf-errors 1" %
                                     pipes.quote(vcf_input),
                                     shell=True)

        if gender == "auto":
            logging.info(mf)
            if "female" in mf:
                gender = "female"
            else:
                gender = "male"

        h = vcfextract.extractHeadersJSON(vcf_input)
        reference_contigs = set(fastaContigLengths(reference).keys())
        reference_has_chr_prefix = hasChrPrefix(reference_contigs)

        allfilters = []
        for f in h["fields"]:
            try:
                if f["key"] == "FILTER":
                    allfilters.append(f["values"]["ID"])
            except:
                logging.warn("ignoring header: %s" % str(f))

        required_filters = None
        if filters:
            fts = filters.split(",")
            required_filters = ",".join(
                list(
                    set(["PASS", "."] +
                        [x for x in allfilters if x not in fts])))

        if fixchr is None:
            try:
                if not h["tabix"]:
                    logging.warn(
                        "input file is not tabix indexed, consider doing this in advance for performance reasons"
                    )
                    vtf = tempfile.NamedTemporaryFile(delete=False,
                                                      suffix=int_suffix)
                    vtf.close()
                    tempfiles.append(vtf.name)
                    runBcftools("view", "-o", vtf.name, "-O", int_format,
                                vcf_input)
                    runBcftools("index", vtf.name)
                    h2 = vcfextract.extractHeadersJSON(vcf_input)
                    chrlist = h2["tabix"]["chromosomes"]
                else:
                    chrlist = h["tabix"]["chromosomes"]
                vcf_has_chr_prefix = hasChrPrefix(chrlist)

                if reference_has_chr_prefix and not vcf_has_chr_prefix:
                    fixchr = True
            except:
                logging.warn("Guessing the chr prefix in %s has failed." %
                             vcf_input)

        # all these require preprocessing
        vtf = vcf_input

        if leftshift or decompose:
            vtf = tempfile.NamedTemporaryFile(delete=False, suffix=int_suffix)
            vtf.close()
            tempfiles.append(vtf.name)
            vtf = vtf.name
        else:
            vtf = vcf_output

        preprocessVCF(vcf_input,
                      vtf,
                      locations,
                      filters == "*",
                      fixchr,
                      bcftools_norm,
                      regions,
                      targets,
                      reference,
                      required_filters,
                      somatic_allele_conversion=somatic_allele_conversion,
                      sample=sample)

        if leftshift or decompose or gender == "male":
            Haplo.partialcredit.partialCredit(vtf,
                                              vcf_output,
                                              reference,
                                              locations,
                                              threads=threads,
                                              window=windowsize,
                                              leftshift=leftshift,
                                              decompose=decompose,
                                              haploid_x=gender == "male")
    finally:
        for t in tempfiles:
            try:
                os.unlink(t)
            except:
                pass

    return gender

Example #17

Show file

File: ftx.py Project: Illumina/hap.py

def main():
    parser = argparse.ArgumentParser("Somatic VCF Feature Extraction")

    parser.add_argument("input", help="Input VCF file")

    parser.add_argument("-o", "--output", dest="output", required=True,
                        help="Output file name. Output will be in CSV format")

    parser.add_argument("-l", "--location", dest="location", default="",
                        help="Location for bcftools view (e.g. chr1)")

    parser.add_argument("-R", "--restrict-regions", dest="regions_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (sparse) regions (using -R in bcftools).")

    parser.add_argument("-T", "--target-regions", dest="targets_bedfile",
                        default=None, type=str,
                        help="Restrict analysis to given (dense) regions (using -T in bcftools).")

    parser.add_argument("-P", "--include-nonpass", dest="inc_nonpass", action="store_true", default=False,
                        help="Use to include failing variants in comparison.")

    parser.add_argument("--feature-table", dest="features", default="generic",
                        help="Select a feature table to output. Options are: %s" % str(Somatic.FeatureSet.sets.keys()))

    parser.add_argument("--feature-label", dest="label", default=None,
                        help="We will output a lable column, this value will go in there -- default is "
                             "the input filename.")

    parser.add_argument("--bam", dest="bams", default=[], action="append",
                        help="pass one or more BAM files for feature table extraction")

    parser.add_argument("-r", "--reference", dest="ref", default=Tools.defaultReference(),
                        help="Specify a reference file for normalization.")

    parser.add_argument("--normalize", dest="normalize", default=False, action="store_true",
                        help="Enable running of bcftools norm on the input file.")

    parser.add_argument("--fix-chr", dest="fixchr", default=False, action="store_true",
                        help="Replace numeric chromosome names in the query by chr*-type names")

    args = parser.parse_args()

    scratch = tempfile.mkdtemp()

    try:
        logging.info("Scratch path is %s" % scratch)

        if not args.label:
            args.label = os.path.basename(args.input)

        bams = []
        md = None
        for x in args.bams:
            bams.append(bamStats(x))

        if bams:
            bres = pandas.concat(bams).groupby("CHROM").mean()
            md = {}
            for x in bres.index:
                logging.info("Mean coverage on %s is %f" % (x, bres.loc[x]["COVERAGE"]))
                md[x] = float(bres.loc[x]["COVERAGE"])*3.0

        nqpath = os.path.join(scratch, "normalized_query.vcf.gz")

        logging.info("Preprocessing input...")
        preprocessVCF(args.input, nqpath, args.location,
                      not args.inc_nonpass,  # pass_only
                      args.fixchr,  # chrprefix
                      args.normalize,  # norm,
                      args.regions_bedfile,
                      args.targets_bedfile,
                      args.ref)

        runBcftools("index", nqpath)

        logging.info("Extracting features...")
        fset = Somatic.FeatureSet.make(args.features)
        fset.setChrDepths(md)
        featuretable = fset.collect(nqpath, args.label)

        if not args.output.endswith(".csv"):
            args.output += ".csv"
        logging.info("Saving feature table %s..." % args.output)
        featuretable.to_csv(args.output)

    finally:
        logging.info("Deleting scratch folder %s " % scratch)
        shutil.rmtree(scratch)

Example #18

Show file

File: partialcredit.py Project: pkrusche/hap.py

def partialCredit(vcfname, outputname, reference, locations, threads=1, window=10000, leftshift=True, decompose=True):
    """ Partial-credit-process a VCF file according to our args """

    pool = getPool(int(threads))
    if threads > 1:
        logging.info("Partial credit processing uses %i parallel processes." % threads)

        if not locations:
            h = extractHeadersJSON(vcfname)
            if not h["tabix"]["chromosomes"]:
                logging.warn("Empty input or not tabix indexed")
                if outputname.endswith(".bcf"):
                    runBcftools("view", "-O", "b", "-o", outputname, vcfname)
                    runBcftools("index", outputname)
                else:
                    runBcftools("view", "-O", "z", "-o", outputname, vcfname)
                    runBcftools("index", "-t", outputname)
                # just return the same file
                return
            locations = h["tabix"]["chromosomes"]
        elif type(locations) is str or type(locations) is unicode:
            locations = locations.split(",")

        # use blocksplit to subdivide input
        res = runParallel(
            pool, blocksplitWrapper, locations, {"vcf": vcfname, "dist": window, "pieces": min(40, threads * 4)}
        )

        if None in res:
            raise Exception("One of the blocksplit processes failed.")

        locations = list(itertools.chain.from_iterable(res))
        if not len(locations):
            logging.warn("Blocksplit returned no blocks. This can happen when " "an input contains no valid variants.")
            locations = [""]
    else:
        locations = [""]

    res = []
    try:
        res = runParallel(
            pool,
            preprocessWrapper,
            itertools.izip(itertools.repeat(vcfname), locations),
            {
                "reference": reference,
                "decompose": decompose,
                "leftshift": leftshift,
                "bcf": outputname.endswith(".bcf"),
            },
        )

        if None in res:
            raise Exception("One of the preprocess jobs failed")
        if not res:
            raise Exception("No blocks were processed. List of locations: %s" % str(list(locations)))

        concatenateParts(outputname, *res)
        if outputname.endswith(".vcf.gz"):
            runBcftools("index", "-t", outputname)
        else:  # use bcf
            runBcftools("index", outputname)
    finally:
        for r in res:
            try:
                os.unlink(r)
            except:
                pass
            try:
                os.unlink(r + ".tbi")
            except:
                pass
            try:
                os.unlink(r + ".csi")
            except:
                pass