Esempio n. 1
0
def OutputDiffRefBias(diffs_from_ref, reflens, fname, xlim=(0,100), \
                      mingts=100, metric="mean", binsize=5):
    r"""Plot reflen vs. mean difference from ref bias plot

    Parameters
    ----------
    diffs_from_ref : list of int
        Difference of each allele call from the ref allele (in bp)
    reflens : list of int
        List of reference allele lengths for each call (in bp)
    fname : str
        Filename of output plot
    xlim: tuple of int, optional
        Specify the minimum and maximum x-axis range (in bp)
    mingts: int, optional
        Don't plot data points computed based on fewer than
        this many genotypes
    metric: str, optional
        Which metric to plot on the y-axis value. Must be mean or median
    binsize: int, optional
        Size (in bp) of bins on the x-axis.
    """
    data = pd.DataFrame({"diff": diffs_from_ref, "ref": reflens, "count": [1]*len(reflens)})
    data["ref"] = data["ref"].apply(lambda x: int(x/binsize)*binsize)
    if metric == "mean":
        sum_fn = np.mean
    elif metric == "median":
        sum_fn = np.median
    else:
        common.WARNING("Invalid metric ({}) specified. Skipping reference bias plot".format(metric))
        return
    metric = metric.capitalize()
    summ = data.groupby("ref", as_index=False).agg({"diff": sum_fn, "count": len}).sort_values("ref")
    summ = summ[summ["count"]>=mingts] # exclude small counts
    summ = summ[(summ["ref"]>=xlim[0]) & (summ["ref"]<=xlim[1])] # filter by x range
    if summ.shape[0] == 0:
        common.WARNING("No points left to plot in reference bias plot after "
                       "filtering. Skipping")
        return
    common.MSG("Plotting ref bias plot with the following data:")
    common.MSG(summ)
    trcounts = np.cumsum(summ["count"])
    trfreqs = trcounts/np.sum(summ["count"])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(summ["ref"], summ["diff"], marker="o", color="darkblue")
    ax.axhline(y=0, linestyle="dashed", color="gray")
    ax.set_xlabel("Reference length (bp)", size=15)
    ax.set_ylabel("{} diff from ref (bp)".format(metric), size=15)
    ax1 = ax.twinx()
    ax1.plot(summ["ref"], trfreqs, color="darkred")
    ax1.set_ylabel("Cumulative fraction of alleles", size=15)
    fig.tight_layout()
    fig.savefig(fname)
    plt.close()
Esempio n. 2
0
def MakeWriter(outfile, invcf, command):
    r"""Create a VCF writer with a dumpSTR header

    Adds a header line with the dumpSTR command used

    Parameters
    ----------
    outfile : str
       Name of the output file
    invcf : vcf.Reader object
       Input VCF. Used to grab header info
    command : str
       String command used to run dumpSTR

    Returns
    -------
    writer : vcf.Writer object
       VCF writer initialized with header of input VCF
       Set to None if we had a problem writing the file
    """
    invcf.metadata["command-DumpSTR"] = [command]
    try:
        writer = vcf.Writer(open(outfile, "w"), invcf)
    except OSError as e:
        common.WARNING(str(e))
        writer = None
    return writer
Esempio n. 3
0
def GetSamples(readers, usefilenames=False):
    r"""Get list of samples used in all files being merged

    Parameters
    ----------
    readers : list of vcf.Reader objects
    usefilenames : bool, optional
       If True, add filename to sample names.
       Useful if sample names overlap across files

    Returns
    -------
    samples : list of str
       List of samples in merged list
    """
    samples = []
    for r in readers:
        if usefilenames:
            samples = samples + [
                r.filename.strip(".vcf.gz") + ":" + s for s in r.samples
            ]
        else:
            samples = samples + r.samples
    if len(set(samples)) != len(samples):
        common.WARNING("Duplicate samples found.")
        return []
    return samples
Esempio n. 4
0
 def LoadRegions(self, filename):
     if not filename.endswith(".bed.gz") and not filename.endswith(
             ".bed.bgz"):
         #raise ValueError("Make sure %s is bgzipped and indexed"%filename)
         self.regions = None
         common.WARNING("Make sure %s is bgzipped and indexed" %
                        filename)
         self.pass_checks = False
         return
     if not os.path.isfile(filename):
         #raise ValueError("Could not find regions BED file %s"%filename)
         self.regions = None
         common.WARNING("Could not find regions BED file %s" % filename)
         self.pass_checks = False
         return
     if not os.path.isfile(filename + ".tbi"):
         #raise ValueError("Could not find tabix index %s.tbi"%filename)
         self.regions = None
         common.WARNING("Could not find tabix index %s.tbi" % filename)
         self.pass_checks = False
         return
     self.regions = BedTool(filename)
Esempio n. 5
0
def CheckHipSTRFilters(invcf, args):
    r"""Check HipSTR call-level filters

    Parameters
    ----------
    invcf : str
        vcf.Reader object
    args : argparse namespace
        Contains user arguments

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if args.hipstr_max_call_flank_indel is not None:
        if args.hipstr_max_call_flank_indel < 0 or args.hipstr_max_call_flank_indel > 1:
            common.WARNING(
                "--hipstr-max-call-flank-indel must be between 0 and 1")
            return False
        assert "DP" in invcf.formats and "DFLANKINDEL" in invcf.formats  # should always be true
    if args.hipstr_max_call_stutter is not None:
        if args.hipstr_max_call_stutter < 0 or args.hipstr_max_call_stutter > 1:
            common.WARNING("--hipstr-max-call-stutter must be between 0 and 1")
            return False
        assert "DP" in invcf.formats and "DSTUTTER" in invcf.formats  # should always be true
    if args.hipstr_min_supp_reads is not None:
        if args.hipstr_min_supp_reads < 0:
            common.WARNING("--hipstr-min-supp-reads must be >= 0")
            return False
        assert "ALLREADS" in invcf.formats and "GB" in invcf.formats
    if args.hipstr_min_call_DP is not None:
        if args.hipstr_min_call_DP < 0:
            common.WARNING("--hipstr-min-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.hipstr_max_call_DP is not None:
        if args.hipstr_max_call_DP < 0:
            common.WARNING("--hipstr-max-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.hipstr_min_call_DP is not None and args.hipstr_max_call_DP is not None:
        if args.hipstr_max_call_DP < args.hipstr_min_call_DP:
            common.WARNING(
                "--hipstr-max-call-DP must be >= --hipstr-min-call-DP")
            return False
    if args.hipstr_min_call_Q is not None:
        if args.hipstr_min_call_Q < 0 or args.hipstr_min_call_Q > 1:
            common.WARNING("--hipstr-min-call-Q must be between 0 and 1")
            return False
        assert "Q" in invcf.formats
    return True
Esempio n. 6
0
def LoadSingleReader(vcffile, checkgz=True, region=None):
    r"""Return VCF reader

    Parameters
    ----------
    vcffile : str
        VCF files to read
    checkgz: boolean, optional
        Check whether VCF file is gzipped and indexed
    region : str, optional
        Chrom:start-end to restrict to
    
    Returns
    -------
    reader : vcf.Reader
        VCF reader
    """
    if not os.path.isfile(vcffile):
        common.WARNING("Could not find VCF file %s" % vcffile)
        return None
    if checkgz:
        if not vcffile.endswith(".vcf.gz") and not vcffile.endswith(
                ".vcf.bgz"):
            common.WARNING("Make sure %s is bgzipped and indexed" % vcffile)
            return None
        if not os.path.isfile(vcffile + ".tbi"):
            common.WARNING("Could not find VCF index %s.tbi" % vcffile)
            return None
    if vcffile.endswith(".vcf.gz") or vcffile.endswith(".vcf.bgz"):
        reader = vcf.Reader(open(vcffile, "rb"))
    else:
        reader = vcf.Reader(open(vcffile))

    if region is None:
        return reader
    else:
        return reader.fetch(region)
Esempio n. 7
0
def CheckLocusFilters(args, vcftype):
    r"""Perform checks on user inputs for locus-level filters

    Parameters
    ----------
    args : argparse namespace
        Contains user arguments
    vcftype : enum.
        Specifies which tool this VCF came from.
        Must be included in trh.VCFTYPES

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if args.min_locus_hwep is not None:
        if args.min_locus_hwep < 0 or args.min_locus_hwep > 1:
            common.WARNING("Invalid --min-locus-hwep. Must be between 0 and 1")
            return False
    if args.min_locus_het is not None:
        if args.min_locus_het < 0 or args.min_locus_het > 1:
            common.WARNING("Invalid --min-locus-het. Must be between 0 and 1")
            return False
    if args.max_locus_het is not None:
        if args.max_locus_het < 0 or args.max_locus_het > 1:
            common.WARNING("Invalid --max-locus-het. Must be between 0 and 1")
            return False
    if args.min_locus_het is not None and args.max_locus_het is not None:
        if args.max_locus_het < args.min_locus_het:
            common.WARNING(
                "Cannot have --max-locus-het less than --min-locus-het")
            return False
    if args.use_length and vcftype not in [trh.VcfTypes["hipstr"]]:
        common.WARNING(
            "--use-length is only meaningful for HipSTR, which reports sequence level differences."
        )
    if args.filter_hrun and vcftype not in [trh.VcfTypes["hipstr"]]:
        common.WARNING(
            "--filter-run only relevant to HipSTR files. This filter will have no effect."
        )
    if args.filter_regions is not None:
        if args.filter_regions_names is not None:
            filter_region_files = args.filter_regions.split(",")
            filter_region_names = args.filter_regions_names.split(",")
            if len(filter_region_names) != len(filter_region_files):
                common.WARNING(
                    "Length of --filter-regions-names must match --filter-regions."
                )
                return False
    return True
Esempio n. 8
0
def CheckPopSTRFilters(invcf, args):
    r"""Check PopSTR call-level filters

    Parameters
    ----------
    invcf : str
        vcf.Reader object
    args : argparse namespace
        Contains user arguments

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if args.popstr_min_call_DP is not None:
        if args.popstr_min_call_DP < 0:
            common.WARNING("--popstr-min-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.popstr_max_call_DP is not None:
        if args.popstr_max_call_DP < 0:
            common.WARNING("--popstr-max-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.popstr_min_call_DP is not None and args.popstr_max_call_DP is not None:
        if args.popstr_max_call_DP < args.popstr_min_call_DP:
            common.WARNING(
                "--popstr-max-call-DP must be >= --popstr-min-call-DP")
            return False
    if args.popstr_require_support is not None:
        if args.popstr_require_support < 0:
            common.WARNING("--popstr-require-support must be >= 0")
            return False
        assert "AD" in invcf.formats
    return True
Esempio n. 9
0
def CheckAdVNTRFilters(invcf, args):
    r"""Check adVNTR call-level filters

    Parameters
    ----------
    invcf : str
        vcf.Reader object
    args : argparse namespace
        Contains user arguments

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if args.advntr_min_call_DP is not None:
        if args.advntr_min_call_DP < 0:
            common.WARNING("--advntr-min-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.advntr_max_call_DP is not None:
        if args.advntr_max_call_DP < 0:
            common.WARNING("--advntr-max-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.advntr_min_call_DP is not None and args.advntr_max_call_DP is not None:
        if args.advntr_max_call_DP < args.advntr_min_call_DP:
            common.WARNING(
                "--advntr-max-call-DP must be >= --advntr-min-call-DP")
            return False
    if args.advntr_min_spanning is not None:
        if args.advntr_min_spanning < 0:
            common.WARNING("--advntr-min-spanning must be >=0")
            return False
        assert "SR" in invcf.formats
    if args.advntr_min_flanking is not None:
        if args.advntr_min_flanking < 0:
            common.WARNING("--advntr-min-flanking must be >=0")
            return False
        assert "FR" in invcf.formats
    if args.advntr_min_ML is not None:
        if args.advntr_min_ML < 0:
            common.WARNING("--advntr-min-ML must be >= 0")
            return False
        assert "ML" in invcf.formats
    return True
Esempio n. 10
0
def CheckEHFilters(invcf, args):  # pragma: no cover
    r"""Check ExpansionHunter call-level filters

    Parameters
    ----------
    invcf : str
        vcf.Reader object
    args : argparse namespace
        Contains user arguments

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if args.eh_min_ADFL is not None:
        if args.eh_min_ADFL < 0:
            common.WARNING("--eh-min-ADFL must be >= 0")
            return False
        assert "ADFL" in invcf.formats
    if args.eh_min_ADIR is not None:
        if args.eh_min_ADIR < 0:
            common.WARNING("--eh-min-ADIR must be >= 0")
            return False
        assert "ADIR" in invcf.formats
    if args.eh_min_ADSP is not None:
        if args.eh_min_ADSP < 0:
            common.WARNING("--eh-min-ADSP must be >= 0")
            return False
        assert "ADSP" in invcf.formats
    if args.eh_min_call_LC is not None:
        if args.eh_min_call_LC < 0:
            common.WARNING("--eh-min-call-LC must be >= 0")
            return False
        assert "LC" in invcf.formats
    if args.eh_max_call_LC is not None:
        if args.eh_max_call_LC < 0:
            common.WARNING("--eh-max-call-LC must be >= 0")
            return False
        assert "LC" in invcf.formats
    if args.eh_min_call_LC is not None and args.eh_max_call_LC is not None:
        if args.eh_max_call_LC < args.eh_min_call_LC:
            common.WARNING("--eh-max-call-LC must be >= --eh-min-call-LC")
            return False
    return True
Esempio n. 11
0
def GetInfoItem(current_records, mergelist, info_field, fail=True):
    """Get INFO item for a group of records

    Make sure it's the same across merged records
    if fail=True, die if items not the same.
    if fail=False, only do something if we have a rule on how to handle that field

    Parameters
    ----------
    current_records : list of vcf.Record
       List of records being merged
    mergelist : list of bool
       List of indicators of whether to merge each record
    info_field : str
       INFO field being merged
    fail : bool
       If True, throw error if fields don't have same value

    Returns
    -------
    infostring : str
       INFO string to add (key=value)
    """
    if not fail:
        return None  # TODO in future implement smart merging of select fields
    vals = set()
    for i in range(len(mergelist)):
        if mergelist[i]:
            if info_field in current_records[i].INFO:
                vals.add(current_records[i].INFO[info_field])
            else:
                raise ValueError("Missing info field %s" % info_field)
    if len(vals) == 1:
        return "%s=%s" % (info_field, vals.pop())
    else:
        common.WARNING("Incompatible info field value %s" % info_field)
        return None
Esempio n. 12
0
def CheckFilters(invcf, args, vcftype):
    r"""Perform checks on user input for filters

    Parameters
    ----------
    invcf : str
        vcf.Reader object
    args : argparse namespace
        Contains user arguments
    vcftype : enum.
        Specifies which tool this VCF came from.
        Must be included in trh.VCFTYPES

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if not CheckLocusFilters(args, vcftype):
        return False

    # Check HipSTR specific filters
    if args.hipstr_max_call_flank_indel is not None or \
       args.hipstr_max_call_stutter is not None or \
       args.hipstr_min_supp_reads is not None or \
       args.hipstr_min_call_DP is not None or \
       args.hipstr_max_call_DP is not None or \
       args.hipstr_min_call_Q is not None:
        if vcftype != trh.VcfTypes["hipstr"]:
            common.WARNING("HipSTR options can only be applied to HipSTR VCFs")
            return False
        else:
            if not CheckHipSTRFilters(invcf, args):
                return False

    # Check GangSTR specific filters
    if args.gangstr_min_call_DP is not None or \
       args.gangstr_max_call_DP is not None or \
       args.gangstr_min_call_Q is not None or \
       args.gangstr_expansion_prob_het is not None or \
       args.gangstr_expansion_prob_hom is not None or \
       args.gangstr_expansion_prob_total is not None or \
       args.gangstr_filter_span_only or \
       args.gangstr_filter_spanbound_only or \
       args.gangstr_filter_badCI or \
       args.gangstr_require_support is not None or \
       args.gangstr_readlen is not None:
        if vcftype != trh.VcfTypes["gangstr"]:
            common.WARNING(
                "GangSTR options can only be applied to GangSTR VCFs")
            return False
        else:
            if not CheckGangSTRFilters(invcf, args):
                return False

    # Check adVNTR specific filters
    if args.advntr_min_call_DP is not None or \
       args.advntr_max_call_DP is not None or \
       args.advntr_min_spanning is not None or \
       args.advntr_min_flanking is not None or \
       args.advntr_min_ML is not None:
        if vcftype != trh.VcfTypes["advntr"]:
            common.WARNING("adVNTR options can only be applied to adVNTR VCFs")
            return False
        else:
            if not CheckAdVNTRFilters(invcf, args):
                return False

    # Check EH specific filters
    if args.eh_min_ADFL is not None or \
       args.eh_min_ADIR is not None or \
       args.eh_min_ADSP is not None or \
       args.eh_min_call_LC is not None or \
       args.eh_max_call_LC is not None:
        if vcftype != trh.VcfTypes["eh"]:
            common.WARNING(
                "ExpansionHunter options can only be applied to ExpansionHunter VCFs"
            )
            return False
        else:  # pragma: no cover
            if not CheckEHFilters(invcf, args):  # pragma: no cover
                return False  # pragma: no cover

    # Check popSTR specific filters
    if args.popstr_min_call_DP is not None or \
       args.popstr_max_call_DP is not None or \
       args.popstr_require_support is not None:
        if vcftype != trh.VcfTypes["popstr"]:
            common.WARNING("popSTR options can only be applied to popSTR VCFs")
            return False
        else:
            if not CheckPopSTRFilters(invcf, args):
                return False
    return True
Esempio n. 13
0
def getargs():  # pragma: no cover
    parser = argparse.ArgumentParser(
        __doc__, formatter_class=utils.ArgumentDefaultsHelpFormatter)
    inout_group = parser.add_argument_group("Input/output")
    inout_group.add_argument("--vcf",
                             help="Input STR VCF file",
                             type=str,
                             required=True)
    inout_group.add_argument(
        "--out",
        help="Output file prefix. Use stdout to print file to standard output.",
        type=str,
        required=True)
    inout_group.add_argument("--vcftype",
                             help="Options=%s" %
                             [str(item) for item in trh.VcfTypes.__members__],
                             type=str,
                             default="auto")
    filter_group = parser.add_argument_group("Filtering group")
    filter_group.add_argument(
        "--samples",
        help=
        "File containing list of samples to include. Or a comma-separated list of files to compute stats separate for each group of samples",
        type=str)
    filter_group.add_argument(
        "--sample-prefixes",
        help=
        "Prefixes to name output for each samples group. By default uses 1,2,3 etc.",
        type=str)
    filter_group.add_argument("--region",
                              help="Restrict to this region chrom:start-end",
                              type=str)
    stat_group_name = "Stats group"
    stat_group = parser.add_argument_group(stat_group_name)
    stat_group.add_argument(
        "--thresh",
        help=
        "Output threshold field (max allele size, used for GangSTR strinfo).",
        action="store_true")
    stat_group.add_argument("--afreq",
                            help="Output allele frequencies",
                            action="store_true")
    stat_group.add_argument("--acount",
                            help="Output allele counts",
                            action="store_true")
    stat_group.add_argument("--hwep",
                            help="Output HWE p-values per loci.",
                            action="store_true")
    stat_group.add_argument("--het",
                            help="Output heterozygosity of each locus.",
                            action="store_true")
    stat_group.add_argument("--mean",
                            help="Output mean of allele frequencies.",
                            action="store_true")
    stat_group.add_argument("--mode",
                            help="Output mode of allele frequencies.",
                            action="store_true")
    stat_group.add_argument("--var",
                            help="Output variance of allele frequencies.",
                            action="store_true")
    stat_group.add_argument("--numcalled",
                            help="Output number of samples called.",
                            action="store_true")
    stat_group.add_argument(
        "--use-length",
        help=
        "Calculate per-locus stats (het, HWE) collapsing alleles by length",
        action="store_true")
    plot_group = parser.add_argument_group("Plotting group")
    plot_group.add_argument(
        "--plot-afreq",
        help=
        "Output allele frequency plot. Will only do for a maximum of 10 TRs.",
        action="store_true")
    ver_group = parser.add_argument_group("Version")
    ver_group.add_argument("--version",
                           action="version",
                           version='{version}'.format(version=__version__))
    args = parser.parse_args()
    # If no stat selected, print an error message and terminate
    stat_dict = {}
    for grp in parser._action_groups:
        if grp.title == stat_group_name:
            stat_dict = {
                a.dest: getattr(args, a.dest, None)
                for a in grp._group_actions
            }

    if not any(stat_dict.values()):
        common.WARNING(
            "Error: Please use at least one of the flags in the Stats group. See statSTR --help for options."
        )
        return None
    return args
Esempio n. 14
0
def CheckGangSTRFilters(invcf, args):
    r"""Check GangSTR call-level filters

    Parameters
    ----------
    invcf : str
        vcf.Reader object
    args : argparse namespace
        Contains user arguments

    Returns
    -------
    checks : bool
        Set to True if all filters look ok.
        Set to False if filters are invalid
    """
    if args.gangstr_min_call_DP is not None:
        if args.gangstr_min_call_DP < 0:
            common.WARNING("--gangstr-min-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.gangstr_max_call_DP is not None:
        if args.gangstr_max_call_DP < 0:
            common.WARNING("--gangstr-max-call-DP must be >= 0")
            return False
        assert "DP" in invcf.formats
    if args.gangstr_min_call_DP is not None and args.gangstr_max_call_DP is not None:
        if args.gangstr_max_call_DP < args.gangstr_min_call_DP:
            common.WARNING(
                "--gangstr-max-call-DP must be >= --gangstr-min-call-DP")
            return False
    if args.gangstr_min_call_Q is not None:
        if args.gangstr_min_call_Q < 0 or args.gangstr_min_call_Q > 1:
            common.WARNING("--gangstr-min-call-Q must be between 0 and 1")
            return False
        assert "Q" in invcf.formats
    if args.gangstr_expansion_prob_het is not None:
        if args.gangstr_expansion_prob_het < 0 or args.gangstr_expansion_prob_het > 1:
            common.WARNING(
                "--gangstr-expansion-prob-het must be between 0 and 1")
            return False
        assert "QEXP" in invcf.formats
    if args.gangstr_expansion_prob_hom is not None:
        if args.gangstr_expansion_prob_hom < 0 or args.gangstr_expansion_prob_hom > 1:
            common.WARNING(
                "--gangstr-expansion-prob-hom must be between 0 and 1")
            return False
        assert "QEXP" in invcf.formats
    if args.gangstr_expansion_prob_total is not None:
        if args.gangstr_expansion_prob_total < 0 or args.gangstr_expansion_prob_total > 1:
            common.WARNING(
                "--gangstr-expansion-prob-total must be between 0 and 1")
            return False
        assert "QEXP" in invcf.formats
    if args.gangstr_require_support is not None:
        if args.gangstr_require_support < 0:
            common.WARNING("--gangstr-require-support must be >= 0")
            return False
        if args.gangstr_require_support > 0 and args.gangstr_readlen is None:
            common.WARNING(
                "Using --gangstr-require-support requires setting --gangstr-readlen"
            )
            return False
        if args.gangstr_readlen is not None and args.gangstr_readlen < 20:
            common.WARNING("--gangstr-readlen must be an integer value >= 20")
            return False
        assert "ENCLREADS" in invcf.formats and "FLNKREADS" in invcf.formats and "RC" in invcf.formats
    return True
Esempio n. 15
0
def main(args):
    if not os.path.exists(args.vcf):
        common.WARNING("Error: %s does not exist" % args.vcf)
        return 1

    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    # Load samples
    sample_lists = []
    sample_prefixes = []
    if args.samples:
        sfiles = args.samples.split(",")
        if args.sample_prefixes:
            sample_prefixes = args.sample_prefixes.split(",")
        else:
            sample_prefixes = [str(item) for item in range(1, len(sfiles) + 1)]
        if len(sfiles) != len(sample_prefixes):
            common.MSG("--sample-prefixes must be same length as --samples")
            return 1
        for sf in sfiles:
            sample_lists.append(
                [item.strip() for item in open(sf, "r").readlines()])

    invcf = utils.LoadSingleReader(args.vcf, checkgz=False)
    if invcf is None:
        return 1
    if args.vcftype != 'auto':
        vcftype = trh.VcfTypes[args.vcftype]
    else:
        vcftype = trh.InferVCFType(invcf)

    header = ["chrom", "start", "end"]
    if args.thresh: header.extend(GetHeader("thresh", sample_prefixes))
    if args.afreq: header.extend(GetHeader("afreq", sample_prefixes))
    if args.acount: header.extend(GetHeader("acount", sample_prefixes))
    if args.hwep: header.extend(GetHeader("hwep", sample_prefixes))
    if args.het: header.extend(GetHeader("het", sample_prefixes))
    if args.mean: header.extend(GetHeader("mean", sample_prefixes))
    if args.mode: header.extend(GetHeader("mode", sample_prefixes))
    if args.var: header.extend(GetHeader("var", sample_prefixes))
    if args.numcalled: header.extend(GetHeader("numcalled", sample_prefixes))
    if args.out == "stdout":
        if args.plot_afreq:
            common.MSG("Cannot use --out stdout when generating plots")
            return 1
        outf = sys.stdout
    else:
        outf = open(args.out + ".tab", "w")
    outf.write("\t".join(header) + "\n")

    if args.region:
        if not os.path.isfile(args.vcf + ".tbi"):
            common.MSG("Make sure %s is bgzipped and indexed" % args.vcf)
            return 1
        regions = invcf.fetch(args.region)
    else:
        regions = invcf
    num_plotted = 0
    for record in regions:
        trrecord = trh.HarmonizeRecord(vcftype, record)
        if args.plot_afreq and num_plotted <= MAXPLOTS:
            PlotAlleleFreqs(trrecord,
                            args.out,
                            samplelists=sample_lists,
                            sampleprefixes=sample_prefixes)
            num_plotted += 1
        items = [
            record.CHROM, record.POS, record.POS + len(trrecord.ref_allele)
        ]
        if args.thresh:
            items.extend(GetThresh(trrecord, samplelists=sample_lists))
        if args.afreq:
            items.extend(
                GetAFreq(trrecord,
                         samplelists=sample_lists,
                         uselength=args.use_length))
        if args.acount:
            items.extend(
                GetAFreq(trrecord,
                         samplelists=sample_lists,
                         uselength=args.use_length,
                         count=True))
        if args.hwep:
            items.extend(
                GetHWEP(trrecord,
                        samplelists=sample_lists,
                        uselength=args.use_length))
        if args.het:
            items.extend(
                GetHet(trrecord,
                       samplelists=sample_lists,
                       uselength=args.use_length))
        if args.mean:
            items.extend(GetMean(trrecord, samplelists=sample_lists))
        if args.mode:
            items.extend(GetMode(trrecord, samplelists=sample_lists))
        if args.var:
            items.extend(GetVariance(trrecord, samplelists=sample_lists))
        if args.numcalled:
            items.extend(GetNumSamples(trrecord, samplelists=sample_lists))
        outf.write("\t".join([str(item) for item in items]) + "\n")
    outf.close()
    return 0
Esempio n. 16
0
def main(args):
    # Load VCF file
    invcf = utils.LoadSingleReader(args.vcf, checkgz=False)
    if invcf is None:
        return 1

    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    # Set up record harmonizer and infer VCF type
    vcftype = trh.InferVCFType(invcf, args.vcftype)

    # Check filters all make sense
    if not CheckFilters(invcf, args, vcftype): return 1

    # Set up locus-level filter list
    try:
        filter_list = BuildLocusFilters(args, vcftype)
    except ValueError:
        return 1
    filter_list = BuildLocusFilters(args, vcftype)
    invcf.filters = {}
    for f in filter_list:
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # Set up call-level filters
    call_filters = BuildCallFilters(args)

    # Add new FORMAT fields
    if "FILTER" not in invcf.formats:
        invcf.formats["FILTER"] = _Format("FILTER", 1, "String",
                                          "Call-level filter")

    # Add new INFO fields
    invcf.infos["AC"] = _Info("AC",
                              -1,
                              "Integer",
                              "Alternate allele counts",
                              source=None,
                              version=None)
    invcf.infos["REFAC"] = _Info("REFAC",
                                 1,
                                 "Integer",
                                 "Reference allele count",
                                 source=None,
                                 version=None)
    invcf.infos["HET"] = _Info("HET",
                               1,
                               "Float",
                               "Heterozygosity",
                               source=None,
                               version=None)
    invcf.infos["HWEP"] = _Info("HWEP",
                                1,
                                "Float",
                                "HWE p-value for obs. vs. exp het rate",
                                source=None,
                                version=None)
    invcf.infos["HRUN"] = _Info("HRUN",
                                1,
                                "Integer",
                                "Length of longest homopolymer run",
                                source=None,
                                version=None)

    # Set up output files
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING("Output directory does not exist")
        return 1
    outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv))
    if outvcf is None: return 1

    # Set up sample info
    all_reasons = GetAllCallFilters(call_filters)
    sample_info = {}
    for s in invcf.samples:
        sample_info[s] = {"numcalls": 0, "totaldp": 0}
        for r in all_reasons:
            sample_info[s][r] = 0

    # Set up locus info
    loc_info = {"totalcalls": 0, "PASS": 0}
    for filt in filter_list:
        loc_info[filt.filter_name()] = 0

    # Go through each record
    record_counter = 0
    while True:
        try:
            record = next(invcf)
        except IndexError:
            common.WARNING(
                "Skipping TR that couldn't be parsed by PyVCF. Check VCF format"
            )
            if args.die_on_warning: return 1
        except StopIteration:
            break
        if args.verbose:
            common.MSG("Processing %s:%s" % (record.CHROM, record.POS))
        record_counter += 1
        if args.num_records is not None and record_counter > args.num_records:
            break
        # Call-level filters
        record = ApplyCallFilters(record, invcf, call_filters, sample_info)

        # Locus-level filters
        record.FILTER = None
        output_record = True
        for filt in filter_list:
            if filt(record) == None: continue
            if args.drop_filtered:
                output_record = False
                break
            record.add_filter(filt.filter_name())
            loc_info[filt.filter_name()] += 1
        if args.drop_filtered:
            if record.call_rate == 0: output_record = False
        if output_record:
            trrecord = trh.HarmonizeRecord(vcftype, record)
            # Recalculate locus-level INFO fields
            record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF)
            if record.num_called > 0:
                allele_freqs = trrecord.GetAlleleFreqs(
                    uselength=args.use_length)
                genotype_counts = trrecord.GetGenotypeCounts(
                    uselength=args.use_length)
                record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs)
                record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest(
                    allele_freqs, genotype_counts)
                record.INFO["AC"] = [
                    int(item * (3 * record.num_called)) for item in record.aaf
                ]
                record.INFO["REFAC"] = int(
                    (1 - sum(record.aaf)) * (2 * record.num_called))
            else:
                record.INFO["HET"] = -1
                record.INFO["HWEP"] = -1
                record.INFO["AC"] = [0] * len(record.ALT)
                record.INFO["REFAC"] = 0
            # Recalc filter
            if record.FILTER is None and not args.drop_filtered:
                record.FILTER = "PASS"
                loc_info["PASS"] += 1
                loc_info["totalcalls"] += record.num_called
            # Output the record
            outvcf.write_record(record)

    # Output log info
    WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab")
    WriteLocLog(loc_info, args.out + ".loclog.tab")

    return 0
Esempio n. 17
0
def main(args):
    if not os.path.exists(args.vcf):
        common.WARNING("The input vcf location %s does not exist"%args.vcf)
        return 1

    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING("Error: The directory which contains the output location {} does"
                       " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    # Set up reader and harmonizer
    invcf = utils.LoadSingleReader(args.vcf, checkgz = False)
    if invcf is None:
        return 1

    if args.vcftype != 'auto':
        harmonizer = trh.TRRecordHarmonizer(invcf, args.vcftype)
    else:
        harmonizer = trh.TRRecordHarmonizer(invcf)

    if len(args.quality) > 0 and not harmonizer.HasQualityScore():
        common.WARNING("Requested a quality plot, but the input vcf doesn't have "
                       "quality scores!")
        return 1

    # Check refbias options
    if args.refbias_binsize < 1:
        common.WARNING("--refbias-binsize must be >=1")
        return 1
    if args.refbias_mingts < 0:  # allow for 0 mingts as a synonym for 1
        common.WARNING("--refbias-mingts must be >=1")
        return 1
    if args.refbias_xrange_min >= args.refbias_xrange_max:
        common.WARNING("--refbias-xrange-min ({}) cannot be >= --refbias-xrange-max ({})".format(
            args.refbias_xrange_min, args.refbias_xrange_max))
        return 1

    # Load samples
    if args.samples:
        samplelist = [item.strip()
                      for item
                      in open(args.samples, "r").readlines()
                      if item.strip() in invcf.samples]
    else: samplelist = invcf.samples

    # Figure out which quality plot to produce by default
    default_quality = False
    if len(args.quality) == 0 and harmonizer.HasQualityScore():
        default_quality = True
        if len(samplelist) <= 5:
            args.quality = [_QualityTypes.sample_stratified.value]
        else:
            args.quality = [_QualityTypes.per_locus.value]

    # Set up data to keep track of
    sample_calls = dict([(sample, 0) for sample in samplelist]) # sample->numcalls
    contigs = invcf.contigs
    if len(contigs) == 0:
        common.WARNING("Warning: no contigs found in VCF file.")
    chrom_calls = dict([(chrom, 0) for chrom in contigs]) # chrom->numcalls
    diffs_from_ref = [] # for each allele call, keep track of diff (bp) from ref
    diffs_from_ref_unit = [] # for each allele call, keep track of diff (units) from ref
    reflens = [] # for each allele call, keep track of reference length (bp)
    if _QualityTypes.per_locus.value in args.quality:
        per_locus_data = []
    if _QualityTypes.per_sample.value in args.quality:
        per_sample_data = {}
        for sample in samplelist: 
            per_sample_data[sample] = []
    if _QualityTypes.per_call.value in args.quality:
        per_call_data = []
    if _QualityTypes.sample_stratified.value in args.quality:
        sample_strat_data = {}
        for sample in samplelist: 
            sample_strat_data[sample] = []
    if _QualityTypes.locus_stratified.value in args.quality:
        locus_strat_data = {}

    # read the vcf
    numrecords = 0
    for trrecord in harmonizer:
        if args.numrecords is not None and numrecords >= args.numrecords: break
        if args.period is not None and len(trrecord.motif) != args.period: continue

        record = trrecord.vcfrecord

        # Extract stats
        chrom = record.CHROM
        rl = len(trrecord.ref_allele)
        allele_counts = trrecord.GetAlleleCounts(uselength=False, samplelist=samplelist)

        # Update data
        num_calls = 0
        if _QualityTypes.per_locus.value in args.quality:
            per_locus_data.append([])
        if _QualityTypes.locus_stratified.value in args.quality:
            locus_strat_data[trrecord.record_id] = []

        # loop over sample data
        for call in record:
            s = call.sample
            if s not in samplelist:
                continue
            if call.called:
                sample_calls[s] += 1
                num_calls += 1

            if len(args.quality) == 0:
                continue

            # set non-calls to zero quality
            if call.called:
                quality_score = trrecord.GetQualityScore(call)
            elif args.quality_ignore_no_call:
                continue
            else:
                quality_score = 0

            if _QualityTypes.per_sample.value in args.quality:
                per_sample_data[s].append(quality_score)
            if _QualityTypes.sample_stratified.value in args.quality:
                sample_strat_data[s].append(quality_score)
            if _QualityTypes.per_locus.value in args.quality:
                per_locus_data[-1].append(quality_score)
            if _QualityTypes.locus_stratified.value in args.quality:
                locus_strat_data[trrecord.record_id].append(quality_score)
            if _QualityTypes.per_call.value in args.quality:
                per_call_data.append(quality_score)

        chrom_calls[chrom] = chrom_calls.get(chrom, 0) + num_calls
        for allele in allele_counts.keys():
            allelediff = len(allele)-rl
            count = allele_counts[allele]
            reflens.extend([rl]*count)
            diffs_from_ref.extend([allelediff]*count)
            diffs_from_ref_unit.extend([allelediff/len(trrecord.motif)]*count)

        numrecords += 1

    print("Producing " + args.out + "-diffref-bias.pdf ... ", end='',
          flush=True)
    OutputDiffRefBias(diffs_from_ref, reflens, args.out + "-diffref-bias.pdf", \
                      xlim=(args.refbias_xrange_min, args.refbias_xrange_max), \
                      mingts=args.refbias_mingts, metric=args.refbias_metric, \
                      binsize=args.refbias_binsize)
    if len(samplelist) > 1:
        print("Done.\nProducing " + args.out + "-sample-callnum.pdf ... ",
              end='', flush=True)
        OutputSampleCallrate(sample_calls, args.out+"-sample-callnum.pdf")
        print("Done.")
    else:
        print("Done.\nOnly one sample, so skipping " + args.out + "-sample-callnum.pdf ...")
    if 1 < len(list(chrom for chrom, value in chrom_calls.items()
                    if value > 0)):
        print("Producing " + args.out + "-chrom-callnum.pdf ... ", end='',
              flush=True)
        OutputChromCallrate(chrom_calls, args.out+"-chrom-callnum.pdf")
        print("Done.\n", end='')
    else:
        print("Only one chromosome, so skipping " + args.out + "-chrom-callnum.pdf ...")
    print("Producing " + args.out + "-diffref-histogram.pdf ... ", end='',
          flush=True)
    OutputDiffRefHistogram(diffs_from_ref_unit, args.out + "-diffref-histogram.pdf")
    print("Done.")

    if default_quality:
        def quality_output_loc(quality_value):
            return args.out+"-quality.pdf"
    else:
        def quality_output_loc(quality_value):
            return args.out+"-quality-{}.pdf".format(quality_value)

    prior_qual_plot = False
    if _QualityTypes.per_sample.value in args.quality:
        print("Producing " +
              quality_output_loc(_QualityTypes.per_sample.value) +
              " ... ", end='', flush=True)
        new_per_sample_data = []
        for sample_data in per_sample_data.values():
            new_per_sample_data.append(stat.mean(sample_data))
        OutputQualityPerSample(new_per_sample_data,
                               quality_output_loc(_QualityTypes.per_sample.value))
        prior_qual_plot = True

    if _QualityTypes.sample_stratified.value in args.quality:
        if prior_qual_plot:
            print("Done.")
        print("Producing " +
              quality_output_loc(_QualityTypes.sample_stratified.value) +
              " ... ", end='', flush=True)
        OutputQualitySampleStrat(sample_strat_data,
                                 quality_output_loc(_QualityTypes.sample_stratified.value))
        prior_qual_plot = True

    if _QualityTypes.per_locus.value in args.quality:
        if prior_qual_plot:
            print("Done.")
        print("Producing " +
              quality_output_loc(_QualityTypes.per_locus.value) +
              " ... ", end='', flush=True)
        new_per_locus_data = []
        for locus_data in per_locus_data:
            new_per_locus_data.append(stat.mean(locus_data))
        OutputQualityPerLocus(new_per_locus_data,
                              quality_output_loc(_QualityTypes.per_locus.value))
        prior_qual_plot = True

    if _QualityTypes.locus_stratified.value in args.quality:
        if prior_qual_plot:
            print("Done.")
        print("Producing " +
              quality_output_loc(_QualityTypes.locus_stratified.value) +
              " ... ", end='', flush=True)
        OutputQualityLocusStrat(locus_strat_data,
                                quality_output_loc(_QualityTypes.locus_stratified.value))
        prior_qual_plot = True

    if _QualityTypes.per_call.value in args.quality:
        if prior_qual_plot:
            print("Done.")
        print("Producing " +
              quality_output_loc(_QualityTypes.per_call.value) +
              " ... ", end='', flush=True)
        OutputQualityPerCall(per_call_data,
                             quality_output_loc(_QualityTypes.per_call.value))

    if len(args.quality) == 0:
        print("This vcf does not have quality scores, so skipping all "
              "quality plots.")

    print("Done.")
    return 0
Esempio n. 18
0
def WriteMergedHeader(vcfw, args, readers, cmd, vcftype):
    r"""Write merged header for VCFs in args.vcfs

    Also do some checks on the VCFs to make sure merging
    is appropriate.
    Return info and format fields to use

    Parameters
    ----------
    vcfw : file object
       Writer to write the merged VCF
    args : argparse namespace
       Contains user options
    readers : list of vcf.Reader
       List of readers to merge
    cmd : str
       Command used to call this program
    vcftype : str
       Type of VCF files being merged

    Returns
    -------
    useinfo : list of (str, bool)
       List of (info field, required) to use downstream
    useformat: list of str
       List of format field strings to use downstream
    """
    def get_contigs(reader):
        return set(reader.contigs.values())

    def get_alts(reader):
        return set(reader.alts.values())

    def get_sources(reader):
        if "source" in reader.metadata:
            return set(r.metadata["source"])
        else:
            return set()

    # Check contigs the same for all readers
    contigs = get_contigs(readers[0])
    for i in range(1, len(readers)):
        if get_contigs(readers[i]) != contigs:
            raise ValueError(
                "Different contigs found across VCF files. Make sure all "
                "files used the same reference. Consider using this "
                "command:\n\t"
                "bcftools reheader -f ref.fa.fai file.vcf.gz -o file_rh.vcf.gz"
            )
    # Write VCF format, commands, and contigs
    vcfw.write("##fileformat=VCFv4.1\n")

    # Update commands
    for r in readers:
        if "command" in r.metadata:
            for i in range(len(r.metadata["command"])):
                vcfw.write("##command=" + r.metadata["command"][i] + "\n")
    vcfw.write("##command=" + cmd + "\n")

    # Update sources
    sources = set.union(*[get_sources(reader) for reader in readers])
    for src in sources:
        vcfw.write("##source=" + src + "\n")

    for contig in contigs:
        # contigs in VCFs can contain more info than just ID and length
        # (such as URL)
        # even though pyvcf ignores all other fields.
        # in the future (e.g. when swapping to cyvcf2),
        # write  out the entire contig not just those two fields
        vcfw.write("##contig=<ID=%s,length=%s>\n" % (contig.id, contig.length))
    # Write ALT fields if present
    alts = set.union(*[get_alts(reader) for reader in readers])
    for alt in alts:
        vcfw.write("##ALT=<ID=%s,Description=\"%s\">\n" % (alt.id, alt.desc))
    # Write INFO fields, different for each tool
    useinfo = []
    for (field, reqd) in INFOFIELDS[vcftype]:
        if field not in readers[0].infos:
            common.WARNING("Expected info field %s not found. Skipping" %
                           field)
        else:
            vcfw.write(GetInfoString(readers[0].infos[field]) + "\n")
            useinfo.append((field, reqd))
    # Write GT header
    vcfw.write(
        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
    # Write FORMAT fields, different for each tool
    useformat = []
    for field in FORMATFIELDS[vcftype]:
        if field not in readers[0].formats:
            common.WARNING("Expected format field %s not found. Skipping" %
                           field)
        else:
            vcfw.write(GetFormatString(readers[0].formats[field]) + "\n")
            useformat.append(field)
    # Write sample list
    samples = mergeutils.GetSamples(readers,
                                    usefilenames=args.update_sample_from_file)
    if len(samples) == 0:
        return None, None
    header_fields = [
        "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"
    ]
    vcfw.write("#" + "\t".join(header_fields + samples) + "\n")
    return useinfo, useformat
Esempio n. 19
0
def main(args):
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    ### Check and Load VCF files ###
    vcfreaders = utils.LoadReaders(args.vcfs.split(","), checkgz=True)
    if vcfreaders is None:
        return 1
    if len(vcfreaders) == 0: return 1
    contigs = vcfreaders[0].contigs
    # WriteMergedHeader will confirm that the list of contigs is the same for
    # each vcf, so just pulling it from one here is fine
    chroms = list(contigs)

    ### Check inferred type of each is the same
    vcftype = mergeutils.GetAndCheckVCFType(vcfreaders, args.vcftype)

    ### Set up VCF writer ###
    vcfw = open(args.out + ".vcf", "w")
    useinfo, useformat = WriteMergedHeader(vcfw, args, vcfreaders,
                                           " ".join(sys.argv), vcftype)
    if useinfo is None or useformat is None:
        common.WARNING("Error writing merged header. Quitting")
        return 1

    ### Walk through sorted readers, merging records as we go ###
    current_records = [next(reader) for reader in vcfreaders]
    # Check if contig ID is set in VCF header for all records
    done = mergeutils.DoneReading(current_records)
    while not done:
        for r, reader in zip(current_records, vcfreaders):
            if r is None: continue
            if not r.CHROM in chroms:
                common.WARNING(
                    ("Error: found a record in file {} with "
                     "chromosome '{}' which was not found in the contig list "
                     "({})").format(reader.filename, r.CHROM,
                                    ", ".join(chroms)))
                common.WARNING(
                    "VCF files must contain a ##contig header line for each chromosome."
                )
                common.WARNING(
                    "If this is only a technical issue and all the vcf "
                    "files were truly built against against the "
                    "same reference, use bcftools "
                    "(https://github.com/samtools/bcftools) to fix the contigs"
                    ", e.g.: bcftools reheader -f hg19.fa.fai -o myvcf-readher.vcf.gz myvcf.vcf.gz"
                )
                return 1
        is_min = mergeutils.GetMinRecords(current_records, chroms)
        if args.verbose:
            mergeutils.DebugPrintRecordLocations(current_records, is_min)
        if mergeutils.CheckMin(is_min): return 1
        MergeRecords(vcfreaders, current_records, is_min, vcfw, args, useinfo,
                     useformat)
        current_records = mergeutils.GetNextRecords(vcfreaders,
                                                    current_records, is_min)
        done = mergeutils.DoneReading(current_records)
    return 0
Esempio n. 20
0
def MergeRecords(readers, current_records, mergelist, vcfw, args, useinfo,
                 useformat):
    r"""Merge records from different files

    Merge all records with indicator set to True in mergelist
    Output merged record to vcfw

    Parameters
    ----------
    readers : list of vcf.Reader
       List of readers being merged
    current_records : list of vcf.Record
       List of current records for each reader
    mergelist : list of bool
       Indicates whether to include each reader in merge
    vcfw : file
       File to write output to
    args : argparse namespace
       Contains user options
    useinfo : list of (str, bool)
       List of (info field, required) to use downstream
    useformat: list of str
       List of format field strings to use downstream
    """
    output_items = []
    use_ind = [i for i in range(len(mergelist)) if mergelist[i]]
    if len(use_ind) == 0: return
    chrom = current_records[use_ind[0]].CHROM
    pos = current_records[use_ind[0]].POS
    alt_alleles = GetAltAlleles(current_records, mergelist)
    ref_allele = GetRefAllele(current_records, mergelist)
    if ref_allele is None:
        common.WARNING("Conflicting refs found at {}:{}. Skipping.".format(
            chrom, pos))
        return
    # Set common fields
    output_items.append(chrom)  # CHROM
    output_items.append(str(pos))  # POS
    output_items.append(GetID(current_records[use_ind[0]].ID))  # ID
    output_items.append(ref_allele)  # REF
    if len(alt_alleles) == 0:
        output_items.append(".")
    else:
        output_items.append(",".join(alt_alleles))  # ALT
    output_items.append(".")  # QUAL
    output_items.append(".")  # FILTER
    # Set INFO
    info_items = []
    for (field, reqd) in useinfo:
        inf = GetInfoItem(current_records, mergelist, field, fail=reqd)
        if inf is not None:
            info_items.append(inf)
    info_items = [item for item in info_items if item is not None]
    output_items.append(";".join(info_items))
    # Set FORMAT - add GT to front
    output_items.append(":".join(["GT"] + useformat))
    # Set sample info
    alleles = [ref_allele] + alt_alleles
    for i in range(len(mergelist)):
        if mergelist[i]:
            output_items.extend(
                GetSampleInfo(current_records[i], alleles, useformat))
        else:
            output_items.extend([NOCALLSTRING] *
                                len(readers[i].samples))  # NOCALL
    vcfw.write("\t".join(output_items) + "\n")
Esempio n. 21
0
def main(args):
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    ### Check and load VCF files ###
    vcfreaders = utils.LoadReaders([args.vcf1, args.vcf2],
                                   checkgz=True,
                                   region=args.region)
    if vcfreaders is None or len(vcfreaders) != 2:
        return 1
    contigs = vcfreaders[0].contigs
    chroms = list(contigs)

    ### Load shared samples ###
    samples = mergeutils.GetSharedSamples(vcfreaders)
    if len(samples) == 0:
        common.WARNING("No shared smaples found between vcf readers")
        return 1
    if args.samples:
        usesamples = set(
            [item.strip() for item in open(args.samples, "r").readlines()])
        samples = list(set(samples).intersection(usesamples))
    if len(samples) == 0:
        common.WARNING("No shared samples found between files")
        return 1

    ### Determine FORMAT fields we should look for ###
    if args.stratify_file is not None and args.stratify_file not in [0, 1, 2]:
        common.MSG("--stratify-file must be 0,1, or 2")
        return 1
    format_fields, format_binsizes = GetFormatFields(args.stratify_fields,
                                                     args.stratify_binsizes,
                                                     args.stratify_file,
                                                     vcfreaders)

    ### Keep track of data to summarize at the end ###
    results_dir = {
        "chrom": [],
        "start": [],
        "period": [],
        "sample": [],
        "gtstring1": [],
        "gtstring2": [],
        "gtsum1": [],
        "gtsum2": [],
        "metric-conc-seq": [],
        "metric-conc-len": [],
    }
    for ff in format_fields:
        results_dir[ff + "1"] = []
        results_dir[ff + "2"] = []

    vcftype1 = trh.GetVCFType(vcfreaders[0], args.vcftype1)
    vcftype2 = trh.GetVCFType(vcfreaders[1], args.vcftype2)

    ### Walk through sorted readers, merging records as we go ###
    current_records = [next(reader) for reader in vcfreaders]
    is_min = mergeutils.GetMinRecords(current_records, chroms)

    done = mergeutils.DoneReading(current_records)
    num_records = 0
    while not done:
        if any([item is None for item in current_records]): break
        if args.numrecords is not None and num_records >= args.numrecords:
            break
        if args.verbose:
            mergeutils.DebugPrintRecordLocations(current_records, is_min)
        if mergeutils.CheckMin(is_min): return 1
        if all([is_min]):
            if (current_records[0].CHROM == current_records[1].CHROM and \
                current_records[0].POS == current_records[1].POS):
                UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \
                                        trh.HarmonizeRecord(vcftype2, current_records[1]), \
                                        format_fields, samples, results_dir)
        current_records = mergeutils.GetNextRecords(vcfreaders,
                                                    current_records, is_min)
        is_min = mergeutils.GetMinRecords(current_records, chroms)
        done = mergeutils.DoneReading(current_records)
        num_records += 1

    ### Load all results to a dataframe and output full results ###
    data = pd.DataFrame(results_dir)
    data.to_csv(args.out + "-callcompare.tab", sep="\t", index=False)

    ### Overall metrics ###
    OutputOverallMetrics(data, format_fields, format_binsizes,
                         args.stratify_file, args.period, args.out)
    if not args.noplot:
        OutputBubblePlot(data,
                         args.period,
                         args.out,
                         minval=args.bubble_min,
                         maxval=args.bubble_max)

    ### Per-locus metrics ###
    OutputLocusMetrics(data, args.out, args.noplot)

    ### Per-sample metrics ###
    OutputSampleMetrics(data, args.out, args.noplot)

    return 0
Esempio n. 22
0
def test_WARNING():
    common.WARNING("Writing a test warning")
    common.WARNING("Writing a test warning")