Ejemplo n.º 1
0
 def add_hap_markers(self, hap_var):
     """
     Populates the table with bases that are associated with haplogroups.
     """
     for hap in hap_var:
         self.markers[hap] = dict()
         for var in hap_var[hap]:
             pos = phylotree.pos_from_var(var)
             der = phylotree.der_allele(var)
             if der != self.refseq[pos]:
                 self.markers[hap][pos] = der
     return
Ejemplo n.º 2
0
 def test_pos_from_var(self):
     self.assertEqual(phylotree.pos_from_var("T182C!"), 181)
     self.assertEqual(phylotree.pos_from_var("(T195C!)"), 194)
     self.assertEqual(phylotree.pos_from_var("C182T!!"), 181)
     self.assertEqual(phylotree.pos_from_var("(T195C)"), 194)
     self.assertEqual(phylotree.pos_from_var("T10454c"), 10453)
     self.assertEqual(phylotree.pos_from_var("T14034C"), 14033)
Ejemplo n.º 3
0
def write_variants(out, phylo, contribs, obs_tab, args):
    """
    Write a table of the variants used in this analysis and note whether the
    position is expected to be polymorphic in the sample given the set of
    identified contributors.

    Args:
        out: File handle to write output to.
        phylo: The Phylotree object used in EM analysis
        contribs: Table of identified contributors with fields  hap#,
                  haplogroup, fraction
        args: The argparse namespace
    Returns: nothing
    """
    haplogroups = [con[1] for con in contribs]
    variants = collections.defaultdict(list)
    for hap in haplogroups:
        for var in phylo.hap_var[hap]:
            pos = phylotree.pos_from_var(var)
            variants[pos].append("%s:%s" % (hap, var))

    polymorphic = set(phylo.polymorphic_sites(haplogroups))
    for ref_pos in range(len(phylo.refseq)):
        obs = obs_tab.obs_at(ref_pos)

        samp_status = "sample_fixed"
        threshold = max(args.min_var_reads,
                        obs_tab.total_obs(pos) * args.frac_var_reads)
        if sum(obs[base] >= threshold for base in 'ACGT') > 1:
            samp_status = "variant"

        phy_status = "fixed"
        if ref_pos in polymorphic:
            phy_status = "polymorphic"

        out.write("%d\t%s\t%s\t%s\t%s\n" %
                  (ref_pos + 1, '\t'.join([str(obs[base]) for base in 'ACGT']),
                   phy_status, samp_status, ','.join(variants[ref_pos])))
    return
Ejemplo n.º 4
0
def _check_contrib_phy_vars(phylo, obs_tab, contrib_prop, args):
    """
    Checks if each candidate contributor from contribs passes our variant base
    check. The strategy for this is to start with the highest estimated
    contributors and an empty list of variant positions. For each contributor,
    we identify the variant bases that are unique from the previous candidates.
    We check the observation table to verify that those bases are observed in
    the sample.

    Args:
        phylo: the phylotree object that holds the variant information for
               these haplogroups
        obs_tab: Table of base observations for positions in the reference.
        contrib_prop: A list of lists, one for each contributor, containing:
          - The haplogroup ID for the contributor.
          - The proprotion estimate from EM
        args: argparse Namespace with user specified values for:
            min_var_reads: The minimum number of observations required to call
                           a base as present in the mixture sample (int)
            frac_var_reads: The minimum fraction of observations required to
                            call a base as present in the mixture sample
                            (float)
            var_fraction: The minimum fraction of defining variants required
                          to be observed to call a haplogroup a contributor
                          (float)
            var_count: Call a haplogroup a contributor if there the number of
                       observed variants is equal or greater than min_count
                       (int)
    Returns:
        contrib_prop, with haplogroups that do not pass filters removed.
    """
    used_vars = set()
    ignore_haps = set()

    if args.verbose:
        sys.stderr.write("Checking diagnostic variants:\n")

    for hap, _ in contrib_prop:
        # get variant for this haplogroup
        uniq_vars = set([(phylotree.pos_from_var(var),
                          phylotree.der_allele(var))
                         for var in phylo.hap_var[hap]])
        uniq_vars -= used_vars
        if args.verbose:
            sys.stderr.write("%s (%d unique variants)\n" %
                             (hap, len(uniq_vars)))

        found_vars = set()
        for pos, der in sorted(uniq_vars):
            if args.verbose:
                var = "%d%s" % (pos + 1, der)
                sys.stderr.write("  %s: %d/%d\n" %
                                 (var.rjust(6), obs_tab.obs_at(
                                     pos, der), obs_tab.total_obs(pos)))
            threshold = max(args.min_var_reads,
                            obs_tab.total_obs(pos) * args.frac_var_reads)
            if obs_tab.obs_at(pos, der) >= threshold:
                found_vars.add((pos, der))
        if ((len(uniq_vars) == 0) or
            (args.var_count is not None and len(found_vars) >= args.var_count)
                or
            (float(len(found_vars)) / len(uniq_vars) >= args.var_fraction)):
            if args.verbose:
                sys.stderr.write(
                    "Keeping '%s': "
                    "%d/%d unique variant bases observed at "
                    "least %d times.\n" %
                    (hap, len(found_vars), len(uniq_vars), threshold))
            # Looks good, these variants can't be used again.
            used_vars.update(found_vars)
            # Also add the ancestral bases for this haplogroup so we do not
            # mistake backmutations in another haplogroup as a novel allele.
            used_vars.update(phylo.get_ancestral(hap))
        else:
            if args.verbose:
                sys.stderr.write(
                    "Ignoring '%s': "
                    "only %d/%d unique variant bases observed.\n" %
                    (hap, len(found_vars), len(uniq_vars)))
            ignore_haps.add(hap)

    pass_contribs = [con for con in contrib_prop if con[0] not in ignore_haps]

    return pass_contribs