def _get_family_info(self):
        """
        Extract the relevant genotype filters, as well all labels
        for each family in the database.
        """
        families = subjects.get_families(self.args.db)
        self.family_ids = []
        self.family_masks = []
        self.family_gt_labels = []
        self.family_gt_columns = []
        self.family_dp_columns = []
        for family in families:

            family_filter = None

            if self.model == "auto_rec":
                family_filter = family.get_auto_recessive_filter()
            elif self.model == "auto_dom":
                family_filter = family.get_auto_dominant_filter()
            elif self.model == "de_novo":
                family_filter = family.get_de_novo_filter()
            elif self.model == "mendel_violations":
                family_filter = family.get_mendelian_violation_filter()

            if family_filter != "False" and family_filter is not None:
                self.family_masks.append(family_filter)
                self.family_gt_labels.append(family.get_genotype_labels())
                self.family_gt_columns.append(family.get_genotype_columns())
                self.family_dp_columns.append(family.get_genotype_depths())
                self.family_ids.append(family.family_id)
    def _get_family_info(self):
        """
        Extract the relevant genotype filters, as well all labels
        for each family in the database.
        """
        families = subjects.get_families(self.args.db)
        self.family_ids = []
        self.family_masks = []
        self.family_gt_labels = []
        self.family_gt_columns = []
        self.family_dp_columns = []
        for family in families:

            family_filter = None
            
            if self.model == "auto_rec":
                family_filter = family.get_auto_recessive_filter()
            elif self.model == "auto_dom":
                family_filter = family.get_auto_dominant_filter()
            elif self.model == "de_novo":
                family_filter = family.get_de_novo_filter()
            elif self.model == "mendel_violations":
                family_filter = family.get_mendelian_violation_filter()

            if family_filter != "False" and family_filter is not None:
                self.family_masks.append(family_filter)
                self.family_gt_labels.append(family.get_genotype_labels())
                self.family_gt_columns.append(family.get_genotype_columns())
                self.family_dp_columns.append(family.get_genotype_depths())
                self.family_ids.append(family.family_id)
Example #3
0
def get_auto_dominant_candidates(c):
    """
    Report candidate variants that meet an autosomal dominant
    inheritance model.
    """

    families = subjects.get_families(c)

    for family in families:

        query = "SELECT chrom, start, end, ref, alt, gene, \
                        impact, impact_severity, gt_types, gts \
                 FROM variants \
                 WHERE impact_severity != 'LOW'"

        c.execute(query)
        all_query_cols = [str(tuple[0]) for tuple in c.description
                          if not tuple[0].startswith("gt")]

        family_genotype_mask = family.get_auto_dominant_filter()
        family_sample_gt_columns = family.get_subject_genotype_columns()
        family_sample_gt_labels = family.get_subject_genotype_labels()

        # yield a header
        header = []
        header.append("family_id")
        for col in all_query_cols:
            header.append(col)
        for col in family_sample_gt_labels:
            header.append(col)
        yield header

        # yield the resulting auto_dom variants for this familiy
        for row in c:

            # unpack the genotype arrays so that we can interrogate
            # the genotypes present in each family member to conforming
            # to the genetic model being tested
            gt_types = compression.unpack_genotype_blob(row['gt_types'])
            gts = compression.unpack_genotype_blob(row['gts'])

            # skip if the variant doesn't meet a dominant model
            # for this family
            if not eval(family_genotype_mask):
                continue

            result = []
            # first report all of the non-genotype columns
            result.append(str(family.family_id))
            for col in all_query_cols:
                if col == 'gt_types' or col == 'gts':
                    continue
                result.append(str(row[col]))

            # now report all of the genotype columns
            for col in family_sample_gt_columns:
                result.append(str(eval(col)))

            yield result
def get_actionable_mutations(parser, args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    query = "SELECT variants.chrom, start, end, ref, alt, \
                    variants.gene, impact, is_somatic, \
                    gene_summary.in_cosmic_census \
             FROM variants, gene_summary \
             WHERE variants.is_somatic = 1 \
             AND (variants.type = 'snp' \
                 OR variants.type = 'indel') \
             AND (variants.impact_severity = 'HIGH' \
                 OR variants.impact_severity = 'MED') \
             AND variants.chrom = gene_summary.chrom \
             AND variants.gene = gene_summary.gene \
             AND gene_summary.in_cosmic_census = 1"


    # collect the relevant genes and query DGIDB
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)

    genes = defaultdict()
    for row in gq:
      genes[row['gene']] = True
    # collect info from DGIdb
    dgidb_info = query_dgidb(genes)


    # now rerun the query and report actionable mutations per DGIDB and COSMIC census.
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)
    print'\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \
                    'gene', 'impact', 'is_somatic', 'in_cosmic_census', 'dgidb_info'])
    for row in gq:

        for pair in t_n_pairs:
            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            print'\t'.join(str(s) for s in [tumor.name, row['chrom'], \
                                            row['start'], row['end'], \
                                            row['ref'], row['alt'], \
                                            row['gene'], row['impact'], \
                                            row['is_somatic'], \
                                            row['in_cosmic_census'], \
                                            str(dgidb_info[row['gene']])])
def get_actionable_mutations(parser, args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    query = "SELECT variants.chrom, start, end, ref, alt, \
                    variants.gene, impact, is_somatic, \
                    gene_summary.in_cosmic_census \
             FROM variants, gene_summary \
             WHERE variants.is_somatic = 1 \
             AND (variants.type = 'snp' \
                 OR variants.type = 'indel') \
             AND (variants.impact_severity = 'HIGH' \
                 OR variants.impact_severity = 'MED') \
             AND variants.chrom = gene_summary.chrom \
             AND variants.gene = gene_summary.gene \
             AND gene_summary.in_cosmic_census = 1"


    # collect the relevant genes and query DGIDB
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)

    genes = defaultdict()
    for row in gq:
        genes[row['gene']] = True
    # collect info from DGIdb
    dgidb_info = query_dgidb(genes)


    # now rerun the query and report actionable mutations per DGIDB and COSMIC census.
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)
    print'\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \
                    'gene', 'impact', 'is_somatic', 'in_cosmic_census', 'dgidb_info'])
    for row in gq:
        for pair in t_n_pairs:
            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            print'\t'.join(str(s) for s in [tumor.name, row['chrom'], \
                                            row['start'], row['end'], \
                                            row['ref'], row['alt'], \
                                            row['gene'], row['impact'], \
                                            row['is_somatic'], \
                                            row['in_cosmic_census'], \
                                            str(dgidb_info[row['gene']])])
Example #6
0
    def get_compound_hets(self):
        """
        Report candidate compound heterozygotes.
        """
        args = self.args
        gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
        idx_to_sample = gq.idx_to_sample
        self.subjects_dict = subjects.get_subjects(args)

        # run the query applying any genotype filters provided by the user.
        gq.run(self.create_query())

        families = subjects.get_families(args.db, args.families)
        family_gt_labels, family_gt_cols = {}, {}
        for family in families:
            family_gt_labels[family.family_id] = family.get_genotype_labels()
            family_gt_cols[family.family_id] = family.get_genotype_columns()

        # output header
        print self.get_header(gq.header, is_comp_het=True)

        # Collect all of the genic heterozygotes for each sample / gene
        for gene, row_list in groupby(gq, itemgetter("gene")):
            sample_hets = collections.defaultdict(lambda: collections.defaultdict(list))
            for row in row_list:

                gt_types, gt_bases, gt_phases = row["gt_types"], row["gts"], row["gt_phases"]
                site = Site(row)
                # track each sample that is heteroyzgous at this site.
                for idx, gt_type in enumerate(gt_types):
                    if gt_type != HET:
                        continue
                    sample = idx_to_sample[idx]
                    sample_site = copy(site)
                    sample_site.phased = gt_phases[idx]

                    if not sample_site.phased and not args.ignore_phasing:
                        continue

                    sample_site.gt = gt_bases[idx]
                    # add the site to the list of candidates for this sample/gene
                    sample_hets[sample][site.row["gene"]].append(sample_site)

            # process the last gene seen
            samples_w_hetpair = self.find_valid_het_pairs(sample_hets)
            self.filter_candidates(samples_w_hetpair, family_gt_labels, family_gt_cols)
def get_de_novo_candidates(c, min_sample_depth=30):
    """
    Report candidate variants that meet appear to be de novo
    mutations in the child. We cannot distinguisj mutations that
    occured in the parental germline from those that occurred
    early in development in the child post-conception.
    """

    families = subjects.get_families(c)

    for family in families:

        query = "SELECT chrom, start, end, ref, alt, gene, \
                        impact, impact_severity, in_dbsnp, \
                        rs_ids, aaf_1kg_all, aaf_esp_all, \
                        clinvar_sig, clinvar_disease_name, \
                        clinvar_dbsource, gt_types, \
                        gt_depths, gts \
                 FROM variants \
                 WHERE impact_severity != 'LOW' \
                 AND num_het = 1"

        c.execute(query)
        all_query_cols = [str(tuple[0]) for tuple in c.description
                          if not tuple[0].startswith("gt")]

        family_genotype_mask = family.get_de_novo_filter()
        family_sample_gt_columns = family.get_subject_genotype_columns()
        family_sample_depth_columns = family.get_subject_depth_columns()
        family_sample_gt_labels = family.get_subject_genotype_labels()
        family_sample_dp_labels = family.get_subject_depth_labels()

        header = []
        header.append("family_id")
        for col in all_query_cols:
            header.append(col)
        for col in family_sample_gt_labels:
            header.append(col)
        for col in family_sample_dp_labels:
            header.append(col)
        yield header

        # report the resulting de_novo variants for this familiy
        for row in c:

            # unpack the genotype arrays so that we can interrogate
            # the genotypes present in each family member to conforming
            # to the genetic model being tested
            gt_types = compression.unpack_genotype_blob(row['gt_types'])
            gt_depths = compression.unpack_genotype_blob(row['gt_depths'])
            gts = compression.unpack_genotype_blob(row['gts'])

            # does the variant meet the a de novo model for this family?
            # if not, ignore.
            if not eval(family_genotype_mask):
                continue

            # make sure each sample's genotype had sufficient coverage.
            # otherwise, ignore
            insufficient_depth = False
            for col in family_sample_depth_columns:
                depth = int(eval(col))
                if depth < min_sample_depth:
                    insufficient_depth = True
                    break
            if insufficient_depth:
                continue

            result = []
            # first report all of the non-genotype columns
            result.append(str(family.family_id))
            for col in all_query_cols:
                if col == 'gt_types' or col == 'gts':
                    continue
                result.append(str(row[col]))

            # now report all of the genotype columns
            for col in family_sample_gt_columns:
                result.append(str(eval(col)))

            # now report all of the depth columns
            for col in family_sample_depth_columns:
                result.append(str(eval(col)))

            yield result
Example #8
0
def get_de_novo_candidates(args):
    """
    Report candidate variants that meet appear to be de novo
    mutations in the child. We cannot distinguish mutations that
    occured in the parental germline from those that occurred
    early in development in the child post-conception.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)

    if args.columns is not None:
        # the user only wants to report a subset of the columns
        query = "SELECT " + str(args.columns) + " FROM variants"
    else:
        # report the kitchen sink
        query = "SELECT *" + \
                ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals"                                                        + \
                " FROM variants"

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " WHERE " + args.filter

    # collect family info
    families = subjects.get_families(gq.c)
    family_ids = []
    family_masks = []
    family_sample_gt_labels = []
    family_sample_gt_columns = []
    family_sample_depth_columns = []
    for family in families:
        family_filter = family.get_de_novo_filter()
        if family_filter != "False":
            family_masks.append(family_filter)
            family_sample_gt_labels.append(
                family.get_subject_genotype_labels())
            family_sample_gt_columns.append(
                family.get_subject_genotype_columns())
            family_sample_depth_columns.append(
                family.get_subject_depth_columns())
            family_ids.append(family.family_id)

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    # print a header
    print "family_id\tfamily_members\tfamily_genotypes\tdepths\t",
    print gq.header

    # yield the resulting variants for this familiy
    for row in gq:

        # interrogate the genotypes present in each family member to conforming
        # to the genetic model being tested
        gt_types = row['gt_types']
        gts = row['gts']
        gt_depths = row['gt_depths']

        # test the variant for each family in the db
        for idx, fam_id in enumerate(family_ids):
            family_genotype_mask = family_masks[idx]
            family_sample_gt_label = family_sample_gt_labels[idx]
            family_sample_gt_cols = family_sample_gt_columns[idx]
            family_sample_dp_cols = family_sample_depth_columns[idx]

            # skip if the variant doesn't meet a de novo model
            # for this family
            if not eval(family_genotype_mask):
                continue

            # make sure each sample's genotype had sufficient coverage.
            # otherwise, ignore
            insufficient_depth = False
            for col in family_sample_dp_cols:
                depth = int(eval(col))
                if depth < args.min_sample_depth:
                    insufficient_depth = True
                    break
            if insufficient_depth:
                continue

            print str(fam_id) + "\t" + \
               ",".join([str(s) for s in family_sample_gt_label]) + "\t", \
               ",".join([str(eval(s)) for s in family_sample_gt_cols]) + "\t", \
               ",".join([str(eval(s)) for s in family_sample_dp_cols]) + "\t",
            print row
def get_tumor_normal_pairs(args):
    conn = sqlite3.connect(args.db)
    conn.isolation_level = None
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    return gemini_subjects.get_families(c)
Example #10
0
def get_auto_recessive_candidates(c):
    """
    Report candidate variants that meet an autosomal recessive
    inheritance model.
    """

    families = subjects.get_families(c)

    for family in families:

        query = "SELECT chrom, start, end, ref, alt, gene, \
                        impact, impact_severity, gt_types, gts \
                 FROM variants \
                 WHERE impact_severity != 'LOW'"

        c.execute(query)
        all_query_cols = [
            str(tuple[0]) for tuple in c.description
            if not tuple[0].startswith("gt")
        ]

        family_genotype_mask = family.get_auto_recessive_filter()
        family_sample_gt_columns = family.get_subject_genotype_columns()
        family_sample_gt_labels = family.get_subject_genotype_labels()

        # skip this family if it cannot meet an autosomal_recessive model.
        if family_genotype_mask is None:
            continue

        # yield a header
        header = []
        header.append("family_id")
        for col in all_query_cols:
            header.append(col)
        for col in family_sample_gt_labels:
            header.append(col)
        yield header

        # yield the resulting auto_rec variants for this familiy
        for row in c:

            # unpack the genotype arrays so that we can interrogate
            # the genotypes present in each family member to conforming
            # to the genetic model being tested
            gt_types = compression.unpack_genotype_blob(row['gt_types'])
            gts = compression.unpack_genotype_blob(row['gts'])

            # skip if the variant doesn't meet a recessive model
            # for this family
            if not eval(family_genotype_mask):
                continue

            result = []
            # first report all of the non-genotype columns
            result.append(str(family.family_id))
            for col in all_query_cols:
                if col == 'gt_types' or col == 'gts':
                    continue
                result.append(str(row[col]))

            # now report all of the genotype columns
            for col in family_sample_gt_columns:
                result.append(str(eval(col)))

            yield result
def tag_somatic_mutations(args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    gq = GeminiQuery.GeminiQuery(args.db)

    if args.chrom is None:
        query = "SELECT variant_id, chrom, start, end, \
                        ref, alt, gene, impact, gts, gt_types, \
                        gt_ref_depths, gt_alt_depths \
                 FROM variants \
                 WHERE depth >= "                                  + str(args.min_depth) + \
                 " AND   qual >= " + str(args.min_qual)
    else:
        query = "SELECT variant_id, chrom, start, end, \
                ref, alt, gene, impact, gts, gt_types, \
                gt_ref_depths, gt_alt_depths \
         FROM variants \
         WHERE depth >= "                          + str(args.min_depth) + \
         " AND qual >= " + str(args.min_qual) + \
         " AND chrom = \'" + args.chrom + "\'"

    gq.run(query)
    smp2idx = gq.sample_to_idx

    somatic_counter = 0
    somatic_v_ids = []

    if args.dry_run:
        print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \
                        'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth',
                        'chrom', 'start', 'end', 'ref', 'alt', 'gene'])

    for row in gq:

        # we can skip varinats where all genotypes are identical
        if len(set(row['gt_types'])) == 1:
            continue

        for pair in t_n_pairs:

            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            tum_idx = smp2idx[tumor.name]
            nrm_idx = smp2idx[normal.name]

            tum_gt = row['gts'][tum_idx]
            nrm_gt = row['gts'][nrm_idx]

            tum_gt_type = row['gt_types'][tum_idx]
            nrm_gt_type = row['gt_types'][nrm_idx]

            if nrm_gt_type == tum_gt_type:
                continue

            if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN:
                continue

            # the genotypes pass the smell test for somatic
            # mutations if in this block.
            if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF):

                tum_ref_depth = row['gt_ref_depths'][tum_idx]
                nrm_ref_depth = row['gt_ref_depths'][nrm_idx]

                tum_alt_depth = row['gt_alt_depths'][tum_idx]
                nrm_alt_depth = row['gt_alt_depths'][nrm_idx]

                # total observed depth
                nrm_depth = nrm_alt_depth + nrm_ref_depth
                tum_depth = tum_alt_depth + tum_ref_depth

                if (nrm_depth < args.min_norm_depth \
                   or \
                   tum_depth < args.min_tumor_depth):
                    continue

                tum_alt_freq = float(tum_alt_depth) / \
                               (float(tum_alt_depth) + float(tum_ref_depth))

                nrm_alt_freq = float(nrm_alt_depth) / \
                               (float(nrm_alt_depth) + float(nrm_ref_depth))

                # apply evidence thresholds.
                if nrm_alt_freq > args.max_norm_alt_freq \
                   or \
                   nrm_alt_depth > args.max_norm_alt_count:
                    continue

                somatic_counter += 1
                somatic_v_ids.append((1, row['variant_id']))

                print'\t'.join(str(s) for s in [tumor.name,  tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \
                                    normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \
                                    row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']])

    if not args.dry_run:
        conn = sqlite3.connect(args.db)
        conn.isolation_level = None
        c = conn.cursor()

        # now set the identified mutations to True.
        update_qry = "UPDATE variants SET is_somatic = ? "
        update_qry += " WHERE variant_id = ?"
        c.executemany(update_qry, somatic_v_ids)
        print "Identified and set", somatic_counter, "somatic mutations"
    else:
        print "Would have identified and set", somatic_counter, "somatic mutations"
Example #12
0
def tag_somatic_mutations(args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    gq = GeminiQuery.GeminiQuery(args.db)

    depth_string, qual_string, ssc_string, chrom_string = ("", "", "", "")
    if args.min_depth:
        depth_string = " AND depth >= %s" % args.min_depth
    if args.min_qual:
        qual_string = " AND qual >= %s" % args.min_qual
    if args.min_somatic_score:
        ssc_string = " AND (type='sv' \
                         OR somatic_score >= %s)" % args.min_somatic_score
    if args.chrom:
        chrom_string = " AND chrom = '%s'" % args.chrom

    if args.chrom is None:
        query = "SELECT variant_id, chrom, start, end, \
                        ref, alt, gene, impact, gts, gt_types, \
                        gt_ref_depths, gt_alt_depths \
                 FROM variants \
                 WHERE 1 \
                 %s \
                 %s \
                 %s \
                 %s" % (depth_string, qual_string, ssc_string, chrom_string)

    gq.run(query)
    smp2idx = gq.sample_to_idx

    somatic_counter = 0
    somatic_v_ids = []

    if args.dry_run:
        print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \
                        'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth',
                        'chrom', 'start', 'end', 'ref', 'alt', 'gene'])

    for row in gq:
        # we can skip variants where all genotypes are identical
        if len(set(row['gt_types'])) == 1:
            continue

        for pair in t_n_pairs:

            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            tum_idx = smp2idx[tumor.name]
            nrm_idx = smp2idx[normal.name]

            tum_gt = row['gts'][tum_idx]
            nrm_gt = row['gts'][nrm_idx]

            tum_gt_type = row['gt_types'][tum_idx]
            nrm_gt_type = row['gt_types'][nrm_idx]

            if nrm_gt_type == tum_gt_type:
                continue

            if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN:
                continue

            # the genotypes pass the smell test for somatic
            # mutations if in this block.
            if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF):

               tum_ref_depth = row['gt_ref_depths'][tum_idx]
               nrm_ref_depth = row['gt_ref_depths'][nrm_idx]

               tum_alt_depth = row['gt_alt_depths'][tum_idx]
               nrm_alt_depth = row['gt_alt_depths'][nrm_idx]

               # total observed depth
               nrm_depth = nrm_alt_depth + nrm_ref_depth
               tum_depth = tum_alt_depth + tum_ref_depth

               if (nrm_depth < args.min_norm_depth \
                  or \
                  tum_depth < args.min_tumor_depth):
                  continue

               try:
                   tum_alt_freq = float(tum_alt_depth) / \
                                  (float(tum_alt_depth) + float(tum_ref_depth))
               except ZeroDivisionError:
                   tum_alt_freq = 'NA'

               try:
                   nrm_alt_freq = float(nrm_alt_depth) / \
                                  (float(nrm_alt_depth) + float(nrm_ref_depth))
               except ZeroDivisionError:
                   nrm_alt_freq = 'NA'

               # apply evidence thresholds.
               if (args.max_norm_alt_freq and nrm_alt_freq > args.max_norm_alt_freq) \
                  or \
                  (args.max_norm_alt_count and nrm_alt_depth > args.max_norm_alt_count):
                  continue

               somatic_counter += 1
               somatic_v_ids.append((1, row['variant_id']))

               print'\t'.join(str(s) for s in [tumor.name,  tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \
                                   normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \
                                   row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']])

    if not args.dry_run:
        import database
        conn, metadata = database.get_session_metadata(args.db)

        # now set the identified mutations to True.
        update_qry = "UPDATE variants SET is_somatic = 1 "
        update_qry += " WHERE variant_id IN (%s)"
        update_qry %= ",".join(str(x[1]) for x in somatic_v_ids)
        res = conn.execute(update_qry)
        assert res.rowcount == somatic_counter
        print "Identified and set", somatic_counter, "somatic mutations"
        conn.commit()
    else:
        print "Would have identified and set", somatic_counter, "somatic mutations"
Example #13
0
def get_auto_dominant_candidates(args):
    """
    Report candidate variants that meet an autosomal dominant
    inheritance model.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    
    if args.columns is not None:
        # the user only wants to report a subset of the columns
        query = "SELECT " + str(args.columns) + " FROM variants"
    else:
        # report the kitchen sink
        query = "SELECT *" + \
                ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals" + \
                " FROM variants"

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " WHERE " + args.filter

    # collect family info
    families = subjects.get_families(gq.c)
    family_ids = []
    family_masks = []
    family_sample_gt_labels = []
    family_sample_gt_columns = []
    for family in families:
        family_masks.append(family.get_auto_dominant_filter())
        family_sample_gt_labels.append(family.get_subject_genotype_labels())
        family_sample_gt_columns.append(family.get_subject_genotype_columns())
        family_ids.append(family.family_id)

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    # print a header
    print "family_id\tfamily_members\tfamily_genotypes\t",
    print gq.header

    # yield the resulting variants for this familiy
    for row in gq:
        
        # interrogate the genotypes present in each family member to conforming
        # to the genetic model being tested
        gt_types = row['gt_types']
        gts = row['gts']

        # test the variant for each family in the db
        for idx, fam_id in enumerate(family_ids):
            family_genotype_mask = family_masks[idx]
            family_sample_gt_label = family_sample_gt_labels[idx]
            family_sample_gt_cols = family_sample_gt_columns[idx]

            # skip if the variant doesn't meet a dominant model
            # for this family
            
            if not eval(family_genotype_mask):
                continue

            print str(fam_id) + "\t" + \
               ",".join([str(s) for s in family_sample_gt_label]) + "\t", \
               ",".join([str(eval(s)) for s in family_sample_gt_cols]) + "\t",
            print row
Example #14
0
def tag_somatic_mutations(args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    gq = GeminiQuery.GeminiQuery(args.db)

    if args.chrom is None:
        query = "SELECT variant_id, chrom, start, end, \
                        ref, alt, gene, impact, gts, gt_types, \
                        gt_ref_depths, gt_alt_depths \
                 FROM variants \
                 WHERE depth >= " + str(args.min_depth) + \
                 " AND   qual >= " + str(args.min_qual) 
    else:
        query = "SELECT variant_id, chrom, start, end, \
                ref, alt, gene, impact, gts, gt_types, \
                gt_ref_depths, gt_alt_depths \
         FROM variants \
         WHERE depth >= " + str(args.min_depth) + \
         " AND qual >= " + str(args.min_qual) + \
         " AND chrom = \'" + args.chrom + "\'"

    gq.run(query)
    smp2idx = gq.sample_to_idx

    somatic_counter = 0
    somatic_v_ids = []

    if args.dry_run:
        print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \
                        'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth',
                        'chrom', 'start', 'end', 'ref', 'alt', 'gene'])

    for row in gq:

        # we can skip varinats where all genotypes are identical
        if len(set(row['gt_types'])) == 1:
            continue

        for pair in t_n_pairs:

            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            tum_idx = smp2idx[tumor.name]
            nrm_idx = smp2idx[normal.name]

            tum_gt = row['gts'][tum_idx]
            nrm_gt = row['gts'][nrm_idx]

            tum_gt_type = row['gt_types'][tum_idx]
            nrm_gt_type = row['gt_types'][nrm_idx]

            if nrm_gt_type == tum_gt_type:
                continue

            if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN:
                continue

            # the genotypes pass the smell test for somatic
            # mutations if in this block.
            if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF):

               
               tum_ref_depth = row['gt_ref_depths'][tum_idx]
               nrm_ref_depth = row['gt_ref_depths'][nrm_idx]

               tum_alt_depth = row['gt_alt_depths'][tum_idx]
               nrm_alt_depth = row['gt_alt_depths'][nrm_idx]

               # total observed depth
               nrm_depth = nrm_alt_depth + nrm_ref_depth
               tum_depth = tum_alt_depth + tum_ref_depth

               if (nrm_depth < args.min_norm_depth \
                  or \
                  tum_depth < args.min_tumor_depth):
                  continue

               tum_alt_freq = float(tum_alt_depth) / \
                              (float(tum_alt_depth) + float(tum_ref_depth))

               nrm_alt_freq = float(nrm_alt_depth) / \
                              (float(nrm_alt_depth) + float(nrm_ref_depth))

               # apply evidence thresholds.
               if nrm_alt_freq > args.max_norm_alt_freq \
                  or \
                  nrm_alt_depth > args.max_norm_alt_count:
                  continue

               somatic_counter += 1
               somatic_v_ids.append((1, row['variant_id']))
               
               print'\t'.join(str(s) for s in [tumor.name,  tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \
                                   normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \
                                   row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']])

    if not args.dry_run:
        conn = sqlite3.connect(args.db)
        conn.isolation_level = None
        c = conn.cursor()

        # now set the identified mutations to True.
        update_qry = "UPDATE variants SET is_somatic = ? "
        update_qry += " WHERE variant_id = ?"
        c.executemany(update_qry, somatic_v_ids)
        print "Identified and set", somatic_counter, "somatic mutations"
    else:
        print "Would have identified and set", somatic_counter, "somatic mutations"
Example #15
0
def get_de_novo_candidates(args):
    """
    Report candidate variants that meet appear to be de novo
    mutations in the child. We cannot distinguish mutations that
    occured in the parental germline from those that occurred
    early in development in the child post-conception.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)

    if args.columns is not None:
        # the user only wants to report a subset of the columns
        query = "SELECT " + str(args.columns) + " FROM variants"
    else:
        # report the kitchen sink
        query = (
            "SELECT *"
            + ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals"
            + " FROM variants"
        )

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " WHERE " + args.filter

    # collect family info
    families = subjects.get_families(gq.c)
    family_ids = []
    family_masks = []
    family_sample_gt_labels = []
    family_sample_gt_columns = []
    family_sample_depth_columns = []
    for family in families:
        family_masks.append(family.get_de_novo_filter())
        family_sample_gt_labels.append(family.get_subject_genotype_labels())
        family_sample_gt_columns.append(family.get_subject_genotype_columns())
        family_sample_depth_columns.append(family.get_subject_depth_columns())
        family_ids.append(family.family_id)

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    # print a header
    print "family_id\tfamily_members\tfamily_genotypes\tdepths\t",
    print gq.header

    # yield the resulting variants for this familiy
    for row in gq:

        # interrogate the genotypes present in each family member to conforming
        # to the genetic model being tested
        gt_types = row["gt_types"]
        gts = row["gts"]
        gt_depths = row["gt_depths"]

        # test the variant for each family in the db
        for idx, fam_id in enumerate(family_ids):
            family_genotype_mask = family_masks[idx]
            family_sample_gt_label = family_sample_gt_labels[idx]
            family_sample_gt_cols = family_sample_gt_columns[idx]
            family_sample_dp_cols = family_sample_depth_columns[idx]

            # skip if the variant doesn't meet a de novo model
            # for this family
            if not eval(family_genotype_mask):
                continue

            # make sure each sample's genotype had sufficient coverage.
            # otherwise, ignore
            insufficient_depth = False
            for col in family_sample_dp_cols:
                depth = int(eval(col))
                if depth < args.min_sample_depth:
                    insufficient_depth = True
                    break
            if insufficient_depth:
                continue

            print str(fam_id) + "\t" + ",".join([str(s) for s in family_sample_gt_label]) + "\t", ",".join(
                [str(eval(s)) for s in family_sample_gt_cols]
            ) + "\t", ",".join([str(eval(s)) for s in family_sample_dp_cols]) + "\t",
            print row