Esempio n. 1
0
def run_query(args):
    predicates = get_row_predicates(args)
    add_required_columns_to_query(args)
    formatter = select_formatter(args)
    genotypes_needed = needs_genotypes(args)
    gene_needed = needs_gene(args)
    try:
        subjects = get_subjects(args)
    except KeyError:
        subjects = []
    kwargs = {}
    if args.bcolz:
        import gemini_bcolz
        kwargs['variant_id_getter'] = gemini_bcolz.filter

    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs)
    gq.run(args.query,
           args.gt_filter,
           args.show_variant_samples,
           args.sample_delim,
           predicates,
           genotypes_needed,
           gene_needed,
           args.show_families,
           subjects=subjects)

    if args.use_header and gq.header:
        print gq.header

    if not args.dgidb:
        for row in gq:
            print row
    else:
        # collect a list of all the genes that need to be queried
        # from DGIdb
        genes = defaultdict()
        for row in gq:
            genes[row['gene']] = True

        # collect info from DGIdb
        dgidb_info = query_dgidb(genes)

        # rerun the query (the cursor is now consumed)
        gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
        gq.run(args.query,
               args.gt_filter,
               args.show_variant_samples,
               args.sample_delim,
               predicates,
               genotypes_needed,
               gene_needed,
               args.show_families,
               subjects=subjects,
               **kwargs)

        # report the query results with DGIdb info added at the end.
        for row in gq:
            print str(row) + "\t" + str(dgidb_info[row['gene']])
def get_actionable_mutations(parser, args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    query = "SELECT variants.chrom, start, end, ref, alt, \
                    variants.gene, impact, is_somatic, \
                    gene_summary.in_cosmic_census \
             FROM variants, gene_summary \
             WHERE variants.is_somatic = 1 \
             AND (variants.type = 'snp' \
                 OR variants.type = 'indel') \
             AND (variants.impact_severity = 'HIGH' \
                 OR variants.impact_severity = 'MED') \
             AND variants.chrom = gene_summary.chrom \
             AND variants.gene = gene_summary.gene \
             AND gene_summary.in_cosmic_census = 1"


    # collect the relevant genes and query DGIDB
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)

    genes = defaultdict()
    for row in gq:
        genes[row['gene']] = True
    # collect info from DGIdb
    dgidb_info = query_dgidb(genes)


    # now rerun the query and report actionable mutations per DGIDB and COSMIC census.
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)
    print'\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \
                    'gene', 'impact', 'is_somatic', 'in_cosmic_census', 'dgidb_info'])
    for row in gq:
        for pair in t_n_pairs:
            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            print'\t'.join(str(s) for s in [tumor.name, row['chrom'], \
                                            row['start'], row['end'], \
                                            row['ref'], row['alt'], \
                                            row['gene'], row['impact'], \
                                            row['is_somatic'], \
                                            row['in_cosmic_census'], \
                                            str(dgidb_info[row['gene']])])
Esempio n. 3
0
def get_fusions(args):
    """
    Identify candidate rearrangments resulting in fusion genes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)

    # create strings for gemini query of command line args
    qual_string, ev_type_string, cosmic_string = ("", "", "")
    if args.min_qual:
        qual_string = " AND qual >= %s" % args.min_qual
    if args.evidence_type:
        ev_type_string = " AND sv_evidence_type = '%s'" % args.evidence_type

    query = """SELECT variants.chrom, start, end,
                      ref, alt,
                      qual,
                      is_somatic, somatic_score,
                      type, sub_type, variants.gene, 
                      sv_strand, sv_length,
                      sv_cipos_start_left,
                      sv_cipos_start_right,
                      sv_cipos_end_left,
                      sv_cipos_end_right,
                      sv_event_id, sv_mate_id,
                      sv_tool, sv_evidence_type,
                      sv_is_precise,
                      gene_summary.strand,
                      gene_summary.transcript_min_start,
                      gene_summary.transcript_max_end,
                      gene_summary.in_cosmic_census
               FROM variants, gene_summary
               WHERE is_somatic = 1   
               AND   type = 'sv'
               AND   variants.gene is not NULL
               AND   variants.chrom = gene_summary.chrom
               AND   variants.gene = gene_summary.gene
               %s
               %s
               ORDER BY sv_event_id
            """ % (qual_string, ev_type_string)

    curr = None
    prev = None
    gq.run(query)
    for row in gq:
        # single-line variants (DEL, DUP, INV)
        if row['sub_type'] != 'complex':
            report_fusion([row], subjects_dict, args)

        # multi-line variants (BND)
        elif row['sv_mate_id']:
            curr = row
            # the SV event ids match, and prev is not None
            if (prev and curr['sv_event_id'] == prev['sv_event_id']):
                report_fusion([prev, curr], subjects_dict, args)
            # shift the previous
            prev = curr
Esempio n. 4
0
def query_json():
    query = request.GET.get('query', '').strip()

    gq = GeminiQuery.GeminiQuery(database)
    gq._set_gemini_browser(True)
    gq.run(query)

    return {'gemini_results': [dict(row) for row in gq]}
Esempio n. 5
0
def get_compound_hets(args):
    """
    Report candidate compound heterozygotes.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
    idx_to_sample = gq.idx_to_sample
    subjects_dict = subjects.get_subjects(args)

    # run the query applying any genotype filters provided by the user.
    gq.run(create_query(args))

    sample_hets = collections.defaultdict(
        lambda: collections.defaultdict(list))
    curr_gene = None
    prev_gene = None
    comp_het_counter = 0
    # output header
    print "family\tsample\tcomp_het_id\t" + str(gq.header)
    # Collect all of the genic heterozygotes for each sample / gene
    for row in gq:

        gt_types = row['gt_types']
        gt_bases = row['gts']
        gt_phases = row['gt_phases']
        curr_gene = row['gene']

        # gene has changed. process the comp_hets for this gene and reset.
        if curr_gene != prev_gene and prev_gene is not None:
            # process comp_hets
            samples_w_hetpair = find_valid_het_pairs(args, sample_hets)
            comp_het_counter = filter_candidates(args, samples_w_hetpair,
                                                 subjects_dict,
                                                 comp_het_counter)
            # reset for next gene
            sample_hets = collections.defaultdict(
                lambda: collections.defaultdict(list))

        site = Site(row)
        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates for this sample/gene
                sample_hets[sample][site.row['gene']].append(sample_site)
        prev_gene = curr_gene

    # process the last gene seen
    samples_w_hetpair = find_valid_het_pairs(args, sample_hets)
    comp_het_counter = filter_candidates(args, samples_w_hetpair,
                                         subjects_dict, comp_het_counter)
Esempio n. 6
0
def _medium_or_high_impact_variants(args):
    query = ("SELECT variant_id, gene from variants"
             " WHERE impact_severity != 'LOW'"
             " AND aaf >= %s"
             " AND aaf <= %s" % (str(args.min_aaf), str(args.max_aaf)))

    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query, show_variant_samples=True)
    return gq
Esempio n. 7
0
File: gim.py Progetto: jsh58/gemini
    def __init__(self, args):

        self.args = args
        self.gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
        self.added = []

        self.gt_cols = self.gq.gt_cols

        if not args.columns:
            args.columns = "*," + ", ".join(self.gt_cols)
        self.set_family_info()
Esempio n. 8
0
def _get_variant_range(args):
    "Return the starting and ending variant id for a given chromosome"
    gq = GeminiQuery.GeminiQuery(args.db)
    query = """SELECT min(variant_id) as cmin, max(variant_id) as cmax
               FROM   variants
               WHERE  chrom = '%s'
            """ % args.chrom
    gq.run(query)
    start, end = None, None
    for row in gq:
        start, end = row['cmin'], row['cmax']
    return start, end
Esempio n. 9
0
def _get_case_and_control_samples(args):
    query = ("SELECT * from samples")
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)
    cases = []
    controls = []
    for row in gq:
        if int(row["phenotype"]) == 1:
            controls.append(row["name"])
        elif int(row["phenotype"]) == 2:
            cases.append(row["name"])
    return cases, controls
Esempio n. 10
0
def run_query(args):
    predicates = get_row_predicates(args)
    add_required_columns_to_query(args)
    formatter = select_formatter(args)
    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
    gq.run(args.query, args.gt_filter, args.show_variant_samples,
           args.sample_delim, predicates, needs_genotypes(args))

    if args.use_header and gq.header:
        print gq.header

    for row in gq:
        print row
Esempio n. 11
0
def family_wise_predicate(args):
    formatter = select_formatter(args)
    families = get_family_dict(args)
    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
    predicates = []
    for f in families.values():
        family_names = [x.name for x in f]
        subjects = get_subjects_in_family(args, f).values()
        predicates.append(select_subjects_predicate(subjects, args,
                                                    family_names))
    def predicate(row):
        return sum([p(row) for p in predicates]) >= args.min_kindreds
    return predicate
Esempio n. 12
0
def region(parser, args):

    if os.path.exists(args.db):

        formatter = select_formatter(args)
        gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)

        if args.region is not None and args.gene is not None:
            sys.exit('EXITING: Choose either --reg or --gene, not both.\n')
        elif args.region is not None:
            get_region(args, gq)
        elif args.gene is not None:
            get_gene(args, gq)
Esempio n. 13
0
def query(parser, args):

    if (args.db is None):
        parser.print_help()

    if os.path.exists(args.db):

        gq = GeminiQuery.GeminiQuery(args.db)
        gq.run(args.query, args.gt_filter, args.show_variant_samples)

        if args.use_header:
            print gq.header

        for row in gq:
            print row
Esempio n. 14
0
def get_subjects(args):
    """
    return a dictionary of subjects, optionally using the
    subjects_query argument to filter them.
    """
    gq = GeminiQuery.GeminiQuery(args.db)
    query = "SELECT * FROM samples"
    if hasattr(args, 'sample_filter') and args.sample_filter:
        query += " WHERE " + args.sample_filter
    gq.c.execute(query)
    samples_dict = {}
    for row in gq.c:
        subject = Subject(row)
        samples_dict[subject.name] = subject
    return samples_dict
Esempio n. 15
0
def _get_sample_sex(args):
    "Return a map of sample name to reported sex"
    gq = GeminiQuery.GeminiQuery(args.db)
    query = """SELECT name, sex FROM samples"""
    sample_sex = {}
    gq.run(query)
    for row in gq:
        if row['sex'] == '1':
            sex = 'male'
        elif row['sex'] == '2':
            sex = 'female'
        else:
            sex = 'unknown'
        sample_sex[row['name']] = sex
    return sample_sex
Esempio n. 16
0
def run_query(args):
    start_time = time.time()
    predicates = get_row_predicates(args)
    add_required_columns_to_query(args)
    formatter = select_formatter(args)
    genotypes_needed = needs_genotypes(args)
    gene_needed = needs_gene(args)
    sample_names_needed = args.sample_filter or args.family_wise
    gq = GeminiQuery.GeminiQuery(args.contact_points,
                                 args.keyspace,
                                 out_format=formatter)
    gq.run(args.query, args.gt_filter, args.show_variant_samples,
           args.sample_delim, predicates, genotypes_needed, gene_needed,
           args.show_families, args.testing, sample_names_needed, args.cores,
           start_time, args.use_header, args.exp_id, args.timeout,
           args.batch_size)
Esempio n. 17
0
def get_subjects(args, skip_filter=False):
    """
    return a dictionary of subjects, optionally using the
    subjects_query argument to filter them.
    """
    gq = GeminiQuery.GeminiQuery(args.contact_points, args.keyspace)
    query = "SELECT * FROM samples"
    if not skip_filter:
        if hasattr(args, 'sample_filter') and args.sample_filter:
            query += " WHERE " + args.sample_filter
    res = gq.run_simple_query(query)
    samples_dict = {}
    for row in res:
        subject = Subject(row)
        samples_dict[subject.name] = subject
    return samples_dict
Esempio n. 18
0
File: gim.py Progetto: jsh58/gemini
    def gen_candidates(self, group_key):
        if isinstance(group_key, basestring):
            group_key = op.itemgetter(group_key)

        q = self.query
        vids = self.bcolz_candidates()
        if vids is None:
            self.gq.run(q, needs_genotypes=True)

        elif len(vids) > 0:
            q = GeminiQuery.add_variant_ids_to_query(q, vids)
            self.gq.run(q, needs_genotypes=True)
        else:
            # no variants met the criteria
            raise StopIteration

        for grp_key, grp in it.groupby(self.gq, group_key):
            yield grp_key, grp
Esempio n. 19
0
    def gen_candidates(self, group_key):
        if isinstance(group_key, basestring):
            group_key = op.itemgetter(group_key)

        q = self.query
        vids = self.bcolz_candidates()
        if vids is None:
            self.gq.run(q, needs_genotypes=True)

        elif len(vids) > 0:
            q = GeminiQuery.add_variant_ids_to_query(q, vids)
            self.gq.run(q, needs_genotypes=True)
        else:
            # no variants met the criteria
            raise StopIteration

        for grp_key, grp in it.groupby(self.gq, group_key):
            yield grp_key, grp
Esempio n. 20
0
def summarize_query_by_sample(args):
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(args.query, show_variant_samples=True)
    total_counts = Counter()
    het_counts = Counter()
    hom_alt_counts = Counter()
    print "\t".join(["sample", "total", "num_het", "num_hom_alt"])
    for row in gq:
        total_counts.update(row["variant_samples"])
        het_counts.update(row["HET_samples"])
        hom_alt_counts.update(row["HOM_ALT_samples"])
    for key in total_counts.keys():
        count_row = [
            key,
            total_counts.get(key, 0),
            het_counts.get(key, 0),
            hom_alt_counts.get(key, 0)
        ]
        print "\t".join(map(str, count_row))
Esempio n. 21
0
def get_subjects(args, skip_filter=False):
    """
    return a dictionary of subjects, optionally using the
    subjects_query argument to filter them.
    """
    gq = GeminiQuery.GeminiQuery(args.db)

    #query = "SELECT * FROM samples"
    query = ""
    if not skip_filter:
        if hasattr(args, 'sample_filter') and args.sample_filter:
            query += args.sample_filter

    res = gq.metadata.tables["samples"].select().where(sql.text(query)).execute()

    samples_dict = {}
    for row in res:
        subject = Subject(row)
        samples_dict[subject.name] = subject
    return samples_dict
Esempio n. 22
0
def stats_region(chrom):
    # Note: chrom is give as an argument

    # we then extract start and end using HTML GET
    start = request.GET.get('start', '').strip()
    end = request.GET.get('end', '').strip()

    # construct a query
    query =  "SELECT start, end from variants"
    query += " WHERE chrom = '" + chrom + "'"
    query += " AND start >= " + start
    query += " AND end <= " + end

    # issue the query
    gq = GeminiQuery.GeminiQuery(database)
    gq._set_gemini_browser(True)
    gq.run(query)

    # return query results in JSON format
    return{'features': [dict(row) for row in gq]}
Esempio n. 23
0
def _summarize_by_gene_and_sample(args, query):
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query, show_variant_samples=True)
    burden = defaultdict(Counter)
    for row in gq:
        gene_name = row['gene']
        if not gene_name:
            continue
        new_counts = Counter(row["HET_samples"])
        # Counter can't do scalar multiplication
        new_counts = new_counts + Counter(row["HOM_ALT_samples"])
        new_counts = new_counts + Counter(row["HOM_ALT_samples"])

        del new_counts['']
        burden[gene_name] += new_counts

    df = DataFrame({})
    for gene_name, counts in burden.items():
        df = df.append(
            DataFrame(counts, columns=counts.keys(), index=[gene_name]))
    df = df.replace(np.NaN, 0)
    df.to_csv(sys.stdout, float_format="%d", sep="\t", index_label='gene')
Esempio n. 24
0
def de_novo():

    # user clicked the "submit" button
    if request.GET.get('submit', '').strip():

        min_sample_depth = str(request.GET.get('min-depth', '').strip())
        igv_links = request.GET.get('igv_links')

        gq = GeminiQuery.GeminiQuery(database)

        if len(min_sample_depth) == 0:
            row_iter = \
                de_novo_tool.get_de_novo_candidates(gq.c)
        else:
            row_iter = \
                de_novo_tool.get_de_novo_candidates(gq.c, int(min_sample_depth))

        return template('de_novo.j2', dbfile=database,
                        rows=row_iter,
                        igv_links=igv_links)

    else:
        return template('de_novo.j2', dbfile=database)
Esempio n. 25
0
def tag_somatic_mutations(args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    gq = GeminiQuery.GeminiQuery(args.db)

    depth_string, qual_string, ssc_string, chrom_string = ("", "", "", "")
    if args.min_depth:
        depth_string = " AND depth >= %s" % args.min_depth
    if args.min_qual:
        qual_string = " AND qual >= %s" % args.min_qual
    if args.min_somatic_score:
        ssc_string = " AND (type='sv' \
                         OR somatic_score >= %s)" % args.min_somatic_score
    if args.chrom:
        chrom_string = " AND chrom = '%s'" % args.chrom

    if args.chrom is None:
        query = "SELECT variant_id, chrom, start, end, \
                        ref, alt, gene, impact, gts, gt_types, \
                        gt_ref_depths, gt_alt_depths \
                 FROM variants \
                 WHERE 1 \
                 %s \
                 %s \
                 %s \
                 %s" % (depth_string, qual_string, ssc_string, chrom_string)

    gq.run(query)
    smp2idx = gq.sample_to_idx

    somatic_counter = 0
    somatic_v_ids = []

    if args.dry_run:
        print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \
                        'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth',
                        'chrom', 'start', 'end', 'ref', 'alt', 'gene'])

    for row in gq:
        # we can skip variants where all genotypes are identical
        if len(set(row['gt_types'])) == 1:
            continue

        for pair in t_n_pairs:

            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            tum_idx = smp2idx[tumor.name]
            nrm_idx = smp2idx[normal.name]

            tum_gt = row['gts'][tum_idx]
            nrm_gt = row['gts'][nrm_idx]

            tum_gt_type = row['gt_types'][tum_idx]
            nrm_gt_type = row['gt_types'][nrm_idx]

            if nrm_gt_type == tum_gt_type:
                continue

            if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN:
                continue

            # the genotypes pass the smell test for somatic
            # mutations if in this block.
            if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF):

               tum_ref_depth = row['gt_ref_depths'][tum_idx]
               nrm_ref_depth = row['gt_ref_depths'][nrm_idx]

               tum_alt_depth = row['gt_alt_depths'][tum_idx]
               nrm_alt_depth = row['gt_alt_depths'][nrm_idx]

               # total observed depth
               nrm_depth = nrm_alt_depth + nrm_ref_depth
               tum_depth = tum_alt_depth + tum_ref_depth

               if (nrm_depth < args.min_norm_depth \
                  or \
                  tum_depth < args.min_tumor_depth):
                  continue

               try:
                   tum_alt_freq = float(tum_alt_depth) / \
                                  (float(tum_alt_depth) + float(tum_ref_depth))
               except ZeroDivisionError:
                   tum_alt_freq = 'NA'

               try:
                   nrm_alt_freq = float(nrm_alt_depth) / \
                                  (float(nrm_alt_depth) + float(nrm_ref_depth))
               except ZeroDivisionError:
                   nrm_alt_freq = 'NA'

               # apply evidence thresholds.
               if (args.max_norm_alt_freq and nrm_alt_freq > args.max_norm_alt_freq) \
                  or \
                  (args.max_norm_alt_count and nrm_alt_depth > args.max_norm_alt_count):
                  continue

               somatic_counter += 1
               somatic_v_ids.append((1, row['variant_id']))

               print'\t'.join(str(s) for s in [tumor.name,  tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \
                                   normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \
                                   row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']])

    if not args.dry_run:
        import database
        conn, metadata = database.get_session_metadata(args.db)

        # now set the identified mutations to True.
        update_qry = "UPDATE variants SET is_somatic = 1 "
        update_qry += " WHERE variant_id IN (%s)"
        update_qry %= ",".join(str(x[1]) for x in somatic_v_ids)
        res = conn.execute(update_qry)
        assert res.rowcount == somatic_counter
        print "Identified and set", somatic_counter, "somatic mutations"
        conn.commit()
    else:
        print "Would have identified and set", somatic_counter, "somatic mutations"
 def __init__(self, args, model):
     self.args = args
     self.model = model
     self.gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)
Esempio n. 27
0
def get_de_novo_candidates(args):
    """
    Report candidate variants that meet appear to be de novo
    mutations in the child. We cannot distinguish mutations that
    occured in the parental germline from those that occurred
    early in development in the child post-conception.
    """
    gq = GeminiQuery.GeminiQuery(args.db, include_gt_cols=True)

    if args.columns is not None:
        # the user only wants to report a subset of the columns
        query = "SELECT " + str(args.columns) + " FROM variants"
    else:
        # report the kitchen sink
        query = "SELECT *" + \
                ", gts, gt_types, gt_phases, gt_depths, \
                gt_ref_depths, gt_alt_depths, gt_quals"                                                        + \
                " FROM variants"

    # add any non-genotype column limits to the where clause
    if args.filter:
        query += " WHERE " + args.filter

    # collect family info
    families = subjects.get_families(gq.c)
    family_ids = []
    family_masks = []
    family_sample_gt_labels = []
    family_sample_gt_columns = []
    family_sample_depth_columns = []
    for family in families:
        family_filter = family.get_de_novo_filter()
        if family_filter != "False":
            family_masks.append(family_filter)
            family_sample_gt_labels.append(
                family.get_subject_genotype_labels())
            family_sample_gt_columns.append(
                family.get_subject_genotype_columns())
            family_sample_depth_columns.append(
                family.get_subject_depth_columns())
            family_ids.append(family.family_id)

    # run the query applying any genotype filters provided by the user.
    gq.run(query)

    # print a header
    print "family_id\tfamily_members\tfamily_genotypes\tdepths\t",
    print gq.header

    # yield the resulting variants for this familiy
    for row in gq:

        # interrogate the genotypes present in each family member to conforming
        # to the genetic model being tested
        gt_types = row['gt_types']
        gts = row['gts']
        gt_depths = row['gt_depths']

        # test the variant for each family in the db
        for idx, fam_id in enumerate(family_ids):
            family_genotype_mask = family_masks[idx]
            family_sample_gt_label = family_sample_gt_labels[idx]
            family_sample_gt_cols = family_sample_gt_columns[idx]
            family_sample_dp_cols = family_sample_depth_columns[idx]

            # skip if the variant doesn't meet a de novo model
            # for this family
            if not eval(family_genotype_mask):
                continue

            # make sure each sample's genotype had sufficient coverage.
            # otherwise, ignore
            insufficient_depth = False
            for col in family_sample_dp_cols:
                depth = int(eval(col))
                if depth < args.min_sample_depth:
                    insufficient_depth = True
                    break
            if insufficient_depth:
                continue

            print str(fam_id) + "\t" + \
               ",".join([str(s) for s in family_sample_gt_label]) + "\t", \
               ",".join([str(eval(s)) for s in family_sample_gt_cols]) + "\t", \
               ",".join([str(eval(s)) for s in family_sample_dp_cols]) + "\t",
            print row
Esempio n. 28
0
def tag_somatic_mutations(args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    gq = GeminiQuery.GeminiQuery(args.db)

    if args.chrom is None:
        query = "SELECT variant_id, chrom, start, end, \
                        ref, alt, gene, impact, gts, gt_types, \
                        gt_ref_depths, gt_alt_depths \
                 FROM variants \
                 WHERE depth >= "                                  + str(args.min_depth) + \
                 " AND   qual >= " + str(args.min_qual)
    else:
        query = "SELECT variant_id, chrom, start, end, \
                ref, alt, gene, impact, gts, gt_types, \
                gt_ref_depths, gt_alt_depths \
         FROM variants \
         WHERE depth >= "                          + str(args.min_depth) + \
         " AND qual >= " + str(args.min_qual) + \
         " AND chrom = \'" + args.chrom + "\'"

    gq.run(query)
    smp2idx = gq.sample_to_idx

    somatic_counter = 0
    somatic_v_ids = []

    if args.dry_run:
        print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \
                        'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth',
                        'chrom', 'start', 'end', 'ref', 'alt', 'gene'])

    for row in gq:

        # we can skip varinats where all genotypes are identical
        if len(set(row['gt_types'])) == 1:
            continue

        for pair in t_n_pairs:

            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            tum_idx = smp2idx[tumor.name]
            nrm_idx = smp2idx[normal.name]

            tum_gt = row['gts'][tum_idx]
            nrm_gt = row['gts'][nrm_idx]

            tum_gt_type = row['gt_types'][tum_idx]
            nrm_gt_type = row['gt_types'][nrm_idx]

            if nrm_gt_type == tum_gt_type:
                continue

            if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN:
                continue

            # the genotypes pass the smell test for somatic
            # mutations if in this block.
            if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF):

                tum_ref_depth = row['gt_ref_depths'][tum_idx]
                nrm_ref_depth = row['gt_ref_depths'][nrm_idx]

                tum_alt_depth = row['gt_alt_depths'][tum_idx]
                nrm_alt_depth = row['gt_alt_depths'][nrm_idx]

                # total observed depth
                nrm_depth = nrm_alt_depth + nrm_ref_depth
                tum_depth = tum_alt_depth + tum_ref_depth

                if (nrm_depth < args.min_norm_depth \
                   or \
                   tum_depth < args.min_tumor_depth):
                    continue

                tum_alt_freq = float(tum_alt_depth) / \
                               (float(tum_alt_depth) + float(tum_ref_depth))

                nrm_alt_freq = float(nrm_alt_depth) / \
                               (float(nrm_alt_depth) + float(nrm_ref_depth))

                # apply evidence thresholds.
                if nrm_alt_freq > args.max_norm_alt_freq \
                   or \
                   nrm_alt_depth > args.max_norm_alt_count:
                    continue

                somatic_counter += 1
                somatic_v_ids.append((1, row['variant_id']))

                print'\t'.join(str(s) for s in [tumor.name,  tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \
                                    normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \
                                    row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']])

    if not args.dry_run:
        conn = sqlite3.connect(args.db)
        conn.isolation_level = None
        c = conn.cursor()

        # now set the identified mutations to True.
        update_qry = "UPDATE variants SET is_somatic = ? "
        update_qry += " WHERE variant_id = ?"
        c.executemany(update_qry, somatic_v_ids)
        print "Identified and set", somatic_counter, "somatic mutations"
    else:
        print "Would have identified and set", somatic_counter, "somatic mutations"
Esempio n. 29
0
def _nonsynonymous_variants(db):
    query = ("SELECT variant_id, gene from variants WHERE "
             "codon_change != 'None'")
    gq = GeminiQuery.GeminiQuery(db)
    gq.run(query, show_variant_samples=True)
    return gq
Esempio n. 30
0
def report_fusion(event, subjects_dict, args):
    """
    Report the fusion event.
    """
    # filter single line events
    if len(event) == 1:
        sv = event.pop()
        gene1 = sv['gene']
        gene1_strand = sv['strand']
        gene1_start = sv['transcript_min_start']
        gene1_end = sv['transcript_max_end']

        # query the table to test whether the END breakpoint lies in a gene
        gq = GeminiQuery.GeminiQuery(args.db)

        query = """SELECT gene,
                          strand,
                          in_cosmic_census
                   FROM   gene_summary
                   WHERE  gene_summary.chrom = '%s'
                   AND    (gene_summary.transcript_min_start > %s
                          OR gene_summary.transcript_max_end < %s)
                   AND    gene_summary.transcript_min_start < %s
                   AND    gene_summary.transcript_max_end > %s
                   AND    gene_summary.gene != 'None'
                   LIMIT  1
                """ % (sv['chrom'], sv['transcript_max_end'],
                       sv['transcript_min_start'], sv['end'], sv['end'])

        gq.run(query)
        gene2, gene2_strand, gene2_cosmic = (None, None, None)
        for row in gq:
            gene2 = row['gene']
            gene2_strand = row['strand']
            gene2_cosmic = row['in_cosmic_census']
            break  # just get the first gene interrupted by the breakend

        # Break if breakpoint2 is intergenic
        if gene2 == None:
            return

        # if SV is a deletion or duplication, genes must be same strand for fusion
        if sv['sub_type'] == 'DEL' or sv['sub_type'] == 'DUP':
            if gene1_strand != gene2_strand:
                return
        # if SV is an inversion, genes must be opposite strands for fusion
        if sv['sub_type'] == 'INV':
            if gene1_strand == gene2_strand:
                return
        # check COSMIC status, if required
        if args.in_cosmic_census and not (sv['in_cosmic_census']
                                          or gene2_cosmic):
            return

        # pass the variables for compatibility with multi-line variants
        end1 = sv
        end2_chrom = end1['chrom']
        end2_start = sv['sv_cipos_start_right']
        end2_end = sv['sv_cipos_end_right']

    # filter multi-line events
    elif len(event) == 2:
        end1 = event.pop()
        end2 = event.pop()
        gene1_strand, gene2_strand = end1['strand'], end2[
            'strand']  # this is gene_summary.strand
        # require that the genes are non-overlapping
        if (end1['chrom'] == end2['chrom'] \
                and end1['transcript_max_end'] >= end2['transcript_min_start'] \
                and end1['transcript_min_start'] <= end2['transcript_max_end']):
            return
        # if breakpoint joins same strand,
        # then genes must be same strand for fusion
        if (end1['sv_strand'][0] == end1['sv_strand'][1] \
                and gene1_strand != gene2_strand):
            return
        # if breakpoint joins opposite strands,
        # then genes must also be opposite strands for fusion
        if (end1['sv_strand'][0] != end1['sv_strand'][1] \
                  and gene1_strand == gene2_strand):
            return
        # check COSMIC status, if required
        if args.in_cosmic_census and not (end1['in_cosmic_census']
                                          or end2['in_cosmic_census']):
            return

        # store the second end for compatibility with single-line SVs
        gene2 = end2['gene']
        end2_chrom = end2['chrom']
        end2_start = end2['sv_cipos_start_right']
        end2_end = end2['sv_cipos_end_right']

    # fusion passes all filters, print
    print '\t'.join(
        map(str, [
            end1['chrom'], end1['sv_cipos_start_left'] - 1,
            end1['sv_cipos_end_left'], end2_chrom, end2_start - 1, end2_end,
            end1['sv_event_id'], end1['qual'], end1['sv_strand'][0],
            end1['sv_strand'][1], end1['sub_type'], end1['gene'], gene2,
            end1['sv_tool'], end1['sv_evidence_type'], end1['sv_is_precise'],
            ','.join(end1['variant_samples'])
        ]))
    return
Esempio n. 31
0
def get_homozygosity_runs(args):

    gq = GeminiQuery.GeminiQuery(args.db)

    # get a mapping of sample ids to sample indices
    idx2smp = gq.index2sample
    smp2idx = gq.sample2index
    sm_index = []

    # prepare a lookup of just the samples
    # for which the user wishes to search for ROHs
    if args.samples is not None:
        sample_filter = args.samples.strip().split(",")
        for sample in sample_filter:
            try:
                idx = smp2idx[sample]
            except:
                raise ValueError("Sample %s could not be found.\n" \
                    % (sample))
            sm_index.append(smp2idx[sample])
    else:
        for sample in smp2idx:
            sm_index.append(smp2idx[sample])

    ###########################################################################
    # Phase 1. Retrieve the variants for each chrom/sample
    ###########################################################################
    query  = "SELECT chrom, start, end, gt_types, gt_depths \
              FROM variants \
              WHERE type = 'snp' \
              AND   filter is NULL \
              AND   depth >= "                               + str(args.min_total_depth) + \
              " ORDER BY chrom, end"

    sys.stderr.write(
        "LOG: Querying and ordering variants by chromosomal position.\n")
    gq.run(query, needs_genotypes=True)

    print "\t".join([
        'chrom', 'start', 'end', 'sample', 'num_of_snps', 'density_per_kb',
        'run_length_in_bp'
    ])

    variants_seen = 0
    samples = defaultdict(list)
    prev_chrom = None
    curr_chrom = None
    for row in gq:
        variants_seen += 1
        if variants_seen % 10000 == 0:
            sys.stderr.write("LOG: Loaded %d variants. Current variant on %s, position %d.\n" \
                % (variants_seen, row['chrom'], row['end']))

        gt_types = row['gt_types']
        gt_depths = row['gt_depths']
        curr_chrom = row['chrom']

        # the chromosome has changed. search for ROHs in the previous chrom
        if curr_chrom != prev_chrom and prev_chrom is not None:
            sweep_genotypes_for_rohs(args, prev_chrom, samples)
            samples = defaultdict(list)

        # associate the genotype for the variant with each sample
        for idx in sm_index:
            sample = idx2smp[idx]
            gt_type = gt_types[idx]
            depth = gt_depths[idx]

            # the genotype must have had sufficient depth to be considered
            if depth < args.min_genotype_depth:
                continue

            if (gt_type == HOM_ALT or gt_type == HOM_REF):
                samples[sample].append(row['end'])
            elif gt_type == HET:
                samples[sample].append('H')
            elif gt_type == UNKNOWN:
                samples[sample].append('U')

        prev_chrom = curr_chrom

    # search for ROHs in the final chromosome
    sweep_genotypes_for_rohs(args, curr_chrom, samples)