def get_variant_genes(res, args, idx_to_sample): samples = defaultdict(list) for r in res: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) var_id = str(r['variant_id']) chrom = str(r['chrom']) start = str(r['start']) end = str(r['end']) gene = str(r['gene']) impact = str(r['impact']) biotype = str(r['biotype']) in_dbsnp = str(r['in_dbsnp']) clinvar_sig = str(r['clinvar_sig']) clinvar_disease_name = str(r['clinvar_disease_name']) aaf_1kg_all = str(r['aaf_1kg_all']) aaf_esp_all = str(r['aaf_esp_all']) for idx, gt_type in enumerate(gt_types): if (gt_type == HET or gt_type == HOM_ALT): if gene != "None": (key, value) = (idx_to_sample[idx], \ (gene,var_id,chrom,start,end,impact, \ biotype,in_dbsnp,clinvar_sig, \ clinvar_disease_name,aaf_1kg_all, \ aaf_esp_all)) samples[idx_to_sample[idx]].append(value) return samples
def get_variant_genes(c, args, idx_to_sample): samples = defaultdict(list) for r in c: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) var_id = str(r['variant_id']) chrom = str(r['chrom']) start = str(r['start']) end = str(r['end']) gene = str(r['gene']) impact = str(r['impact']) biotype = str(r['biotype']) in_dbsnp = str(r['in_dbsnp']) clinvar_sig = str(r['clinvar_sig']) clinvar_disease_name = str(r['clinvar_disease_name']) aaf_1kg_all = str(r['aaf_1kg_all']) aaf_esp_all = str(r['aaf_esp_all']) for idx, gt_type in enumerate(gt_types): if (gt_type == HET or gt_type == HOM_ALT): if gene != "None": (key, value) = (idx_to_sample[idx], \ (gene,var_id,chrom,start,end,impact, \ biotype,in_dbsnp,clinvar_sig, \ clinvar_disease_name,aaf_1kg_all, \ aaf_esp_all)) samples[idx_to_sample[idx]].append(value) return samples
def _report_variant_pathways(c, args, idx_to_sample): (agn_paths, hgnc_paths, ensembl_paths) = get_pathways(args) for r in c: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) gene = str(r['gene']) trans = str(r['transcript']) pathways = [] if (gene, trans) in agn_paths: pathways = _get_pathways(gene, trans, agn_paths[(gene, trans)], allow_none=False) elif (gene, trans) in hgnc_paths: pathways = _get_pathways(gene, trans, hgnc_paths[(gene, trans)], allow_none=False) elif (gene, trans) in ensembl_paths: pathways = _get_pathways(gene, trans, ensembl_paths[(gene, trans)], allow_none=False) pathlist = ",".join(pathways) for idx, gt_type in enumerate(gt_types): if (gt_type == HET or gt_type == HOM_ALT) and \ len(pathways) > 0: print "\t".join([r['chrom'], str(r['start']), str(r['end']), \ r['ref'], r['alt'], r['impact'], \ idx_to_sample[idx], gts[idx], gene, trans, \ pathlist])
def _report_variant_pathways(res, args, idx_to_sample): (agn_paths, hgnc_paths, ensembl_paths) = get_pathways(args) for r in res: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) gene = str(r['gene']) trans = str(r['transcript']) pathways = [] if (gene, trans) in agn_paths: pathways = _get_pathways(gene, trans, agn_paths[(gene, trans)], allow_none=False) elif (gene, trans) in hgnc_paths: pathways = _get_pathways(gene, trans, hgnc_paths[(gene, trans)], allow_none=False) elif (gene, trans) in ensembl_paths: pathways = _get_pathways(gene, trans, ensembl_paths[(gene, trans)], allow_none=False) pathlist = ",".join(pathways) for idx, gt_type in enumerate(gt_types): if (gt_type == HET or gt_type == HOM_ALT) and \ len(pathways) > 0: print "\t".join([r['chrom'], str(r['start']), str(r['end']), \ r['ref'], r['alt'], r['impact'], \ idx_to_sample[idx], gts[idx], gene, trans, \ pathlist])
def get_ind_lof(c, args): idx_to_sample = util.map_indices_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ v.impact, v.aa_change, v.aa_length, \ v.gt_types, v.gts, i.gene, \ i.transcript, i.biotype\ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id \ AND i.is_lof='1' \ AND v.type = 'snp'" c.execute(query) # header print '\t'.join([ 'chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change', 'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample', 'genotype', 'gene', 'transcript', 'trans_type' ]) for r in c: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) gene = str(r['gene']) trans = str(r['transcript']) aa_change = str(r['aa_change']) aa_length = str(r['aa_length']) transcript_pos = None transcript_pct = None if aa_change != 'None': try: #transcript_pos for snpEff annotated VCF transcript_pos = re.findall('\S(\d+)\S', aa_change)[0] except IndexError: #transcript_pos for VEP annotated VCF if aa_length != 'None' and \ aa_length.split("/")[0] != "-": transcript_pos = aa_length.split("/")[0] #transcript_pct for snpEff annotated VCF if aa_length != 'None' and "/" not in aa_length: transcript_pct = float(transcript_pos) / float(aa_length) #transcript_pct for VEP annotated VCF elif aa_length != 'None' and "/" in aa_length: transcript_pct = float(transcript_pos) / float( aa_length.split("/")[1]) for idx, gt_type in enumerate(gt_types): if gt_type == HET or gt_type == HOM_ALT: print "\t".join([ r['chrom'], str(r['start']), str(r['end']), r['ref'], r['alt'], r['impact'], r['aa_change'] or 'None', transcript_pos or 'None', r['aa_length'] or 'None', str(transcript_pct) or 'None', idx_to_sample[idx], gts[idx], gene, trans, r['biotype'] or 'None' ])
def get_auto_dominant_candidates(c): """ Report candidate variants that meet an autosomal dominant inheritance model. """ families = subjects.get_families(c) for family in families: query = "SELECT chrom, start, end, ref, alt, gene, \ impact, impact_severity, gt_types, gts \ FROM variants \ WHERE impact_severity != 'LOW'" c.execute(query) all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] family_genotype_mask = family.get_auto_dominant_filter() family_sample_gt_columns = family.get_subject_genotype_columns() family_sample_gt_labels = family.get_subject_genotype_labels() # yield a header header = [] header.append("family_id") for col in all_query_cols: header.append(col) for col in family_sample_gt_labels: header.append(col) yield header # yield the resulting auto_dom variants for this familiy for row in c: # unpack the genotype arrays so that we can interrogate # the genotypes present in each family member to conforming # to the genetic model being tested gt_types = compression.unpack_genotype_blob(row['gt_types']) gts = compression.unpack_genotype_blob(row['gts']) # skip if the variant doesn't meet a dominant model # for this family if not eval(family_genotype_mask): continue result = [] # first report all of the non-genotype columns result.append(str(family.family_id)) for col in all_query_cols: if col == 'gt_types' or col == 'gts': continue result.append(str(row[col])) # now report all of the genotype columns for col in family_sample_gt_columns: result.append(str(eval(col))) yield result
def get_ind_lof(c, args): idx_to_sample = util.map_indices_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ v.impact, v.aa_change, v.aa_length, \ v.gt_types, v.gts, i.gene, \ i.transcript, i.biotype\ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id \ AND i.is_lof='1' \ AND v.type = 'snp'" c.execute(query) # header print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change', 'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample', 'genotype', 'gene', 'transcript', 'trans_type']) for r in c: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) gene = str(r['gene']) trans = str(r['transcript']) aa_change = str(r['aa_change']) aa_length = str(r['aa_length']) transcript_pos = None transcript_pct = None if aa_change != 'None': try: #transcript_pos for snpEff annotated VCF transcript_pos = re.findall('\S(\d+)\S', aa_change)[0] except IndexError: #transcript_pos for VEP annotated VCF if aa_length != 'None' and \ aa_length.split("/")[0] != "-": transcript_pos = aa_length.split("/")[0] #transcript_pct for snpEff annotated VCF if aa_length != 'None' and "/" not in aa_length: transcript_pct = float(transcript_pos) / float(aa_length) #transcript_pct for VEP annotated VCF elif aa_length != 'None' and "/" in aa_length: transcript_pct = float(transcript_pos) / float(aa_length.split("/")[1]) for idx, gt_type in enumerate(gt_types): if gt_type == HET or gt_type == HOM_ALT: print "\t".join([r['chrom'], str(r['start']), str(r['end']), r['ref'], r['alt'], r['impact'], r['aa_change'] or 'None', transcript_pos or 'None', r['aa_length'] or 'None', str(transcript_pct) or 'None', idx_to_sample[idx], gts[idx], gene, trans, r['biotype'] or 'None'])
def get_lof_genes(res, args, idx_to_sample): lof = defaultdict(list) for r in res: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) gene = str(r['gene']) for idx, gt_type in enumerate(gt_types): if (gt_type == HET or gt_type == HOM_ALT): if gene != "None": (key, value) = (idx_to_sample[idx], gene) lof[idx_to_sample[idx]].append(gene) return lof
def get_lof_genes(c, args, idx_to_sample): lof = defaultdict(list) for r in c: gt_types = Z.unpack_genotype_blob(r['gt_types']) gts = Z.unpack_genotype_blob(r['gts']) gene = str(r['gene']) for idx, gt_type in enumerate(gt_types): if (gt_type == HET or gt_type == HOM_ALT): if gene != "None": (key, value) = (idx_to_sample[idx], gene) lof[idx_to_sample[idx]].append(gene) return lof
def __getitem__(self, key): # we cache what we can. if key in ('het_samples', 'hom_alt_samples', 'unknown_samples', 'variant_samples', 'hom_ref_samples'): if self.genotype_dict == {}: self.genotype_dict = self.query._group_samples_by_genotype(self['gt_types']) if key == 'het_samples': return self.genotype_dict[HET] if key == 'hom_alt_samples': return self.genotype_dict[HOM_ALT] if key == 'hom_ref_samples': return self.genotype_dict[HOM_REF] if key == 'unknown_samples': return self.genotype_dict[UNKNOWN] if key == 'variant_samples': return self.genotype_dict[HET] + self.genotype_dict[HOM_ALT] if key in self.cache: return self.cache[key] if key == 'info': if 'info' not in self.cache: self.cache['info'] = compression.unpack_ordereddict_blob(self.row['info']) return self.cache['info'] if key not in self.query.gt_cols: return self.row[key] elif key in self.query.gt_cols: if key not in self.cache: self.cache[key] = compression.unpack_genotype_blob(self.row[key]) return self.cache[key] raise KeyError(key)
def get_genotypes(c, args): """For each variant, report each sample's genotype on a separate line. """ idx_to_sample = util.map_indices_to_samples(c) query = "SELECT v.chrom, v.start, v.end, \ v.ref, v.alt, \ v.type, v.sub_type, \ v.aaf, v.in_dbsnp, v.gene, \ v.gts \ FROM variants v \ ORDER BY chrom, start" c.execute(query) # build a list of all the column indices that are NOT # gt_* columns. These will be the columns reported (col_names, non_gt_idxs) = \ util.get_col_names_and_indices(c.description, ignore_gt_cols=True) col_names.append('sample') col_names.append('genotype') if args.use_header: print args.separator.join(col for col in col_names) for row in c: gts = Z.unpack_genotype_blob(row['gts']) for idx, gt in enumerate(gts): # xrange(len(row)-1) to avoid printing v.gts print args.separator.join( str(row[i]) for i in xrange(len(row) - 1)), print args.separator.join([idx_to_sample[idx], gt])
def get_genotypes(conn, metadata, args): """For each variant, report each sample's genotype on a separate line. """ idx_to_sample = util.map_indices_to_samples(metadata) query = "SELECT v.chrom, v.start, v.end, \ v.ref, v.alt, \ v.type, v.sub_type, \ v.aaf, v.in_dbsnp, v.gene, \ v.gts \ FROM variants v \ ORDER BY chrom, start" res = conn.execute(sql.text(query)) # build a list of all the column indices that are NOT # gt_* columns. These will be the columns reported (col_names, non_gt_idxs) = \ util.get_col_names_and_indices(metadata.tables["variants"], ignore_gt_cols=True) col_names.append('sample') col_names.append('genotype') if args.use_header: print args.separator.join(col for col in col_names) for row in res: gts = Z.unpack_genotype_blob(row['gts']) for idx, gt in enumerate(gts): # xrange(len(row)-1) to avoid printing v.gts a = args.separator.join(str(row[i]) for i in xrange(len(row)-1)) b = args.separator.join([idx_to_sample[idx], gt]) print args.separator.join((a, b))
def apply_query_w_genotype_select(c, query, use_header): """ Execute a query that contains gt* columns in only in the SELECT. """ # construct a mapping of sample names to list indices sample_to_idx = util.map_samples_to_indicies(c) (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx) query = add_gt_cols_to_query(query.lower()) c.execute(query) # what are the columns that were actually selected by the user. all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] if "*" in select_cols: select_cols.remove("*") # all_cols_orig.remove("*") all_cols_new.remove("*") select_cols += all_query_cols if use_header: h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)] yield OrderedDict(itertools.izip(h, h)) report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols)) for row in c: gts = compression.unpack_genotype_blob(row["gts"]) gt_types = compression.unpack_genotype_blob(row["gt_types"]) gt_phases = compression.unpack_genotype_blob(row["gt_phases"]) gt_depths = compression.unpack_genotype_blob(row["gt_depths"]) fields = OrderedDict() for idx, col in enumerate(report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: fields[col] = eval(col.strip()) yield fields
def get_variant_genes(c, args, idx_to_sample): samples = defaultdict(list) for r in c: gt_types = Z.unpack_genotype_blob(r["gt_types"]) gts = Z.unpack_genotype_blob(r["gts"]) var_id = str(r["variant_id"]) chrom = str(r["chrom"]) start = str(r["start"]) end = str(r["end"]) gene = str(r["gene"]) impact = str(r["impact"]) biotype = str(r["biotype"]) in_dbsnp = str(r["in_dbsnp"]) clinvar_sig = str(r["clinvar_sig"]) clinvar_disease_name = str(r["clinvar_disease_name"]) aaf_1kg_all = str(r["aaf_1kg_all"]) aaf_esp_all = str(r["aaf_esp_all"]) for idx, gt_type in enumerate(gt_types): if gt_type == HET or gt_type == HOM_ALT: if gene != "None": (key, value) = ( idx_to_sample[idx], ( gene, var_id, chrom, start, end, impact, biotype, in_dbsnp, clinvar_sig, clinvar_disease_name, aaf_1kg_all, aaf_esp_all, ), ) samples[idx_to_sample[idx]].append(value) return samples
def get_mds(conn, metadata, args): """ Compute the pairwise genetic distance between each sample. """ idx_to_sample = {} res = conn.execute(sql.text("select sample_id, name from samples")) for row in res: idx_to_sample[int(row['sample_id']) - 1] = row['name'] query = "SELECT DISTINCT v.variant_id, v.gt_types\ FROM variants v\ WHERE v.type = 'snp'" res = conn.execute(query) # keep a list of numeric genotype values # for each sample genotypes = collections.defaultdict(list) for row in res: gt_types = Z.unpack_genotype_blob(row['gt_types']) # at this point, gt_types is a numpy array # idx: 0 1 2 3 4 5 6 .. #samples # type [0 1 2 1 2 0 0 .. ] for idx, gt_type in enumerate(gt_types): sample = idx_to_sample[idx] genotypes[sample].append(gt_type) mds = collections.defaultdict(float) # convert the genotype list for each sample # to a numpy array for performance. # masks stores an array of T/F indicating which genotypes are # known (True, [0,1,2]) and unknown (False [-1]). masks = {} for s in genotypes: sample = str(s) x = np.array(genotypes[sample]) genotypes[sample] = x masks[sample] = \ np.ma.masked_where(genotypes[sample] != UNKNOWN, genotypes[sample]).mask # compute the euclidean distance for each s1/s2 combination # using numpy's vectorized sum() and square() operations. # we use the mask arrays to identify the indices of known genotypes # for each sample. by doing a bitwise AND of the mask arrays for the # two samples, we have a mask array of variants where __both__ samples # were called. for sample1 in genotypes: for sample2 in genotypes: pair = (sample1, sample2) # which variants have known genotypes for both samples? both_mask = masks[str(sample1)] & masks[str(sample2)] genotype1 = genotypes[sample1] genotype2 = genotypes[sample2] # distance between s1 and s2: eucl_dist = float(np.sum(np.square((genotype1 - genotype2)[both_mask]))) \ / \ float(np.sum(both_mask)) mds[pair] = eucl_dist # report the pairwise MDS for each sample pair. print "sample1\tsample2\tdistance" for pair in mds: print "\t".join([str(pair[0]), str(pair[1]), str(round(mds[pair], 4))])
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() if self._query_needs_genotype_info(): gts = compression.unpack_genotype_blob(row['gts']) gt_types = \ compression.unpack_genotype_blob(row['gt_types']) gt_phases = \ compression.unpack_genotype_blob(row['gt_phases']) gt_depths = \ compression.unpack_genotype_blob(row['gt_depths']) gt_ref_depths = \ compression.unpack_genotype_blob(row['gt_ref_depths']) gt_alt_depths = \ compression.unpack_genotype_blob(row['gt_alt_depths']) gt_quals = \ compression.unpack_genotype_blob(row['gt_quals']) # skip the record if it does not meet the user's genotype filter if self.gt_filter and not eval(self.gt_filter): continue fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: # reuse the original column anme user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] fields[orig_col] = eval(col.strip()) else: # asked for "gts" or "gt_types", e.g. if col == "gts": fields[col] = ','.join(gts) elif col == "gt_types": fields[col] = \ ','.join(str(t) for t in gt_types) elif col == "gt_phases": fields[col] = \ ','.join(str(p) for p in gt_phases) elif col == "gt_depths": fields[col] = \ ','.join(str(d) for d in gt_depths) elif col == "gt_quals": fields[col] = \ ','.join(str(d) for d in gt_quals) elif col == "gt_ref_depths": fields[col] = \ ','.join(str(d) for d in gt_ref_depths) elif col == "gt_alt_depths": fields[col] = \ ','.join(str(d) for d in gt_alt_depths) if self.show_variant_samples: gt_types = compression.unpack_genotype_blob(row['gt_types']) variant_samples = [x for x, y in enumerate(gt_types) if y == HET or y == HOM_ALT] variant_names = [self.idx_to_sample[x] for x in variant_samples] fields["variant_samples"] = ",".join(variant_names) het_samples = [x for x, y in enumerate(gt_types) if y == HET] het_names = [self.idx_to_sample[x] for x in het_samples] fields["HET_samples"] = ",".join(het_names) hom_alt_samples = [x for x, y in enumerate(gt_types) if y == HOM_ALT] hom_alt_names = [self.idx_to_sample[x] for x in hom_alt_samples] fields["HOM_ALT_samples"] = ",".join(hom_alt_names) if self._query_needs_genotype_info(): if not self.for_browser: return GeminiRow(fields, gts, gt_types, gt_phases, gt_depths, gt_ref_depths, gt_alt_depths, gt_quals) else: return fields else: if not self.for_browser: return GeminiRow(fields) else: return fields except Exception as e: raise StopIteration
def get_compound_hets(c, args): """ Report candidate compound heterozygous mutations. """ # build a mapping of the numpy array index to the appropriate sample name # e.g. 0 == 109400005 # 37 == 147800025 idx_to_sample = util.map_indicies_to_samples(c) comp_hets = collections.defaultdict(lambda: collections.defaultdict(list)) query = "SELECT * FROM variants \ WHERE impact_severity != 'LOW'" # is_exonic - what about splice? c.execute(query) # step 1. collect all candidate heterozygptes for all # genes and samples. the list will be refined in step 2. for row in c: gt_types = compression.unpack_genotype_blob(row['gt_types']) gt_phases = compression.unpack_genotype_blob(row['gt_phases']) gt_bases = compression.unpack_genotype_blob(row['gts']) site = Site(row) # filter putative sites that the user doesn't care about if site.num_hets > 1 and not args.allow_other_hets: continue if not site.is_lof and args.only_lof: continue # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] # (testing) # sample = "NA19002" sample_site = copy(site) sample_site.phased = gt_phases[idx] # require phased genotypes if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates # for this sample/gene comp_hets[sample][site.gene].append(sample_site) # header print "sample\tgene\thet1\thet2" # step 2. now, cull the list of candidate heterozygotes for each # gene/sample to those het pairs where the alternate alleles # were inherited on opposite haplotypes. for sample in comp_hets: for gene in comp_hets[sample]: for site1 in comp_hets[sample][gene]: for site2 in comp_hets[sample][gene]: if site1 == site2: continue # expand the genotypes for this sample # at each site into it's composite # alleles. e.g. A|G -> ['A', 'G'] alleles_site1 = [] alleles_site2 = [] if not args.ignore_phasing: alleles_site1 = site1.gt.split('|') alleles_site2 = site2.gt.split('|') else: # split on phased (|) or unphased (/) genotypes alleles_site1 = re.split('\||/', site1.gt) alleles_site2 = re.split('\||/', site2.gt) # it is only a true compound heterozygote iff # the alternates are on opposite haplotypes. if not args.ignore_phasing: # return the haplotype on which the alternate # allele was observed for this sample at each # candidate het. site. # e.g., if ALT=G and alleles_site1=['A', 'G'] # then alt_hap_1 = 1. if ALT=A, then alt_hap_1 = 0 alt_hap_1 = alleles_site1.index(site1.alt) alt_hap_2 = alleles_site2.index(site2.alt) if alt_hap_1 != alt_hap_2: print "\t".join([sample, gene, str(site1), str(site2)]) else: # user has asked us to not care about phasing print "\t".join([sample, gene, str(site1), str(site2)])
def get_auto_recessive_candidates(c): """ Report candidate variants that meet an autosomal recessive inheritance model. """ families = subjects.get_families(c) for family in families: query = "SELECT chrom, start, end, ref, alt, gene, \ impact, impact_severity, gt_types, gts \ FROM variants \ WHERE impact_severity != 'LOW'" c.execute(query) all_query_cols = [ str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt") ] family_genotype_mask = family.get_auto_recessive_filter() family_sample_gt_columns = family.get_subject_genotype_columns() family_sample_gt_labels = family.get_subject_genotype_labels() # skip this family if it cannot meet an autosomal_recessive model. if family_genotype_mask is None: continue # yield a header header = [] header.append("family_id") for col in all_query_cols: header.append(col) for col in family_sample_gt_labels: header.append(col) yield header # yield the resulting auto_rec variants for this familiy for row in c: # unpack the genotype arrays so that we can interrogate # the genotypes present in each family member to conforming # to the genetic model being tested gt_types = compression.unpack_genotype_blob(row['gt_types']) gts = compression.unpack_genotype_blob(row['gts']) # skip if the variant doesn't meet a recessive model # for this family if not eval(family_genotype_mask): continue result = [] # first report all of the non-genotype columns result.append(str(family.family_id)) for col in all_query_cols: if col == 'gt_types' or col == 'gts': continue result.append(str(row[col])) # now report all of the genotype columns for col in family_sample_gt_columns: result.append(str(eval(col))) yield result
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() except Exception as e: self.conn.close() raise StopIteration gts = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) if self._query_needs_genotype_info(): gts = compression.unpack_genotype_blob(row['gts']) gt_types = \ compression.unpack_genotype_blob(row['gt_types']) gt_phases = \ compression.unpack_genotype_blob(row['gt_phases']) gt_depths = \ compression.unpack_genotype_blob(row['gt_depths']) gt_ref_depths = \ compression.unpack_genotype_blob(row['gt_ref_depths']) gt_alt_depths = \ compression.unpack_genotype_blob(row['gt_alt_depths']) gt_quals = \ compression.unpack_genotype_blob(row['gt_quals']) variant_samples = [x for x, y in enumerate(gt_types) if y == HET or y == HOM_ALT] variant_names = [self.idx_to_sample[x] for x in variant_samples] het_samples = [x for x, y in enumerate(gt_types) if y == HET] het_names = [self.idx_to_sample[x] for x in het_samples] hom_alt_samples = [x for x, y in enumerate(gt_types) if y == HOM_ALT] hom_alt_names = [self.idx_to_sample[x] for x in hom_alt_samples] hom_ref_samples = [x for x, y in enumerate(gt_types) if y == HOM_REF] hom_ref_names = [self.idx_to_sample[x] for x in hom_ref_samples] unknown_samples = [x for x, y in enumerate(gt_types) if y == UNKNOWN] unknown_names = [self.idx_to_sample[x] for x in unknown_samples] families = map(str, list(set([self.idx_to_sample_object[x].family_id for x in variant_samples]))) # skip the record if it does not meet the user's genotype filter if self.gt_filter and not eval(self.gt_filter, locals()): continue fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT") and not col == "info": fields[col] = row[col] elif col == "info": fields[col] = self._info_dict_to_string(info) else: # reuse the original column name user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] val = eval(col.strip()) if type(val) in [np.int8, np.int32, np.bool_]: fields[orig_col] = int(val) elif type(val) in [np.float32]: fields[orig_col] = float(val) else: fields[orig_col] = val else: # asked for "gts" or "gt_types", e.g. if col == "gts": fields[col] = ','.join(gts) elif col == "gt_types": fields[col] = \ ','.join(str(t) for t in gt_types) elif col == "gt_phases": fields[col] = \ ','.join(str(p) for p in gt_phases) elif col == "gt_depths": fields[col] = \ ','.join(str(d) for d in gt_depths) elif col == "gt_quals": fields[col] = \ ','.join(str(d) for d in gt_quals) elif col == "gt_ref_depths": fields[col] = \ ','.join(str(d) for d in gt_ref_depths) elif col == "gt_alt_depths": fields[col] = \ ','.join(str(d) for d in gt_alt_depths) if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(variant_names) fields["HET_samples"] = \ self.variant_samples_delim.join(het_names) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(hom_alt_names) if self.show_families: fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, gts, gt_types, gt_phases, gt_depths, gt_ref_depths, gt_alt_depths, gt_quals, variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all([predicate(gemini_row) for predicate in self.predicates]): continue if not self.for_browser: return gemini_row else: return fields
def get_mds(c, args): """ Compute the pairwise genetic distance between each sample. """ idx_to_sample = {} c.execute("select sample_id, name from samples") for row in c: idx_to_sample[int(row['sample_id']) - 1] = row['name'] query = "SELECT DISTINCT v.variant_id, v.gt_types\ FROM variants v\ WHERE v.type = 'snp'" c.execute(query) # keep a list of numeric genotype values # for each sample genotypes = collections.defaultdict(list) for row in c: gt_types = Z.unpack_genotype_blob(row['gt_types']) # at this point, gt_types is a numpy array # idx: 0 1 2 3 4 5 6 .. #samples # type [0 1 2 1 2 0 0 .. ] for idx, gt_type in enumerate(gt_types): sample = idx_to_sample[idx] genotypes[sample].append(gt_type) mds = collections.defaultdict(float) # convert the genotype list for each sample # to a numpy array for performance. # masks stores an array of T/F indicating which genotypes are # known (True, [0,1,2]) and unknown (False [-1]). masks = {} for s in genotypes: sample = str(s) x = np.array(genotypes[sample]) genotypes[sample] = x masks[sample] = \ np.ma.masked_where(genotypes[sample] != UNKNOWN, genotypes[sample]).mask # compute the euclidean distance for each s1/s2 combination # using numpy's vectorized sum() and square() operations. # we use the mask arrays to identify the indices of known genotypes # for each sample. by doing a bitwise AND of the mask arrays for the # two samples, we have a mask array of variants where __both__ samples # were called. for sample1 in genotypes: for sample2 in genotypes: pair = (sample1, sample2) # which variants have known genotypes for both samples? both_mask = masks[str(sample1)] & masks[str(sample2)] genotype1 = genotypes[sample1] genotype2 = genotypes[sample2] # distance between s1 and s2: eucl_dist = float(np.sum(np.square((genotype1 - genotype2)[both_mask]))) \ / \ float(np.sum(both_mask)) mds[pair] = eucl_dist # report the pairwise MDS for each sample pair. print "sample1\tsample2\tdistance" for pair in mds: print "\t".join([str(pair[0]), str(pair[1]), str(round(mds[pair], 4))])
def filter_query(c, query, gt_filter, use_header): """ Execute a base SQL query while applying filters on the returned rows based on filters applied to the genotype-specific columns. For example: --gt_filter "(gt_types.1478PC0011 == 1 or gt_types.1478PC0012 == 1) """ def correct_genotype_filter(gt_filter, sample_to_idx): """ This converts a "raw" genotype filter supplied by the user to a filter than can be eval()'ed. Specifically, we must convery a _named_ genotype index to a _numerical_ genotype index so that the appropriate value can be extracted for the sample from the genotype numpy arrays. For example, converts: --gt-filter "(gt_types.1478PC0011 == 1)" to (gt_types[11] == 1) """ corrected_gt_filter = [] tokens = re.split(r"[\s+]+", gt_filter) for token in tokens: if token.find("gt") >= 0 or token.find("GT") >= 0: corrected = _correct_genotype_col(token, sample_to_idx) corrected_gt_filter.append(corrected) else: corrected_gt_filter.append(token) return " ".join(corrected_gt_filter) # construct a mapping of sample names to list indices sample_to_idx = util.map_samples_to_indicies(c) gt_filter = correct_genotype_filter(gt_filter, sample_to_idx) (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx) query = add_gt_cols_to_query(query.lower()) c.execute(query) # what are the columns that were actually selected by the user. all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] if "*" in select_cols: select_cols.remove("*") all_cols_orig.remove("*") all_cols_new.remove("*") select_cols += all_query_cols if use_header: h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)] yield OrderedDict(itertools.izip(h, h)) report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols)) for row in c: gts = compression.unpack_genotype_blob(row["gts"]) gt_types = compression.unpack_genotype_blob(row["gt_types"]) gt_phases = compression.unpack_genotype_blob(row["gt_phases"]) gt_depths = compression.unpack_genotype_blob(row["gt_depths"]) if not eval(gt_filter): continue fields = OrderedDict() for idx, col in enumerate(report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: fields[col] = eval(col.strip()) yield fields
def get_de_novo_candidates(c, min_sample_depth=30): """ Report candidate variants that meet appear to be de novo mutations in the child. We cannot distinguisj mutations that occured in the parental germline from those that occurred early in development in the child post-conception. """ families = subjects.get_families(c) for family in families: query = "SELECT chrom, start, end, ref, alt, gene, \ impact, impact_severity, in_dbsnp, \ rs_ids, aaf_1kg_all, aaf_esp_all, \ clinvar_sig, clinvar_disease_name, \ clinvar_dbsource, gt_types, \ gt_depths, gts \ FROM variants \ WHERE impact_severity != 'LOW' \ AND num_het = 1" c.execute(query) all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")] family_genotype_mask = family.get_de_novo_filter() family_sample_gt_columns = family.get_subject_genotype_columns() family_sample_depth_columns = family.get_subject_depth_columns() family_sample_gt_labels = family.get_subject_genotype_labels() family_sample_dp_labels = family.get_subject_depth_labels() header = [] header.append("family_id") for col in all_query_cols: header.append(col) for col in family_sample_gt_labels: header.append(col) for col in family_sample_dp_labels: header.append(col) yield header # report the resulting de_novo variants for this familiy for row in c: # unpack the genotype arrays so that we can interrogate # the genotypes present in each family member to conforming # to the genetic model being tested gt_types = compression.unpack_genotype_blob(row['gt_types']) gt_depths = compression.unpack_genotype_blob(row['gt_depths']) gts = compression.unpack_genotype_blob(row['gts']) # does the variant meet the a de novo model for this family? # if not, ignore. if not eval(family_genotype_mask): continue # make sure each sample's genotype had sufficient coverage. # otherwise, ignore insufficient_depth = False for col in family_sample_depth_columns: depth = int(eval(col)) if depth < min_sample_depth: insufficient_depth = True break if insufficient_depth: continue result = [] # first report all of the non-genotype columns result.append(str(family.family_id)) for col in all_query_cols: if col == 'gt_types' or col == 'gts': continue result.append(str(row[col])) # now report all of the genotype columns for col in family_sample_gt_columns: result.append(str(eval(col))) # now report all of the depth columns for col in family_sample_depth_columns: result.append(str(eval(col))) yield result
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() except Exception as e: self.conn.close() raise StopIteration gts = None gt_types = None gt_phases = None gt_depths = None gt_ref_depths = None gt_alt_depths = None gt_quals = None variant_names = [] het_names = [] hom_alt_names = [] hom_ref_names = [] unknown_names = [] info = None if 'info' in self.report_cols: info = compression.unpack_ordereddict_blob(row['info']) if self._query_needs_genotype_info(): gts = compression.unpack_genotype_blob(row['gts']) gt_types = \ compression.unpack_genotype_blob(row['gt_types']) gt_phases = \ compression.unpack_genotype_blob(row['gt_phases']) gt_depths = \ compression.unpack_genotype_blob(row['gt_depths']) gt_ref_depths = \ compression.unpack_genotype_blob(row['gt_ref_depths']) gt_alt_depths = \ compression.unpack_genotype_blob(row['gt_alt_depths']) gt_quals = \ compression.unpack_genotype_blob(row['gt_quals']) variant_samples = [ x for x, y in enumerate(gt_types) if y == HET or y == HOM_ALT ] variant_names = [ self.idx_to_sample[x] for x in variant_samples ] het_samples = [x for x, y in enumerate(gt_types) if y == HET] het_names = [self.idx_to_sample[x] for x in het_samples] hom_alt_samples = [ x for x, y in enumerate(gt_types) if y == HOM_ALT ] hom_alt_names = [ self.idx_to_sample[x] for x in hom_alt_samples ] hom_ref_samples = [ x for x, y in enumerate(gt_types) if y == HOM_REF ] hom_ref_names = [ self.idx_to_sample[x] for x in hom_ref_samples ] unknown_samples = [ x for x, y in enumerate(gt_types) if y == UNKNOWN ] unknown_names = [ self.idx_to_sample[x] for x in unknown_samples ] families = map( str, list( set([ self.idx_to_sample_object[x].family_id for x in variant_samples ]))) # skip the record if it does not meet the user's genotype filter if self.gt_filter and not eval(self.gt_filter, locals()): continue fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith( "GT") and not col == "info": fields[col] = row[col] elif col == "info": fields[col] = self._info_dict_to_string(info) else: # reuse the original column name user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] val = eval(col.strip()) if type(val) in [np.int8, np.int32, np.bool_]: fields[orig_col] = int(val) elif type(val) in [np.float32]: fields[orig_col] = float(val) else: fields[orig_col] = val else: # asked for "gts" or "gt_types", e.g. if col == "gts": fields[col] = ','.join(gts) elif col == "gt_types": fields[col] = \ ','.join(str(t) for t in gt_types) elif col == "gt_phases": fields[col] = \ ','.join(str(p) for p in gt_phases) elif col == "gt_depths": fields[col] = \ ','.join(str(d) for d in gt_depths) elif col == "gt_quals": fields[col] = \ ','.join(str(d) for d in gt_quals) elif col == "gt_ref_depths": fields[col] = \ ','.join(str(d) for d in gt_ref_depths) elif col == "gt_alt_depths": fields[col] = \ ','.join(str(d) for d in gt_alt_depths) if self.show_variant_samples: fields["variant_samples"] = \ self.variant_samples_delim.join(variant_names) fields["HET_samples"] = \ self.variant_samples_delim.join(het_names) fields["HOM_ALT_samples"] = \ self.variant_samples_delim.join(hom_alt_names) if self.show_families: fields["families"] = self.variant_samples_delim.join(families) gemini_row = GeminiRow(fields, gts, gt_types, gt_phases, gt_depths, gt_ref_depths, gt_alt_depths, gt_quals, variant_names, het_names, hom_alt_names, hom_ref_names, unknown_names, info, formatter=self.formatter) if not all( [predicate(gemini_row) for predicate in self.predicates]): continue if not self.for_browser: return gemini_row else: return fields
def next(self): """ Return the GeminiRow object for the next query result. """ # we use a while loop since we may skip records based upon # genotype filters. if we need to skip a record, we just # throw a continue and keep trying. the alternative is to just # recursively call self.next() if we need to skip, but this # can quickly exceed the stack. while (1): try: row = self.c.next() if self._query_needs_genotype_info(): gts = compression.unpack_genotype_blob(row['gts']) gt_types = \ compression.unpack_genotype_blob(row['gt_types']) gt_phases = \ compression.unpack_genotype_blob(row['gt_phases']) gt_depths = \ compression.unpack_genotype_blob(row['gt_depths']) # skip the record if it does not meet the user's genotype filter if self.gt_filter and not eval(self.gt_filter): continue fields = OrderedDict() for idx, col in enumerate(self.report_cols): if col == "*": continue if not col.startswith("gt") and not col.startswith("GT"): fields[col] = row[col] else: # reuse the original column anme user requested # e.g. replace gts[1085] with gts.NA20814 if '[' in col: orig_col = self.gt_idx_to_name_map[col] fields[orig_col] = eval(col.strip()) else: # asked for "gts" or "gt_types", e.g. if col == "gts": fields[col] = ','.join(gts) elif col == "gt_types": fields[col] = \ ','.join(str(t) for t in gt_types) elif col == "gt_phases": fields[col] = \ ','.join(str(p) for p in gt_phases) elif col == "gt_depths": fields[col] = \ ','.join(str(d) for d in gt_depths) if self._query_needs_genotype_info(): if not self.for_browser: return GeminiRow(fields, gts, gt_types, gt_phases, gt_depths) else: return fields else: if not self.for_browser: return GeminiRow(fields) else: return fields except: raise StopIteration