def sample_variants(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT variant_id, gt_types, gts, gene, impact, biotype, \ in_dbsnp, clinvar_sig, clinvar_disease_name, aaf_1kg_all, aaf_esp_all, chrom, \ start, end \ FROM variants" c.execute(query) if args.command == 'interactions': #header if args.var_mode: print "\t".join(['sample','gene','order_of_interaction', \ 'interacting_gene', 'var_id', 'chrom', 'start', \ 'end', 'impact', 'biotype', 'in_dbsnp', \ 'clinvar_sig', 'clinvar_disease_name', 'aaf_1kg_all', \ 'aaf_esp_all']) if (not args.var_mode): print "\t".join(['sample','gene','order_of_interaction', \ 'interacting_gene']) sample_gene_interactions(c, args, idx_to_sample) elif args.command == 'lof_interactions': samples = get_variant_genes(c, args, idx_to_sample) return samples
def get_genotypes(c, args): """For each variant, report each sample's genotype on a separate line. """ idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, \ v.ref, v.alt, \ v.type, v.sub_type, \ v.aaf, v.in_dbsnp, v.gene, \ v.gts \ FROM variants v \ ORDER BY chrom, start" c.execute(query) # build a list of all the column indices that are NOT # gt_* columns. These will be the columns reported (col_names, non_gt_idxs) = \ util.get_col_names_and_indices(c.description, ignore_gt_cols=True) col_names.append('sample') col_names.append('genotype') if args.use_header: print args.separator.join(col for col in col_names) for row in c: gts = np.array(cPickle.loads(zlib.decompress(row['gts']))) for idx, gt in enumerate(gts): # xrange(len(row)-1) to avoid printing v.gts print args.separator.join(str(row[i]) for i in xrange(len(row)-1)), print args.separator.join([idx_to_sample[idx], gt])
def get_gtcounts_by_sample(c, args): """ Report the count of each genotype class observed for each sample. """ idx_to_sample = util.map_indicies_to_samples(c) # report. print '\t'.join([ 'sample', 'num_hom_ref', 'num_het', 'num_hom_alt', 'num_unknown', 'total' ]) query = "SELECT *, \ (num_hom_ref + num_het + num_hom_alt + num_unknown) as total \ FROM sample_genotype_counts" c.execute(query) # count the number of each genotype type obs. for each sample. for row in c: sample = idx_to_sample[row['sample_id']] print "\t".join( str(s) for s in [ sample, row['num_hom_ref'], row['num_het'], row['num_hom_alt'], row['num_unknown'], row['total'] ])
def get_genotypes(c, args): """For each variant, report each sample's genotype on a separate line. """ idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, \ v.ref, v.alt, \ v.type, v.sub_type, \ v.aaf, v.in_dbsnp, v.gene, \ v.gts \ FROM variants v \ ORDER BY chrom, start" c.execute(query) # build a list of all the column indices that are NOT # gt_* columns. These will be the columns reported (col_names, non_gt_idxs) = \ util.get_col_names_and_indices(c.description, ignore_gt_cols=True) col_names.append('sample') col_names.append('genotype') if args.use_header: print args.separator.join(col for col in col_names) for row in c: gts = np.array(cPickle.loads(zlib.decompress(row['gts']))) for idx, gt in enumerate(gts): # xrange(len(row)-1) to avoid printing v.gts print args.separator.join( str(row[i]) for i in xrange(len(row) - 1)), print args.separator.join([idx_to_sample[idx], gt])
def sample_lof_variants(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT chrom, start, end, \ gt_types, gts, gene \ FROM variants \ WHERE is_lof='1'" c.execute(query) sample_lof_interactions(c, args, idx_to_sample)
def __init__(self, db): self.db = db self.query_executed = False self.for_browser = False self._connect_to_database() # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323 self.sample_to_idx = util.map_samples_to_indicies(self.c) # and vice versa. e.g., self.idx_to_sample[323] -> NA20814 self.idx_to_sample = util.map_indicies_to_samples(self.c)
def __init__(self, db, include_gt_cols=False): assert os.path.exists(db), "%s does not exist." % db self.db = db self.query_executed = False self.for_browser = False self.include_gt_cols = include_gt_cols self._connect_to_database() # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323 self.sample_to_idx = util.map_samples_to_indicies(self.c) # and vice versa. e.g., self.idx_to_sample[323] -> NA20814 self.idx_to_sample = util.map_indicies_to_samples(self.c)
def sample_variants(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT variant_id, gt_types, gts, gene, impact, biotype \ FROM variants" c.execute(query) if args.var_mode: print "\t".join(['sample','gene','order_of_interaction', \ 'interacting_gene', 'var_id','impact','biotype']) elif (not args.var_mode): print "\t".join(['sample','gene','order_of_interaction', \ 'interacting_gene']) sample_gene_interactions(c, args, idx_to_sample)
def get_ind_lof(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ v.impact, v.aa_change, v.aa_length, \ v.gt_types, v.gts, i.gene, \ i.transcript, i.biotype\ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id \ AND i.is_lof='1' \ AND v.type = 'snp'" c.execute(query) # header print '\t'.join([ 'chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change', 'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample', 'genotype', 'gene', 'transcript', 'trans_type' ]) for r in c: gt_types = np.array(cPickle.loads(zlib.decompress(r['gt_types']))) gts = np.array(cPickle.loads(zlib.decompress(r['gts']))) gene = str(r['gene']) trans = str(r['transcript']) aa_change = str(r['aa_change']) aa_length = str(r['aa_length']) transcript_pos = None transcript_pct = None if aa_change != 'None': transcript_pos = re.findall('\S(\d+)\S', aa_change)[0] if aa_length != 'None': transcript_pct = float(transcript_pos) / float(aa_length) for idx, gt_type in enumerate(gt_types): if gt_type == HET or gt_type == HOM_ALT: print "\t".join([ r['chrom'], str(r['start']), str(r['end']), r['ref'], r['alt'], r['impact'], r['aa_change'] or 'None', transcript_pos or 'None', r['aa_length'] or 'None', str(transcript_pct) or 'None', idx_to_sample[idx], gts[idx], gene, trans, r['biotype'] ])
def get_ind_lof(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ v.impact, v.aa_change, v.aa_length, \ v.gt_types, v.gts, i.gene, \ i.transcript, i.biotype\ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id \ AND i.is_lof='1' \ AND v.type = 'snp'" c.execute(query) # header print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \ 'highest_impact', 'aa_change', 'var_trans_pos', 'trans_aa_length', 'var_trans_pct', \ 'sample', 'genotype', 'gene', 'transcript', 'trans_type']) for r in c: gt_types = np.array(cPickle.loads(zlib.decompress(r['gt_types']))) gts = np.array(cPickle.loads(zlib.decompress(r['gts']))) gene = str(r['gene']) trans = str(r['transcript']) aa_change = str(r['aa_change']) aa_length = str(r['aa_length']) transcript_pos = None transcript_pct = None if aa_change != 'None': transcript_pos = re.findall('\S(\d+)\S', aa_change)[0] if aa_length != 'None': transcript_pct = float(transcript_pos) / float(aa_length) for idx, gt_type in enumerate(gt_types): if gt_type == GT_HET or gt_type == GT_HOM_ALT: print "\t".join([r['chrom'], str(r['start']), \ str(r['end']), r['ref'], r['alt'], \ r['impact'], \ r['aa_change'] or 'None', \ transcript_pos or 'None', \ r['aa_length'] or 'None', \ str(transcript_pct) or 'None', \ idx_to_sample[idx], \ gts[idx], gene, trans, r['biotype']])
def get_ind_pathways(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ i.impact, v.gt_types, v.gts, i.gene, \ i.transcript \ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id" c.execute(query) # header print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \ 'highest_impact', 'sample', 'genotype', \ 'gene', 'transcript', 'pathway']) _report_variant_pathways(c, args, idx_to_sample)
def get_variants_by_sample(c, args): """ Report the number of variants observed for each sample where the sample had a non-ref genotype """ idx_to_sample = util.map_indicies_to_samples(c) # report. print '\t'.join(['sample', 'total']) query = "SELECT sample_id, \ (num_het + num_hom_alt) as total \ FROM sample_genotype_counts" c.execute(query) for row in c: sample = idx_to_sample[row['sample_id']] print "\t".join(str(s) for s in [sample, row['total']])
def get_ind_pathways(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ i.impact, v.gt_types, v.gts, i.gene, \ i.transcript \ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id" c.execute(query) # header print "\t".join( ["chrom", "start", "end", "ref", "alt", "impact", "sample", "genotype", "gene", "transcript", "pathway"] ) _report_variant_pathways(c, args, idx_to_sample)
def get_ind_pathways(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ i.impact, v.gt_types, v.gts, i.gene, \ i.transcript \ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id" c.execute(query) # header print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \ 'impact', 'sample', 'genotype', \ 'gene', 'transcript', 'pathway']) _report_variant_pathways(c, args, idx_to_sample)
def sample_lof_variants(c, args, samples): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT chrom, start, end, \ gt_types, gts, gene \ FROM variants \ WHERE is_lof='1'" c.execute(query) #header if args.var_mode: print "\t".join(['sample','lof_gene','order_of_interaction', \ 'interacting_gene', 'var_id', 'chrom', 'start', \ 'end', 'impact','biotype','in_dbsnp', 'clin_sigs', \ 'aaf_1kg_all','aaf_esp_all']) elif (not args.var_mode): print "\t".join(['sample','lof_gene','order_of_interaction', \ 'interacting_gene']) sample_lof_interactions(c, args, idx_to_sample, samples)
def sample_lof_variants(c, args, samples): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT chrom, start, end, \ gt_types, gts, gene \ FROM variants \ WHERE is_lof='1'" c.execute(query) #header if args.var_mode: print "\t".join(['sample','lof_gene','order_of_interaction', \ 'interacting_gene', 'var_id', 'chrom', 'start', \ 'end', 'impact','biotype','in_dbsnp', 'clinvar_sig', \ 'clinvar_disease_name', 'aaf_1kg_all','aaf_esp_all']) elif (not args.var_mode): print "\t".join(['sample','lof_gene','order_of_interaction', \ 'interacting_gene']) sample_lof_interactions(c, args, idx_to_sample, samples)
def get_gtcounts_by_sample(c, args): """ Report the count of each genotype class observed for each sample. """ idx_to_sample = util.map_indicies_to_samples(c) # report. print '\t'.join(['sample', 'num_hom_ref', 'num_het', 'num_hom_alt', 'num_unknown', 'total']) query = "SELECT *, \ (num_hom_ref + num_het + num_hom_alt + num_unknown) as total \ FROM sample_genotype_counts" c.execute(query) # count the number of each genotype type obs. for each sample. for row in c: sample = idx_to_sample[row['sample_id']] print "\t".join(str(s) for s in [sample, row['num_hom_ref'], row['num_het'], row['num_hom_alt'], row['num_unknown'], row['total']])
def get_compound_hets(c, args): """ Report candidate compound heterozygous mutations. """ # build a mapping of the numpy array index to the appropriate sample name # e.g. 0 == 109400005 # 37 == 147800025 idx_to_sample = util.map_indicies_to_samples(c) comp_hets = collections.defaultdict(lambda: collections.defaultdict(list)) query = "SELECT * FROM variants \ WHERE is_coding = 1" # is_exonic - what about splice? c.execute(query) # step 1. collect all candidate heterozygptes for all # genes and samples. the list will be refined in step 2. for row in c: gt_types = np.array(cPickle.loads(zlib.decompress(row['gt_types']))) gt_phases = np.array(cPickle.loads(zlib.decompress(row['gt_phases']))) gt_bases = np.array(cPickle.loads(zlib.decompress(row['gts']))) site = Site(row) # filter putative sites that the user doesn't care about if site.num_hets > 1 and not args.allow_other_hets: continue if not site.is_lof and args.only_lof: continue # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == GT_HET: sample = idx_to_sample[idx] # (testing) # sample = "NA19002" sample_site = copy(site) sample_site.phased = gt_phases[idx] # require phased genotypes if not sample_site.phased: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates # for this sample/gene comp_hets[sample][site.gene].append(sample_site) # header print "sample\tgene\thet1\thet2" # step 2. now, cull the list of candidate heterozygotes for each # gene/sample to those het pairs where the alternate alleles # were inherited on opposite haplotypes. for sample in comp_hets: for gene in comp_hets[sample]: for site1 in comp_hets[sample][gene]: for site2 in comp_hets[sample][gene]: if site1 == site2: continue # expand the genotypes for this sample # at each site into it's composite # alleles. e.g. A|G -> ['A', 'G'] alleles_site1 = site1.gt.split('|') alleles_site2 = site2.gt.split('|') # return the haplotype on which the alternate # allele was observed for this sample at each # candidate het. site. # e.g., if ALT=G and alleles_site1=['A', 'G'] # then alt_hap_1 = 1. if ALT=A, then alt_hap_1 = 0 alt_hap_1 = alleles_site1.index(site1.alt) alt_hap_2 = alleles_site2.index(site2.alt) # it is only a true compound heterozygote iff # the alternates are on opposite haplotypes. if alt_hap_1 != alt_hap_2: print "\t".join([sample, gene, str(site1), str(site2)])
def get_compound_hets(c, args): """ Report candidate compound heterozygous mutations. """ # build a mapping of the numpy array index to the appropriate sample name # e.g. 0 == 109400005 # 37 == 147800025 idx_to_sample = util.map_indicies_to_samples(c) comp_hets = collections.defaultdict(lambda: collections.defaultdict(list)) query = "SELECT * FROM variants \ WHERE impact_severity != 'LOW'" # is_exonic - what about splice? c.execute(query) # step 1. collect all candidate heterozygptes for all # genes and samples. the list will be refined in step 2. for row in c: gt_types = compression.unpack_genotype_blob(row['gt_types']) gt_phases = compression.unpack_genotype_blob(row['gt_phases']) gt_bases = compression.unpack_genotype_blob(row['gts']) site = Site(row) # filter putative sites that the user doesn't care about if site.num_hets > 1 and not args.allow_other_hets: continue if not site.is_lof and args.only_lof: continue # track each sample that is heteroyzgous at this site. for idx, gt_type in enumerate(gt_types): if gt_type == HET: sample = idx_to_sample[idx] # (testing) # sample = "NA19002" sample_site = copy(site) sample_site.phased = gt_phases[idx] # require phased genotypes if not sample_site.phased and not args.ignore_phasing: continue sample_site.gt = gt_bases[idx] # add the site to the list of candidates # for this sample/gene comp_hets[sample][site.gene].append(sample_site) # header print "sample\tgene\thet1\thet2" # step 2. now, cull the list of candidate heterozygotes for each # gene/sample to those het pairs where the alternate alleles # were inherited on opposite haplotypes. for sample in comp_hets: for gene in comp_hets[sample]: for site1 in comp_hets[sample][gene]: for site2 in comp_hets[sample][gene]: if site1 == site2: continue # expand the genotypes for this sample # at each site into it's composite # alleles. e.g. A|G -> ['A', 'G'] alleles_site1 = [] alleles_site2 = [] if not args.ignore_phasing: alleles_site1 = site1.gt.split('|') alleles_site2 = site2.gt.split('|') else: # split on phased (|) or unphased (/) genotypes alleles_site1 = re.split('\||/', site1.gt) alleles_site2 = re.split('\||/', site2.gt) # it is only a true compound heterozygote iff # the alternates are on opposite haplotypes. if not args.ignore_phasing: # return the haplotype on which the alternate # allele was observed for this sample at each # candidate het. site. # e.g., if ALT=G and alleles_site1=['A', 'G'] # then alt_hap_1 = 1. if ALT=A, then alt_hap_1 = 0 alt_hap_1 = alleles_site1.index(site1.alt) alt_hap_2 = alleles_site2.index(site2.alt) if alt_hap_1 != alt_hap_2: print "\t".join([sample, gene, str(site1), str(site2)]) else: # user has asked us to not care about phasing print "\t".join([sample, gene, str(site1), str(site2)])
def get_ind_lof(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ v.impact, v.aa_change, v.aa_length, \ v.gt_types, v.gts, i.gene, \ i.transcript, i.biotype\ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id \ AND i.is_lof='1' \ AND v.type = 'snp'" c.execute(query) # header print "\t".join( [ "chrom", "start", "end", "ref", "alt", "highest_impact", "aa_change", "var_trans_pos", "trans_aa_length", "var_trans_pct", "sample", "genotype", "gene", "transcript", "trans_type", ] ) for r in c: gt_types = np.array(cPickle.loads(zlib.decompress(r["gt_types"]))) gts = np.array(cPickle.loads(zlib.decompress(r["gts"]))) gene = str(r["gene"]) trans = str(r["transcript"]) aa_change = str(r["aa_change"]) aa_length = str(r["aa_length"]) transcript_pos = None transcript_pct = None if aa_change != "None": transcript_pos = re.findall("\S(\d+)\S", aa_change)[0] if aa_length != "None": transcript_pct = float(transcript_pos) / float(aa_length) for idx, gt_type in enumerate(gt_types): if gt_type == HET or gt_type == HOM_ALT: print "\t".join( [ r["chrom"], str(r["start"]), str(r["end"]), r["ref"], r["alt"], r["impact"], r["aa_change"] or "None", transcript_pos or "None", r["aa_length"] or "None", str(transcript_pct) or "None", idx_to_sample[idx], gts[idx], gene, trans, r["biotype"], ] )