def run_query(args): predicates = get_row_predicates(args) add_required_columns_to_query(args) formatter = select_formatter(args) genotypes_needed = needs_genotypes(args) gene_needed = needs_gene(args) try: subjects = get_subjects(args) except KeyError: subjects = [] kwargs = {} if args.bcolz: from . import gemini_bcolz kwargs['variant_id_getter'] = gemini_bcolz.filter gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs) gq.run(args.query, args.gt_filter, args.show_variant_samples, args.sample_delim, predicates, genotypes_needed, gene_needed, args.show_families, subjects=subjects) if args.use_header and gq.header: print(gq.header) if not args.dgidb: for row in gq: print(row) else: # collect a list of all the genes that need to be queried # from DGIdb genes = defaultdict() for row in gq: genes[row['gene']] = True # collect info from DGIdb dgidb_info = query_dgidb(genes) # rerun the query (the cursor is now consumed) gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter) gq.run(args.query, args.gt_filter, args.show_variant_samples, args.sample_delim, predicates, genotypes_needed, gene_needed, args.show_families, subjects=subjects, **kwargs) # report the query results with DGIdb info added at the end. for row in gq: print(str(row) + "\t" + str(dgidb_info[row['gene']]))
def get_actionable_mutations(parser, args): t_n_pairs = gemini_subjects.get_families(args.db) query = "SELECT variants.chrom, start, end, ref, alt, \ variants.gene, impact, is_somatic, \ gene_summary.in_cosmic_census \ FROM variants, gene_summary \ WHERE variants.is_somatic = 1 \ AND (variants.type = 'snp' \ OR variants.type = 'indel') \ AND (variants.impact_severity = 'HIGH' \ OR variants.impact_severity = 'MED') \ AND variants.chrom = gene_summary.chrom \ AND variants.gene = gene_summary.gene \ AND gene_summary.in_cosmic_census = 1" # collect the relevant genes and query DGIDB gq = GeminiQuery.GeminiQuery(args.db) gq.run(query) genes = defaultdict() for row in gq: genes[row['gene']] = True # collect info from DGIdb dgidb_info = query_dgidb(genes) # now rerun the query and report actionable mutations per DGIDB and COSMIC census. gq = GeminiQuery.GeminiQuery(args.db) gq.run(query) print('\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \ 'gene', 'impact', 'is_somatic', 'in_cosmic_census', 'dgidb_info'])) for row in gq: for pair in t_n_pairs: samples = pair.subjects if len(samples) != 2: continue tumor = pair.subjects[0] normal = pair.subjects[1] # swap if we guessed the tumor incorrectly if tumor.affected is False: tumor, normal = normal, tumor print('\t'.join(str(s) for s in [tumor.name, row['chrom'], \ row['start'], row['end'], \ row['ref'], row['alt'], \ row['gene'], row['impact'], \ row['is_somatic'], \ row['in_cosmic_census'], \ str(dgidb_info[row['gene']])]))
def identify_low_complexity(name, in_vcf, in_bed): gms_thresh = 50.0 subset_vcf = subset_by_region(name, in_vcf, in_bed) gemini_db = create_gemini_db(subset_vcf) print name gq = GeminiQuery(gemini_db) gq.run("SELECT count(*) from variants") total = list(gq)[0]["count(*)"] gq = GeminiQuery(gemini_db) gq.run("SELECT count(*) from variants WHERE gms_illumina < %s OR " "gms_solid < %s OR gms_iontorrent < %s" % (gms_thresh, gms_thresh, gms_thresh)) low_gms = list(gq)[0]["count(*)"] print low_gms, total, "%.4f" % (float(low_gms) / float(total) * 100.0)
def main(): """Main function which parses arguments and calls relevant functions""" # Parsing arguments arguments = parse_arguments() # Processing the presets config file presets = classes.Presets(arguments["presets_config"]) # Passing the arguments and presets to a query constructor object queryformatter = classes.QueryConstructor(arguments, presets) # Creating the gemini database object gemini_db = GeminiQuery.GeminiQuery(arguments["input"]) # Calling relevant function depending on the chosen mode if arguments["mode"] == "sample": output_table = get_sample_variants(gemini_db, arguments, queryformatter) with open(arguments["output"], 'w') as outputfile: outputfile.write('\n'.join(output_table)) elif arguments["mode"] == "variant": output_table = get_variant_information(gemini_db, arguments, queryformatter) with open(arguments["output"], 'w') as outputfile: outputfile.write('\n'.join(output_table)) elif arguments["mode"] == "table": output_table = get_table(gemini_db, arguments, queryformatter) with open(arguments["output"], 'w') as outputfile: outputfile.write('\n'.join(output_table)) elif arguments["mode"] == "info": print_comprehension = [ print(field) for field in get_fields(gemini_db).split('\t') ]
def _add_transcripts(self, variant_obj, gemini_variant): """ Add all transcripts for a variant Go through all transcripts found for the variant Args: gemini_variant (GeminiQueryRow): The gemini variant Yields: transcript (puzzle.models.Transcript) """ query = "SELECT * from variant_impacts WHERE variant_id = {0}".format( gemini_variant['variant_id'] ) gq = GeminiQuery(self.db) gq.run(query) for gemini_transcript in gq: transcript = Transcript( hgnc_symbol=gemini_transcript['gene'], transcript_id=gemini_transcript['transcript'], consequence=gemini_transcript['impact_so'], biotype=gemini_transcript['biotype'], polyphen=gemini_transcript['polyphen_pred'], sift=gemini_transcript['sift_pred'], HGVSc=gemini_transcript['codon_change'], HGVSp=', '.join([gemini_transcript['aa_change'] or '', gemini_transcript['aa_length'] or '']) ) variant_obj.add_transcript(transcript)
def get_variant_type(variant_source): """Try to find out what type of variants that exists in a variant source Args: variant_source (str): Path to variant source source_mode (str): 'vcf' or 'gemini' Returns: variant_type (str): 'sv' or 'snv' """ file_type = get_file_type(variant_source) variant_type = 'sv' if file_type == 'vcf': variants = VCF(variant_source) elif file_type == 'gemini': variants = GeminiQuery(variant_source) gemini_query = "SELECT * from variants" variants.run(gemini_query) # Check 1000 first variants, if anyone is a snv we set the variant_type # to 'snv' for i, variant in enumerate(variants): if file_type == 'vcf': if variant.is_snp: variant_type = 'snv' elif file_type == 'gemini': if variant['type'] == 'snp': variant_type = 'snv' if i > 1000: break return variant_type
def query_json(): query = request.GET.get('query', '').strip() gq = GeminiQuery.GeminiQuery(database) gq._set_gemini_browser(True) gq.run(query) return {'gemini_results': [dict(row) for row in gq]}
def get_query_results(gemini_db, query, gt_filter="", out_format=DefaultRowFormat(None)): """ Returns results of query. """ gemini = GeminiQuery(gemini_db, out_format=out_format) gemini.run(query, gt_filter=gt_filter) return gemini
def generate_phenotypes(database): query = GeminiQuery(database) query_string = "SELECT name, phenotype FROM samples" phenotypes = {1: list(), 2: list()} query.run(query_string) for row in query: phenotypes[int(row["phenotype"])].append(row["name"]) return phenotypes
def _prep_priority_filter(gemini_db, data): """Prepare tabix indexed file with priority based filters and supporting information """ from gemini import GeminiQuery out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): ref_chroms = set([ x.name for x in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) with file_transaction(data, out_file) as tx_out_file: gq = GeminiQuery(gemini_db) pops = [ "aaf_esp_ea", "aaf_esp_aa", "aaf_esp_all", "aaf_1kg_amr", "aaf_1kg_eas", "aaf_1kg_sas", "aaf_1kg_afr", "aaf_1kg_eur", "aaf_1kg_all", "aaf_adj_exac_all", "aaf_adj_exac_afr", "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_fin", "aaf_adj_exac_nfe", "aaf_adj_exac_oth", "aaf_adj_exac_sas" ] attrs = ( "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, " "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths" ).split(", ") gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops)) sidx = gq.sample_to_idx[dd.get_sample_name(data)] header = attrs[:5] + ["filter"] + attrs[5:-2] + [ x for x in pops if x.endswith("_all") ] + ["freq"] with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle, dialect="excel-tab") cheader = header[:] cheader[0] = "#" + cheader[0] writer.writerow(cheader) for row in gq: ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0) alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0) out_vals = dict(row.row) try: out_vals["freq"] = "%.2f" % ( float(alt_depth) / float(ref_depth + alt_depth)) except ZeroDivisionError: out_vals["freq"] = "0.00" out_vals["filter"] = _calc_priority_filter(row, pops) if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37( out_vals["chrom"]) in ref_chroms: out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"]) out = [out_vals[x] for x in header] writer.writerow(out) return vcfutils.bgzip_and_index(out_file, data["config"], tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
def family_wise_predicate(args): formatter = select_formatter(args) families = get_family_dict(args) gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter) predicates = [] for f in families.values(): family_names = [x.name for x in f] subjects = get_subjects_in_family(args, f).values() predicates.append( select_subjects_predicate(subjects, args, family_names)) def predicate(row): return sum([p(row) for p in predicates]) >= args.min_kindreds return predicate
def get_query_results(gemini_db, query, gt_filter="", as_dataframe=False): """ Returns results of query. """ gq = GeminiQuery(gemini_db) gq.run(query, gt_filter=gt_filter) if as_dataframe: # Return results as dataframe. df = pd.DataFrame([str(row).split('\t') for row in gq], columns=gq.header.split('\t')) return df else: # Return results as iterator. return gq
def _variants(self, case_id, gemini_query): """Return variants found in the gemini database Args: case_id (str): The case for which we want to see information gemini_query (str): What variants should be chosen filters (dict): A dictionary with filters Yields: variant_obj (dict): A Variant formatted dictionary """ individuals = [] # Get the individuals for the case case_obj = self.case(case_id) for individual in case_obj.individuals: individuals.append(individual) self.db = case_obj.variant_source self.variant_type = case_obj.variant_type gq = GeminiQuery(self.db) gq.run(gemini_query) index = 0 for gemini_variant in gq: variant = None # Check if variant is non ref in the individuals is_variant = self._is_variant(gemini_variant, individuals) if self.variant_type == 'snv' and not is_variant: variant = None else: index += 1 logger.debug("Updating index to: {0}".format(index)) variant = self._format_variant(case_id=case_id, gemini_variant=gemini_variant, individual_objs=individuals, index=index) if variant: yield variant
def csv_to_query_string(csv_handle, db): query_list = [] count = 0 for line in csv_handle: chrom, pos, ref, alt = line.strip().split(",") query_list.append( "(chrom='{chrom}' and start={pos})".format(**locals())) count += 1 if count % 100 == 0: gq = GeminiQuery(db) regions = " or ".join(query_list) query = "select * from variants where %s" % regions gq.run(query) if count == 100: print gq.header for row in gq: print row query_list = []
def stats_region(chrom): # Note: chrom is give as an argument # we then extract start and end using HTML GET start = request.GET.get('start', '').strip() end = request.GET.get('end', '').strip() # construct a query query = "SELECT start, end from variants" query += " WHERE chrom = '" + chrom + "'" query += " AND start >= " + start query += " AND end <= " + end # issue the query gq = GeminiQuery.GeminiQuery(database) gq._set_gemini_browser(True) gq.run(query) # return query results in JSON format return {'features': [dict(row) for row in gq]}
def summarize_gene_region(args, gene): gq = GeminiQuery(args.geminidb) gq.run("SELECT chrom, start, end, ref, alt, type, " "num_hom_ref, num_het, num_hom_alt, " "aaf_1kg_all, " "gene, impact, impact_severity, aa_change, clinvar_sig, " "grc, gms_illumina, in_cse, rmsk " "FROM variants WHERE " "chrom == '{chrom}' AND gene == '{gene}' AND filter is NULL " "ORDER BY start".format(chrom=args.chrom, gene=gene)) for row in gq: if row["impact_severity"] not in ["LOW"]: print row var_depths = [] novar_depths = [] for i, (gt, gt_type, gt_depth) in enumerate( zip(row.gts, row.gt_types, row.gt_depths)): if gt_type > 0: print " ", gq.index2sample[i], gt, gt_depth elif row["type"] == "indel": print row["chrom"], row["start"], row["ref"], row["alt"]
def summarize_by_gene_and_sample(db, coding_only=True): "This is copied from GEMINI's own burden tool" query = ("select chrom, start, end, gene, impact, info from variants where" " impact != 'synonymous_coding' and in_1kg=0 ") if coding_only: query += " and codon_change != 'None'" gq = GeminiQuery(db) gq.run(query, show_variant_samples=True) burden = defaultdict(Counter) for row in gq: gene_name = row['gene'] if not gene_name: gene_name = get_nearby_gene(row["chrom"], row["start"], row["end"]) new_counts = Counter(row["HET_samples"]) # Counter can't do scalar multiplication new_counts = new_counts + Counter(row["HOM_ALT_samples"]) new_counts = new_counts + Counter(row["HOM_ALT_samples"]) del new_counts[''] burden[gene_name] += new_counts dfs = list() for gene_name, counts in burden.items(): df = pd.DataFrame(counts, columns=counts.keys(), index=[gene_name]) dfs.append(df) df = pd.concat(dfs) df = df.fillna(0) return df
def variant(self, case_id, variant_id): """Return a specific variant. We solve this by building a gemini query and send it to _variants Args: case_id (str): Path to a gemini database variant_id (int): A gemini variant id Returns: variant_obj (dict): A puzzle variant """ #Use the gemini id for fast lookup variant_id = int(variant_id) gemini_query = "SELECT * from variants WHERE variant_id = {0}".format( variant_id) individuals = [] # Get the individuals for the case case_obj = self.case(case_id) for individual in case_obj.individuals: individuals.append(individual) self.db = case_obj.variant_source self.variant_type = case_obj.variant_type gq = GeminiQuery(self.db) gq.run(gemini_query) for gemini_variant in gq: variant = self._format_variant(case_id=case_id, gemini_variant=gemini_variant, individual_objs=individuals, index=gemini_variant['variant_id'], add_all_info=True) return variant return None
def test_gemini_db(self): """Check if self.db is a valid gemini database Raises sqlite3.DatabaseError if not a valid databse """ gq = GeminiQuery(self.db) return True
def get_impact_gene(args): gq = GeminiQuery(args.geminidb) gq.run( "SELECT gene FROM variants WHERE chrom == '{chrom}' AND start == {pos}" .format(chrom=args.chrom, pos=args.pos - 1)) return gq.next()["gene"]
def extract_shared_mutations(database, reference_file=None): phenotypes = generate_phenotypes(database) query = GeminiQuery(database) query_string = ("SELECT chrom, start, end, gene, ref, alt, type, sub_type," "impact, codon_change, aa_change, vcf_id, cosmic_ids" " FROM variants WHERE in_1kg=0") query.run(query_string, show_variant_samples=True) rows = list() for row in query: variants = row.variant_samples if any(item in phenotypes[1] for item in variants) and any( item in phenotypes[2] for item in variants): valid_groups = list() chrom, start, end, alt = (row["chrom"], row["start"], row["end"], row["alt"]) # In the case of intergenic regions, get the name of the # closest gene if row["gene"] is None: gene = get_nearby_gene(chrom, start, end) # print "None subsituted with", gene else: gene = row["gene"] for gid, group in groupby(variants, lambda x: x.split("_")[1]): # Rename according to Pandora guidelines # Starts with 0: 1 + number # 3 digits: 20 + number # 4 digits: 2 + number if len(gid) < 5: newgid = "2" + gid if len(gid) == 4 else "20" + gid else: newgid = gid group = list(group) if len(list(group)) == 2: # Check if we have different ALT bases for the samples # in the same pair. If this occurs, it is a false positive # and should be discarded. To do so, we need a VCF file to # query by base, otherwise we take the value as-is. if reference_file is not None: alt = check_multiple_alts(chrom, start, end, alt, group, reference_file) if alt is None: # Biallelic site for pair - discard continue valid_groups.append(newgid) cosmic_data = "Yes" if row["cosmic_ids"] else "No" data = [chrom, start, end, gene, row["ref"], alt, row["type"], row["sub_type"], row["impact"], row["codon_change"], row["aa_change"], row["vcf_id"], cosmic_data] if not valid_groups: rows.append(data + [np.nan]) else: for gid in valid_groups: rows.append(data + [gid]) colnames = ["chrom", "start", "end", "gene", "ref", "alt", "type", "sub_type", "impact", "codon_change", "aa_change", "dbsnp_id", "in_cosmic", "variants_with_pairs"] df = pd.DataFrame.from_records(rows, columns=colnames) df.set_index(["chrom", "start", "end", "gene", "ref", "alt"], inplace=True) # Get rid of loci without pairs df = df.dropna(subset=["variants_with_pairs"]) return df
def query(): def _get_fields(): query = request.GET.get('query', '').strip() gt_filter = request.GET.get('gt_filter', '').strip() use_header = request.GET.get('use_header') igv_links = request.GET.get('igv_links') return query, gt_filter or None, use_header, igv_links # user clicked the "submit" button if request.GET.get('submit', '').strip(): (query, gt_filter, use_header, igv_links) = _get_fields() if use_header: use_header = True if igv_links: igv_links = True gq = GeminiQuery.GeminiQuery(database) gq._set_gemini_browser(True) gq.run(query, gt_filter) if len(query) == 0: return template('query.j2', dbfile=database) if igv_links and ('chrom' not in query.lower() or 'start' not in query.lower() or 'end' not in query.lower()): return template('query.j2', dbfile=database, rows=gq, igv_links=igv_links, igv_links_error=True, use_header=use_header, gt_filter=gt_filter, query=query) else: return template('query.j2', dbfile=database, rows=gq, igv_links=igv_links, igv_links_error=False, use_header=use_header, gt_filter=gt_filter, query=query) # user clicked the "save to file" button elif request.GET.get('save', '').strip(): (query, gt_filter, use_header, igv_links) = _get_fields() gq = GeminiQuery.GeminiQuery(database) gq.run(query, gt_filter) if len(query) == 0: return template('query.j2', dbfile=database) # dump the results to a text file. this will be # stored in /static and a link will be given to # the user. tmp_file = '/tmp.txt' tmp = open(_static_folder + tmp_file, 'w') for i, row in enumerate(gq): if i == 0 and use_header: tmp.write('\t'.join([str(key) for key in row.keys()]) + '\n') tmp.write('\t'.join([str(row[key]) for key in row.keys()]) + '\n') tmp.close() return template('query.j2', dbfile=database, tmp_file=tmp_file, igv_links=igv_links, igv_links_error=True, use_header=use_header, gt_filter=gt_filter, query=query) # user did nothing. else: return template('query.j2', dbfile=database)
#!/usr/share/gemini/anaconda/bin/python -E # -*- coding: utf-8 -*- # usage: gemini_summarize.py <query> <gemini.db> import sys import locale from gemini import GeminiQuery DP_THRESHOLD = 8 GQ_THRESHOLD = 20 reload(sys) sys.setdefaultencoding(locale.getpreferredencoding()) gq = GeminiQuery(sys.argv[2], include_gt_cols=True) gq.run(sys.argv[1], None, True) header_printed = False genotype_columns = [ 'gt_depths', 'gt_ref_depths', 'gt_alt_depths', 'gts', 'gt_quals' ] for row in gq: columns = row.row.keys()[:-11] if (not header_printed): # Print file header print '\t'.join(['SAMPLE_ID'] + [s[:-1] for s in genotype_columns] + columns) header_printed = True # Output only het & hom alt variants for sample in row['variant_samples']:
def get_individuals(variant_source, case_lines=None, case_type='ped', variant_mode='vcf'): """Get the individuals from a vcf file, gemini database, and/or a ped file. Args: variant_source (str): Path to a variant source case_lines(Iterable): Ped like lines case_type(str): Format of ped lines Returns: individuals (generator): generator with Individuals """ individuals = [] ind_dict ={} if variant_mode == 'vcf': head = get_header(variant_source) #Dictionary with ind_id:index where index show where in vcf ind info is for index, ind in enumerate(head.individuals): ind_dict[ind] = index if case_lines: # read individuals from ped file family_parser = FamilyParser(case_lines, family_type=case_type) families = family_parser.families logger.debug("Found families {0}".format( ','.join(list(families.keys())))) if len(families) != 1: logger.error("Only one family can be used with vcf adapter") raise IOError case_id = list(families.keys())[0] logger.debug("Family used in analysis: {0}".format(case_id)) for ind_id in family_parser.individuals: ind = family_parser.individuals[ind_id] logger.info("Found individual {0}".format(ind.individual_id)) try: individual = Individual( ind_id=ind_id, case_id=case_id, mother=ind.mother, father=ind.father, sex=str(ind.sex), phenotype=str(ind.phenotype), variant_source=variant_source, ind_index=ind_dict[ind_id], ) individuals.append(individual) except KeyError as err: #This is the case when individuals in ped does not exist #in vcf raise PedigreeError( family_id=case_id, individual_id=ind_id, message="Individual {0} exists in ped file but not in vcf".format(ind_id) ) else: case_id = os.path.basename(variant_source) for ind in ind_dict: individual = Individual( ind_id=ind, case_id=case_id, variant_source=variant_source, ind_index=ind_dict[ind] ) individuals.append(individual) logger.debug("Found individual {0} in {1}".format( ind, variant_source)) elif variant_mode == 'gemini': gq = GeminiQuery(variant_source) #Dictionaru with sample to index in the gemini database ind_dict = gq.sample_to_idx query = "SELECT * from samples" gq.run(query) for individual in gq: logger.debug("Found individual {0} with family id {1}".format( individual['name'], individual['family_id'])) individuals.append( Individual( ind_id=individual['name'], case_id=individual['family_id'], mother=individual['maternal_id'], father=individual['paternal_id'], sex=individual['sex'], phenotype=individual['phenotype'], ind_index=ind_dict.get(individual['name']), variant_source=variant_source, bam_path=None) ) return individuals