def _get_individuals(self): """Return a list with the individual objects found in db Returns: individuals (list): List of Individuals """ individuals = [] gq = GeminiQuery(self.db) #Dictionaru with sample to index in the gemini database sample_to_idx = gq.sample_to_idx query = "SELECT * from samples" gq.run(query) for individual in gq: logger.info("Found individual {0} with family id {1}".format( individual['name'], individual['family_id'])) individuals.append( Individual( ind_id=individual['name'], case_id=individual['family_id'], mother=individual['maternal_id'], father=individual['paternal_id'], sex=individual['sex'], phenotype=individual['phenotype'], index=sample_to_idx.get(individual['name']), variant_source=self.db, bam_path=None) ) return individuals
def _get_transcripts(self, gemini_variant): """Return a Transcript object Go through all transcripts found for the variant Args: gemini_variant (GeminiQueryRow): The gemini variant Yields: transcript (puzzle.models.Transcript) """ query = "SELECT * from variant_impacts WHERE variant_id = {0}".format( gemini_variant['variant_id'] ) gq = GeminiQuery(self.db) gq.run(query) for genimi_transcript in gq: transcript = Transcript( hgnc_symbol=genimi_transcript['gene'], transcript_id=genimi_transcript['transcript'], consequence=genimi_transcript['impact_so'], biotype=genimi_transcript['biotype'], polyphen=genimi_transcript['polyphen_pred'], sift=genimi_transcript['sift_pred'], HGVSc=genimi_transcript['codon_change'], HGVSp=genimi_transcript['aa_change'] ) yield transcript
def _get_transcripts(self, gemini_variant): """Return a Transcript object Gemini stores the information for the most severe transcript so only one transcript is connected to one variant. Args: gemini_variant (GeminiQueryRow): The gemini variant Returns: transcripts list: List of affected transcripts """ query = "SELECT * from variant_impacts WHERE variant_id = {0}".format( gemini_variant['variant_id'] ) gq = GeminiQuery(self.db) gq.run(query) transcripts = [] for transcript in gq: transcripts.append(Transcript( hgnc_symbol = transcript['gene'], transcript_id = transcript['transcript'], consequence=transcript['impact_so'], biotype = transcript['biotype'], polyphen = transcript['polyphen_pred'], sift = transcript['sift_pred'], HGVSc = transcript['codon_change'], HGVSp = transcript['aa_change'] ) ) return transcripts
def _add_transcripts(self, variant_obj, gemini_variant): """ Add all transcripts for a variant Go through all transcripts found for the variant Args: gemini_variant (GeminiQueryRow): The gemini variant Yields: transcript (puzzle.models.Transcript) """ query = "SELECT * from variant_impacts WHERE variant_id = {0}".format( gemini_variant['variant_id'] ) gq = GeminiQuery(self.db) gq.run(query) for gemini_transcript in gq: transcript = Transcript( hgnc_symbol=gemini_transcript['gene'], transcript_id=gemini_transcript['transcript'], consequence=gemini_transcript['impact_so'], biotype=gemini_transcript['biotype'], polyphen=gemini_transcript['polyphen_pred'], sift=gemini_transcript['sift_pred'], HGVSc=gemini_transcript['codon_change'], HGVSp=', '.join([gemini_transcript['aa_change'] or '', gemini_transcript['aa_length'] or '']) ) variant_obj.add_transcript(transcript)
def get_query_results(gemini_db, query, gt_filter="", out_format=DefaultRowFormat(None)): """ Returns results of query. """ gemini = GeminiQuery(gemini_db, out_format=out_format) gemini.run(query, gt_filter=gt_filter) return gemini
def run_query(args): predicates = get_row_predicates(args) add_required_columns_to_query(args) formatter = select_formatter(args) genotypes_needed = needs_genotypes(args) gene_needed = needs_gene(args) try: subjects = get_subjects(args) except KeyError: subjects = [] kwargs = {} if args.bcolz: from . import gemini_bcolz kwargs['variant_id_getter'] = gemini_bcolz.filter gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs) gq.run(args.query, args.gt_filter, args.show_variant_samples, args.sample_delim, predicates, genotypes_needed, gene_needed, args.show_families, subjects=subjects) if args.use_header and gq.header: print(gq.header) if not args.dgidb: for row in gq: print(row) else: # collect a list of all the genes that need to be queried # from DGIdb genes = defaultdict() for row in gq: genes[row['gene']] = True # collect info from DGIdb dgidb_info = query_dgidb(genes) # rerun the query (the cursor is now consumed) gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter) gq.run(args.query, args.gt_filter, args.show_variant_samples, args.sample_delim, predicates, genotypes_needed, gene_needed, args.show_families, subjects=subjects, **kwargs) # report the query results with DGIdb info added at the end. for row in gq: print(str(row) + "\t" + str(dgidb_info[row['gene']]))
def _prep_priority_filter(gemini_db, data): """Prepare tabix indexed file with priority based filters and supporting information """ from gemini import GeminiQuery out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): ref_chroms = set([x.name for x in ref.file_contigs(dd.get_ref_file(data), data["config"])]) with file_transaction(data, out_file) as tx_out_file: gq = GeminiQuery(gemini_db) pops = [ "aaf_esp_ea", "aaf_esp_aa", "aaf_esp_all", "aaf_1kg_amr", "aaf_1kg_eas", "aaf_1kg_sas", "aaf_1kg_afr", "aaf_1kg_eur", "aaf_1kg_all", "aaf_adj_exac_all", "aaf_adj_exac_afr", "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_fin", "aaf_adj_exac_nfe", "aaf_adj_exac_oth", "aaf_adj_exac_sas", ] attrs = ( "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, " "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths" ).split(", ") gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops)) sidx = gq.sample_to_idx[dd.get_sample_name(data)] header = attrs[:5] + ["filter"] + attrs[5:-2] + [x for x in pops if x.endswith("_all")] + ["freq"] with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle, dialect="excel-tab") cheader = header[:] cheader[0] = "#" + cheader[0] writer.writerow(cheader) for row in gq: ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0) alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0) out_vals = dict(row.row) try: out_vals["freq"] = "%.2f" % (float(alt_depth) / float(ref_depth + alt_depth)) except ZeroDivisionError: out_vals["freq"] = "0.00" out_vals["filter"] = _calc_priority_filter(row, pops) if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37(out_vals["chrom"]) in ref_chroms: out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"]) out = [out_vals[x] for x in header] writer.writerow(out) return vcfutils.bgzip_and_index(out_file, data["config"], tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
def get_actionable_mutations(parser, args): t_n_pairs = gemini_subjects.get_families(args.db) query = "SELECT variants.chrom, start, end, ref, alt, \ variants.gene, impact, is_somatic, \ gene_summary.in_cosmic_census \ FROM variants, gene_summary \ WHERE variants.is_somatic = 1 \ AND (variants.type = 'snp' \ OR variants.type = 'indel') \ AND (variants.impact_severity = 'HIGH' \ OR variants.impact_severity = 'MED') \ AND variants.chrom = gene_summary.chrom \ AND variants.gene = gene_summary.gene \ AND gene_summary.in_cosmic_census = 1" # collect the relevant genes and query DGIDB gq = GeminiQuery.GeminiQuery(args.db) gq.run(query) genes = defaultdict() for row in gq: genes[row['gene']] = True # collect info from DGIdb dgidb_info = query_dgidb(genes) # now rerun the query and report actionable mutations per DGIDB and COSMIC census. gq = GeminiQuery.GeminiQuery(args.db) gq.run(query) print('\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \ 'gene', 'impact', 'is_somatic', 'in_cosmic_census', 'dgidb_info'])) for row in gq: for pair in t_n_pairs: samples = pair.subjects if len(samples) != 2: continue tumor = pair.subjects[0] normal = pair.subjects[1] # swap if we guessed the tumor incorrectly if tumor.affected is False: tumor, normal = normal, tumor print('\t'.join(str(s) for s in [tumor.name, row['chrom'], \ row['start'], row['end'], \ row['ref'], row['alt'], \ row['gene'], row['impact'], \ row['is_somatic'], \ row['in_cosmic_census'], \ str(dgidb_info[row['gene']])]))
def generate_phenotypes(database): query = GeminiQuery(database) query_string = "SELECT name, phenotype FROM samples" phenotypes = {1: list(), 2: list()} query.run(query_string) for row in query: phenotypes[int(row["phenotype"])].append(row["name"]) return phenotypes
def _prep_priority_filter(gemini_db, data): """Prepare tabix indexed file with priority based filters and supporting information """ from gemini import GeminiQuery out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): ref_chroms = set([ x.name for x in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) with file_transaction(data, out_file) as tx_out_file: gq = GeminiQuery(gemini_db) pops = [ "aaf_esp_ea", "aaf_esp_aa", "aaf_esp_all", "aaf_1kg_amr", "aaf_1kg_eas", "aaf_1kg_sas", "aaf_1kg_afr", "aaf_1kg_eur", "aaf_1kg_all", "aaf_adj_exac_all", "aaf_adj_exac_afr", "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_fin", "aaf_adj_exac_nfe", "aaf_adj_exac_oth", "aaf_adj_exac_sas" ] attrs = ( "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, " "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths" ).split(", ") gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops)) sidx = gq.sample_to_idx[dd.get_sample_name(data)] header = attrs[:5] + ["filter"] + attrs[5:-2] + [ x for x in pops if x.endswith("_all") ] + ["freq"] with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle, dialect="excel-tab") cheader = header[:] cheader[0] = "#" + cheader[0] writer.writerow(cheader) for row in gq: ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0) alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0) out_vals = dict(row.row) try: out_vals["freq"] = "%.2f" % ( float(alt_depth) / float(ref_depth + alt_depth)) except ZeroDivisionError: out_vals["freq"] = "0.00" out_vals["filter"] = _calc_priority_filter(row, pops) if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37( out_vals["chrom"]) in ref_chroms: out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"]) out = [out_vals[x] for x in header] writer.writerow(out) return vcfutils.bgzip_and_index(out_file, data["config"], tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
def get_query_results(gemini_db, query, gt_filter="", as_dataframe=False): """ Returns results of query. """ gq = GeminiQuery(gemini_db) gq.run(query, gt_filter=gt_filter) if as_dataframe: # Return results as dataframe. df = pd.DataFrame([str(row).split('\t') for row in gq], columns=gq.header.split('\t')) return df else: # Return results as iterator. return gq
def get_variant_type(variant_source): """Try to find out what type of variants that exists in a variant source Args: variant_source (str): Path to variant source source_mode (str): 'vcf' or 'gemini' Returns: variant_type (str): 'sv' or 'snv' """ file_type = get_file_type(variant_source) variant_type = 'sv' if file_type == 'vcf': variants = VCF(variant_source) elif file_type == 'gemini': variants = GeminiQuery(variant_source) gemini_query = "SELECT * from variants" variants.run(gemini_query) # Check 1000 first variants, if anyone is a snv we set the variant_type # to 'snv' for i, variant in enumerate(variants): if file_type == 'vcf': if variant.is_snp: variant_type = 'snv' elif file_type == 'gemini': if variant['type'] == 'snp': variant_type = 'snv' if i > 1000: break return variant_type
def main(): """Main function which parses arguments and calls relevant functions""" # Parsing arguments arguments = parse_arguments() # Processing the presets config file presets = classes.Presets(arguments["presets_config"]) # Passing the arguments and presets to a query constructor object queryformatter = classes.QueryConstructor(arguments, presets) # Creating the gemini database object gemini_db = GeminiQuery.GeminiQuery(arguments["input"]) # Calling relevant function depending on the chosen mode if arguments["mode"] == "sample": output_table = get_sample_variants(gemini_db, arguments, queryformatter) with open(arguments["output"], 'w') as outputfile: outputfile.write('\n'.join(output_table)) elif arguments["mode"] == "variant": output_table = get_variant_information(gemini_db, arguments, queryformatter) with open(arguments["output"], 'w') as outputfile: outputfile.write('\n'.join(output_table)) elif arguments["mode"] == "table": output_table = get_table(gemini_db, arguments, queryformatter) with open(arguments["output"], 'w') as outputfile: outputfile.write('\n'.join(output_table)) elif arguments["mode"] == "info": print_comprehension = [ print(field) for field in get_fields(gemini_db).split('\t') ]
def _variants(self, case_id, gemini_query, filters=None): """Return variants found in the gemini database Args: case_id (str): The case for which we want to see information gemini_query (str): What variants should be chosen filters (dict): A dictionary with filters Yields: variant_obj (dict): A Variant formatted dictionary """ gq = GeminiQuery(self.db) gq.run(gemini_query) individuals = [] # Get the individuals for the case case = self.case(case_id) for individual in case.individuals: individuals.append(individual) index = 0 for gemini_variant in gq: # Check if variant is non ref in the individuals variant = None if self.variant_type == 'sv': index += 1 variant = self._format_sv_variants( gemini_variant=gemini_variant, index=index, filters=filters ) else: if self._is_variant(gemini_variant, individuals): index += 1 logger.debug("Updating index to: {0}".format(index)) variant = self._format_variants( gemini_variant=gemini_variant, index=index, filters=filters ) if variant: yield variant
def csv_to_query_string(csv_handle, db): query_list = [] count = 0 for line in csv_handle: chrom, pos, ref, alt = line.strip().split(",") query_list.append("(chrom='{chrom}' and start={pos})".format(**locals())) count += 1 if count % 100 == 0: gq = GeminiQuery(db) regions = " or ".join(query_list) query = "select * from variants where %s" % regions gq.run(query) if count == 100: print gq.header for row in gq: print row query_list = []
def query_json(): query = request.GET.get('query', '').strip() gq = GeminiQuery.GeminiQuery(database) gq._set_gemini_browser(True) gq.run(query) return {'gemini_results': [dict(row) for row in gq]}
def _variants(self, case_id, gemini_query): """Return variants found in the gemini database Args: case_id (str): The case for which we want to see information gemini_query (str): What variants should be chosen filters (dict): A dictionary with filters Yields: variant_obj (dict): A Variant formatted dictionary """ individuals = [] # Get the individuals for the case case_obj = self.case(case_id) for individual in case_obj.individuals: individuals.append(individual) self.db = case_obj.variant_source self.variant_type = case_obj.variant_type gq = GeminiQuery(self.db) gq.run(gemini_query) index = 0 for gemini_variant in gq: variant = None # Check if variant is non ref in the individuals is_variant = self._is_variant(gemini_variant, individuals) if self.variant_type == 'snv' and not is_variant: variant = None else: index += 1 logger.debug("Updating index to: {0}".format(index)) variant = self._format_variant(case_id=case_id, gemini_variant=gemini_variant, individual_objs=individuals, index=index) if variant: yield variant
def csv_to_query_string(csv_handle, db): query_list = [] count = 0 for line in csv_handle: chrom, pos, ref, alt = line.strip().split(",") query_list.append( "(chrom='{chrom}' and start={pos})".format(**locals())) count += 1 if count % 100 == 0: gq = GeminiQuery(db) regions = " or ".join(query_list) query = "select * from variants where %s" % regions gq.run(query) if count == 100: print gq.header for row in gq: print row query_list = []
def _run_gemini_query_and_filter(db, genes): """Use the GeminiQuery API to filter results based on severity and specific annotations :param db: GEMINI database. :type db: str. :returns: tuple -- The header line for the requested columns and all rows that pass filters. """ query = "SELECT chrom, start, end, ref, alt, vcf_id, rs_ids, cosmic_ids, filter, qual, qual_depth, depth, " \ "gene, transcript, exon, codon_change, aa_change, biotype, impact, impact_so, impact_severity, " \ "aa_length, is_lof, is_conserved, pfam_domain, in_omim, clinvar_sig, clinvar_disease_name, " \ "clinvar_origin, clinvar_causal_allele, clinvar_dbsource, clinvar_dbsource_id, clinvar_on_diag_assay, " \ "rmsk, in_segdup, strand_bias, rms_map_qual, in_hom_run, num_mapq_zero, num_reads_w_dels, grc, " \ "gms_illumina, in_cse, num_alleles, allele_count, haplotype_score, is_somatic, somatic_score, " \ "aaf_esp_ea, aaf_esp_aa, aaf_esp_aa, aaf_esp_all, aaf_1kg_amr, aaf_1kg_eas, aaf_1kg_sas, aaf_1kg_afr, " \ "aaf_1kg_eur, aaf_1kg_all, aaf_exac_all, aaf_adj_exac_all, aaf_adj_exac_afr, aaf_adj_exac_amr, " \ "aaf_adj_exac_eas, aaf_adj_exac_fin, aaf_adj_exac_nfe, aaf_adj_exac_oth, aaf_adj_exac_sas, " \ "max_aaf_all, in_esp, in_1kg, in_exac FROM variants" # "(gts).(*), (gt_depths).(*), (gt_ref_depths).(*), (gt_alt_depths).(*), " \ gq = GeminiQuery(db) gq.run(query) header = gq.header passing_rows = [] print header # Filter out variants with minor allele frequencies above the threshold but # retain any that are above the threshold but in COSMIC or in ClinVar and not listed as benign. for variant_data in gq: if genes: if not gemini_interface.var_in_gene(variant_data, genes): continue # Right now removing this. Many benign and synonymous variants are in cosmic # if _var_is_in_cosmic(variant_data): # passing_rows.append(variant_data) # continue if gemini_interface.var_is_in_clinvar(variant_data): # Removed is_benign check temporarily. Some variants not annotated with up to date annotations passing_rows.append(variant_data) continue if gemini_interface.var_is_rare(variant_data): if gemini_interface.var_is_protein_effecting(variant_data): passing_rows.append(variant_data) return header, passing_rows
def variant(self, case_id, variant_id): """Return a specific variant. We solve this by building a gemini query and send it to _variants Args: case_id (str): Path to a gemini database variant_id (int): A gemini variant id Returns: variant_obj (dict): A puzzle variant """ #Use the gemini id for fast lookup variant_id = int(variant_id) gemini_query = "SELECT * from variants WHERE variant_id = {0}".format( variant_id ) individuals = [] # Get the individuals for the case case_obj = self.case(case_id) for individual in case_obj.individuals: individuals.append(individual) self.db = case_obj.variant_source self.variant_type = case_obj.variant_type gq = GeminiQuery(self.db) gq.run(gemini_query) for gemini_variant in gq: variant = self._format_variant( case_id=case_id, gemini_variant=gemini_variant, individual_objs=individuals, index=gemini_variant['variant_id'], add_all_info = True ) return variant return None
def identify_low_complexity(name, in_vcf, in_bed): gms_thresh = 50.0 subset_vcf = subset_by_region(name, in_vcf, in_bed) gemini_db = create_gemini_db(subset_vcf) print name gq = GeminiQuery(gemini_db) gq.run("SELECT count(*) from variants") total = list(gq)[0]["count(*)"] gq = GeminiQuery(gemini_db) gq.run("SELECT count(*) from variants WHERE gms_illumina < %s OR " "gms_solid < %s OR gms_iontorrent < %s" % (gms_thresh, gms_thresh, gms_thresh)) low_gms = list(gq)[0]["count(*)"] print low_gms, total, "%.4f" % (float(low_gms) / float(total) * 100.0)
def summarize_by_gene_and_sample(db, coding_only=True): "This is copied from GEMINI's own burden tool" query = ("select chrom, start, end, gene, impact, info from variants where" " impact != 'synonymous_coding' and in_1kg=0 ") if coding_only: query += " and codon_change != 'None'" gq = GeminiQuery(db) gq.run(query, show_variant_samples=True) burden = defaultdict(Counter) for row in gq: gene_name = row['gene'] if not gene_name: gene_name = get_nearby_gene(row["chrom"], row["start"], row["end"]) new_counts = Counter(row["HET_samples"]) # Counter can't do scalar multiplication new_counts = new_counts + Counter(row["HOM_ALT_samples"]) new_counts = new_counts + Counter(row["HOM_ALT_samples"]) del new_counts[''] burden[gene_name] += new_counts dfs = list() for gene_name, counts in burden.items(): df = pd.DataFrame(counts, columns=counts.keys(), index=[gene_name]) dfs.append(df) df = pd.concat(dfs) df = df.fillna(0) return df
def summarize_gene_region(args, gene): gq = GeminiQuery(args.geminidb) gq.run("SELECT chrom, start, end, ref, alt, type, " "num_hom_ref, num_het, num_hom_alt, " "aaf_1kg_all, " "gene, impact, impact_severity, aa_change, clinvar_sig, " "grc, gms_illumina, in_cse, rmsk " "FROM variants WHERE " "chrom == '{chrom}' AND gene == '{gene}' AND filter is NULL " "ORDER BY start".format(chrom=args.chrom, gene=gene)) for row in gq: if row["impact_severity"] not in ["LOW"]: print row var_depths = [] novar_depths = [] for i, (gt, gt_type, gt_depth) in enumerate( zip(row.gts, row.gt_types, row.gt_depths)): if gt_type > 0: print " ", gq.index2sample[i], gt, gt_depth elif row["type"] == "indel": print row["chrom"], row["start"], row["ref"], row["alt"]
def summarize_gene_region(args, gene): gq = GeminiQuery(args.geminidb) gq.run("SELECT chrom, start, end, ref, alt, type, " "num_hom_ref, num_het, num_hom_alt, " "aaf_1kg_all, " "gene, impact, impact_severity, aa_change, clinvar_sig, " "grc, gms_illumina, in_cse, rmsk " "FROM variants WHERE " "chrom == '{chrom}' AND gene == '{gene}' AND filter is NULL " "ORDER BY start" .format(chrom=args.chrom, gene=gene)) for row in gq: if row["impact_severity"] not in ["LOW"]: print row var_depths = [] novar_depths = [] for i, (gt, gt_type, gt_depth) in enumerate(zip(row.gts, row.gt_types, row.gt_depths)): if gt_type > 0: print " ", gq.index2sample[i], gt, gt_depth elif row["type"] == "indel": print row["chrom"], row["start"], row["ref"], row["alt"]
def variant(self, case_id, variant_id): """Return a specific variant. We solve this by building a gemini query and send it to _variants Args: case_id (str): Path to a gemini database variant_id (int): A gemini variant id Returns: variant_obj (dict): A puzzle variant """ variant_id = int(variant_id) gemini_query = "SELECT * from variants WHERE variant_id = {0}".format( variant_id ) individuals = [] # Get the individuals for the case for case in self.cases(): if case['name'] == case_id: for individual in case['individuals']: individuals.append(individual) gq = GeminiQuery(self.db) gq.run(gemini_query) for gemini_variant in gq: variant = self._format_variant( gemini_variant=gemini_variant, individual_objs=individuals, index=gemini_variant['variant_id'] ) return variant return None
def variant(self, case_id, variant_id): """Return a specific variant. We solve this by building a gemini query and send it to _variants Args: case_id (str): Path to a gemini database variant_id (int): A gemini variant id Returns: variant_obj (dict): A puzzle variant """ #Use the gemini id for fast lookup variant_id = int(variant_id) gemini_query = "SELECT * from variants WHERE variant_id = {0}".format( variant_id) individuals = [] # Get the individuals for the case case_obj = self.case(case_id) for individual in case_obj.individuals: individuals.append(individual) self.db = case_obj.variant_source self.variant_type = case_obj.variant_type gq = GeminiQuery(self.db) gq.run(gemini_query) for gemini_variant in gq: variant = self._format_variant(case_id=case_id, gemini_variant=gemini_variant, individual_objs=individuals, index=gemini_variant['variant_id'], add_all_info=True) return variant return None
def _variants(self, case_id, gemini_query): """Return variants found in the gemini database Args: case_id (str): The case for which we want to see information gemini_query (str): What variants should be chosen Yields: variant_obj (dict): A Variant formatted doctionary """ gq = GeminiQuery(self.db) gq.run(gemini_query) individuals = [] # Get the individuals for the case for case in self.cases(): if case.name == case_id: for individual in case.individuals: individuals.append(individual) indexes = [individual.index for individual in individuals] index = 0 for gemini_variant in gq: # Check if variant is non ref in the individuals if self._is_variant(gemini_variant, indexes): index += 1 logger.debug("Updating index to: {0}".format(index)) variant = self._format_variant( gemini_variant=gemini_variant, individual_objs=individuals, index=index ) yield variant
def family_wise_predicate(args): formatter = select_formatter(args) families = get_family_dict(args) gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter) predicates = [] for f in families.values(): family_names = [x.name for x in f] subjects = get_subjects_in_family(args, f).values() predicates.append( select_subjects_predicate(subjects, args, family_names)) def predicate(row): return sum([p(row) for p in predicates]) >= args.min_kindreds return predicate
def gemini_query(db): """Execute a GEMINI Query :param db: A GEMINI database name :type db: str :returns: GeminiRow Results -- True or False. """ query = "SELECT chrom, start, end, ref, alt, vcf_id, rs_ids, cosmic_ids, filter, qual, qual_depth, depth, " \ "type, sub_type, " \ "gene, transcript, exon, codon_change, aa_change, biotype, impact, impact_so, impact_severity, " \ "aa_length, is_lof, is_conserved, pfam_domain, in_omim, clinvar_sig, clinvar_disease_name, " \ "is_exonic, is_coding, is_splicing, " \ "clinvar_origin, clinvar_causal_allele, clinvar_dbsource, clinvar_dbsource_id, " \ "clinvar_on_diag_assay, rmsk, in_segdup, strand_bias, rms_map_qual, in_hom_run, num_mapq_zero, " \ "num_reads_w_dels, grc, gms_illumina, in_cse, num_alleles, allele_count, haplotype_score, " \ "is_somatic, somatic_score, aaf_esp_ea, aaf_esp_aa, aaf_esp_all, aaf_1kg_amr, " \ "aaf_1kg_eas, aaf_1kg_sas, aaf_1kg_afr, aaf_1kg_eur, aaf_1kg_all, aaf_exac_all, aaf_adj_exac_all, " \ "aaf_adj_exac_afr, aaf_adj_exac_amr, aaf_adj_exac_eas, aaf_adj_exac_fin, aaf_adj_exac_nfe, " \ "aaf_adj_exac_oth, aaf_adj_exac_sas, max_aaf_all, in_esp, in_1kg, in_exac, info," \ "(gts).(*), (gt_depths).(*), (gt_ref_depths).(*), (gt_alt_depths).(*) FROM variants" gq = GeminiQuery(db) gq.run(query) return gq
def stats_region(chrom): # Note: chrom is give as an argument # we then extract start and end using HTML GET start = request.GET.get('start', '').strip() end = request.GET.get('end', '').strip() # construct a query query = "SELECT start, end from variants" query += " WHERE chrom = '" + chrom + "'" query += " AND start >= " + start query += " AND end <= " + end # issue the query gq = GeminiQuery.GeminiQuery(database) gq._set_gemini_browser(True) gq.run(query) # return query results in JSON format return {'features': [dict(row) for row in gq]}
def phase_genotypes(database): gq=GeminiQuery(database) families = subjects.get_families(database) gq.run("select chrom, start, end, ref, alt, gene, impact, gts, gt_types, gt_ref_depths, gt_alt_depths from variants") s2i=gq.sample_to_idx for row in gq: mendelian = "" phasable = "" inheritance = "" origin = "" phasedata = "" chrom = str(row['chrom']) start = str(row['start']) end = str(row['end']) ref = str(row['ref']) alt = str(row['alt']) gene = str(row['gene']) impact = str(row['impact']) for family in families: dad_idx = s2i[family.father_name] mom_idx = s2i[family.mother_name] dad_gt = str(row['gts'][dad_idx]) mom_gt = str(row['gts'][mom_idx]) dad_gt_type = row['gt_types'][dad_idx] mom_gt_type = row['gt_types'][mom_idx] dad_gt_ref_depths = str(row['gt_ref_depths'][dad_idx]) mom_gt_ref_depths = str(row['gt_ref_depths'][mom_idx]) dad_gt_alt_depths = str(row['gt_alt_depths'][dad_idx]) mom_gt_alt_depths = str(row['gt_alt_depths'][mom_idx]) #m5=re.search('((?:\w*|\.*))/((?:\w*|\.*))',dad_gt) m5=string.split(dad_gt,"/") #m6=re.search('((?:\w*|\.*))/((?:\w*|\.*))',mom_gt) m6=string.split(mom_gt,"/") for child in family.children: kid_idx = s2i[str(child.name)] kid_gt = str(row['gts'][kid_idx]) kid_gt_type = row['gt_types'][kid_idx] kid_gt_ref_depths = str(row['gt_ref_depths'][kid_idx]) kid_gt_alt_depths = str(row['gt_alt_depths'][kid_idx]) if kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 1: mendelian = "mendelian" phasable = "unphasable" inheritance = "unknown" origin = "inherited from both parents" elif kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 0 or kid_gt_type == 1 and dad_gt_type == 1 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 3: mendelian = "mendelian" phasable = "unphasable" inheritance = "unknown" origin = "inherited from both parents" elif kid_gt_type == 2 or dad_gt_type == 2 or mom_gt_type == 2: mendelian = "missing allele - unknown" phasable = "missing allele - unknown" inheritance = "missing allele - unknown" origin = "missing allele - unknown" elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 0 or kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 3: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 3 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 0: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "extremely rare de novo or bad data" elif kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 1 or kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 0 or kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 3: mendelian = "mendelian" phasable = "unphasable" inheritance = "unknown" origin = "inherited from both parents or unlikely de novo" elif kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 1: mendelian = "mendelian" phasable = "phasable" inheritance = "homozygous alternate from dad, heterozygous allele from mom" origin = "inherited from both parents or unlikely de novo" ct=0 if m5[0] == m6[ct]: phasedata = m5[0]+" from dad "+m6[ct+1]+" from mom" else: phasedata = m5[0]+" from dad "+m6[ct]+" from mom" elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 1: mendelian = "mendelian" phasable = "phasable" inheritance = "homozygous reference from dad_gt_type, heterozygous allele from mom_gt_type" origin = "inherited from both parents or unlikely de novo" ct=0 if m5[0] == m6[ct]: phasedata = m5[0]+" from dad "+m6[ct+1]+" from mom" else: phasedata = m5[0]+" from dad "+m6[ct]+" from mom" elif kid_gt_type == 1 and dad_gt_type == 1 and mom_gt_type == 3: mendelian = "mendelian" phasable = "phasable" inheritance = "heterozygous allele from dad_gt_type, homozygous alternate from mom_gt_type" origin = "inherited from both parents or unlikely de novo" ct=0 if m5[ct] == m6[0]: phasedata = m5[ct+1]+" from dad "+m6[0]+" from mom" else: phasedata = m5[ct]+" from dad "+m6[0]+" from mom" elif kid_gt_type == 1 and dad_gt_type == 1 and mom_gt_type == 0: mendelian = "mendelian" phasable = "phasable" inheritance = "heterozygous allele from dad_gt_type, homozygous reference from mom_gt_type" origin = "inherited from both parents or unlikely de novo" ct=0 if m5[ct] == m6[0]: phasedata = m5[ct+1]+" from dad "+m6[0]+" from mom" else: phasedata = m5[ct]+" from dad "+m6[0]+" from mom" elif kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 3 or kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 0: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" elif kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 3: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" elif kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 0: mendelian = "mendelian" phasable = "phasable" inheritance = "homozygous alternate from dad_gt_type, homozygous reference from mom_gt_type" origin = "inherited from both parents or unlikely de novo" phasedata = m5[0]+" from dad "+m6[0]+" from mom" elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 3: mendelian = "mendelian" phasable = "phasable" inheritance = "homozygous reference from dad_gt_type, homozygous alternate from mom_gt_type" origin = "inherited from both parents or unlikely de novo" phasedata = m5[0]+" from dad "+m6[0]+" from mom" elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 3: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" try: print chrom + " " + start + " " + end + " " + family.family_id + " " + child.name + " " + ref + " " + alt + " " + gene + " " + impact + " " + kid_gt_ref_depths + " " + dad_gt_ref_depths + " " + mom_gt_ref_depths + " " + kid_gt_alt_depths + " " + dad_gt_alt_depths + " " + mom_gt_alt_depths + " " + kid_gt + " " + dad_gt + " " + mom_gt + " " + str(kid_gt_type) + " " + str(dad_gt_type) + " " + str(mom_gt_type) + " " + mendelian + " " + phasable + " " + inheritance + " " + origin + " " + phasedata except TypeError: print chrom + " " + start + " " + end + " " + family.family_id + " " + child.name + " " + ref + " " + alt + " " + gene + " " + impact + " " + kid_gt_ref_depths + " " + dad_gt_ref_depths + " " + mom_gt_ref_depths + " " + kid_gt_alt_depths + " " + dad_gt_alt_depths + " " + mom_gt_alt_depths + " " + kid_gt + " " + dad_gt + " " + mom_gt + " " + str(kid_gt_type) + " " + str(dad_gt_type) + " " + str(mom_gt_type) + " " + mendelian + " " + phasable + " " + inheritance + " " + origin
def get_impact_gene(args): gq = GeminiQuery(args.geminidb) gq.run("SELECT gene FROM variants WHERE chrom == '{chrom}' AND start == {pos}" .format(chrom=args.chrom, pos=args.pos - 1)) return gq.next()["gene"]
#!/usr/share/gemini/anaconda/bin/python -E # -*- coding: utf-8 -*- # usage: gemini_summarize.py <query> <gemini.db> import sys import locale from gemini import GeminiQuery DP_THRESHOLD = 8 GQ_THRESHOLD = 20 reload(sys) sys.setdefaultencoding(locale.getpreferredencoding()) gq = GeminiQuery(sys.argv[2], include_gt_cols=True) gq.run(sys.argv[1], None, True) header_printed = False genotype_columns = [ 'gt_depths', 'gt_ref_depths', 'gt_alt_depths', 'gts', 'gt_quals' ] for row in gq: columns = row.row.keys()[:-11] if (not header_printed): # Print file header print '\t'.join(['SAMPLE_ID'] + [s[:-1] for s in genotype_columns] + columns) header_printed = True # Output only het & hom alt variants for sample in row['variant_samples']:
#!/usr/share/gemini/anaconda/bin/python -E # -*- coding: utf-8 -*- # usage: gemini_summarize.py <query> <gemini.db> import sys import locale from gemini import GeminiQuery DP_THRESHOLD = 8 GQ_THRESHOLD = 20 reload(sys) sys.setdefaultencoding(locale.getpreferredencoding()) gq = GeminiQuery(sys.argv[2], include_gt_cols=True) gq.run(sys.argv[1], None, True) header_printed = False genotype_columns = ["gt_depths", "gt_ref_depths", "gt_alt_depths", "gts", "gt_quals"] for row in gq: columns = row.row.keys()[:-3] if not header_printed: # Print file header print "\t".join(["SAMPLE_ID"] + [s[:-1] for s in genotype_columns] + columns) header_printed = True # Output only het & hom alt variants for sample in row.variant_samples: # Drop low depth and low quality variants if ( row["gt_depths"][gq.sample2index[sample]] < DP_THRESHOLD
def get_individuals(variant_source, case_lines=None, case_type='ped', variant_mode='vcf'): """Get the individuals from a vcf file, gemini database, and/or a ped file. Args: variant_source (str): Path to a variant source case_lines(Iterable): Ped like lines case_type(str): Format of ped lines Returns: individuals (generator): generator with Individuals """ individuals = [] ind_dict ={} if variant_mode == 'vcf': head = get_header(variant_source) #Dictionary with ind_id:index where index show where in vcf ind info is for index, ind in enumerate(head.individuals): ind_dict[ind] = index if case_lines: # read individuals from ped file family_parser = FamilyParser(case_lines, family_type=case_type) families = family_parser.families logger.debug("Found families {0}".format( ','.join(list(families.keys())))) if len(families) != 1: logger.error("Only one family can be used with vcf adapter") raise IOError case_id = list(families.keys())[0] logger.debug("Family used in analysis: {0}".format(case_id)) for ind_id in family_parser.individuals: ind = family_parser.individuals[ind_id] logger.info("Found individual {0}".format(ind.individual_id)) try: individual = Individual( ind_id=ind_id, case_id=case_id, mother=ind.mother, father=ind.father, sex=str(ind.sex), phenotype=str(ind.phenotype), variant_source=variant_source, ind_index=ind_dict[ind_id], ) individuals.append(individual) except KeyError as err: #This is the case when individuals in ped does not exist #in vcf raise PedigreeError( family_id=case_id, individual_id=ind_id, message="Individual {0} exists in ped file but not in vcf".format(ind_id) ) else: case_id = os.path.basename(variant_source) for ind in ind_dict: individual = Individual( ind_id=ind, case_id=case_id, variant_source=variant_source, ind_index=ind_dict[ind] ) individuals.append(individual) logger.debug("Found individual {0} in {1}".format( ind, variant_source)) elif variant_mode == 'gemini': gq = GeminiQuery(variant_source) #Dictionaru with sample to index in the gemini database ind_dict = gq.sample_to_idx query = "SELECT * from samples" gq.run(query) for individual in gq: logger.debug("Found individual {0} with family id {1}".format( individual['name'], individual['family_id'])) individuals.append( Individual( ind_id=individual['name'], case_id=individual['family_id'], mother=individual['maternal_id'], father=individual['paternal_id'], sex=individual['sex'], phenotype=individual['phenotype'], ind_index=ind_dict.get(individual['name']), variant_source=variant_source, bam_path=None) ) return individuals
def get_impact_gene(args): gq = GeminiQuery(args.geminidb) gq.run( "SELECT gene FROM variants WHERE chrom == '{chrom}' AND start == {pos}" .format(chrom=args.chrom, pos=args.pos - 1)) return gq.next()["gene"]
def find_de_novo(): #defines input arguments parser = argparse.ArgumentParser(description='Finds de novos') parser.add_argument('-f','--input_file', default='', help='The input file; should be a SQLite .db that gemini can read') parser.add_argument('-p','--min_total_parent_depth', default = '0', help='The minimum total read depth for parental alleles for variants to be considered') parser.add_argument('-c','--min_total_child_depth', default = '0', help='The minimum total read depth for child alleles for variants to be considered') parser.add_argument('-m','--max_alt_parent_depth', default = '0', help='The maximum alternate read depth for parental alleles for de novos to be considered') parser.add_argument('-a','--max_alt_child_depth', default = '0', help='The maximum alternate read depth for child alleles for de novos to be considered') #checks minimum number of arguments if len(sys.argv)<1: parser.print_help() sys.exit("Where is the input file?") #parses arguments args = parser.parse_args() database=args.input_file mtpd=int(args.min_total_parent_depth) mtcd=int(args.min_total_child_depth) mapd=int(args.max_alt_parent_depth) macd=int(args.max_alt_child_depth) if database == '': sys.exit("You must supply an input file") gq=GeminiQuery(database) families = subjects.get_families(database) gq.run("select chrom, start, end, ref, alt, gene, impact, gts, gt_types, gt_ref_depths, gt_alt_depths from variants") s2i=gq.sample_to_idx for row in gq: mendelian = "" phasable = "" inheritance = "" origin = "" phasedata = "" chrom = str(row['chrom']) start = str(row['start']) end = str(row['end']) ref = str(row['ref']) alt = str(row['alt']) gene = str(row['gene']) impact = str(row['impact']) for family in families: dad_idx = s2i[family.father_name] mom_idx = s2i[family.mother_name] dad_gt = str(row['gts'][dad_idx]) mom_gt = str(row['gts'][mom_idx]) dad_gt_type = row['gt_types'][dad_idx] mom_gt_type = row['gt_types'][mom_idx] dad_gt_ref_depths = str(row['gt_ref_depths'][dad_idx]) mom_gt_ref_depths = str(row['gt_ref_depths'][mom_idx]) dad_gt_alt_depths = str(row['gt_alt_depths'][dad_idx]) mom_gt_alt_depths = str(row['gt_alt_depths'][mom_idx]) #m5=re.search('((?:\w*|\.*))/((?:\w*|\.*))',dad_gt) m5=string.split(dad_gt,"/") #m6=re.search('((?:\w*|\.*))/((?:\w*|\.*))',mom_gt) m6=string.split(mom_gt,"/") for child in family.children: kid_idx = s2i[str(child.name)] kid_gt = str(row['gts'][kid_idx]) kid_gt_type = row['gt_types'][kid_idx] kid_gt_ref_depths = str(row['gt_ref_depths'][kid_idx]) kid_gt_alt_depths = str(row['gt_alt_depths'][kid_idx]) #code for removing variants that do not meet parameters if int(dad_gt_ref_depths)+int(dad_gt_alt_depths)<mtpd or int(mom_gt_ref_depths)+int(mom_gt_alt_depths)<mtpd \ or int(kid_gt_ref_depths)+int(kid_gt_alt_depths)<mtcd \ or int(dad_gt_alt_depths)>mapd or int(mom_gt_alt_depths)>mapd \ or int(kid_gt_alt_depths)>macd: \ continue if kid_gt_type == 2 or dad_gt_type == 2 or mom_gt_type == 2: continue elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 0 or kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 3: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 3 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 0: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "extremely rare de novo or bad data" elif kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 3 or kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 0: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" elif kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 3: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 3: mendelian = "non-mendelian" phasable = "unphasable" inheritance = "unknown" origin = "de novo or erroneous data" else: continue print chrom + " " + start + " " + end + " " + family.family_id + " " + child.name + " " + ref + " " + alt + " " + gene + " " + impact + " " + kid_gt_ref_depths + " " + dad_gt_ref_depths + " " + mom_gt_ref_depths + " " + kid_gt_alt_depths + " " + dad_gt_alt_depths + " " + mom_gt_alt_depths + " " + kid_gt + " " + dad_gt + " " + mom_gt + " " + str(kid_gt_type) + " " + str(dad_gt_type) + " " + str(mom_gt_type) + " " + mendelian + " " + phasable + " " + inheritance + " " + origin
def run_gemini_query(self, id, query, genotype_filter, json_filename, mode, results_string, user_id): json_results_fh = os.path.join(STATIC_FOLDER, json_filename) results_file = "/static/%s" % json_filename self.update_state(state='PROGRESS', meta={'status': 'Setup'}) sys.stderr.write("DEBUG: Retrieving GEMINI db object\n") gdb = models.GDatabase.objects.get(id = ObjectId(id)) sys.stderr.write("DEBUG: Setup Query Object\n") gq = GeminiQuery(gdb.file, out_format=JSONRowFormat(None)) self.update_state(state='PROGRESS', meta={'status': 'Running Query'}) sys.stderr.write("DEBUG: Run GEMINi Query\n") gq.run(query, genotype_filter) sys.stderr.write("DEBUG: Getting header\n") header = gq.header js_header = [] for key in header: string = key.replace('.', '\\\\.') js_header.append(string) #The json result file is a unique name generated from the database name, query, and genotype_filter #If the json results file already exists we save some time by skipping generating the file. #We only re-executed the query to get the header object. count1 = 0 #rows = [] sys.stderr.write("DEBUG: Checking if file exists\n") self.update_state(state='PROGRESS', meta={'status': 'Checking for Existence of File'}) if not os.path.isfile(json_results_fh): sys.stderr.write("DEBUG: Opening results file\n") self.update_state(state='PROGRESS', meta={'status': 'Writing data to JSON file'}) with open(json_results_fh, "wb") as file: count = 0 file.write("""{\n"data": [\n""") for row in gq: #rows.append(row) if count == 0: file.write("%s" % row) else: file.write(",\n%s" % row) count += 1 file.write("""\n]\n}\n""") self.update_state(state='PROGRESS', meta={'status': 'File Writing Complete'}) sys.stderr.write("DEBUG: Done writing results file\n") #Save Results to database self.update_state(state='PROGRESS', meta={'status': 'Sending Results to Database'}) sys.stderr.write("DEBUG: Saving results to database\n") sys.stderr.write("DEBUG: Fetching user\n") user = models.User.objects.get(id=user_id) result_elements = results_string.split('_') sys.stderr.write("DEBUG: Creating result object\n") r = models.GResult(header = header, js_header = js_header, query = query, query_slug = result_elements[3], created_on = datetime.datetime.now, created_by = user, last_accessed = datetime.datetime.now) sys.stderr.write("DEBUG: JSON Opening file\n") file = open(json_results_fh, 'rb') sys.stderr.write("DEBUG: Adding file\n") r.json.put(file, content_type = 'application/json') sys.stderr.write("DEBUG: Saving results\n") r.save() sys.stderr.write("DEBUG: Appending results to GEMINI database entry\n") gdb.results.append(r) sys.stderr.write("DEBUG: Saving\n") gdb.save() self.update_state(state='PROGRESS', meta={'status': 'Complete'}) sys.stderr.write("DEBUG: Returning Results\n") self.update_state(state='SUCCESS', meta={'status': 'Results completed'}) return (header, js_header, results_file, gdb.file, query, genotype_filter, results_string, json_results_fh)
def extract_shared_mutations(database, reference_file=None): phenotypes = generate_phenotypes(database) query = GeminiQuery(database) query_string = ("SELECT chrom, start, end, gene, ref, alt, type, sub_type," "impact, codon_change, aa_change, vcf_id, cosmic_ids" " FROM variants WHERE in_1kg=0") query.run(query_string, show_variant_samples=True) rows = list() for row in query: variants = row.variant_samples if any(item in phenotypes[1] for item in variants) and any( item in phenotypes[2] for item in variants): valid_groups = list() chrom, start, end, alt = (row["chrom"], row["start"], row["end"], row["alt"]) # In the case of intergenic regions, get the name of the # closest gene if row["gene"] is None: gene = get_nearby_gene(chrom, start, end) # print "None subsituted with", gene else: gene = row["gene"] for gid, group in groupby(variants, lambda x: x.split("_")[1]): # Rename according to Pandora guidelines # Starts with 0: 1 + number # 3 digits: 20 + number # 4 digits: 2 + number if len(gid) < 5: newgid = "2" + gid if len(gid) == 4 else "20" + gid else: newgid = gid group = list(group) if len(list(group)) == 2: # Check if we have different ALT bases for the samples # in the same pair. If this occurs, it is a false positive # and should be discarded. To do so, we need a VCF file to # query by base, otherwise we take the value as-is. if reference_file is not None: alt = check_multiple_alts(chrom, start, end, alt, group, reference_file) if alt is None: # Biallelic site for pair - discard continue valid_groups.append(newgid) cosmic_data = "Yes" if row["cosmic_ids"] else "No" data = [chrom, start, end, gene, row["ref"], alt, row["type"], row["sub_type"], row["impact"], row["codon_change"], row["aa_change"], row["vcf_id"], cosmic_data] if not valid_groups: rows.append(data + [np.nan]) else: for gid in valid_groups: rows.append(data + [gid]) colnames = ["chrom", "start", "end", "gene", "ref", "alt", "type", "sub_type", "impact", "codon_change", "aa_change", "dbsnp_id", "in_cosmic", "variants_with_pairs"] df = pd.DataFrame.from_records(rows, columns=colnames) df.set_index(["chrom", "start", "end", "gene", "ref", "alt"], inplace=True) # Get rid of loci without pairs df = df.dropna(subset=["variants_with_pairs"]) return df
def test_gemini_db(self): """Check if self.db is a valid gemini database Raises sqlite3.DatabaseError if not a valid databse """ gq = GeminiQuery(self.db) return True
def query(): def _get_fields(): query = request.GET.get('query', '').strip() gt_filter = request.GET.get('gt_filter', '').strip() use_header = request.GET.get('use_header') igv_links = request.GET.get('igv_links') return query, gt_filter or None, use_header, igv_links # user clicked the "submit" button if request.GET.get('submit', '').strip(): (query, gt_filter, use_header, igv_links) = _get_fields() if use_header: use_header = True if igv_links: igv_links = True gq = GeminiQuery.GeminiQuery(database) gq._set_gemini_browser(True) gq.run(query, gt_filter) if len(query) == 0: return template('query.j2', dbfile=database) if igv_links and ('chrom' not in query.lower() or 'start' not in query.lower() or 'end' not in query.lower()): return template('query.j2', dbfile=database, rows=gq, igv_links=igv_links, igv_links_error=True, use_header=use_header, gt_filter=gt_filter, query=query) else: return template('query.j2', dbfile=database, rows=gq, igv_links=igv_links, igv_links_error=False, use_header=use_header, gt_filter=gt_filter, query=query) # user clicked the "save to file" button elif request.GET.get('save', '').strip(): (query, gt_filter, use_header, igv_links) = _get_fields() gq = GeminiQuery.GeminiQuery(database) gq.run(query, gt_filter) if len(query) == 0: return template('query.j2', dbfile=database) # dump the results to a text file. this will be # stored in /static and a link will be given to # the user. tmp_file = '/tmp.txt' tmp = open(_static_folder + tmp_file, 'w') for i, row in enumerate(gq): if i == 0 and use_header: tmp.write('\t'.join([str(key) for key in row.keys()]) + '\n') tmp.write('\t'.join([str(row[key]) for key in row.keys()]) + '\n') tmp.close() return template('query.j2', dbfile=database, tmp_file=tmp_file, igv_links=igv_links, igv_links_error=True, use_header=use_header, gt_filter=gt_filter, query=query) # user did nothing. else: return template('query.j2', dbfile=database)