def get_gene_bounds(self, gene_id): gene = self.gene_utils.get_genes([gene_id]).get(gene_id) if not gene: return (None, None, None) build = 'Grch37' if gene['chromGrch37'] else 'Grch38' chrom = gene['chrom{}'.format(build)] start = gene['start{}'.format(build)] end = gene['end{}'.format(build)] return (genomeloc.get_xpos(chrom, start), genomeloc.get_xpos(chrom, end))
def _make_db_query(self, genotype_filter=None, variant_filter=None): """ Caller specifies filters to get_variants, but they are evaluated later. Here, we just inspect those filters and see what heuristics we can apply to avoid a full table scan, Query here must return a superset of the true get_variants results Note that the full annotation isn't stored, so use the fields added by _add_index_fields_to_variant """ db_query = {} # genotype filter if genotype_filter is not None: _add_genotype_filter_to_variant_query(db_query, genotype_filter) if variant_filter: #logger.info(pformat(variant_filter.toJSON())) if variant_filter.locations: location_ranges = [] for i, location in enumerate(variant_filter.locations): if isinstance(location, basestring): chrom, pos_range = location.split(":") start, end = pos_range.split("-") xstart = genomeloc.get_xpos(chrom, int(start)) xend = genomeloc.get_xpos(chrom, int(end)) variant_filter.locations[i] = (xstart, xend) else: xstart, xend = location location_ranges.append({ '$and': [{ 'xpos': { '$gte': xstart } }, { 'xpos': { '$lte': xend } }] }) db_query['$or'] = location_ranges if variant_filter.so_annotations: db_query['db_tags'] = {'$in': variant_filter.so_annotations} if variant_filter.genes: if getattr(variant_filter, 'exclude_genes'): db_query['db_gene_ids'] = {'$nin': variant_filter.genes} else: db_query['db_gene_ids'] = {'$in': variant_filter.genes} if variant_filter.ref_freqs: for population, freq in variant_filter.ref_freqs: #if population in self._annotator.reference_population_slugs: db_query['db_freqs.' + population] = {'$lte': freq} return db_query
def _make_db_query(self, genotype_filter=None, variant_filter=None): """ Caller specifies filters to get_variants, but they are evaluated later. Here, we just inspect those filters and see what heuristics we can apply to avoid a full table scan, Query here must return a superset of the true get_variants results Note that the full annotation isn't stored, so use the fields added by _add_index_fields_to_variant """ db_query = {} # genotype filter if genotype_filter is not None: _add_genotype_filter_to_variant_query(db_query, genotype_filter) if variant_filter: logger.info(pformat(variant_filter.toJSON())) if variant_filter.locations: location_ranges = [] for i, location in enumerate(variant_filter.locations): if isinstance(location, basestring): chrom, pos_range = location.split(":") start, end = pos_range.split("-") xstart = genomeloc.get_xpos(chrom, int(start)) xend = genomeloc.get_xpos(chrom, int(end)) variant_filter.locations[i] = (xstart, xend) else: xstart, xend = location location_ranges.append({'$and' : [ {'xpos' : {'$gte': xstart }}, {'xpos' : {'$lte': xend }}] }) db_query['$or'] = location_ranges if variant_filter.so_annotations: db_query['db_tags'] = {'$in': variant_filter.so_annotations} if variant_filter.genes: if getattr(variant_filter, 'exclude_genes'): db_query['db_gene_ids'] = {'$nin': variant_filter.genes} else: db_query['db_gene_ids'] = {'$in': variant_filter.genes} if variant_filter.ref_freqs: for population, freq in variant_filter.ref_freqs: #if population in self._annotator.reference_population_slugs: db_query['db_freqs.' + population] = {'$lte': freq} if variant_filter.ref_acs: for population, ac in variant_filter.ref_acs: db_query['db_acs.' + population] = {'$lte': ac} if variant_filter.ref_hom_hemi: for population, count in variant_filter.ref_hom_hemi: db_query['db_hemi.' + population] = {'$lte': count} db_query['db_hom.' + population] = {'$lte': count} return db_query
def load_from_cadd_file(cadd_file): """Utility function to load scores from a CADD file""" f = gzip.open(cadd_file) # skip header lines f.next() header = f.next() for line in tqdm.tqdm(f): # Chrom Pos Ref Alt RawScore PHRED chrom, pos, ref, alt, raw, phred = line.rstrip('\n').split('\t') xpos = genomeloc.get_xpos(chrom, int(pos)) result = annotator_store.variants.update( { 'xpos': xpos, 'ref': ref, 'alt': alt, 'annotation.cadd_phred': { '$exists': False } }, {'$set': { 'annotation.cadd_phred': phred }}, upsert=False)
def add_variant_tag(row, user): project_id = (row.get('Linking ID') or row.get('Link -> project ID')).split('/')[4] family_id = (row.get('Family ID') or row.get('CMG Internal Project ID(s)')).strip() new_tag_name = row['New Tag'].strip() gene_symbol = (row.get('Gene symbol') or row.get('Gene Name')).strip() try: hgvsc = (row.get('HGVS') or row.get('g. coordinate')) hgvsc = hgvsc.strip().split(",")[0] chrom, _ = hgvsc.split(":") chrom = chrom.replace("chr", "") pos_ref, alt = _.split(">") pos = re.search("[0-9]+", pos_ref).group(0).strip() ref = re.search("[ACGT]+", pos_ref).group(0).strip() xpos = genomeloc.get_xpos(chrom, int(pos)) except Exception as e: print("Couldn't parse HGVS: %s in row %s. %s" % (hgvsc, row, e)) return try: project_tag = ProjectTag.objects.get(project__project_id=project_id, tag__icontains=new_tag_name) except ObjectDoesNotExist as e: print("project tag not found - %s %s: %s" % (project_id, new_tag_name, e)) return try: families = get_family(family_id, project_id=project_id) except Exception as e: print("Unable to get family: %s %s" % (family_id, e)) return assert len(families) == 1 family = families[0] for vt in VariantTag.objects.filter(family=family, xpos=xpos, ref=ref, alt=alt): if any(k in vt.project_tag.tag.lower() for k in ["tier 1", "tier 2", "known gene for phenotype"]): if vt.project_tag.tag != project_tag.tag: print("Variant %s tag will be replaced with %s" % (vt, project_tag.tag)) vt.delete() else: print("Variant %s already exists in %s %s" % (vt, project_id, family_id)) variant_tags_by_multiple_users = VariantTag.objects.filter(project_tag=project_tag, family=family, xpos=xpos, ref=ref, alt=alt) if len(variant_tags_by_multiple_users) > 1: for vt in variant_tags_by_multiple_users: if vt.user == user: print("Deleting extra tag: " + str(vt)) vt.delete() vt, created = VariantTag.objects.get_or_create(project_tag=project_tag, family=family, xpos=xpos, ref=ref, alt=alt) if created: vt.user = user vt.save() print("Creating tag: %s" % (vt.toJSON(),))
def get_exac_af(chrom, pos, ref, alt): populations = ['AMR', 'EAS', 'FIN', 'NFE', 'SAS', 'AFR'] chrom_without_chr = chrom.replace("chr", "") xpos = genomeloc.get_single_location(chrom, pos) variant_length = len(ref) + len(alt) # check whether the alleles match matching_exac_variant = None matching_exac_variant_i = None for record in exac_vcf.fetch(chrom_without_chr, pos - variant_length, pos + variant_length): exac_xpos = genomeloc.get_xpos(record.CHROM, record.POS) for exac_alt_i, exac_alt in enumerate(record.ALT): exac_variant_xpos, exac_ref, exac_alt = get_minimal_representation( exac_xpos, str(record.REF), str(exac_alt)) if exac_variant_xpos == xpos and exac_ref == ref and exac_alt == alt: if matching_exac_variant is not None: print( "ERROR: multiple exac variants match the variant: %s %s %s %s" % (chrom, pos, ref, alt)) matching_exac_variant = record matching_exac_variant_i = exac_alt_i #print("Variant %s %s %s matches: %s %s %s %s" % (xpos, ref, alt, record, exac_variant_xpos, exac_ref, exac_alt) ) if matching_exac_variant is None: #print("Variant %s %s %s %s not found in ExAC" % (chrom, pos, alt, ref)) return None, None, None pop_max_af = -1 pop_max_population = None for p in populations: if matching_exac_variant.INFO['AN_' + p] > 0: pop_af = matching_exac_variant.INFO[ 'AC_' + p][matching_exac_variant_i] / float( matching_exac_variant.INFO['AN_' + p]) if pop_af > pop_max_af: pop_max_af = pop_af pop_max_population = p if matching_exac_variant.INFO['AN_Adj'] != 0: global_af = float(matching_exac_variant.INFO['AC_Adj'] [matching_exac_variant_i]) / float( matching_exac_variant.INFO['AN_Adj']) else: assert float( matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i]) == 0 global_af = 0 return global_af, pop_max_af, pop_max_population
def add_breakpoint_from_dict(project, bp): """ Add a breakpoint to the given project based on keys from the given dict. The sample id is presumed to already be loaded as an existing individual in the project. If a breakpoint already exists, it is not updated or changed (even if data loaded is actually different). Therefore to reload it is necessary to delete first, but it is safe to load new samples incrementally by just running the load again. """ # Fields in dict are chr start end sample depth cscore partner genes cdsdist xpos = genomeloc.get_xpos(bp['chr'], int(bp['start'])) sample_id = slugify(bp['sample'], separator='_') try: breakpoint = Breakpoint.objects.get(project=project, xpos=xpos, individual__indiv_id=sample_id) existing = True except Breakpoint.DoesNotExist: existing = False breakpoint = Breakpoint() breakpoint.xpos = xpos breakpoint.project = project breakpoint.obs = int(bp['depth']) breakpoint.individual = Individual.objects.get(project=project, indiv_id=sample_id) breakpoint.sample_count = int(bp['sample_count']) breakpoint.partner = bp['partner'] breakpoint.consensus = bp['cscore'] breakpoint.save() for gene_symbol, cds_dist in zip(bp['genes'].split(','), bp['cdsdist'].split(',')): if gene_symbol: if existing: try: gene = BreakpointGene.objects.get(breakpoint=breakpoint, gene_symbol=gene_symbol) except BreakpointGene.DoesNotExist: gene = BreakpointGene() else: gene = BreakpointGene() gene.breakpoint = breakpoint gene.gene_symbol = gene_symbol gene.cds_dist = int(cds_dist) gene.save()
def load_from_cadd_file(cadd_file): """Utility function to load scores from a CADD file""" f = gzip.open(cadd_file) # skip header lines f.next() header = f.next() for line in tqdm.tqdm(f): # Chrom Pos Ref Alt RawScore PHRED chrom, pos, ref, alt, raw, phred = line.rstrip('\n').split('\t') xpos = genomeloc.get_xpos(chrom, int(pos)) result = annotator_store.variants.update({'xpos': xpos, 'ref': ref, 'alt': alt, 'annotation.cadd_phred': {'$exists' : False} }, {'$set': {'annotation.cadd_phred': phred}}, upsert=False)
def add_breakpoint_from_dict(project, bp ): """ Add a breakpoint to the given project based on keys from the given dict. The sample id is presumed to already be loaded as an existing individual in the project. If a breakpoint already exists, it is not updated or changed (even if data loaded is actually different). Therefore to reload it is necessary to delete first, but it is safe to load new samples incrementally by just running the load again. """ # Fields in dict are chr start end sample depth cscore partner genes cdsdist xpos = genomeloc.get_xpos(bp['chr'], int(bp['start'])) sample_id = slugify(bp['sample'], separator='_') try: breakpoint = Breakpoint.objects.get(project=project, xpos=xpos, individual__indiv_id=sample_id) existing = True except Breakpoint.DoesNotExist: existing = False breakpoint = Breakpoint() breakpoint.xpos = xpos breakpoint.project = project breakpoint.obs = int(bp['depth']) breakpoint.individual = Individual.objects.get(project=project, indiv_id=sample_id) breakpoint.sample_count = int(bp['sample_count']) breakpoint.partner = bp['partner'] breakpoint.consensus = bp['cscore'] breakpoint.save() for gene_symbol,cds_dist in zip(bp['genes'].split(','), bp['cdsdist'].split(',')): if gene_symbol: if existing: try: gene = BreakpointGene.objects.get(breakpoint=breakpoint, gene_symbol=gene_symbol) except BreakpointGene.DoesNotExist: gene = BreakpointGene() else: gene = BreakpointGene() gene.breakpoint = breakpoint gene.gene_symbol = gene_symbol gene.cds_dist = int(cds_dist) gene.save()
def get_exac_af(chrom, pos, ref, alt): populations = ['AMR', 'EAS', 'FIN', 'NFE', 'SAS', 'AFR'] chrom_without_chr = chrom.replace("chr", "") xpos = genomeloc.get_single_location(chrom, pos) variant_length = len(ref) + len(alt) # check whether the alleles match matching_exac_variant = None matching_exac_variant_i = None for record in exac_vcf.fetch(chrom_without_chr, pos - variant_length, pos + variant_length): exac_xpos = genomeloc.get_xpos(record.CHROM, record.POS) for exac_alt_i, exac_alt in enumerate(record.ALT): exac_variant_xpos, exac_ref, exac_alt = get_minimal_representation(exac_xpos, str(record.REF), str(exac_alt)) if exac_variant_xpos == xpos and exac_ref == ref and exac_alt == alt: if matching_exac_variant is not None: print("ERROR: multiple exac variants match the variant: %s %s %s %s" % (chrom, pos, ref, alt)) matching_exac_variant = record matching_exac_variant_i = exac_alt_i #print("Variant %s %s %s matches: %s %s %s %s" % (xpos, ref, alt, record, exac_variant_xpos, exac_ref, exac_alt) ) if matching_exac_variant is None: #print("Variant %s %s %s %s not found in ExAC" % (chrom, pos, alt, ref)) return None, None, None pop_max_af = -1 pop_max_population = None for p in populations: if matching_exac_variant.INFO['AN_'+p] > 0: pop_af = matching_exac_variant.INFO['AC_'+p][matching_exac_variant_i]/float(matching_exac_variant.INFO['AN_'+p]) if pop_af > pop_max_af: pop_max_af = pop_af pop_max_population = p if matching_exac_variant.INFO['AN_Adj'] != 0: global_af = float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i])/float(matching_exac_variant.INFO['AN_Adj']) else: assert float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i]) == 0 global_af = 0 return global_af, pop_max_af, pop_max_population
def add_variant_tag(row, user): project_id = (row.get('Linking ID') or row.get('Link -> project ID')).split('/')[4] family_id = (row.get('Family ID') or row.get('CMG Internal Project ID(s)')).strip() new_tag_name = row['New Tag'].strip() gene_symbol = (row.get('Gene symbol') or row.get('Gene Name')).strip() try: hgvsc = (row.get('HGVS') or row.get('g. coordinate')) hgvsc = hgvsc.strip().split(",")[0] chrom, _ = hgvsc.split(":") chrom = chrom.replace("chr", "") pos_ref, alt = _.split(">") pos = re.search("[0-9]+", pos_ref).group(0).strip() ref = re.search("[ACGT]+", pos_ref).group(0).strip() xpos = genomeloc.get_xpos(chrom, int(pos)) except Exception as e: print("Couldn't parse HGVS: %s in row %s. %s" % (hgvsc, row, e)) return try: project_tag = ProjectTag.objects.get(project__project_id=project_id, tag__icontains=new_tag_name) except ObjectDoesNotExist as e: print("project tag not found - %s %s: %s" % (project_id, new_tag_name, e)) return try: families = get_family(family_id, project_id=project_id) except Exception as e: print("Unable to get family: %s %s" % (family_id, e)) return assert len(families) == 1 family = families[0] for vt in VariantTag.objects.filter(family=family, xpos=xpos, ref=ref, alt=alt): if any(k in vt.project_tag.tag.lower() for k in ["tier 1", "tier 2", "known gene for phenotype"]): if vt.project_tag.tag != project_tag.tag: print("Variant %s tag will be replaced with %s" % (vt, project_tag.tag)) vt.delete() else: print("Variant %s already exists in %s %s" % (vt, project_id, family_id)) variant_tags_by_multiple_users = VariantTag.objects.filter( project_tag=project_tag, family=family, xpos=xpos, ref=ref, alt=alt) if len(variant_tags_by_multiple_users) > 1: for vt in variant_tags_by_multiple_users: if vt.user == user: print("Deleting extra tag: " + str(vt)) vt.delete() vt, created = VariantTag.objects.get_or_create(project_tag=project_tag, family=family, xpos=xpos, ref=ref, alt=alt) if created: vt.user = user vt.save() print("Creating tag: %s" % (vt.toJSON(), ))
def search_for_genes(self, gene_or_variant_ids, project_id_list, output_filename, max_af=0.01, knockouts=False, in_clinvar_only=False, include_non_coding=False): """ Search for a gene across project(s) Args: gene_or_variant_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter in_clinvar_only (bool): include_non_coding (bool): """ projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes", "gnomad-genomes", "families", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants if not knockouts: variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) #variant_filter.set_max_AF(max_af) if include_non_coding: variant_filter.so_annotations = [] print("All Filters: ") pprint(variant_filter.toJSON()) #print("Max AF threshold: %s" % max_af) print("Starting search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_or_variant_ids), ", ".join( [p.project_id for p in projects]))) for project in projects: project_id = project.project_id if get_project_datastore(project).project_collection_is_loaded( project): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue indiv_cache = {} for gene_or_variant_id in gene_or_variant_ids: chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})", gene_or_variant_id) chrom_pos_ref_alt_match = re.match( "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)", gene_or_variant_id) if chrom_pos_match or chrom_pos_ref_alt_match: chrom = chrom_pos_match.group(1) pos = int(chrom_pos_match.group(2)) xpos = genomeloc.get_xpos(chrom, pos) ref = alt = None if chrom_pos_ref_alt_match: ref = chrom_pos_ref_alt_match.group(3) alt = chrom_pos_ref_alt_match.group(4) variant = get_project_datastore( project).get_single_variant(project.project_id, None, xpos, ref, alt) if variant is None: continue variants = [variant] print("-- searching %s for variant %s-%s-%s: found %s" % (project_id, xpos, ref, alt, variant)) worst_annotation_idx = variant.annotation[ 'worst_vep_annotation_index'] print(variant.annotation["vep_annotation"] [worst_annotation_idx]) gene_id = variant.annotation["vep_annotation"][ worst_annotation_idx]['gene_id'] gene = get_reference().get_gene(gene_id) else: gene_id = get_gene_id_from_str(gene_or_variant_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) if knockouts: knockout_ids, variation = project_analysis.get_knockouts_in_gene( project, gene_id) variants = variation.get_relevant_variants_for_indiv_ids( knockout_ids) else: variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) for variant in variants: if not chrom_pos_match and not chrom_pos_ref_alt_match and max( variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"].get(gene_id) if worst_annotation_idx is not None: worst_annotation = variant.annotation[ "vep_annotation"][worst_annotation_idx] else: worst_annotation = None all_genotypes_list = [] pass_filter = "N/A" family_ids = set() for indiv_id, genotype in variant.genotypes.items(): if indiv_id in indiv_cache: individual = indiv_cache[indiv_id] if individual == 'deleted': continue else: try: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_cache[indiv_id] = individual except ObjectDoesNotExist: # this can happen when an individual is deleted from the project - from postgres, but not from mong indiv_cache[indiv_id] = 'deleted' continue except MultipleObjectsReturned: # when several families have an individual with the same id individuals = Individual.objects.filter( project=project, indiv_id=indiv_id) individual = individuals[0] indiv_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: family_ids.add(individual.family.family_id) all_genotypes_list.append( "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" % (individual.family.family_id, indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) if len(all_genotypes_list) == 0: continue measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) if in_clinvar_only and ( not clinvar_significance or "path" not in clinvar_significance.lower()): continue row = map(str, [ project_id, gene, variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", "") if worst_annotation else "", (worst_annotation.get("hgvsp", "") or "").replace( "%3D", "=") if worst_annotation else "", worst_annotation.get("sift", "") if worst_annotation else "", worst_annotation.get("polyphen", "") if worst_annotation else "", worst_annotation.get("mutationtaster_pred", "") if worst_annotation else "", ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))) if worst_annotation else "", measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), variant.annotation["freqs"].get("gnomad-exomes2", ""), variant.annotation["freqs"].get("gnomad-genomes2", ""), ", ".join(sorted(list(family_ids))), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)