def genes_info(request): gene_ids = request.GET.get('geneIds', '').split(',') return create_json_response( get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True))
def _get_mme_genes_phenotypes(results, get_features, get_genomic_features, include_matched_symbol_genes=False, **kwargs): hpo_ids, gene_ids, gene_symbols = _get_mme_gene_phenotype_ids( results, get_features, get_genomic_features, **kwargs) gene_symbols_to_ids = get_gene_ids_for_gene_symbols(gene_symbols) if include_matched_symbol_genes: # Include all gene IDs associated with the given symbol for new_gene_ids in gene_symbols_to_ids.values(): gene_ids.update(new_gene_ids) # Include any gene IDs whose legacy id is the given symbol for gene_symbol in gene_symbols: legacy_gene_ids = get_filtered_gene_ids( Q(dbnsfpgene__gene_names__startswith='{};'.format(gene_symbol)) | Q(dbnsfpgene__gene_names__endswith=';{}'.format(gene_symbol)) | Q(dbnsfpgene__gene_names__contains=';{};'.format( gene_symbol))) gene_symbols_to_ids[gene_symbol] += legacy_gene_ids gene_ids.update(legacy_gene_ids) else: gene_ids.update( {new_gene_ids[0] for new_gene_ids in gene_symbols_to_ids.values()}) genes_by_id = get_genes(gene_ids) hpo_terms_by_id = { hpo.hpo_id: hpo.name for hpo in HumanPhenotypeOntology.objects.filter(hpo_id__in=hpo_ids) } return hpo_terms_by_id, genes_by_id, gene_symbols_to_ids
def anvil_export(request, project_guid): if project_guid == 'all': project_guid = None if project_guid: projects_by_guid = {project_guid: Project.objects.get(guid=project_guid)} else: projects_by_guid = {p.guid: p for p in Project.objects.filter(projectcategory__name__iexact='anvil')} individuals = _get_loaded_before_date_project_individuals(projects_by_guid.values(), loaded_before=request.GET.get('loadedBefore')) saved_variants_by_family = _get_saved_variants_by_family(projects_by_guid.values(), request.user) # Handle compound het genes compound_het_gene_id_by_family = {} for family_guid, saved_variants in saved_variants_by_family.items(): if len(saved_variants) > 1: potential_compound_het_variants = [ variant for variant in saved_variants if all(gen['numAlt'] < 2 for gen in variant['genotypes'].values()) ] main_gene_ids = {variant['mainTranscript']['geneId'] for variant in potential_compound_het_variants} if len(main_gene_ids) > 1: # This occurs in compound hets where some hits have a primary transcripts in different genes for gene_id in main_gene_ids: if all(gene_id in variant['transcripts'] for variant in potential_compound_het_variants): compound_het_gene_id_by_family[family_guid] = gene_id rows = _get_json_for_individuals(list(individuals), project_guid=project_guid, family_fields=['family_id', 'coded_phenotype']) gene_ids = set() for row in rows: row['Project_ID'] = projects_by_guid[row['projectGuid']].name saved_variants = saved_variants_by_family[row['familyGuid']] row['numSavedVariants'] = len(saved_variants) for i, variant in enumerate(saved_variants): genotype = variant['genotypes'].get(row['individualGuid'], {}) if genotype.get('numAlt', -1) > 0: gene_id = compound_het_gene_id_by_family.get(row['familyGuid']) or variant['mainTranscript']['geneId'] gene_ids.add(gene_id) variant_fields = { 'Zygosity': 'heterozygous' if genotype['numAlt'] == 1 else 'homozygous', 'Chrom': variant['chrom'], 'Pos': variant['pos'], 'Ref': variant['ref'], 'Alt': variant['alt'], 'hgvsc': variant['mainTranscript']['hgvsc'], 'hgvsp': variant['mainTranscript']['hgvsp'], 'Transcript': variant['mainTranscript']['transcriptId'], 'geneId': gene_id, } row.update({'{}-{}'.format(k, i + 1): v for k, v in variant_fields.items()}) genes_by_id = get_genes(gene_ids) for row in rows: for key, gene_id in row.items(): if key.startswith('geneId') and genes_by_id.get(gene_id): row[key.replace('geneId', 'Gene')] = genes_by_id[gene_id]['geneSymbol'] return create_json_response({'anvilRows': rows})
def get_mme_genes_phenotypes(results, additional_genes=None): hpo_ids = set() genes = additional_genes if additional_genes else set() for result in results: hpo_ids.update({ feature['id'] for feature in result['patient'].get('features', []) if feature.get('id') }) genes.update({ gene_feature['gene']['id'] for gene_feature in result['patient'].get('genomicFeatures', []) }) gene_ids = {gene for gene in genes if gene.startswith('ENSG')} gene_symols = {gene for gene in genes if not gene.startswith('ENSG')} gene_symbols_to_ids = get_gene_ids_for_gene_symbols(gene_symols) gene_ids.update( {new_gene_ids[0] for new_gene_ids in gene_symbols_to_ids.values()}) genes_by_id = get_genes(gene_ids) hpo_terms_by_id = { hpo.hpo_id: hpo.name for hpo in HumanPhenotypeOntology.objects.filter(hpo_id__in=hpo_ids) } return hpo_terms_by_id, genes_by_id, gene_symbols_to_ids
def project_page_data(request, project_guid): """Returns a JSON object containing information used by the project page: :: json_response = { 'project': {..}, 'familiesByGuid': {..}, 'individualsByGuid': {..}, 'samplesByGuid': {..}, } Args: project_guid (string): GUID of the Project to retrieve data for. """ project = get_project_and_check_permissions(project_guid, request.user) update_project_from_json(project, {'last_accessed_date': timezone.now()}) response = _get_project_child_entities(project, request.user) project_json = _get_json_for_project(project, request.user) project_json['collaborators'] = get_json_for_project_collaborator_list(project) project_json['locusListGuids'] = response['locusListsByGuid'].keys() project_json['detailsLoaded'] = True project_json.update(_get_json_for_variant_tag_types(project)) gene_ids = set() for tag in project_json['discoveryTags']: gene_ids.update(tag.get('transcripts', {}).keys()) response.update({ 'projectsByGuid': {project_guid: project_json}, 'genesById': get_genes(gene_ids), }) return create_json_response(response)
def get_variant_gene_breakdown(request, search_hash): results_model = VariantSearchResults.objects.get(search_hash=search_hash) _check_results_permission(results_model, request.user) gene_counts = get_es_variant_gene_counts(results_model) return create_json_response({ 'searchGeneBreakdown': {search_hash: gene_counts}, 'genesById': get_genes(gene_counts.keys(), add_omim=True, add_constraints=True), })
def _saved_variant_genes(variants): gene_ids = set() for variant in variants: gene_ids.update(variant['transcripts'].keys()) genes = get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True, add_primate_ai=True) for gene in genes.values(): if gene: gene['locusListGuids'] = [] return genes
def _update_gene_symbols(rows): genes_by_id = get_genes({row['gene_id'] for row in rows if row.get('gene_id')}) for row in rows: if row.get('gene_id') and genes_by_id.get(row['gene_id']): row['gene_name'] = genes_by_id[row['gene_id']]['geneSymbol'] row["extras_variant_tag_list"] = ["{variant_id} {gene_symbol} {tag}".format( variant_id=variant_id, gene_symbol=genes_by_id.get(gene_id, {}).get('geneSymbol'), tag=tag, ) for variant_id, gene_id, tag in row.get("extras_variant_tag_list", [])]
def _update_gene_symbols(rows): genes_by_id = get_genes({row['gene_id'] for row in rows if row.get('gene_id')}) for row in rows: if row.get('gene_id') and genes_by_id.get(row['gene_id']): row['gene_name'] = genes_by_id[row['gene_id']]['geneSymbol'] row["extras_variant_tag_list"] = ["{variant_id} {gene_symbol} {tag}".format( variant_id=variant_id, gene_symbol=genes_by_id.get(gene_id, {}).get('geneSymbol'), tag=tag, ) for variant_id, gene_id, tag in row.get("extras_variant_tag_list", [])]
def locus_list_info(request, locus_list_guid): locus_list = LocusList.objects.get(guid=locus_list_guid) check_public_object_permissions(locus_list, request.user) locus_list_json = get_json_for_locus_list(locus_list, request.user) gene_ids = [item['geneId'] for item in locus_list_json['items'] if item.get('geneId')] return create_json_response({ 'locusListsByGuid': {locus_list_guid: locus_list_json}, 'genesById': get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True) })
def locus_list_info(request, locus_list_guid): locus_list = LocusList.objects.get(guid=locus_list_guid) check_public_object_permissions(locus_list, request.user) locus_list_json = get_json_for_locus_list(locus_list, request.user) gene_ids = [item['geneId'] for item in locus_list_json['items'] if item.get('geneId')] return create_json_response({ 'locusListsByGuid': {locus_list_guid: locus_list_json}, 'genesById': get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True) })
def _saved_variant_genes(variants): gene_ids = set() for variant in variants.values(): gene_ids.update(variant['transcripts'].keys()) genes = get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True) for gene in genes.values(): if gene: gene['locusLists'] = [] return genes
def _saved_variant_genes(variants): gene_ids = set() for variant in variants: if isinstance(variant, list): for compound_het in variant: gene_ids.update(compound_het['transcripts'].keys()) else: gene_ids.update(variant['transcripts'].keys()) genes = get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True, add_primate_ai=True) for gene in genes.values(): if gene: gene['locusListGuids'] = [] return genes
def _get_mme_genes_phenotypes(results, get_features, get_genomic_features, **kwargs): hpo_ids, gene_ids, gene_symols = _get_mme_gene_phenotype_ids( results, get_features, get_genomic_features, **kwargs) gene_symbols_to_ids = get_gene_ids_for_gene_symbols(gene_symols) gene_ids.update( {new_gene_ids[0] for new_gene_ids in gene_symbols_to_ids.values()}) genes_by_id = get_genes(gene_ids) hpo_terms_by_id = { hpo.hpo_id: hpo.name for hpo in HumanPhenotypeOntology.objects.filter(hpo_id__in=hpo_ids) } return hpo_terms_by_id, genes_by_id, gene_symbols_to_ids
def get_mme_genes_phenotypes(results): hpo_ids = set() genes = set() for result in results: hpo_ids.update({feature['id'] for feature in result['patient'].get('features', []) if feature.get('id')}) genes.update({gene_feature['gene']['id'] for gene_feature in result['patient'].get('genomicFeatures', [])}) gene_ids = {gene for gene in genes if gene.startswith('ENSG')} gene_symols = {gene for gene in genes if not gene.startswith('ENSG')} gene_symbols_to_ids = get_gene_ids_for_gene_symbols(gene_symols) gene_ids.update({new_gene_ids[0] for new_gene_ids in gene_symbols_to_ids.values()}) genes_by_id = get_genes(gene_ids) hpo_terms_by_id = {hpo.hpo_id: hpo.name for hpo in HumanPhenotypeOntology.objects.filter(hpo_id__in=hpo_ids)} return hpo_terms_by_id, genes_by_id, gene_symbols_to_ids
def locus_list_info(request, locus_list_guid): locus_list = LocusList.objects.get(guid=locus_list_guid) if not locus_list.is_public: check_multi_project_permissions(locus_list, request.user) locus_list_json = get_json_for_locus_list(locus_list, request.user) gene_ids = [ item['geneId'] for item in locus_list_json['items'] if item.get('geneId') ] return create_json_response({ 'locusListsByGuid': { locus_list_guid: locus_list_json }, 'genesById': get_genes(gene_ids) })
def _parse_disease_information(gene_id): response = get_genes([gene_id], add_dbnsfp=True, add_omim=True, add_constraints=True) disease = response[gene_id]["diseaseDesc"] if disease == "": return "No disease associations", "No disease associations" disease_split = disease.split("]:") disease_name = disease_split[0].replace("DISEASE:", "").replace("[MIM:", "OMIM #").strip() disease_description = disease_split[1].replace(";", "").strip() return disease_name, disease_description
def _parse_list_items(request_json): requested_items = (request_json.get('parsedItems') or {}).get('items') or [] existing_gene_ids = set() new_gene_symbols = set() new_gene_ids = set() existing_interval_guids = set() new_intervals = [] invalid_items = [] for item in requested_items: if item.get('locusListIntervalGuid'): existing_interval_guids.add(item.get('locusListIntervalGuid')) elif item.get('geneId'): if item.get('symbol'): existing_gene_ids.add(item.get('geneId')) else: new_gene_ids.add(item.get('geneId')) elif item.get('symbol'): new_gene_symbols.add(item.get('symbol')) else: try: item['start'] = int(item['start']) item['end'] = int(item['end']) if item['start'] > item['end']: raise ValueError get_xpos(item['chrom'], int(item['start'])) new_intervals.append(item) except (KeyError, ValueError): invalid_items.append('chr{chrom}:{start}-{end}'.format( chrom=item.get('chrom', '?'), start=item.get('start', '?'), end=item.get('end', '?') )) gene_symbols_to_ids = get_gene_ids_for_gene_symbols(new_gene_symbols) invalid_items += [symbol for symbol in new_gene_symbols if not gene_symbols_to_ids.get(symbol)] invalid_items += [symbol for symbol in new_gene_symbols if len(gene_symbols_to_ids.get(symbol, [])) > 1] new_genes = get_genes([gene_ids[0] for gene_ids in gene_symbols_to_ids.values() if len(gene_ids) == 1] + list(new_gene_ids), add_dbnsfp=True, add_omim=True, add_constraints=True) invalid_items += [gene_id for gene_id, gene in new_genes.items() if not gene] new_genes = {gene_id: gene for gene_id, gene in new_genes.items() if gene} return new_genes, existing_gene_ids, new_intervals, existing_interval_guids, invalid_items
def test_get_genes(self): gene_ids = {GENE_ID, 'ENSG00000227232'} user = User.objects.get(pk=1) json = get_genes(gene_ids) self.assertSetEqual(set(json.keys()), gene_ids) self.assertSetEqual(set(json[GENE_ID].keys()), GENE_FIELDS) fields = {'constraints', 'omimPhenotypes', 'mimNumber', 'cnSensitivity'} fields.update(GENE_FIELDS) json = get_genes_for_variant_display(gene_ids) self.assertSetEqual(set(json.keys()), gene_ids) self.assertSetEqual(set(json[GENE_ID].keys()), fields) json = get_genes_for_variants(gene_ids) self.assertSetEqual(set(json.keys()), gene_ids) self.assertSetEqual(set(json[GENE_ID].keys()), GENE_VARIANT_FIELDS) json = get_genes_with_detail(gene_ids, user) self.assertSetEqual(set(json.keys()), gene_ids) gene = json[GENE_ID] self.assertSetEqual(set(gene.keys()), GENE_DETAIL_FIELDS) # test nested models self.assertSetEqual(set(gene['primateAi'].keys()), {'percentile25', 'percentile75'}) self.assertSetEqual( set(gene['constraints'].keys()), {'misZ', 'misZRank', 'pli', 'pliRank', 'louef', 'louefRank', 'totalGenes'}) self.assertSetEqual(set(gene['cnSensitivity'].keys()), {'phi', 'pts'}) self.assertSetEqual( set(gene['omimPhenotypes'][0].keys()), {'mimNumber', 'phenotypeMimNumber', 'phenotypeDescription', 'phenotypeInheritance'}) sparse_gene = json['ENSG00000227232'] self.assertIsNone(sparse_gene['primateAi']) self.assertDictEqual(sparse_gene['constraints'], {}) self.assertDictEqual(sparse_gene['cnSensitivity'], {}) self.assertListEqual(sparse_gene['omimPhenotypes'], [])
def _parse_anvil_metadata(project, individual_samples, user, include_collaborator=False): samples_by_family = defaultdict(list) individual_id_map = {} sample_ids = set() for individual, sample in individual_samples.items(): samples_by_family[individual.family].append(sample) individual_id_map[individual.id] = individual.individual_id sample_ids.add(sample.sample_id) family_individual_affected_guids = {} for family, family_samples in samples_by_family.items(): family_individual_affected_guids[family.guid] = ( {s.individual.guid for s in family_samples if s.individual.affected == Individual.AFFECTED_STATUS_AFFECTED}, {s.individual.guid for s in family_samples if s.individual.affected == Individual.AFFECTED_STATUS_UNAFFECTED}, {s.individual.guid for s in family_samples if s.individual.sex == Individual.SEX_MALE}, ) sample_airtable_metadata = _get_sample_airtable_metadata(list(sample_ids), user, include_collaborator=include_collaborator) saved_variants_by_family = _get_parsed_saved_discovery_variants_by_family(list(samples_by_family.keys())) compound_het_gene_id_by_family, gene_ids = _process_saved_variants( saved_variants_by_family, family_individual_affected_guids) genes_by_id = get_genes(gene_ids) mim_numbers = set() for family in samples_by_family.keys(): if family.post_discovery_omim_number: mim_numbers.update(family.post_discovery_omim_number.split(',')) mim_decription_map = { str(o.phenotype_mim_number): o.phenotype_description for o in Omim.objects.filter(phenotype_mim_number__in=mim_numbers) } project_details = { 'project_id': project.name, 'project_guid': project.guid, 'phenotype_group': '|'.join([ category.name for category in project.projectcategory_set.filter(name__in=PHENOTYPE_PROJECT_CATEGORIES) ]), } subject_rows = [] sample_rows = [] family_rows = [] discovery_rows = [] for family, family_samples in samples_by_family.items(): saved_variants = saved_variants_by_family[family.guid] family_subject_row = { 'family_guid': family.guid, 'family_id': family.family_id, 'pmid_id': family.pubmed_ids[0].replace('PMID:', '').strip() if family.pubmed_ids else '', 'phenotype_description': (family.coded_phenotype or '').replace(',', ';').replace('\t', ' '), 'num_saved_variants': len(saved_variants), } family_subject_row.update(project_details) if family.post_discovery_omim_number: mim_numbers = family.post_discovery_omim_number.split(',') family_subject_row.update({ 'disease_id': ';'.join(['OMIM:{}'.format(mim_number) for mim_number in mim_numbers]), 'disease_description': ';'.join([ mim_decription_map.get(mim_number, '') for mim_number in mim_numbers]).replace(',', ';'), }) affected_individual_guids, _, male_individual_guids = family_individual_affected_guids[family.guid] family_consanguinity = any(sample.individual.consanguinity is True for sample in family_samples) family_row = { 'entity:family_id': family.family_id, 'family_id': family.family_id, 'consanguinity': 'Present' if family_consanguinity else 'None suspected', } if len(affected_individual_guids) > 1: family_row['family_history'] = 'Yes' family_rows.append(family_row) parsed_variants = [ _parse_anvil_family_saved_variant(variant, family, compound_het_gene_id_by_family, genes_by_id) for variant in saved_variants] for sample in family_samples: individual = sample.individual airtable_metadata = sample_airtable_metadata.get(sample.sample_id, {}) dbgap_submission = airtable_metadata.get('dbgap_submission') or set() has_dbgap_submission = sample.sample_type in dbgap_submission subject_row = _get_subject_row( individual, has_dbgap_submission, airtable_metadata, parsed_variants, individual_id_map) subject_row.update(family_subject_row) subject_rows.append(subject_row) sample_row = _get_sample_row(sample, has_dbgap_submission, airtable_metadata) sample_rows.append(sample_row) discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids) discovery_rows.append(discovery_row) return subject_rows, sample_rows, family_rows, discovery_rows
def genes_info(request): gene_ids = request.GET.get('geneIds', '').split(',') return create_json_response({'genesById': get_genes(gene_ids, add_dbnsfp=True, add_omim=True, add_constraints=True)})
def anvil_export(request, project_guid): if project_guid == 'all': project_guid = None if project_guid: projects_by_guid = {project_guid: Project.objects.get(guid=project_guid)} else: projects_by_guid = {p.guid: p for p in Project.objects.filter(projectcategory__name__iexact='anvil')} families = _get_over_year_loaded_project_families(projects_by_guid.values()) prefetch_related_objects(families, 'individual_set') saved_variants_by_family = _get_saved_variants_by_family(projects_by_guid.values(), request.user) # Handle compound het genes compound_het_gene_id_by_family = {} for family_guid, saved_variants in saved_variants_by_family.items(): if len(saved_variants) > 1: potential_compound_het_variants = [ variant for variant in saved_variants if all(gen['numAlt'] < 2 for gen in variant['genotypes'].values()) ] main_gene_ids = {variant['mainTranscript']['geneId'] for variant in potential_compound_het_variants} if len(main_gene_ids) > 1: # This occurs in compound hets where some hits have a primary transcripts in different genes for gene_id in main_gene_ids: if all(gene_id in variant['transcripts'] for variant in potential_compound_het_variants): compound_het_gene_id_by_family[family_guid] = gene_id individuals = set() for family in families: individuals.update(family.individual_set.all()) rows = _get_json_for_individuals(list(individuals), project_guid=project_guid, family_fields=['family_id', 'coded_phenotype']) gene_ids = set() for row in rows: row['Project_ID'] = projects_by_guid[row['projectGuid']].name saved_variants = saved_variants_by_family[row['familyGuid']] row['numSavedVariants'] = len(saved_variants) for i, variant in enumerate(saved_variants): genotype = variant['genotypes'].get(row['individualGuid'], {}) if genotype.get('numAlt', -1) > 0: gene_id = compound_het_gene_id_by_family.get(row['familyGuid']) or variant['mainTranscript']['geneId'] gene_ids.add(gene_id) variant_fields = { 'Zygosity': 'heterozygous' if genotype['numAlt'] == 1 else 'homozygous', 'Chrom': variant['chrom'], 'Pos': variant['pos'], 'Ref': variant['ref'], 'Alt': variant['alt'], 'hgvsc': variant['mainTranscript']['hgvsc'], 'hgvsp': variant['mainTranscript']['hgvsp'], 'Transcript': variant['mainTranscript']['transcriptId'], 'geneId': gene_id, } row.update({'{}-{}'.format(k, i + 1): v for k, v in variant_fields.items()}) genes_by_id = get_genes(gene_ids) for row in rows: for key, gene_id in row.items(): if key.startswith('geneId') and genes_by_id.get(gene_id): row[key.replace('geneId', 'Gene')] = genes_by_id[gene_id]['geneSymbol'] return create_json_response({'anvilRows': rows})