def get_variants_from_variant_tuples(project, variant_tuples, user=None): datastore = get_datastore(project) population_slugs = project.get_reference_population_slugs() variant_tuples_by_family_id = {} for xpos, ref, alt, family_id in variant_tuples: if family_id not in variant_tuples_by_family_id: variant_tuples_by_family_id[family_id] = [] variant_tuples_by_family_id[family_id].append((xpos, ref, alt)) variants = [] for family_id, variant_tuples in variant_tuples_by_family_id.items(): variants_for_family = datastore.get_multiple_variants( project.project_id, family_id, variant_tuples, user=user) for (xpos, ref, alt), variant in zip(variant_tuples, variants_for_family): if not variant: variant = Variant(xpos, ref, alt) get_annotator().annotate_variant(variant, population_slugs) variant.set_extra('created_variant', True) variant.set_extra('family_id', family_id) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def get_variants_from_note_tuples(project, note_tuples): variants = [] for note_t in note_tuples: variant = get_datastore(project.project_id).get_single_variant( project.project_id, note_t[3], note_t[0], note_t[1], note_t[2] ) if not variant: variant = Variant(note_t[0], note_t[1], note_t[2]) get_annotator().annotate_variant(variant, project.get_reference_population_slugs()) # variant.annotation = get_annotator().get_variant(note_t[0], note_t[1], note_t[2]) variant.set_extra("family_id", note_t[3]) variant.set_extra("project_id", project.project_id) variants.append(variant) return variants
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None, quality_filter=None, indivs_to_consider=None, user=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: logger.error( "Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate( collection.find({ '$and': [{ k: v } for k, v in db_query.items()] }).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT + 5)): if i >= settings.VARIANT_QUERY_RESULTS_LIMIT: raise Exception( "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % settings.VARIANT_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) variant.set_extra('project_id', project_id) variant.set_extra('family_id', family_id) self.add_annotations_to_variants([variant], project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def mendelian_variant_search_spec(request): project, family = get_project_and_family_for_user(request.user, request.GET) search_hash = request.GET.get('search_hash') search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, search_hash) search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict) if variants is None: variants = api_utils.calculate_mendelian_variant_search(search_spec, family.xfamily()) else: variants = [Variant.fromJSON(v) for v in variants] add_extra_info_to_variants_family(get_reference(), family, variants) return_type = request.GET.get('return_type') if return_type == 'json' or not return_type: return JSONResponse({ 'is_error': False, 'variants': [v.toJSON() for v in variants], 'search_spec': search_spec_dict, }) elif request.GET.get('return_type') == 'csv': response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash) writer = csv.writer(response) indiv_ids = family.indiv_ids_with_variant_data() headers = xbrowse_displays.get_variant_display_headers(get_mall(project.project_id), project, indiv_ids) writer.writerow(headers) for variant in variants: fields = xbrowse_displays.get_display_fields_for_variant(get_mall(project.project_id), project, variant, indiv_ids) writer.writerow(fields) return response
def get_variants_in_gene(self, project_id, family_id, gene_id, genotype_filter=None, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(genotype_filter, modified_variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: return # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [] for variant_dict in collection.find(db_query).hint([ ('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING) ]): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) for v in variants: yield v
def get_variants_in_range(self, project_id, family_id, xpos_start, xpos_end): collection = self._get_family_collection(project_id, family_id) if not collection: raise ValueError("Family not found: " + str(family_id)) for i, variant_dict in enumerate( collection.find({ '$and': [{ 'xpos': { '$gte': xpos_start } }, { 'xpos': { '$lte': xpos_end } }] }).limit(MONGO_QUERY_RESULTS_LIMIT + 5)): if i > MONGO_QUERY_RESULTS_LIMIT: raise Exception( "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) yield variant
def get_project_variants_in_gene(self, project_id, gene_id, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(None, modified_variant_filter) sys.stderr.write("Project Gene Search: " + str(project_id) + " all variants query: " + str(db_query)) collection = self._get_project_collection(project_id) # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [] for variant_dict in collection.find(db_query).hint([ ('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING) ]): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) return variants
def mendelian_variant_search_spec(request): project, family = get_project_and_family_for_user(request.user, request.GET) # TODO: use form search_hash = request.GET.get('search_hash') search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, search_hash) search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict) if variants is None: variants = api_utils.calculate_mendelian_variant_search(search_spec, family.xfamily()) else: variants = [Variant.fromJSON(v) for v in variants] add_extra_info_to_variants_family(get_reference(), family, variants) return_type = request.GET.get('return_type') if return_type == 'json' or not return_type: return JSONResponse({ 'is_error': False, 'variants': [v.toJSON() for v in variants], 'search_spec': search_spec_dict, }) elif request.GET.get('return_type') == 'csv': response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash) writer = csv.writer(response) indiv_ids = family.indiv_ids_with_variant_data() headers = xbrowse_displays.get_variant_display_headers(get_mall(project.project_id), project, indiv_ids) writer.writerow(headers) for variant in variants: fields = xbrowse_displays.get_display_fields_for_variant(get_mall(project.project_id), project, variant, indiv_ids) writer.writerow(fields) return response
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print( "Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate( collection.find(db_query).sort('xpos').limit( MONGO_QUERY_RESULTS_LIMIT + 5)): if i >= MONGO_QUERY_RESULTS_LIMIT: raise Exception( "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants_in_range(self, project_id, family_id, xpos_start, xpos_end): collection = self._get_family_collection(project_id, family_id) if not collection: raise ValueError("Family not found: " + str(family_id)) for variant_dict in collection.find({'$and': [{'xpos': {'$gte': xpos_start}}, {'xpos': {'$lte': xpos_end}}]}): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) yield variant
def get_single_variant(self, project_id, family_id, xpos, ref, alt): collection = self._get_family_collection(project_id, family_id) variant = collection.find_one({'xpos': xpos, 'ref': ref, 'alt': alt}) if variant: return Variant.fromJSON(variant) else: return None
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = _make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) for variant_dict in collection.find(db_query).sort('xpos'): variant = Variant.fromJSON(variant_dict) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants_from_variant_tuples(project, variant_tuples, user=None): datastore = get_datastore(project) population_slugs = project.get_reference_population_slugs() variant_tuples_by_family_id = {} for xpos, ref, alt, family_id in variant_tuples: if family_id not in variant_tuples_by_family_id: variant_tuples_by_family_id[family_id] = [] variant_tuples_by_family_id[family_id].append((xpos, ref, alt)) variants = [] for family_id, variant_tuples in variant_tuples_by_family_id.items(): variants_for_family = datastore.get_multiple_variants( project.project_id, family_id, variant_tuples, user=user ) for (xpos, ref, alt), variant in zip(variant_tuples, variants_for_family): if not variant: variant = Variant(xpos, ref, alt) get_annotator().annotate_variant(variant, population_slugs) variant.set_extra('created_variant', True) variant.set_extra('family_id', family_id) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def get_variants_from_variant_tuples(project, variant_tuples): variants = [] for t in variant_tuples: variant = get_datastore(project.project_id).get_single_variant( project.project_id, t[3], t[0], t[1], t[2] ) if not variant: variant = Variant(t[0], t[1], t[2]) get_annotator().annotate_variant(variant, project.get_reference_population_slugs()) variant.set_extra('family_id', t[3]) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def get_variants_cohort(self, project_id, cohort_id, variant_filter=None): db_query = self._make_db_query(None, variant_filter) collection = self._get_family_collection(project_id, cohort_id) for i, variant in enumerate(collection.find(db_query).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT+5)): if i > settings.VARIANT_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % settings.VARIANT_QUERY_RESULTS_LIMIT) yield Variant.fromJSON(variant)
def get_variants_cohort(self, project_id, cohort_id, variant_filter=None): db_query = self._make_db_query(None, variant_filter) collection = self._get_family_collection(project_id, cohort_id) for i, variant in enumerate(collection.find(db_query).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)): if i > MONGO_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) yield Variant.fromJSON(variant)
def get_variants_in_range(self, project_id, family_id, xpos_start, xpos_end): collection = self._get_family_collection(project_id, family_id) if not collection: raise ValueError("Family not found: " + str(family_id)) for i, variant_dict in enumerate(collection.find({'$and': [{'xpos': {'$gte': xpos_start}}, {'xpos': {'$lte': xpos_end}}]}).limit(MONGO_QUERY_RESULTS_LIMIT+5)): if i > MONGO_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) yield variant
def get_single_variant(self, project_id, family_id, xpos, ref, alt, user=None): collection = self._get_family_collection(project_id, family_id) if not collection: return None variant_dict = collection.find_one({'xpos': xpos, 'ref': ref, 'alt': alt}) if variant_dict: variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variants([variant], project_id, family_id=family_id) return variant else: return None
def get_single_variant(self, project_id, family_id, xpos, ref, alt): collection = self._get_family_collection(project_id, family_id) if not collection: return None variant_dict = collection.find_one({'xpos': xpos, 'ref': ref, 'alt': alt}) if variant_dict: variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) return variant else: return None
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for variant_dict in collection.find(db_query).sort('xpos'): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants(self, project_id, variant_filter=None): variant_filter_t = VariantFilter(**(variant_filter if variant_filter else {})) db_query = self._make_db_query(None, variant_filter) collection = self._get_project_collection(project_id) for variant_dict in collection.find(db_query).sort('xpos'): variant = Variant.fromJSON(variant_dict) if variant_filter is None: yield variant if passes_variant_filter(variant, variant_filter_t)[0]: yield variant
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate(collection.find(db_query).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)): if i >= MONGO_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None, quality_filter=None, indivs_to_consider=None, user=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: logger.error("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate(collection.find({'$and' : [{k: v} for k, v in db_query.items()]}).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT+5)): if i >= settings.VARIANT_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % settings.VARIANT_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) variant.set_extra('project_id', project_id) variant.set_extra('family_id', family_id) self.add_annotations_to_variants([variant], project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants_in_gene(self, project_id, gene_id, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(None, modified_variant_filter) collection = self._get_project_collection(project_id) variants = [] for variant_dict in collection.find(db_query).hint([('gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)]): variant = Variant.fromJSON(variant_dict) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) return variants
def get_project_variants_in_gene(self, project_id, gene_id, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(None, modified_variant_filter) logger.info("Project Gene Search: " + str(project_id) + " all variants query: " + str(db_query)) collection = self._get_project_collection(project_id) # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [Variant.fromJSON(variant_dict) for variant_dict in collection.find(db_query).hint([('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)])] self.add_annotations_to_variants(variants, project_id) variants = filter(lambda variant: passes_variant_filter(variant, modified_variant_filter), variants) variants = sorted(variants, key=lambda v: v.unique_tuple()) return variants
def cohort_variant_search_spec(request): project, cohort = get_project_and_cohort_for_user(request.user, request.GET) # TODO: use form search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, request.GET.get('search_hash')) search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict) if variants is None: variants = api_utils.calculate_mendelian_variant_search(search_spec, cohort.xfamily()) else: variants = [Variant.fromJSON(v) for v in variants] api_utils.add_extra_info_to_variants_cohort(get_reference(), cohort, variants) return JSONResponse({ 'is_error': False, 'variants': [v.toJSON() for v in variants], 'search_spec': search_spec.toJSON(), })
def get_variants_from_variant_tuples(project, variant_tuples): variants = [] for t in variant_tuples: variant = get_datastore(project.project_id).get_single_variant( project.project_id, t[3], t[0], t[1], t[2]) if not variant: variant = Variant(t[0], t[1], t[2]) get_annotator().annotate_variant( variant, project.get_reference_population_slugs()) variant.set_extra('family_id', t[3]) variant.set_extra('project_id', project.project_id) variants.append(variant) return variants
def get_variants_in_gene(self, project_id, family_id, gene_id, genotype_filter=None, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = _make_db_query(genotype_filter, modified_variant_filter) collection = self._get_family_collection(project_id, family_id) # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [] for variant_dict in collection.find(db_query).hint([('gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)]): variant = Variant.fromJSON(variant_dict) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) for v in variants: yield v
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return counters = OrderedDict([('returned_by_query', 0), ('passes_variant_filter', 0)]) for i, variant_dict in enumerate(collection.find({'$and' : [{k: v} for k, v in db_query.items()]}).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)): if i >= MONGO_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) counters["returned_by_query"] += 1 if passes_variant_filter(variant, variant_filter)[0]: counters["passes_variant_filter"] += 1 yield variant for k, v in counters.items(): sys.stderr.write(" %s: %s\n" % (k,v))
def get_elasticsearch_variants( self, project_id, family_id=None, variant_filter=None, genotype_filter=None, variant_id_filter=None, quality_filter=None, indivs_to_consider=None, include_all_consequences=False, user=None, max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT, ): from xbrowse_server.base.models import Project, Family, Individual from seqr.models import Sample from seqr.utils.es_utils import _liftover_grch38_to_grch37 from xbrowse_server.mall import get_reference redis_client = None if settings.REDIS_SERVICE_HOSTNAME: try: redis_client = redis.StrictRedis(host=settings.REDIS_SERVICE_HOSTNAME, socket_connect_timeout=3) redis_client.ping() except redis.exceptions.TimeoutError as e: logger.warn("Unable to connect to redis host: {}".format(settings.REDIS_SERVICE_HOSTNAME) + str(e)) redis_client = None cache_key = "Variants___%s___%s___%s" % ( project_id, family_id, json.dumps([ variant_filter.toJSON() if variant_filter else None, genotype_filter, quality_filter, variant_id_filter, indivs_to_consider, include_all_consequences, ]) ) cached_results = redis_client and redis_client.get(cache_key) if cached_results is not None: variant_results = json.loads(cached_results) return [Variant.fromJSON(variant_json) for variant_json in variant_results] if family_id is None: project = Project.objects.get(project_id=project_id) elasticsearch_index = project.get_elasticsearch_index() logger.info("Searching in project elasticsearch index: " + str(elasticsearch_index)) else: family = Family.objects.get(project__project_id=project_id, family_id=family_id) elasticsearch_index = family.get_elasticsearch_index() project = family.project logger.info("Searching in family elasticsearch index: " + str(elasticsearch_index)) if indivs_to_consider is None and genotype_filter and not family_id: indivs_to_consider = genotype_filter.keys() individuals = Individual.objects.filter(family__project__project_id=project_id).only("indiv_id", "seqr_individual") if indivs_to_consider: individuals = individuals.filter(indiv_id__in=indivs_to_consider) if family_id is not None: individuals = individuals.filter(family__family_id=family_id) if not indivs_to_consider: indivs_to_consider = [i.indiv_id for i in individuals] prefetch_related_objects(individuals, "seqr_individual") es_indices = [index.rstrip('*') for index in elasticsearch_index.split(',')] samples = Sample.objects.filter( individual__in=[i.seqr_individual for i in individuals if i.seqr_individual], dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__startswith=es_indices[0], loaded_date__isnull=False, ).order_by('-loaded_date') prefetch_related_objects(samples, "individual") family_individual_ids_to_sample_ids = {} for i in individuals: indiv_id = i.indiv_id sample_id = None if i.seqr_individual: sample_id = next(( sample.sample_id for sample in samples if sample.individual == i.seqr_individual and sample.elasticsearch_index.startswith(tuple(es_indices)) ), None) family_individual_ids_to_sample_ids[indiv_id] = sample_id or indiv_id query_json = self._make_db_query(genotype_filter, variant_filter) es_client = elasticsearch.Elasticsearch(host=settings.ELASTICSEARCH_SERVICE_HOSTNAME, timeout=30) mapping = es_client.indices.get_mapping(str(elasticsearch_index) + "*") index_fields = {} is_parent_child = False is_nested = False if elasticsearch_index in mapping and 'join_field' in mapping[elasticsearch_index]["mappings"]["variant"]["properties"]: # Nested indices are not sharded so all samples are in the single index logger.info("matching indices: " + str(elasticsearch_index)) is_parent_child = True elif elasticsearch_index in mapping and 'genotypes' in mapping[elasticsearch_index]["mappings"]["variant"]["properties"]: # Nested indices are not sharded so all samples are in the single index logger.info("matching indices: " + str(elasticsearch_index)) is_nested = True elif family_id is not None and len(family_individual_ids_to_sample_ids) > 0: # figure out which index to use # TODO add caching matching_indices = [] for raw_sample_id in family_individual_ids_to_sample_ids.values(): sample_id = _encode_name(raw_sample_id) for index_name, index_mapping in mapping.items(): if sample_id+"_num_alt" in index_mapping["mappings"]["variant"]["properties"]: matching_indices.append(index_name) index_fields.update(index_mapping["mappings"]["variant"]["properties"]) if len(matching_indices) > 0: break if not matching_indices: if family_id is not None and not family_individual_ids_to_sample_ids: logger.error("no individuals found for family %s" % (family_id)) elif not mapping: logger.error("no es mapping found for found with prefix %s" % (elasticsearch_index)) else: logger.error("%s not found in %s:\n%s" % (indiv_id, elasticsearch_index, pformat(index_mapping["mappings"]["variant"]["properties"]))) else: elasticsearch_index = ",".join(matching_indices) logger.info("matching indices: " + str(elasticsearch_index)) else: elasticsearch_index = str(elasticsearch_index)+"*" if not index_fields: for index_mapping in mapping.values(): index_fields.update(index_mapping["mappings"]["variant"]["properties"]) s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) #",".join(indices)) if variant_id_filter is not None: variant_id_filter_term = None for variant_id in variant_id_filter: q_obj = Q('term', **{"variantId": variant_id}) if variant_id_filter_term is None: variant_id_filter_term = q_obj else: variant_id_filter_term |= q_obj s = s.filter(variant_id_filter_term) genotype_filters = {} for key, value in query_json.items(): if key.startswith("genotypes"): indiv_id = ".".join(key.split(".")[1:-1]) sample_id = family_individual_ids_to_sample_ids.get(indiv_id) or indiv_id genotype_filter = value if type(genotype_filter) == int or type(genotype_filter) == basestring: genotype_filters[sample_id] = [('term', genotype_filter)] elif '$gte' in genotype_filter: genotype_filter = {k.replace("$", ""): v for k, v in genotype_filter.items()} genotype_filters[sample_id] = [('range', genotype_filter)] elif "$in" in genotype_filter: num_alt_values = genotype_filter['$in'] genotype_filters[sample_id] = [('term', num_alt_value) for num_alt_value in num_alt_values] sample_ids = [family_individual_ids_to_sample_ids.get(indiv_id) or indiv_id for indiv_id in (indivs_to_consider or [])] min_ab = None min_gq = None if quality_filter is not None and indivs_to_consider: min_ab = quality_filter.get('min_ab') if min_ab is not None and not is_nested: min_ab /= 100.0 # convert to fraction min_gq = quality_filter.get('min_gq') vcf_filter = quality_filter.get('vcf_filter') if vcf_filter is not None: s = s.filter(~Q('exists', field='filters')) if is_parent_child: quality_q = Q() if min_ab or min_gq: if min_ab is not None: # AB only relevant for hets quality_q &= Q(~Q('term', num_alt=1) | Q('range', ab={'gte': min_ab})) if min_gq is not None: quality_q &= Q('range', gq={'gte': min_gq}) if genotype_filters: # Return inner hits for all requested samples, even those without a specified genotype genotype_sample_ids = sample_ids or genotype_filters.keys() genotype_q = None for sample_id in genotype_sample_ids: sample_q = Q(Q('term', sample_id=sample_id) & quality_q) if genotype_filters.get(sample_id): q = None for (op, val) in genotype_filters[sample_id]: if q: q |= Q(op, num_alt=val) else: q = Q(op, num_alt=val) sample_q &= q if not genotype_q: genotype_q = sample_q else: genotype_q |= sample_q genotype_kwargs = {'query': genotype_q, 'min_children': len(genotype_sample_ids)} elif sample_ids: # Subquery for child docs with the requested sample IDs and quality metrics sample_id_q = Q('terms', sample_id=sample_ids) & quality_q # Only return variants where at least one of the requested samples has an alt allele s = s.filter(Q('has_child', type='genotype', query=(Q(Q('range', num_alt={'gte': 1}) & sample_id_q)))) # Return inner hits for all the requested samples regardless of genotype genotype_kwargs = {'query': sample_id_q, 'min_children': len(sample_ids)} else: # Return all inner hits for the variant # This case is only used by gene search, which also does not use quality filters genotype_kwargs = {'query': Q()} s = s.filter(Q('has_child', type='genotype', inner_hits={'size': genotype_kwargs.get('min_children', MAX_INNER_HITS)}, **genotype_kwargs)) if is_nested: if sample_ids and min_ab is not None: min_ab_filter_val = int(min_ab) - int(min_ab % 5) for sample_id in sample_ids: q = Q('term', samples_ab_0_to_5=sample_id) for i in range(5, min_ab_filter_val, 5): q = q | Q('term', **{'samples_ab_{}_to_{}'.format(i, i+5): sample_id}) # AB only relevant for hets s = s.filter(~Q(q) | ~Q('term', samples_num_alt_1=sample_id)) if sample_ids and min_gq is not None: min_gq_filter_val = int(min_gq) - int(min_gq % 5) for sample_id in sample_ids: q = Q('term', samples_gq_0_to_5=sample_id) for i in range(5, min_gq_filter_val, 5): q = q | Q('term', **{'samples_gq_{}_to_{}'.format(i, i+5): sample_id}) s = s.filter(~Q(q)) if genotype_filters: for sample_id, queries in genotype_filters.items(): if queries[0][0] == 'range': allowed_num_alt = range(queries[0][1]['gte'], 3) else: allowed_num_alt = [query[1] for query in queries] if 0 in allowed_num_alt: q = Q('term', samples_no_call=sample_id) if 1 not in allowed_num_alt: q = q | Q('term', samples_num_alt_1=sample_id) if 2 not in allowed_num_alt: q = q | Q('term', samples_num_alt_2=sample_id) s = s.filter(~q) else: q = Q('term', **{'samples_num_alt_{}'.format(allowed_num_alt[0]): sample_id}) for num_alt in allowed_num_alt[1:]: q = q | Q('term', **{'samples_num_alt_{}'.format(num_alt): sample_id}) s = s.filter(q) elif sample_ids: s = s.filter(Q('terms', samples_num_alt_1=sample_ids) | Q('terms', samples_num_alt_2=sample_ids)) else: for sample_id, queries in genotype_filters.items(): encoded_sample_id = _encode_name(sample_id) q = Q(queries[0][0], **{encoded_sample_id + "_num_alt": queries[0][1]}) for (op, val) in queries[1:]: q = q | Q(op, **{encoded_sample_id + "_num_alt": val}) s = s.filter(q) if sample_ids: atleast_one_nonref_genotype_filter = None for sample_id in sample_ids: encoded_sample_id = _encode_name(sample_id) q = Q('range', **{encoded_sample_id+"_num_alt": {'gte': 1}}) if atleast_one_nonref_genotype_filter is None: atleast_one_nonref_genotype_filter = q else: atleast_one_nonref_genotype_filter |= q s = s.filter(atleast_one_nonref_genotype_filter) if min_ab or min_gq: for sample_id in sample_ids: encoded_sample_id = _encode_name(sample_id) if min_ab: s = s.filter( ~Q('term', **{encoded_sample_id+"_num_alt": 1}) | Q('range', **{encoded_sample_id+"_ab": {'gte': min_ab}})) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}})) if min_gq: s = s.filter('range', **{encoded_sample_id+"_gq": {'gte': min_gq}}) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}})) # parse variant query annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP for key, value in query_json.items(): if key == 'db_tags': so_annotations = query_json.get('db_tags', {}).get('$in', []) # handle clinvar filters selected_so_annotations_set = set(so_annotations) all_clinvar_filters_set = set(annotation_groups_map.get("clinvar", {}).get("children", [])) selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set all_hgmd_filters_set = set(annotation_groups_map.get("hgmd", {}).get("children", [])) selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set vep_consequences = list(selected_so_annotations_set - selected_clinvar_filters_set - selected_hgmd_filters_set) consequences_filter = Q("terms", transcriptConsequenceTerms=vep_consequences) if selected_clinvar_filters_set: clinvar_clinical_significance_terms = set() for clinvar_filter in selected_clinvar_filters_set: # translate selected filters to the corresponding clinvar clinical consequence terms if clinvar_filter == "pathogenic": clinvar_clinical_significance_terms.update(["Pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "likely_pathogenic": clinvar_clinical_significance_terms.update(["Likely_pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "benign": clinvar_clinical_significance_terms.update(["Benign", "Benign/Likely_benign"]) elif clinvar_filter == "likely_benign": clinvar_clinical_significance_terms.update(["Likely_benign", "Benign/Likely_benign"]) elif clinvar_filter == "vus_or_conflicting": clinvar_clinical_significance_terms.update([ "Conflicting_interpretations_of_pathogenicity", "Uncertain_significance", "not_provided", "other"]) else: raise ValueError("Unexpected clinvar filter: " + str(clinvar_filter)) consequences_filter = consequences_filter | Q("terms", clinvar_clinical_significance=list(clinvar_clinical_significance_terms)) if selected_hgmd_filters_set: hgmd_class = set() for hgmd_filter in selected_hgmd_filters_set: # translate selected filters to the corresponding hgmd clinical consequence terms if hgmd_filter == "disease_causing": hgmd_class.update(["DM"]) elif hgmd_filter == "likely_disease_causing": hgmd_class.update(["DM?"]) elif hgmd_filter == "hgmd_other": hgmd_class.update(["DP", "DFP", "FP", "FTV"]) else: raise ValueError("Unexpected hgmd filter: " + str(hgmd_filter)) consequences_filter = consequences_filter | Q("terms", hgmd_class=list(hgmd_class)) if 'intergenic_variant' in vep_consequences: # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy consequences_filter = consequences_filter | ~Q('exists', field='transcriptConsequenceTerms') s = s.filter(consequences_filter) #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences)) if key.startswith("genotypes"): continue if key == "db_gene_ids": db_gene_ids = query_json.get('db_gene_ids', {}) exclude_genes = db_gene_ids.get('$nin', []) gene_ids = exclude_genes or db_gene_ids.get('$in', []) if exclude_genes: s = s.exclude("terms", geneIds=gene_ids) else: s = s.filter("terms", geneIds=gene_ids) #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids))) if key == "$or" and type(value) == list: q_terms = None for region_filter in value: xpos_filters = region_filter.get("$and", {}) # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}] xpos_filters_dict = {} for xpos_filter in xpos_filters: xpos_filter_setting = xpos_filter["xpos"] # for example {'$gte': 12345} or {'$lte': 54321} xpos_filters_dict.update(xpos_filter_setting) xpos_filter_setting = {k.replace("$", ""): v for k, v in xpos_filters_dict.items()} q = Q('range', **{"xpos": xpos_filter_setting}) if q_terms is None: q_terms = q else: q_terms |= q if q_terms is not None: s = s.filter(q_terms) #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting})) af_key_map = { "db_freqs.AF": ["AF"], "db_freqs.1kg_wgs_phase3": ["g1k_POPMAX_AF"], "db_freqs.exac_v3": ["exac_AF_POPMAX"], "db_freqs.topmed": ["topmed_AF"], "db_freqs.gnomad_exomes": ["gnomad_exomes_AF_POPMAX", "gnomad_exomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad_genomes": ["gnomad_genomes_AF_POPMAX", "gnomad_genomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad-exomes2": ["gnomad_exomes_AF_POPMAX", "gnomad_exomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad-genomes2": ["gnomad_genomes_AF_POPMAX", "gnomad_genomes_AF_POPMAX_OR_GLOBAL"], } if key in af_key_map: for filter_key in af_key_map[key]: af_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: af_filter_setting}) | ~Q('exists', field=filter_key)) #logger.info("==> %s: %s" % (filter_key, af_filter_setting)) ac_key_map = { "db_acs.AF": "AC", "db_acs.1kg_wgs_phase3": "g1k_AC", "db_acs.exac_v3": "exac_AC", "db_acs.topmed": "topmed_AC", "db_acs.gnomad_exomes": "gnomad_exomes_AC", "db_acs.gnomad_genomes": "gnomad_genomes_AC", "db_acs.gnomad-exomes2": "gnomad_exomes_AC", "db_acs.gnomad-genomes2": "gnomad_genomes_AC", } if key in ac_key_map: filter_key = ac_key_map[key] ac_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: ac_filter_setting}) | ~Q('exists', field=filter_key)) hemi_key_map = { "db_hemi.exac_v3": "exac_AC_Hemi", "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi", "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi", "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi", "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi", } if key in hemi_key_map: filter_key = hemi_key_map[key] hemi_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: hemi_filter_setting}) | ~Q('exists', field=filter_key)) hom_key_map = { "db_hom.exac_v3": "exac_AC_Hom", "db_hom.gnomad_exomes": "gnomad_exomes_Hom", "db_hom.gnomad_genomes": "gnomad_genomes_Hom", "db_hom.gnomad-exomes2": "gnomad_exomes_Hom", "db_hom.gnomad-genomes2": "gnomad_genomes_Hom", } if key in hom_key_map: filter_key = hom_key_map[key] hom_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: hom_filter_setting}) | ~Q('exists', field=filter_key)) #s = s.sort("xpos") #logger.info("=====") #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__)) #logger.info("FILTERS: " + pformat(s.to_dict())) # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan start = time.time() s = s.params(size=max_results_limit + 1) #if not include_all_consequences: # s = s.source(exclude=["sortedTranscriptConsequences"]) response = s.execute() logger.info("=====") logger.info("TOTAL: %s. Query took %s seconds" % (response.hits.total, time.time() - start)) if response.hits.total > max_results_limit + 1: raise Exception("This search matched too many variants. Please set additional filters and try again.") #print(pformat(response.to_dict())) project = Project.objects.get(project_id=project_id) #gene_list_map = project.get_gene_list_map() reference = get_reference() #for i, hit in enumerate(response.hits): variant_results = [] for i, hit in enumerate(response): # preserve_order=True #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__))) #print("HIT %s: %s" % (i, pformat(hit.to_dict()))) filters = ",".join(hit["filters"] or []) if "filters" in hit else "" genotypes = {} all_num_alt = [] if is_parent_child: genotypes_by_sample_id = {gen_hit['sample_id']: gen_hit for gen_hit in hit.meta.inner_hits.genotype} elif is_nested: genotypes_by_sample_id = {gen_hit['sample_id']: gen_hit for gen_hit in hit['genotypes']} for individual_id, sample_id in family_individual_ids_to_sample_ids.items(): def _get_hit_field(field): if is_parent_child or is_nested: gen_hit = genotypes_by_sample_id.get(sample_id, {}) key = field else: gen_hit = hit key = '{}_{}'.format(_encode_name(sample_id), field) return gen_hit[key] if key in gen_hit else None num_alt = _get_hit_field('num_alt') if num_alt is None: num_alt = -1 all_num_alt.append(num_alt) alleles = [] if num_alt == 0: alleles = [hit["ref"], hit["ref"]] elif num_alt == 1: alleles = [hit["ref"], hit["alt"]] elif num_alt == 2: alleles = [hit["alt"], hit["alt"]] elif num_alt == -1 or num_alt == None: alleles = [] else: raise ValueError("Invalid num_alt: " + str(num_alt)) genotypes[individual_id] = { 'ab': _get_hit_field('ab'), 'alleles': map(str, alleles), 'extras': { 'ad': _get_hit_field('ad'), 'dp': _get_hit_field('dp'), #'pl': '', }, 'filter': filters or "pass", 'gq': _get_hit_field('gq') or '', 'num_alt': num_alt, } vep_annotation = hit['sortedTranscriptConsequences'] if 'sortedTranscriptConsequences' in hit else None if vep_annotation is not None: if is_parent_child or is_nested: vep_annotation = [annot.to_dict() for annot in vep_annotation] else: vep_annotation = json.loads(str(vep_annotation)) gene_ids = list(hit['geneIds'] or []) worst_vep_index_per_gene = { gene_id: next((i for i, annot in enumerate(vep_annotation) if annot['gene_id'] == gene_id), None) for gene_id in gene_ids } if project.genome_version == GENOME_VERSION_GRCh37: grch38_coord = None if self.liftover_grch37_to_grch38: grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate("chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch38_coord and grch38_coord[0]: grch38_coord = "%s-%s-%s-%s "% (grch38_coord[0][0], grch38_coord[0][1], hit["ref"], hit["alt"]) else: grch38_coord = None else: grch38_coord = hit["variantId"] if project.genome_version == GENOME_VERSION_GRCh38: grch37_coord = None liftover_grch38_to_grch37 = _liftover_grch38_to_grch37() if liftover_grch38_to_grch37: grch37_coord = liftover_grch38_to_grch37.convert_coordinate("chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch37_coord and grch37_coord[0]: grch37_coord = "%s-%s-%s-%s "% (grch37_coord[0][0], grch37_coord[0][1], hit["ref"], hit["alt"]) else: grch37_coord = None else: grch37_coord = hit["variantId"] freq_fields = { 'AF': "AF" if "AF" in index_fields else None, '1kg_wgs_AF': "g1k_AF" if "g1k_AF" in index_fields else None, '1kg_wgs_popmax_AF': "g1k_POPMAX_AF" if "g1k_POPMAX_AF" in index_fields else None, 'exac_v3_AF': "exac_AF" if "exac_AF" in index_fields else None, 'exac_v3_popmax_AF': "exac_AF_POPMAX" if "exac_AF_POPMAX" in index_fields else None, 'gnomad_exomes_AF': "gnomad_exomes_AF" if "gnomad_exomes_AF" in index_fields else None, 'gnomad_exomes_popmax_AF': "gnomad_exomes_AF_POPMAX_OR_GLOBAL" if "gnomad_exomes_AF_POPMAX_OR_GLOBAL" in index_fields else ( "gnomad_exomes_AF_POPMAX" if "gnomad_exomes_AF_POPMAX" in index_fields else None), 'gnomad_genomes_AF': "gnomad_genomes_AF" if "gnomad_genomes_AF" in index_fields else None, 'gnomad_genomes_popmax_AF': "gnomad_genomes_AF_POPMAX_OR_GLOBAL" if "gnomad_genomes_AF_POPMAX_OR_GLOBAL" in index_fields else ( "gnomad_genomes_AF_POPMAX" if "gnomad_genomes_AF_POPMAX" in index_fields else None), 'topmed_AF': "topmed_AF" if "topmed_AF" in index_fields else None, } result = { #u'_id': ObjectId('596d2207ff66f729285ca588'), 'alt': str(hit["alt"]) if "alt" in hit else None, 'annotation': { 'fathmm': fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0]) if "dbnsfp_FATHMM_pred" in hit and hit["dbnsfp_FATHMM_pred"] else None, 'muttaster': muttaster_map.get(hit["dbnsfp_MutationTaster_pred"].split(';')[0]) if "dbnsfp_MutationTaster_pred" in hit and hit["dbnsfp_MutationTaster_pred"] else None, 'polyphen': polyphen_map.get(hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0]) if "dbnsfp_Polyphen2_HVAR_pred" in hit and hit["dbnsfp_Polyphen2_HVAR_pred"] else None, 'sift': sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0]) if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"] else None, 'metasvm': metasvm_map.get(hit["dbnsfp_MetaSVM_pred"].split(';')[0]) if "dbnsfp_MetaSVM_pred" in hit and hit["dbnsfp_MetaSVM_pred"] else None, 'GERP_RS': float(hit["dbnsfp_GERP_RS"]) if "dbnsfp_GERP_RS" in hit and hit["dbnsfp_GERP_RS"] else None, 'phastCons100way_vertebrate': float(hit["dbnsfp_phastCons100way_vertebrate"]) if "dbnsfp_phastCons100way_vertebrate" in hit and hit["dbnsfp_phastCons100way_vertebrate"] else None, 'cadd_phred': hit["cadd_PHRED"] if "cadd_PHRED" in hit else None, 'dann_score': hit["dbnsfp_DANN_score"] if "dbnsfp_DANN_score" in hit else None, 'revel_score': hit["dbnsfp_REVEL_score"] if "dbnsfp_REVEL_score" in hit else None, 'eigen_phred': hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else (hit["dbnsfp_Eigen_phred"] if "dbnsfp_Eigen_phred" in hit else None), 'mpc_score': hit["mpc_MPC"] if "mpc_MPC" in hit else None, 'primate_ai_score': hit["primate_ai_score"] if "primate_ai_score" in hit else None, 'splice_ai_delta_score': hit["splice_ai_delta_score"] if "splice_ai_delta_score" in hit else None, 'rsid': hit["rsid"] if "rsid" in hit else None, 'annotation_tags': list(hit["transcriptConsequenceTerms"] or []) if "transcriptConsequenceTerms" in hit else None, 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'vep_annotation': vep_annotation, 'vep_group': str(hit['mainTranscript_major_consequence'] or "") if "mainTranscript_major_consequence" in hit else "", 'vep_consequence': str(hit['mainTranscript_major_consequence'] or "") if "mainTranscript_major_consequence" in hit else "", 'main_transcript': {k.replace('mainTranscript_', ''): hit[k] for k in dir(hit) if k.startswith('mainTranscript_')}, 'worst_vep_annotation_index': 0, 'worst_vep_index_per_gene': worst_vep_index_per_gene, }, 'chr': hit["contig"], 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': gene_ids, 'coverage': { 'gnomad_exome_coverage': float(hit["gnomad_exome_coverage"] or -1) if "gnomad_exome_coverage" in hit else -1, 'gnomad_genome_coverage': float(hit["gnomad_genome_coverage"] or -1) if "gnomad_genome_coverage" in hit else -1, }, 'pop_counts': { 'AC': int(hit['AC'] or 0) if 'AC' in hit else None, 'AN': int(hit['AN'] or 0) if 'AN' in hit else None, 'g1kAC': int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None, 'g1kAN': int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None, 'exac_v3_AC': int(hit["exac_AC_Adj"] or 0) if "exac_AC_Adj" in hit else None, 'exac_v3_Het': int(hit["exac_AC_Het"] or 0) if "exac_AC_Het" in hit else None, 'exac_v3_Hom': int(hit["exac_AC_Hom"] or 0) if "exac_AC_Hom" in hit else None, 'exac_v3_Hemi': int(hit["exac_AC_Hemi"] or 0) if "exac_AC_Hemi" in hit else None, 'exac_v3_AN': int(hit["exac_AN_Adj"] or 0) if "exac_AN_Adj" in hit else None, 'gnomad_exomes_AC': int(hit["gnomad_exomes_AC"] or 0) if "gnomad_exomes_AC" in hit else None, 'gnomad_exomes_Hom': int(hit["gnomad_exomes_Hom"] or 0) if "gnomad_exomes_Hom" in hit else None, 'gnomad_exomes_Hemi': int(hit["gnomad_exomes_Hemi"] or 0) if "gnomad_exomes_Hemi" in hit else None, 'gnomad_exomes_AN': int(hit["gnomad_exomes_AN"] or 0) if "gnomad_exomes_AN" in hit else None, 'gnomad_genomes_AC': int(hit["gnomad_genomes_AC"] or 0) if "gnomad_genomes_AC" in hit else None, 'gnomad_genomes_Hom': int(hit["gnomad_genomes_Hom"] or 0) if "gnomad_genomes_Hom" in hit else None, 'gnomad_genomes_Hemi': int(hit["gnomad_genomes_Hemi"] or 0) if "gnomad_genomes_Hemi" in hit else None, 'gnomad_genomes_AN': int(hit["gnomad_genomes_AN"] or 0) if "gnomad_genomes_AN" in hit else None, 'topmed_AC': float(hit["topmed_AC"] or 0) if "topmed_AC" in hit else None, 'topmed_Het': float(hit["topmed_Het"] or 0) if "topmed_Het" in hit else None, 'topmed_Hom': float(hit["topmed_Hom"] or 0) if "topmed_Hom" in hit else None, 'topmed_AN': float(hit["topmed_AN"] or 0) if "topmed_AN" in hit else None, }, 'db_freqs': {k: float(hit[v] or 0.0) if v in hit else (0.0 if v else None) for k, v in freq_fields.items()}, #'popmax_populations': { # 'exac_popmax': hit["exac_POPMAX"] or None, # 'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None, # 'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None, #}, 'db_gene_ids': list((hit["geneIds"] or []) if "geneIds" in hit else []), 'db_tags': str(hit["transcriptConsequenceTerms"] or "") if "transcriptConsequenceTerms" in hit else None, 'extras': { 'clinvar_variant_id': hit['clinvar_variation_id'] if 'clinvar_variation_id' in hit and hit['clinvar_variation_id'] else None, 'clinvar_allele_id': hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit and hit['clinvar_allele_id'] else None, 'clinvar_clinsig': hit['clinvar_clinical_significance'].lower() if ('clinvar_clinical_significance' in hit) and hit['clinvar_clinical_significance'] else None, 'clinvar_gold_stars': hit['clinvar_gold_stars'] if 'clinvar_gold_stars' in hit and hit['clinvar_gold_stars'] else None, 'hgmd_class': hit['hgmd_class'] if 'hgmd_class' in hit and user and user.is_staff else None, 'hgmd_accession': hit['hgmd_accession'] if 'hgmd_accession' in hit else None, 'genome_version': project.genome_version, 'grch37_coords': grch37_coord, 'grch38_coords': grch38_coord, 'alt_allele_pos': 0, 'orig_alt_alleles': map(str, [a.split("-")[-1] for a in hit["originalAltAlleles"]]) if "originalAltAlleles" in hit else None }, 'genotypes': genotypes, 'pos': long(hit['start']), 'pos_end': str(hit['end']), 'ref': str(hit['ref']), 'vartype': 'snp' if len(hit['ref']) == len(hit['alt']) else "indel", 'vcf_id': None, 'xpos': long(hit["xpos"]), 'xposx': long(hit["xpos"]), } result["annotation"]["freqs"] = result["db_freqs"] result["annotation"]["pop_counts"] = result["pop_counts"] result["annotation"]["db"] = "elasticsearch" result["extras"]["svlen"] = hit["SVLEN"] if "SVLEN" in hit else None result["extras"]["svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None logger.info("Result %s: GRCh37: %s GRCh38: %s - gene ids: %s, coding gene_ids: %s" % ( i, grch37_coord, grch38_coord, result["gene_ids"], result["coding_gene_ids"])) result["extras"]["project_id"] = project_id result["extras"]["family_id"] = family_id # add gene info gene_names = {} if vep_annotation is not None: gene_names = {vep_anno["gene_id"]: vep_anno.get("gene_symbol") for vep_anno in vep_annotation if vep_anno.get("gene_symbol")} result["extras"]["gene_names"] = gene_names try: genes = {} for gene_id in result["gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary(gene_id) or {} #if not genes: # genes = {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation} result["extras"]["genes"] = genes except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() logger.warn("WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s" % (e, exc_tb.tb_lineno)) variant_results.append(result) logger.info("Finished returning the %s variants: %s seconds" % (response.hits.total, time.time() - start)) if redis_client: redis_client.set(cache_key, json.dumps(variant_results)) return [Variant.fromJSON(variant_json) for variant_json in variant_results]
def get_variants_cohort(self, project_id, cohort_id, variant_filter=None): db_query = _make_db_query(None, variant_filter) collection = self._get_family_collection(project_id, cohort_id) for variant in collection.find(db_query).sort('xpos'): yield Variant.fromJSON(variant)
def get_variant(self, xpos, ref, alt): variant = Variant(xpos, ref, alt) self.annotate_variant(variant) return variant
def add_or_edit_variant_note(request): """Add a variant note""" family = None if 'family_id' in request.GET: project, family = get_project_and_family_for_user(request.user, request.GET) else: project = utils.get_project_for_user(request.user, request.GET) form = api_forms.VariantNoteForm(project, request.GET) if not form.is_valid(): return JSONResponse({ 'is_error': True, 'error': server_utils.form_error_string(form) }) variant = get_datastore(project.project_id).get_single_variant( project.project_id, family.family_id, form.cleaned_data['xpos'], form.cleaned_data['ref'], form.cleaned_data['alt'], ) if not variant: variant = Variant.fromJSON({ 'xpos' : form.cleaned_data['xpos'], 'ref': form.cleaned_data['ref'], 'alt': form.cleaned_data['alt'], 'genotypes': {}, 'extras': {}, }) if 'note_id' in form.cleaned_data and form.cleaned_data['note_id']: event_type = "edit_variant_note" notes = VariantNote.objects.filter( id=form.cleaned_data['note_id'], project=project, xpos=form.cleaned_data['xpos'], ref=form.cleaned_data['ref'], alt=form.cleaned_data['alt'], ) if not notes: return JSONResponse({ 'is_error': True, 'error': 'note id %s not found' % form.cleaned_data['note_id'] }) note = notes[0] note.user = request.user note.note = form.cleaned_data['note_text'] note.date_saved = timezone.now() if family: note.family = family note.save() else: event_type = "add_variant_note" VariantNote.objects.create( user=request.user, project=project, xpos=form.cleaned_data['xpos'], ref=form.cleaned_data['ref'], alt=form.cleaned_data['alt'], note=form.cleaned_data['note_text'], date_saved=timezone.now(), family=family, ) add_extra_info_to_variants_family(get_reference(), family, [variant,]) try: settings.EVENTS_COLLECTION.insert({ 'event_type': event_type, 'date': timezone.now(), 'project_id': ''.join(project.project_id), 'family_id': family.family_id, 'note': form.cleaned_data['note_text'], 'xpos':form.cleaned_data['xpos'], 'pos':variant.pos, 'chrom': variant.chr, 'ref':form.cleaned_data['ref'], 'alt':form.cleaned_data['alt'], 'gene_names': ", ".join(variant.extras['gene_names'].values()), 'username': request.user.username, 'email': request.user.email, }) except Exception as e: logging.error("Error while logging %s event: %s" % (event_type, e)) return JSONResponse({ 'is_error': False, 'variant': variant.toJSON(), })
def get_single_variant_cohort(self, project_id, cohort_id, xpos, ref, alt): collection = self._get_family_collection(project_id, cohort_id) variant = collection.find_one({'xpos': xpos, 'ref': ref, 'alt': alt}) return Variant.fromJSON(variant)
def get_de_novo_variants(datastore, reference, family, variant_filter=None, quality_filter=None): """ Returns variants that follow homozygous recessive inheritance in family """ de_novo_filter = inheritance.get_de_novo_filter(family) db_query = datastore._make_db_query(de_novo_filter, variant_filter) collection = datastore._get_family_collection(family.project_id, family.family_id) if not collection: raise ValueError("Error: mongodb collection not found for project %s family %s " % (family.project_id, family.family_id)) variant_iter = collection.find(db_query).sort('xpos') # get ids of parents in this family valid_ids = set(indiv_id for indiv_id in family.individuals) paternal_ids = set(i.paternal_id for i in family.get_individuals() if i.paternal_id in valid_ids) maternal_ids = set(i.maternal_id for i in family.get_individuals() if i.maternal_id in valid_ids) parental_ids = paternal_ids | maternal_ids # loop over all variants returned for variant_dict in variant_iter: variant = Variant.fromJSON(variant_dict) datastore.add_annotations_to_variant(variant, family.project_id) if not passes_variant_filter(variant, variant_filter)[0]: continue # handle genotype filters if len(parental_ids) != 2: # ordinary filters for non-trios for indiv_id in de_novo_filter.keys(): genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter): break else: yield variant else: # for trios use Mark's recommended filters for de-novo variants: # Hard-coded thresholds: # 1) Child must have > 10% of combined Parental Read Depth # 2) MinimumChildGQscore >= 20 # 3) MaximumParentAlleleBalance <= 5% # Adjustable filters: # Variants should PASS # Child AB should be >= 20 # compute parental read depth for filter 1 total_parental_read_depth = 0 for indiv_id in parental_ids: genotype = variant.get_genotype(indiv_id) if genotype.extras and 'dp' in genotype.extras: total_parental_read_depth += int(genotype.extras['dp']) else: total_parental_read_depth = None # both parents must have DP to use the parental_read_depth filters break for indiv_id in de_novo_filter.keys(): quality_filter_temp = quality_filter.copy() # copy before modifying if indiv_id in parental_ids: # handle one of the parents quality_filter_temp['max_ab'] = 5 else: # handle child quality_filter_temp['min_gq'] = 20 if total_parental_read_depth is not None: quality_filter_temp['min_dp'] = total_parental_read_depth * 0.1 genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter_temp): #print("%s: %s " % (variant.chr, variant.pos)) break else: yield variant
def get_de_novo_variants(datastore, reference, family, variant_filter=None, quality_filter=None): """ Returns variants that follow homozygous recessive inheritance in family """ de_novo_filter = inheritance.get_de_novo_filter(family) db_query = datastore._make_db_query(de_novo_filter, variant_filter) collection = datastore._get_family_collection(family.project_id, family.family_id) if not collection: raise ValueError( "Error: mongodb collection not found for project %s family %s " % (family.project_id, family.family_id)) MONGO_QUERY_RESULTS_LIMIT = 5000 variant_iter = collection.find(db_query).sort('xpos').limit( MONGO_QUERY_RESULTS_LIMIT + 5) # get ids of parents in this family valid_ids = set(indiv_id for indiv_id in family.individuals) paternal_ids = set(i.paternal_id for i in family.get_individuals() if i.paternal_id in valid_ids) maternal_ids = set(i.maternal_id for i in family.get_individuals() if i.maternal_id in valid_ids) parental_ids = paternal_ids | maternal_ids # loop over all variants returned for i, variant_dict in enumerate(variant_iter): if i > MONGO_QUERY_RESULTS_LIMIT: raise Exception( "MONGO_QUERY_RESULTS_LIMIT of %s exceeded for query: %s" % (MONGO_QUERY_RESULTS_LIMIT, db_query)) variant = Variant.fromJSON(variant_dict) datastore.add_annotations_to_variant(variant, family.project_id) if not passes_variant_filter(variant, variant_filter)[0]: continue # handle genotype filters if len(parental_ids) != 2: # ordinary filters for non-trios for indiv_id in de_novo_filter.keys(): genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter): break else: yield variant else: # for trios use Mark's recommended filters for de-novo variants: # Hard-coded thresholds: # 1) Child must have > 10% of combined Parental Read Depth # 2) MinimumChildGQscore >= 20 # 3) MaximumParentAlleleBalance <= 5% # Adjustable filters: # Variants should PASS # Child AB should be >= 20 # compute parental read depth for filter 1 total_parental_read_depth = 0 for indiv_id in parental_ids: genotype = variant.get_genotype(indiv_id) if genotype.extras and 'dp' in genotype.extras and genotype.extras[ 'dp'] != '.': total_parental_read_depth += int(genotype.extras['dp']) else: total_parental_read_depth = None # both parents must have DP to use the parental_read_depth filters break for indiv_id in de_novo_filter.keys(): quality_filter_temp = quality_filter.copy( ) # copy before modifying if indiv_id in parental_ids: # handle one of the parents quality_filter_temp['max_ab'] = 5 else: # handle child quality_filter_temp['min_gq'] = 20 if total_parental_read_depth is not None: quality_filter_temp[ 'min_dp'] = total_parental_read_depth * 0.1 genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter_temp): #print("%s: %s " % (variant.chr, variant.pos)) break else: yield variant
def get_elasticsearch_variants( self, project_id, family_id=None, variant_filter=None, genotype_filter=None, variant_id_filter=None, quality_filter=None, indivs_to_consider=None, include_all_consequences=False, user=None, max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT, ): from xbrowse_server.base.models import Individual from xbrowse_server.mall import get_reference cache_key = "Variants___%s___%s___%s" % (project_id, family_id, json.dumps([ variant_filter.toJSON() if variant_filter else None, genotype_filter, quality_filter, variant_id_filter, indivs_to_consider, include_all_consequences, ])) cached_results = self._redis_client and self._redis_client.get( cache_key) if cached_results is not None: variant_results = json.loads(cached_results) return [ Variant.fromJSON(variant_json) for variant_json in variant_results ] if indivs_to_consider is None: if genotype_filter: indivs_to_consider = genotype_filter.keys() else: indivs_to_consider = [] if family_id is not None: family_individual_ids = [ i.indiv_id for i in Individual.objects.filter( family__family_id=family_id).only("indiv_id") ] else: family_individual_ids = [ i.indiv_id for i in Individual.objects.filter( family__project__project_id=project_id).only("indiv_id") ] from xbrowse_server.base.models import Project, Family from pyliftover.liftover import LiftOver query_json = self._make_db_query(genotype_filter, variant_filter) try: if self.liftover_grch38_to_grch37 is None: self.liftover_grch38_to_grch37 = LiftOver('hg38', 'hg19') if self.liftover_grch37_to_grch38 is None: self.liftover_grch37_to_grch38 = None # LiftOver('hg19', 'hg38') except Exception as e: logger.info( "WARNING: Unable to set up liftover. Is there a working internet connection? " + str(e)) if family_id is None: project = Project.objects.get(project_id=project_id) elasticsearch_index = project.get_elasticsearch_index() logger.info("Searching in project elasticsearch index: " + str(elasticsearch_index)) else: family = Family.objects.get(project__project_id=project_id, family_id=family_id) elasticsearch_index = family.get_elasticsearch_index() project = family.project logger.info("Searching in family elasticsearch index: " + str(elasticsearch_index)) if family_id is not None and len(family_individual_ids) > 0: # figure out which index to use # TODO add caching matching_indices = [] mapping = self._es_client.indices.get_mapping( str(elasticsearch_index) + "*") if family_individual_ids: indiv_id = _encode_name(family_individual_ids[0]) for index_name, index_mapping in mapping.items(): if indiv_id + "_num_alt" in index_mapping["mappings"][ "variant"]["properties"]: matching_indices.append(index_name) if not matching_indices: if not family_individual_ids: logger.error("no individuals found for family %s" % (family_id)) elif not mapping: logger.error( "no es mapping found for found with prefix %s" % (elasticsearch_index)) else: logger.error("%s not found in %s:\n%s" % (indiv_id, elasticsearch_index, pformat(index_mapping["mappings"]["variant"] ["properties"]))) else: logger.info("matching indices: " + str(elasticsearch_index)) elasticsearch_index = ",".join(matching_indices) s = elasticsearch_dsl.Search(using=self._es_client, index=str(elasticsearch_index) + "*") #",".join(indices)) if variant_id_filter is not None: variant_id_filter_term = None for variant_id in variant_id_filter: q_obj = Q('term', **{"variantId": variant_id}) if variant_id_filter_term is None: variant_id_filter_term = q_obj else: variant_id_filter_term |= q_obj s = s.filter(variant_id_filter_term) if indivs_to_consider: atleast_one_nonref_genotype_filter = None for sample_id in indivs_to_consider: encoded_sample_id = _encode_name(sample_id) q = Q('range', **{encoded_sample_id + "_num_alt": {'gte': 1}}) if atleast_one_nonref_genotype_filter is None: atleast_one_nonref_genotype_filter = q else: atleast_one_nonref_genotype_filter |= q s = s.filter(atleast_one_nonref_genotype_filter) if quality_filter is not None and indivs_to_consider: #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46 min_ab = quality_filter.get('min_ab') if min_ab is not None: min_ab /= 100.0 # convert to fraction min_gq = quality_filter.get('min_gq') vcf_filter = quality_filter.get('vcf_filter') for sample_id in indivs_to_consider: encoded_sample_id = _encode_name(sample_id) #'vcf_filter': u'pass', u'min_ab': 17, u'min_gq': 46 if min_ab: s = s.filter( ~Q('term', **{encoded_sample_id + "_num_alt": 1}) | Q('range', ** {encoded_sample_id + "_ab": { 'gte': min_ab }})) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}})) if min_gq: s = s.filter( 'range', **{encoded_sample_id + "_gq": { 'gte': min_gq }}) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}})) if vcf_filter is not None: s = s.filter(~Q('exists', field='filters')) #logger.info("### ADDED FILTER: " + str(~Q('exists', field='filters'))) # parse variant query annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP for key, value in query_json.items(): if key == 'db_tags': so_annotations = query_json.get('db_tags', {}).get('$in', []) # handle clinvar filters selected_so_annotations_set = set(so_annotations) all_clinvar_filters_set = set( annotation_groups_map.get("clinvar", {}).get("children", [])) selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set all_hgmd_filters_set = set( annotation_groups_map.get("hgmd", {}).get("children", [])) selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set vep_consequences = list(selected_so_annotations_set - selected_clinvar_filters_set - selected_hgmd_filters_set) consequences_filter = Q( "terms", transcriptConsequenceTerms=vep_consequences) if selected_clinvar_filters_set: clinvar_clinical_significance_terms = set() for clinvar_filter in selected_clinvar_filters_set: # translate selected filters to the corresponding clinvar clinical consequence terms if clinvar_filter == "pathogenic": clinvar_clinical_significance_terms.update( ["Pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "likely_pathogenic": clinvar_clinical_significance_terms.update([ "Likely_pathogenic", "Pathogenic/Likely_pathogenic" ]) elif clinvar_filter == "benign": clinvar_clinical_significance_terms.update( ["Benign", "Benign/Likely_benign"]) elif clinvar_filter == "likely_benign": clinvar_clinical_significance_terms.update( ["Likely_benign", "Benign/Likely_benign"]) elif clinvar_filter == "vus_or_conflicting": clinvar_clinical_significance_terms.update([ "Conflicting_interpretations_of_pathogenicity", "Uncertain_significance", "not_provided", "other" ]) else: raise ValueError("Unexpected clinvar filter: " + str(clinvar_filter)) consequences_filter = consequences_filter | Q( "terms", clinvar_clinical_significance=list( clinvar_clinical_significance_terms)) if selected_hgmd_filters_set: hgmd_class = set() for hgmd_filter in selected_hgmd_filters_set: # translate selected filters to the corresponding hgmd clinical consequence terms if hgmd_filter == "disease_causing": hgmd_class.update(["DM"]) elif hgmd_filter == "likely_disease_causing": hgmd_class.update(["DM?"]) elif hgmd_filter == "hgmd_other": hgmd_class.update(["DP", "DFP", "FP", "FTV"]) else: raise ValueError("Unexpected hgmd filter: " + str(hgmd_filter)) consequences_filter = consequences_filter | Q( "terms", hgmd_class=list(hgmd_class)) if 'intergenic_variant' in vep_consequences: # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy consequences_filter = consequences_filter | ~Q( 'exists', field='transcriptConsequenceTerms') s = s.filter(consequences_filter) #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences)) if key.startswith("genotypes"): sample_id = ".".join(key.split(".")[1:-1]) encoded_sample_id = _encode_name(sample_id) genotype_filter = value #logger.info("==> genotype filter: " + str(genotype_filter)) if type(genotype_filter) == int or type( genotype_filter) == basestring: #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter})) s = s.filter( 'term', **{encoded_sample_id + "_num_alt": genotype_filter}) elif '$gte' in genotype_filter: genotype_filter = { k.replace("$", ""): v for k, v in genotype_filter.items() } s = s.filter( 'range', **{encoded_sample_id + "_num_alt": genotype_filter}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": genotype_filter})) elif "$in" in genotype_filter: num_alt_values = genotype_filter['$in'] q = Q( 'term', **{encoded_sample_id + "_num_alt": num_alt_values[0]}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_values[0]})) for num_alt_value in num_alt_values[1:]: q = q | Q( 'term', ** {encoded_sample_id + "_num_alt": num_alt_value}) #logger.info("==> genotypes: %s" % str({encoded_sample_id+"_num_alt": num_alt_value})) s = s.filter(q) if key == "db_gene_ids": db_gene_ids = query_json.get('db_gene_ids', {}) exclude_genes = db_gene_ids.get('$nin', []) gene_ids = exclude_genes or db_gene_ids.get('$in', []) if exclude_genes: s = s.exclude("terms", geneIds=gene_ids) else: s = s.filter("terms", geneIds=gene_ids) #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids))) if key == "$or" and type(value) == list: q_terms = None for region_filter in value: xpos_filters = region_filter.get("$and", {}) # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}] xpos_filters_dict = {} for xpos_filter in xpos_filters: xpos_filter_setting = xpos_filter[ "xpos"] # for example {'$gte': 12345} or {'$lte': 54321} xpos_filters_dict.update(xpos_filter_setting) xpos_filter_setting = { k.replace("$", ""): v for k, v in xpos_filters_dict.items() } q = Q('range', **{"xpos": xpos_filter_setting}) if q_terms is None: q_terms = q else: q_terms |= q if q_terms is not None: s = s.filter(q_terms) #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting})) af_key_map = { "db_freqs.AF": "AF", "db_freqs.1kg_wgs_phase3": "g1k_POPMAX_AF", "db_freqs.exac_v3": "exac_AF_POPMAX", "db_freqs.topmed": "topmed_AF", "db_freqs.gnomad_exomes": "gnomad_exomes_AF_POPMAX", "db_freqs.gnomad_genomes": "gnomad_genomes_AF_POPMAX", "db_freqs.gnomad-exomes2": "gnomad_exomes_AF_POPMAX", "db_freqs.gnomad-genomes2": "gnomad_genomes_AF_POPMAX", } if key in af_key_map: filter_key = af_key_map[key] af_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: af_filter_setting}) | ~Q('exists', field=filter_key)) #logger.info("==> %s: %s" % (filter_key, af_filter_setting)) ac_key_map = { "db_acs.AF": "AC", "db_acs.1kg_wgs_phase3": "g1k_AC", "db_acs.exac_v3": "exac_AC", "db_acs.topmed": "topmed_AC", "db_acs.gnomad_exomes": "gnomad_exomes_AC", "db_acs.gnomad_genomes": "gnomad_genomes_AC", "db_acs.gnomad-exomes2": "gnomad_exomes_AC", "db_acs.gnomad-genomes2": "gnomad_genomes_AC", } if key in ac_key_map: filter_key = ac_key_map[key] ac_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: ac_filter_setting}) | ~Q('exists', field=filter_key)) hemi_key_map = { "db_hemi.exac_v3": "exac_AC_Hemi", "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi", "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi", "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi", "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi", } if key in hemi_key_map: filter_key = hemi_key_map[key] hemi_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: hemi_filter_setting}) | ~Q('exists', field=filter_key)) hom_key_map = { "db_hom.exac_v3": "exac_AC_Hom", "db_hom.gnomad_exomes": "gnomad_exomes_Hom", "db_hom.gnomad_genomes": "gnomad_genomes_Hom", "db_hom.gnomad-exomes2": "gnomad_exomes_Hom", "db_hom.gnomad-genomes2": "gnomad_genomes_Hom", } if key in hom_key_map: filter_key = hom_key_map[key] hom_filter_setting = { k.replace("$", ""): v for k, v in value.items() } s = s.filter( Q('range', **{filter_key: hom_filter_setting}) | ~Q('exists', field=filter_key)) #s = s.sort("xpos") #logger.info("=====") #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__)) #logger.info("FILTERS: " + pformat(s.to_dict())) # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan start = time.time() s = s.params(size=max_results_limit + 1) #if not include_all_consequences: # s = s.source(exclude=["sortedTranscriptConsequences"]) response = s.execute() logger.info("=====") logger.info("TOTAL: %s. Query took %s seconds" % (response.hits.total, time.time() - start)) if response.hits.total > max_results_limit + 1: raise Exception( "This search matched too many variants. Please set additional filters and try again." ) #print(pformat(response.to_dict())) project = Project.objects.get(project_id=project_id) #gene_list_map = project.get_gene_list_map() reference = get_reference() #for i, hit in enumerate(response.hits): variant_results = [] for i, hit in enumerate(s.scan()): # preserve_order=True #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__))) #print("HIT %s: %s" % (i, pformat(hit.to_dict()))) filters = ",".join(hit["filters"] or []) if "filters" in hit else "" genotypes = {} all_num_alt = [] for individual_id in family_individual_ids: encoded_individual_id = _encode_name(individual_id) num_alt = int(hit["%s_num_alt" % encoded_individual_id]) if ( "%s_num_alt" % encoded_individual_id) in hit else -1 if num_alt is not None: all_num_alt.append(num_alt) alleles = [] if num_alt == 0: alleles = [hit["ref"], hit["ref"]] elif num_alt == 1: alleles = [hit["ref"], hit["alt"]] elif num_alt == 2: alleles = [hit["alt"], hit["alt"]] elif num_alt == -1 or num_alt == None: alleles = [] else: raise ValueError("Invalid num_alt: " + str(num_alt)) genotypes[individual_id] = { 'ab': hit["%s_ab" % encoded_individual_id] if ("%s_ab" % encoded_individual_id) in hit else None, 'alleles': map(str, alleles), 'extras': { 'ad': hit["%s_ab" % encoded_individual_id] if ("%s_ad" % encoded_individual_id) in hit else None, 'dp': hit["%s_dp" % encoded_individual_id] if ("%s_dp" % encoded_individual_id) in hit else None, #'pl': '', }, 'filter': filters or "pass", 'gq': hit["%s_gq" % encoded_individual_id] if ("%s_gq" % encoded_individual_id in hit and hit["%s_gq" % encoded_individual_id] is not None) else '', 'num_alt': num_alt, } if all([num_alt <= 0 for num_alt in all_num_alt]): #logger.info("Filtered out due to genotype: " + str(genotypes)) #print("Filtered all_num_alt <= 0 - Result %s: GRCh38: %s:%s, cadd: %s %s - %s" % (i, hit["contig"], hit["start"], hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], all_num_alt)) continue vep_annotation = json.loads( str(hit['sortedTranscriptConsequences']) ) if 'sortedTranscriptConsequences' in hit else None if project.genome_version == GENOME_VERSION_GRCh37: grch38_coord = None if self.liftover_grch37_to_grch38: grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate( "chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch38_coord and grch38_coord[0]: grch38_coord = "%s-%s-%s-%s " % ( grch38_coord[0][0], grch38_coord[0][1], hit["ref"], hit["alt"]) else: grch38_coord = None else: grch38_coord = hit["variantId"] if project.genome_version == GENOME_VERSION_GRCh38: grch37_coord = None if self.liftover_grch38_to_grch37: grch37_coord = self.liftover_grch38_to_grch37.convert_coordinate( "chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch37_coord and grch37_coord[0]: grch37_coord = "%s-%s-%s-%s " % ( grch37_coord[0][0], grch37_coord[0][1], hit["ref"], hit["alt"]) else: grch37_coord = None else: grch37_coord = hit["variantId"] result = { #u'_id': ObjectId('596d2207ff66f729285ca588'), 'alt': str(hit["alt"]) if "alt" in hit else None, 'annotation': { 'fathmm': fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0]) if "dbnsfp_FATHMM_pred" in hit and hit["dbnsfp_FATHMM_pred"] else None, 'muttaster': muttaster_map.get( hit["dbnsfp_MutationTaster_pred"].split(';')[0]) if "dbnsfp_MutationTaster_pred" in hit and hit["dbnsfp_MutationTaster_pred"] else None, 'polyphen': polyphen_map.get( hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0]) if "dbnsfp_Polyphen2_HVAR_pred" in hit and hit["dbnsfp_Polyphen2_HVAR_pred"] else None, 'sift': sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0]) if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"] else None, 'GERP_RS': hit["dbnsfp_GERP_RS"] if "dbnsfp_GERP_RS" in hit else None, 'phastCons100way_vertebrate': hit["dbnsfp_phastCons100way_vertebrate"] if "dbnsfp_phastCons100way_vertebrate" in hit else None, 'cadd_phred': hit["cadd_PHRED"] if "cadd_PHRED" in hit else None, 'dann_score': hit["dbnsfp_DANN_score"] if "dbnsfp_DANN_score" in hit else None, 'revel_score': hit["dbnsfp_REVEL_score"] if "dbnsfp_REVEL_score" in hit else None, 'eigen_phred': hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else (hit["dbnsfp_Eigen_phred"] if "dbnsfp_Eigen_phred" in hit else None), 'mpc_score': hit["mpc_MPC"] if "mpc_MPC" in hit else None, 'annotation_tags': list(hit["transcriptConsequenceTerms"] or []) if "transcriptConsequenceTerms" in hit else None, 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'vep_annotation': vep_annotation, 'vep_group': str(hit['mainTranscript_major_consequence'] or ""), 'vep_consequence': str(hit['mainTranscript_major_consequence'] or ""), 'main_transcript': { k.replace('mainTranscript_', ''): hit[k] for k in dir(hit) if k.startswith('mainTranscript_') }, 'worst_vep_annotation_index': 0, 'worst_vep_index_per_gene': { str(hit['mainTranscript_gene_id']): 0 }, }, 'chr': hit["contig"], 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'coverage': { 'gnomad_exome_coverage': float(hit["gnomad_exome_coverage"] or -1) if "gnomad_exome_coverage" in hit else -1, 'gnomad_genome_coverage': float(hit["gnomad_genome_coverage"] or -1) if "gnomad_genome_coverage" in hit else -1, }, 'pop_counts': { 'AC': int(hit['AC'] or 0) if 'AC' in hit else None, 'AN': int(hit['AN'] or 0) if 'AN' in hit else None, '1kg_AC': int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None, '1kg_AN': int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None, 'exac_v3_AC': int(hit["exac_AC_Adj"] or 0) if "exac_Adj_AC" in hit else None, 'exac_v3_Het': int(hit["exac_AC_Het"] or 0) if "exac_AC_Het" in hit else None, 'exac_v3_Hom': int(hit["exac_AC_Hom"] or 0) if "exac_AC_Hom" in hit else None, 'exac_v3_Hemi': int(hit["exac_AC_Hemi"] or 0) if "exac_AC_Hemi" in hit else None, 'gnomad_exomes_AC': int(hit["gnomad_exomes_AC"] or 0) if "gnomad_exomes_AC" in hit else None, 'gnomad_exomes_Hom': int(hit["gnomad_exomes_Hom"] or 0) if "gnomad_exomes_Hom" in hit else None, 'gnomad_exomes_Hemi': int(hit["gnomad_exomes_Hemi"] or 0) if "gnomad_exomes_Hemi" in hit else None, 'gnomad_exomes_AN': int(hit["gnomad_exomes_AN"] or 0) if "gnomad_exomes_AN" in hit else None, 'gnomad_genomes_AC': int(hit["gnomad_genomes_AC"] or 0) if "gnomad_genomes_AC" in hit else None, 'gnomad_genomes_Hom': int(hit["gnomad_genomes_Hom"] or 0) if "gnomad_genomes_Hom" in hit else None, 'gnomad_genomes_Hemi': int(hit["gnomad_genomes_Hemi"] or 0) if "gnomad_genomes_Hemi" in hit else None, 'gnomad_genomes_AN': int(hit["gnomad_genomes_AN"] or 0) if "gnomad_genomes_AN" in hit else None, 'topmed_AC': float(hit["topmed_AC"] or 0) if "topmed_AC" in hit else None, 'topmed_Het': float(hit["topmed_Het"] or 0) if "topmed_Het" in hit else None, 'topmed_Hom': float(hit["topmed_Hom"] or 0) if "topmed_Hom" in hit else None, 'topmed_AN': float(hit["topmed_AN"] or 0) if "topmed_AN" in hit else None, }, 'db_freqs': { 'AF': float(hit["AF"] or 0.0) if "AF" in hit else None, '1kg_wgs_AF': float(hit["g1k_AF"] or 0.0) if "g1k_AF" in hit else None, '1kg_wgs_popmax_AF': float(hit["g1k_POPMAX_AF"] or 0.0) if "g1k_POPMAX_AF" in hit else None, 'exac_v3_AF': float(hit["exac_AF"] or 0.0) if "exac_AF" in hit else (hit["exac_AC_Adj"] / float(hit["exac_AN_Adj"]) if "exac_AC_Adj" in hit and "exac_AN_Adj" in hit and int(hit["exac_AN_Adj"] or 0) > 0 else None), 'exac_v3_popmax_AF': float(hit["exac_AF_POPMAX"] or 0.0) if "exac_AF_POPMAX" in hit else None, 'gnomad_exomes_AF': float(hit["gnomad_exomes_AF"] or 0.0) if "gnomad_exomes_AF" in hit else None, 'gnomad_exomes_popmax_AF': float(hit["gnomad_exomes_AF_POPMAX"] or 0.0) if "gnomad_exomes_AF_POPMAX" in hit else None, 'gnomad_genomes_AF': float(hit["gnomad_genomes_AF"] or 0.0) if "gnomad_genomes_AF" in hit else None, 'gnomad_genomes_popmax_AF': float(hit["gnomad_genomes_AF_POPMAX"] or 0.0) if "gnomad_genomes_AF_POPMAX" in hit else None, 'topmed_AF': float(hit["topmed_AF"] or 0.0) if "topmed_AF" in hit else None, }, #'popmax_populations': { # 'exac_popmax': hit["exac_POPMAX"] or None, # 'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None, # 'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None, #}, 'db_gene_ids': list((hit["geneIds"] or []) if "geneIds" in hit else []), 'db_tags': str(hit["transcriptConsequenceTerms"] or "") if "transcriptConsequenceTerms" in hit else None, 'extras': { 'clinvar_variant_id': hit['clinvar_variation_id'] if 'clinvar_variation_id' in hit and hit['clinvar_variation_id'] else None, 'clinvar_allele_id': hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit and hit['clinvar_allele_id'] else None, 'clinvar_clinsig': hit['clinvar_clinical_significance'].lower() if ('clinvar_clinical_significance' in hit) and hit['clinvar_clinical_significance'] else None, 'hgmd_class': hit['hgmd_class'] if 'hgmd_class' in hit and user and user.is_staff else None, 'hgmd_accession': hit['hgmd_accession'] if 'hgmd_accession' in hit else None, 'genome_version': project.genome_version, 'grch37_coords': grch37_coord, 'grch38_coords': grch38_coord, 'alt_allele_pos': 0, 'orig_alt_alleles': map(str, [a.split("-")[-1] for a in hit["originalAltAlleles"]]) if "originalAltAlleles" in hit else None }, 'genotypes': genotypes, 'pos': long(hit['start']), 'pos_end': str(hit['end']), 'ref': str(hit['ref']), 'vartype': 'snp' if len(hit['ref']) == len(hit['alt']) else "indel", 'vcf_id': None, 'xpos': long(hit["xpos"]), 'xposx': long(hit["xpos"]), } result["annotation"]["freqs"] = result["db_freqs"] result["annotation"]["pop_counts"] = result["pop_counts"] result["annotation"]["db"] = "elasticsearch" result["extras"][ "svlen"] = hit["SVLEN"] if "SVLEN" in hit else None result["extras"][ "svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None logger.info( "Result %s: GRCh37: %s GRCh38: %s:, cadd: %s %s - gene ids: %s, coding gene_ids: %s" % (i, grch37_coord, grch38_coord, hit["cadd_PHRED"] if "cadd_PHRED" in hit else "", hit["transcriptConsequenceTerms"], result["gene_ids"], result["coding_gene_ids"])) result["extras"]["project_id"] = project_id result["extras"]["family_id"] = family_id # add gene info gene_names = {} if vep_annotation is not None: gene_names = { vep_anno["gene_id"]: vep_anno.get("gene_symbol") for vep_anno in vep_annotation if vep_anno.get("gene_symbol") } result["extras"]["gene_names"] = gene_names try: genes = {} for gene_id in result["coding_gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary( gene_id) or {} if not genes: for gene_id in result["gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary( gene_id) or {} #if not genes: # genes = {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation} result["extras"]["genes"] = genes except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() logger.warn( "WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s" % (e, exc_tb.tb_lineno)) variant_results.append(result) logger.info("Finished returning the %s variants: %s seconds" % (response.hits.total, time.time() - start)) if self._redis_client: self._redis_client.set(cache_key, json.dumps(variant_results)) return [ Variant.fromJSON(variant_json) for variant_json in variant_results ]
def add_or_edit_variant_tags(request): family = None if 'family_id' in request.GET: project, family = get_project_and_family_for_user(request.user, request.GET) else: project = utils.get_project_for_user(request.user, request.GET) form = api_forms.VariantTagsForm(project, request.GET) if not form.is_valid(): ret = { 'is_error': True, 'error': server_utils.form_error_string(form) } return JSONResponse(ret) variant = get_datastore(project.project_id).get_single_variant( project.project_id, family.family_id, form.cleaned_data['xpos'], form.cleaned_data['ref'], form.cleaned_data['alt'], ) if not variant: variant = Variant(form.cleaned_data['xpos'], form.cleaned_data['ref'], form.cleaned_data['alt']) variant_tags_to_delete = { variant_tag.id: variant_tag for variant_tag in VariantTag.objects.filter( family=family, xpos=form.cleaned_data['xpos'], ref=form.cleaned_data['ref'], alt=form.cleaned_data['alt']) } project_tag_events = {} for project_tag in form.cleaned_data['project_tags']: # retrieve tags tag, created = VariantTag.objects.get_or_create( project_tag=project_tag, family=family, xpos=form.cleaned_data['xpos'], ref=form.cleaned_data['ref'], alt=form.cleaned_data['alt'], ) if not created: # this tag already exists so just keep it (eg. remove it from the set of tags that will be deleted) del variant_tags_to_delete[tag.id] continue # this a new tag, so update who saved it and when project_tag_events[project_tag] = "add_variant_tag" tag.user = request.user tag.date_saved = timezone.now() tag.search_url = form.cleaned_data['search_url'] tag.save() # delete the tags that are no longer checked. for variant_tag in variant_tags_to_delete.values(): project_tag_events[variant_tag.project_tag] = "delete_variant_tag" variant_tag.delete() # add the extra info after updating the tag info in the database, so that the new tag info is added to the variant JSON add_extra_info_to_variants_family(get_reference(), family, [variant,]) # log tag creation for project_tag, event_type in project_tag_events.items(): try: settings.EVENTS_COLLECTION.insert({ 'event_type': event_type, 'date': timezone.now(), 'project_id': ''.join(project.project_id), 'family_id': family.family_id, 'tag': project_tag.tag, 'title': project_tag.title, 'xpos':form.cleaned_data['xpos'], 'pos':variant.pos, 'chrom': variant.chr, 'ref':form.cleaned_data['ref'], 'alt':form.cleaned_data['alt'], 'gene_names': ", ".join(variant.extras['gene_names'].values()), 'username': request.user.username, 'email': request.user.email, 'search_url': form.cleaned_data.get('search_url'), }) except Exception as e: logging.error("Error while logging add_variant_tag event: %s" % e) return JSONResponse({ 'is_error': False, 'variant': variant.toJSON(), })