def test_pubs_disease_tags(self): ''' Check the number of disease publications against the number of tags.disease and report differences`. ''' count = True msg = '' for disease in DiseasePublicationTest.DISEASES: pmids = self._get_pmids(disease) disease_code = disease.lower() elastic = Search(search_query=ElasticQuery(BoolQuery( b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) res = elastic.get_count() msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids)) if res['count'] != len(pmids): count = False docs = elastic.search().docs pmids_in_idx = [getattr(doc, 'pmid') for doc in docs] pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids] pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx] if len(pmids_diff1) > 0: msg += '\textra PMIDs: '+str(pmids_diff1) if len(pmids_diff2) > 0: msg += '\tmissing PMIDs: '+str(pmids_diff2) msg += '\n' print(msg) self.assertTrue(count, 'Count for disease tags')
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def _get_pub_docs_by_pmid(pmids, sources=None): """ Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. """ query = ElasticQuery(Query.ids(pmids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx("PUBLICATION"), size=len(pmids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def test_significant_terms(self): ''' Significant Terms Aggregation ''' agg = Agg("test_significant_terms", "significant_terms", {"field": "start"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('test_significant_terms' in r_aggs, "returned aggregations")
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def test_string_query(self): ''' Test building and running a string query. ''' query = ElasticQuery.query_string("rs2476601", fields=["id"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search() self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)") self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
def setUpModule(): ''' Change ini config (MY_INI_FILE) to use the test suffix when creating pipeline indices. ''' ini_file = os.path.join(os.path.dirname(__file__), 'test_download.ini') if os.path.isfile(MY_INI_FILE): return with open(MY_INI_FILE, 'w') as new_file: with open(ini_file) as old_file: for line in old_file: new_file.write(line.replace('auto_tests', IDX_SUFFIX)) '''load ensembl GTF and GENE_HISTORY''' INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG['ENSEMBL_GENE_GTF']['index'] call_command('pipeline', '--steps', 'load', sections='GENE_HISTORY', dir=TEST_DATA_DIR, ini=MY_INI_FILE) call_command('pipeline', '--steps', 'stage', 'load', sections='ENSEMBL_GENE_GTF', dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx) call_command('pipeline', '--steps', 'load', sections='GENE2ENSEMBL', dir=TEST_DATA_DIR, ini=MY_INI_FILE) Search.index_refresh(idx)
def get_disease_tags(cls, feature_id, idx=None, idx_type=None): ''' function to get the aggregated list of disease_tags for a given feature id, aggregated from all criteria_types for a feature type @type feature_id: string @keyword feature_id: Id of the feature (gene => gene_id, region=>region_id) @type idx: string @param idx: name of the index @type idx_type: string @param idx_type: name of the idx type, each criteria is an index type ''' query = ElasticQuery(Query.term("qid", feature_id)) agg = Agg("criteria_disease_tags", "terms", {"field": "disease_tags", "size": 0}) aggs = Aggs(agg) if idx_type: search = Search(query, aggs=aggs, idx=idx, idx_type=idx_type) else: search = Search(query, aggs=aggs, idx=idx) disease_tags = [] try: r_aggs = search.search().aggs buckets = r_aggs['criteria_disease_tags'].get_buckets() disease_tags = [dis_dict['key'].lower() for dis_dict in buckets] except: return [] # get disease docs if (len(disease_tags) > 0): (core, other) = Disease.get_site_diseases(dis_list=disease_tags) diseases = list(core) diseases.extend(other) return diseases else: return None
def _find_snp_position(snp_track, name): if snp_track is None: query = ElasticQuery.query_match("id", name) elastic = Search(query, idx=ElasticSettings.idx('MARKER')) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} else: mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.query_match("name", name) elastic = Search(query, idx=snp_track_idx) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) search_filters = self._build_filters(filters=filters) if search_filters is not None: q = ElasticQuery.filtered(Query.match_all(), search_filters) else: q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=getattr(view, 'idx'), size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def get_gene_docs_by_ensembl_id(cls, ens_ids, sources=None): ''' Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. ''' query = ElasticQuery(Query.ids(ens_ids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx('GENE', idx_type='GENE'), size=len(ens_ids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from Rserve. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get('marker', 'rs2476601') dataset = filters.get('dataset', 'EUR').replace('-', '') query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start']) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, 'seqid') rserve = getattr(settings, 'RSERVE') conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT')) pop_str = conn.r.get_pop(dataset, seqid, mid1) pops = json.loads(str(pop_str)) populations = [] for pop in pops: pops[pop]['population'] = pop populations.append(pops[pop]) conn.close() return [ElasticObject(initial={'populations': populations, 'marker': mid1})] except (TypeError, ValueError, IndexError, ConnectionError): return [ElasticObject(initial={'populations': None, 'marker': mid1})]
def test_filter(self): ''' Filter Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='25000')), Agg('avg_start', 'avg', {"field": 'start'}), Agg('min_start', 'min', {"field": 'start'}), Agg('sum_start', 'sum', {"field": 'start'}), Agg('stats_start', 'stats', {"field": 'start'}), Agg('count_start', 'value_count', {"field": 'start'}), Agg('ext_stats_start', 'extended_stats', {"field": 'start'})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('avg_start' in r_aggs, "returned avg aggregation") self.assertTrue('min_start' in r_aggs, "returned min aggregation") stats_keys = ["min", "max", "sum", "count", "avg"] self.assertTrue(all(hasattr(r_aggs['stats_start'], k) for k in stats_keys), "returned min aggregation") stats_keys.extend(["sum_of_squares", "variance", "std_deviation", "std_deviation_bounds"]) self.assertTrue(all(hasattr(r_aggs['ext_stats_start'], k) for k in stats_keys), "returned min aggregation")
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) print(filterable) print(request) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) criteria_idx = self._get_index(filters.get('feature_type', 'GENE_CRITERIA')) idx = criteria_idx if type(criteria_idx) == list: idx = ','.join(ElasticSettings.idx(name) for name in criteria_idx) else: idx = ElasticSettings.idx(criteria_idx) q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=idx, size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] new_obj.criteria_type = result['_type'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def test_top_hits(self): ''' Top Hits Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='2000')), Agg('test_top_hits', 'top_hits', {"size": 1})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) hits = search.search().aggs['test_top_hits'].get_hits() self.assertTrue(len(hits) == 1, "returned the top hit")
def test_top_hits_sub_agg(self): sub_agg = Agg('idx_top_hits', 'top_hits', {"size": 1}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("categories", "terms", {"field": "_type", "size": 0})]) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) buckets = search.search().aggs['idxs'].get_docs_in_buckets() self.assertEqual(buckets[ElasticSettings.idx('DEFAULT')]['doc_count'], 3) self.assertEqual(len(buckets[ElasticSettings.idx('DEFAULT')]['docs']), 1)
def test_missing(self): ''' Missing Aggregation ''' agg = Agg("test_missing", "missing", {"field": "seqid"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(getattr(r_aggs['test_missing'], 'doc_count') == 0, "no missing seqid fields")
def fetch_overlapping_features(cls, build, seqid, start, end, idx=None, idx_type=None, disease_id=None): ''' function to create fetch overlapping features for a given stretch of region the build info is stored as nested document..so nested query is build @type build: string @param build: build info eg: 'GRCh38' @type seqid: string @param seqid: chromosome number @type start: string @param start: region start @type end: string @param end: region end @type idx: string @param idx: name of the index @type idx_type: string @param idx_type: name of the idx type, each criteria is an index type @type disease_id: string @param disease_id: disease code ''' nbuild = build start_range = start end_range = end bool_range = BoolQuery() bool_range.must(RangeQuery("build_info.start", lte=start_range)) \ .must(RangeQuery("build_info.end", gte=end_range)) or_filter = OrFilter(RangeQuery("build_info.start", gte=start_range, lte=end_range)) or_filter.extend(RangeQuery("build_info.end", gte=start_range, lte=end_range)) \ .extend(bool_range) bool_query = BoolQuery() if disease_id: qnested_buildinfo = Query.nested('build_info', bool_query) bool_query = BoolQuery() bool_query.must(Query.term("disease", disease_id.lower())).must(qnested_buildinfo) qnested = ElasticQuery(bool_query, sources=['build_info.*', 'disease_locus', 'disease', 'chr_band', 'species']) else: bool_query.must(Query.term("build_info.build", nbuild)) \ .must(Query.term("build_info.seqid", seqid)) \ .filter(or_filter) qnested = ElasticQuery(Query.nested('build_info', bool_query), sources=['build_info.*', 'disease_locus', 'disease', 'chr_band', 'species']) elastic = Search(qnested, idx=idx, idx_type=idx_type) res = elastic.search() return res.docs
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def region_page(request, region): ''' Region elastic''' query = ElasticQuery.query_match("attr.region_id", region) elastic = Search(query, idx=ElasticSettings.idx(name='REGION')) context = elastic.get_result() context['title'] = "Region" print(context) return render(request, 'region/region.html', context, content_type='text/html')
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_terms_avg_order(self): ''' Test average and order. ''' agg_name = "test" sub_agg = Agg('avg_start', 'avg', {"field": "start"}) agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0, "order": {"avg_start": "desc"}}, sub_agg=sub_agg) search = Search(aggs=Aggs(agg), idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") self.assertGreater(r_aggs['test'].get_buckets()[0]['doc_count'], 1)
def test_url_rotate(self): ''' Test the url rotates from http://xxx:9200 to correct url. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker") Search.index_exists('test', 'test2') ElasticUrl.URL_INDEX = 0 # reset
def get_elastic_settings_with_user_uploads(cls, elastic_dict=None, new_upload_file=None): '''Get the updated elastic settings with user uploaded idx_types''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) ''' why don't we use Search.get_mapping ? I guess it's not a class method''' #logger.debug(response.json()) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) # here if we use aliasing then idx can be different # this causes problems as it's effectively hardcoded # this should fix to handle things where aliases are deployed idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) if elastic_dict is None: elastic_dict = ElasticSettings.attrs().get('IDX') idx_type_dict = {} existing_ct = [ct.name for ct in ContentType.objects.filter(app_label=cls.PERMISSION_MODEL_APP_NAME)] for idx_type in idx_types: idx_type_with_suffix = idx_type + cls.PERMISSION_MODEL_TYPE_SUFFIX for ct in existing_ct: if ct.endswith(idx_type_with_suffix): meta_url = idx + '/' + idx_type + '/_meta/_source' meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads(meta_response.content.decode("utf-8")) label = elastic_meta['label'] except: label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} if new_upload_file is not None: idx_type = new_upload_file label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict return elastic_dict
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_filters(self): ''' Filters Aggregation ''' filters = {'filters': {'start_gt': RangeQuery('start', gt='1000'), 'start_lt': RangeQuery('start', lt='100000')}} agg = Agg('test_filters', 'filters', filters) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('start_lt' in r_aggs['test_filters'].get_buckets(), "returned avg aggregation")
def ajax_range_overlap_search(request, src, start, stop, search_idx, ajax): ''' Return count or paginated range elastic result as a JSON ''' if ajax == 'count': elastic = Search.range_overlap_query(src, start, stop, idx=search_idx) return JsonResponse(elastic.get_count()) search_from = request.POST.get("from") size = request.POST.get("size") elastic = Search.range_overlap_query(src, start, stop, search_from=search_from, size=size, idx=search_idx) return JsonResponse(elastic.get_json_response())
def test_terms_query(self): ''' Test building and running a match query. ''' highlight = Highlight(["id"]) query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved markers (rs2476601, rs768019142)") self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found") self.assertTrue(docs[0].highlight() is not None, "highlighting found")
def ajax_search(request, query, search_idx, ajax): ''' Return count or paginated elastic result as a JSON ''' if ajax == 'count': elastic = Search.field_search_query(query, fields=fields, idx=search_idx) return JsonResponse(elastic.get_count()) search_from = request.POST.get("from") size = request.POST.get("size") elastic = Search.field_search_query(query, fields=fields, search_from=search_from, size=size, idx=search_idx) return JsonResponse(elastic.get_json_response())
def _build_frags_query(frags_idx, chrom, segmin, segmax): query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.bedFields) fragsQuery = Search(search_query=query, search_from=0, size=2000000, idx=frags_idx) fragsResult = fragsQuery.get_result() frags = fragsResult['data'] frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags) return frags
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_range(self): ''' Range Aggregation ''' agg = Agg("test_range_agg", "range", {"field": "start", "ranges": [{"to": 10000}, {"from": 10000, "to": 15000}]}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(len(r_aggs['test_range_agg'].get_buckets()) == 2, "returned two buckets in range aggregations")
def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1): ''' Get a random doc from the indices. ''' score_function1 = ScoreFunction.create_score_function('random_score', seed=random.randint(0, 1000000)) search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'), sources=sources) elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type) try: return elastic.search().docs except IndexError: return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
def test_region_idx_loader(self): ''' Test loader has created and populated indices. ''' key = 'PRIVATE_REGIONS_GFF' if key in IDX.keys(): idx = IDX[key]['indexName'] Search.index_refresh(idx) self.assertTrue(Search.index_exists(idx=idx), 'Index exists: '+idx) ndocs = Search(idx=idx).get_count()['count'] self.assertTrue(ndocs > 0, "Elastic count documents in " + idx + ": " + str(ndocs))
def test_bool_nested_filter(self): ''' Test combined Bool filter ''' query_bool_nest = BoolQuery() query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query_bool = BoolQuery() query_bool.should(query_bool_nest) \ .should(Query.term("seqid", 2)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
def test_bool_query(self): ''' Test a bool query. ''' query_bool = BoolQuery() highlight = Highlight(["id", "seqid"]) query_bool.must(Query.term("id", "rs768019142")) \ .must(RangeQuery("start", gt=1000)) \ .must_not(Query.match("seqid", "2")) \ .should(Query.match("seqid", "3")) \ .should(Query.match("seqid", "1")) query = ElasticQuery.bool(query_bool, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
def test_idx_loader(self): ''' Test loader has created and populated indices. ''' for key in IDX: idx = IDX[key]['indexName'] # check the index has documents, allow for the indexing to complete if necessary # Search.index_refresh(idx) self.assertTrue(Search.index_exists(idx=idx), 'Index exists: ' + idx) ndocs = Search(idx=idx).get_count()['count'] self.assertTrue( ndocs > 0, "Elastic count documents in " + idx + ": " + str(ndocs))
def test_pub_ini_file2(self): ''' Test publication pipeline with a list of PMIDs. ''' out = StringIO() call_command('publications', '--dir', TEST_DATA_DIR, '--steps', 'load', sections='DISEASE::TEST', ini=MY_PUB_INI_FILE, stdout=out) INI_CONFIG = IniParser().read_ini(MY_PUB_INI_FILE) idx = INI_CONFIG['DISEASE']['index'] Search.index_refresh(idx) query = ElasticQuery.query_string("test", fields=["tags.disease"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(docs), 1)
def test_bool_filtered_query4(self): ''' Test building and running a filtered boolean query. Note: ElasticQuery used to wrap match in a query object. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_update_doc(self): ''' Update with a partial document. ''' idx = IDX['MARKER']['indexName'] docs = Search(ElasticQuery(Query.term("id", "rs2476601"), sources=['id']), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") update_field = {"doc": {"start": 100, "end": 200}} Update.update_doc(docs[0], update_field) Search.index_refresh(IDX['MARKER']['indexName']) docs = Search(ElasticQuery(Query.term("id", "rs2476601")), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") self.assertEquals(getattr(docs[0], 'start'), 100, "rs2476601 start") self.assertEquals(getattr(docs[0], 'end'), 200, "rs2476601 end")
def get_object(self): q = ElasticQuery(Query.ids(self.kwargs[self.lookup_field])) s = Search(search_query=q, idx=getattr(self, 'idx')) try: result = s.get_json_response()['hits']['hits'][0] obj = ElasticObject(initial=result['_source']) obj.uuid = result['_id'] # May raise a permission denied self.check_object_permissions(self.request, obj) return obj except (TypeError, ValueError, IndexError): raise Http404
def filter_queryset(self, request, queryset, view): ''' Override this method to request feature locations. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) query_str = filters.get('feature', 'PTPN22') build = self._get_build(filters.get('build', settings.DEFAULT_BUILD)) if query_str is None or query_str == '': return [ElasticObject(initial={'error': 'No feature name provided.'})] search_fields = ['id', 'symbol', 'dbxrefs.ensembl', 'region_name'] sources = ['start', 'stop', 'seqid', 'chromosome', 'disease_loci'] idxs = ElasticSettings.getattr('IDX') MARKER_IDX = '' if build == ElasticSettings.get_label('MARKER', label='build'): MARKER_IDX = 'MARKER' if MARKER_IDX == '': for idx in idxs: if 'MARKER' in idx: if build == ElasticSettings.get_label(idx, label='build'): MARKER_IDX = idx (idx, idx_type) = ElasticSettings.idx_names(MARKER_IDX, 'MARKER') (idx_r, idx_type_r) = ElasticSettings.idx_names('REGION', 'REGION') (idx_g, idx_type_g) = ElasticSettings.idx_names('GENE', 'GENE') idx += ',' + idx_r + ',' + idx_g idx_type += ',' + idx_type_r + ',' + idx_type_g equery = BoolQuery(must_arr=Query.query_string(query_str, fields=search_fields)) elastic = Search(search_query=ElasticQuery(equery, sources), size=10, idx=idx, idx_type=idx_type) docs = elastic.search().docs locs = [] for doc in docs: if isinstance(doc, RegionDocument): doc = Region.pad_region_doc(doc) loc = doc.get_position(build=build).split(':') pos = loc[1].replace(',', '').split('-') locs.append(ElasticObject( {'feature': query_str, 'chr': loc[0], 'start': int(pos[0]), 'end': int(pos[1]) if len(pos) > 1 else int(pos[0]), 'locusString': query_str+" ("+str(loc[1])+")"})) return locs except (TypeError, ValueError, IndexError, ConnectionError): raise Http404
def get_object(self): q = ElasticQuery(Query.ids(self.kwargs[self.lookup_field])) s = Search(search_query=q, idx=getattr(self, 'idx')) try: result = s.get_json_response()['hits']['hits'][0] obj = ElasticObject(initial=result['_source']) obj.uuid = result['_id'] obj.criteria_type = result['_type'] # May raise a permission denied self.check_object_permissions(self.request, obj) return obj except (TypeError, ValueError, IndexError): raise Http404
def test_region_idx_loader(self): ''' Test loader has created and populated indices. ''' key = 'PRIVATE_REGIONS_GFF' if key in IDX.keys(): idx = IDX[key]['indexName'] Search.index_refresh(idx) self.assertTrue(Search.index_exists(idx=idx), 'Index exists: ' + idx) ndocs = Search(idx=idx).get_count()['count'] self.assertTrue( ndocs > 0, "Elastic count documents in " + idx + ": " + str(ndocs))
def post(self, request, *args, **kwargs): ens_id = self.request.POST.get('ens_id') marker = self.request.POST.get('marker') markers = self.request.POST.getlist('markers[]') if ens_id: sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) elif marker: sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap()) elif markers: sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500) study_hits = elastic.get_json_response()['hits'] ens_ids = [] pmids = [] for hit in study_hits['hits']: if 'pmid' in hit['_source']: pmids.append(hit['_source']['pmid']) if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal']) for hit in study_hits['hits']: genes = {} if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: try: genes[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} hit['_source']['genes'] = genes if 'pmid' in hit['_source']: pmid = hit['_source']['pmid'] try: authors = getattr(pub_docs[pmid], 'authors') journal = getattr(pub_docs[pmid], 'journal') hit['_source']['pmid'] = \ {'pmid': pmid, 'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "", 'journal': journal} except KeyError: hit['_source']['pmid'] = {'pmid': pmid} return JsonResponse(study_hits)
def test_mapping(self): ''' Test retrieving the mapping for an index. ''' elastic = Search(idx=ElasticSettings.idx('DEFAULT')) mapping = elastic.get_mapping() self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") if ElasticSettings.idx('DEFAULT') in mapping: self.assertTrue("mappings" in mapping[ElasticSettings.idx('DEFAULT')], "Mapping result found") # check using the index type mapping = elastic.get_mapping('marker') self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") # err check mapping = elastic.get_mapping('marker/xx') self.assertTrue('error' in mapping, "Database name in mapping result")
def _build_exon_query(chrom, segmin, segmax, genes): # get exonic structure for genes in this section geneExons = dict() query_bool = BoolQuery() query_bool.must([Query.term("seqid", chrom)]) if len(genes) > 0: for g in genes: query = ElasticQuery.filtered_bool(Query.query_string(g["gene_id"], fields=["name"]), query_bool, sources=utils.snpFields) elastic = Search(query, idx=getattr(chicp_settings, 'CP_GENE_IDX')+'/exons/', search_from=0, size=2000) result = elastic.get_result() exons = result['data'] exons = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], exons) geneExons[g["gene_id"]] = sorted(exons, key=operator.itemgetter("start")) return geneExons
def study_page(request, study): ''' Renders a study page. ''' if study is None: messages.error(request, 'No study id given.') raise Http404() query = ElasticQuery(Query.ids(study.split(','))) elastic = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=5) res = elastic.search(obj_document=StudyDocument) if res.hits_total == 0: messages.error(request, 'Study(s) '+study+' not found.') elif res.hits_total < 9: names = ', '.join([getattr(doc, 'study_name') for doc in res.docs]) context = {'features': res.docs, 'title': names} return render(request, 'study/study.html', context, content_type='text/html') raise Http404()
def _build_frags_query(frags_idx, chrom, segmin, segmax): query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.bedFields) fragsQuery = Search(search_query=query, search_from=0, size=10000, idx=frags_idx) # fragsResult = fragsQuery.get_result() # frags = fragsResult['data'] fragsResult = fragsQuery.get_json_response() frags = [] for hit in fragsResult['hits']['hits']: frags.append(hit['_source']) frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags) return frags
def _build_hic_query(query, targetIdx, segmin=0, segmax=0): hic = [] hicElastic = Search(query, idx=targetIdx, search_from=0, size=2000) hicResult = hicElastic.get_result() if len(hicResult['data']) > 0: hic = hicResult['data'] if segmin == 0 or segmax == 0: (segmin, segmax) = utils.segCoords(hic) extension = int(0.05*(segmax-segmin)) segmin = segmin - extension segmax = segmax + extension hic = utils.makeRelative(int(segmin), int(segmax), ['baitStart', 'baitEnd', 'oeStart', 'oeEnd'], hic) return hic, segmin, segmax
def filter_queryset(self, request, queryset, view): """ Override this method to request just the documents required from Rserve. """ try: filterable = getattr(view, "filter_fields", []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get("m1") if mid1 is None or mid1 == "": return [ElasticObject(initial={"error": "No marker ID provided."})] dataset = filters.get("dataset", "EUR").replace("-", "") mid2 = filters.get("m2") window_size = int(filters.get("window_size", 1000000)) dprime = filters.get("dprime", 0.0) rsq = filters.get("rsq", 0.8) maf = filters.get("maf", False) if maf: maf = True build_version = filters.get("build", "GRCh38").lower() pos = filters.get("pos", False) if pos: pos = True query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=["seqid", "start"]) elastic = Search(search_query=query, idx=ElasticSettings.idx("MARKER", "MARKER"), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, "seqid") rserve = getattr(settings, "RSERVE") conn = pyRserve.connect(host=rserve.get("HOST"), port=rserve.get("PORT")) ld_str = conn.r.ld_run( dataset, seqid, mid1, marker2=mid2, window_size=window_size, dprime=dprime, rsq=rsq, maf=maf, position=pos, build_version=build_version, ) ld_str = ld_str.replace("D.prime", "dprime").replace("R.squared", "rsquared") conn.close() return [ElasticObject(initial=json.loads(str(ld_str)))] except (TypeError, ValueError, IndexError, ConnectionError): raise Http404
def marker_page(request): ''' Renders a gene page. ''' query_dict = request.GET marker = query_dict.get("m") if marker is None: messages.error(request, 'No gene name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = Document(hit) if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) criteria = {} if marker_doc is not None: if ElasticSettings.idx('CRITERIA') is not None: criteria = views.get_criteria([marker_doc], 'marker', 'id', 'MARKER') marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) context = { 'marker': marker_doc, 'old_dbsnp_docs': _get_old_dbsnps(marker), 'ic': ic_docs, 'history': history_docs, 'criteria': criteria } return render(request, 'marker/marker.html', context, content_type='text/html') elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def test_gene_history_loader(self): """ Test the gene history loading. """ call_command("pipeline", "--steps", "load", sections="GENE_HISTORY", dir=TEST_DATA_DIR, ini=MY_INI_FILE) INI_CONFIG = IniParser().read_ini(MY_INI_FILE) idx = INI_CONFIG["GENE_HISTORY"]["index"] idx_type = INI_CONFIG["GENE_HISTORY"]["index_type"] elastic = Search(idx=idx, idx_type=idx_type) Search.index_refresh(idx) self.assertTrue(elastic.get_count()["count"] > 1, "Count documents in the index") map1_props = Gene.gene_history_mapping(idx, idx_type, test_mode=True).mapping_properties map2_props = elastic.get_mapping() if idx not in map2_props: logger.error("MAPPING ERROR: " + json.dumps(map2_props)) self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)
def region_page(request, region): ''' Renders a region page. ''' if region is None: messages.error(request, 'No region given.') raise Http404() query = ElasticQuery(Query.ids(region.split(','))) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'REGION'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Region(s) '+region+' not found.') elif res.hits_total < 9: names = ', '.join([getattr(doc, 'region_name') for doc in res.docs]) REGIONS = [Region.pad_region_doc(doc) for doc in res.docs] context = {'features': REGIONS, 'title': names} return render(request, 'region/index.html', context, content_type='text/html') raise Http404()
def disease_page(request, disease): ''' Renders a disease page. ''' disease = disease.lower() if disease is None: messages.error(request, 'No disease given.') raise Http404() query = ElasticQuery(Query.terms("code", [disease.split(',')])) elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Disease(s) '+disease+' not found.') elif res.hits_total < 9: names = ', '.join([getattr(doc, 'name') for doc in res.docs]) context = {'features': res.docs, 'title': names} return render(request, 'disease/index.html', context, content_type='text/html') raise Http404()
def _get_old_dbsnps(marker): ''' Get markers from old versions of DBSNP. Assumes the index key is prefixed by 'MARKER_'. ''' old_dbsnps_names = sorted([ElasticSettings.idx(k) for k in ElasticSettings.getattr('IDX').keys() if 'MARKER_' in k], reverse=True) old_dbsnp_docs = [] if len(old_dbsnps_names) > 0: search_query = ElasticQuery(Query.query_string(marker, fields=['id', 'rscurrent'])) for idx_name in old_dbsnps_names: elastic2 = Search(search_query=search_query, idx=idx_name, idx_type='marker') docs = elastic2.search().docs if len(docs) > 0: old_doc = docs[0] old_doc.marker_build = _get_marker_build(idx_name) old_dbsnp_docs.append(old_doc) return old_dbsnp_docs