def _find_snp_position(snp_track, name): if snp_track is None: query = ElasticQuery.query_match("id", name) elastic = Search(query, idx=ElasticSettings.idx('MARKER')) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} else: mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.query_match("name", name) elastic = Search(query, idx=snp_track_idx) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def test_term_query(self): ''' Test building and running a match query. ''' query = ElasticQuery(Query.term("id", "rs2476601")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)") query = ElasticQuery(Query.term("seqid", "1", boost=3.0)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers on chr1")
def test_query_ids(self): ''' Test by query ids. ''' query = ElasticQuery(Query.ids(['1', '2'])) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)") idx_type = docs[0].type() query = ElasticQuery(Query.ids('2', types=idx_type)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
def test_update_doc(self): ''' Update with a partial document. ''' idx = IDX['MARKER']['indexName'] docs = Search(ElasticQuery(Query.term("id", "rs2476601"), sources=['id']), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") update_field = {"doc": {"start": 100, "end": 200}} Update.update_doc(docs[0], update_field) Search.index_refresh(IDX['MARKER']['indexName']) docs = Search(ElasticQuery(Query.term("id", "rs2476601")), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") self.assertEquals(getattr(docs[0], 'start'), 100, "rs2476601 start") self.assertEquals(getattr(docs[0], 'end'), 200, "rs2476601 end")
def test_term(self): ''' Terms Aggregation ''' agg_name = "test" agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") ''' Ids Query with Terms Aggregation''' query = ElasticQuery(Query.ids(['1', '2'])) search = Search(search_query=query, aggs=aggs, idx=ElasticSettings.idx('DEFAULT'), size=5) r_aggs = search.search().aggs self.assertTrue(len(r_aggs[agg_name].get_buckets()) > 0, "returned test aggregation buckets") self.assertTrue(getattr(r_aggs[agg_name], 'buckets')[0]['doc_count'] >= 0, "bucket document count")
def test_filter(self): ''' Filter Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='25000')), Agg('avg_start', 'avg', {"field": 'start'}), Agg('min_start', 'min', {"field": 'start'}), Agg('sum_start', 'sum', {"field": 'start'}), Agg('stats_start', 'stats', {"field": 'start'}), Agg('count_start', 'value_count', {"field": 'start'}), Agg('ext_stats_start', 'extended_stats', {"field": 'start'})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('avg_start' in r_aggs, "returned avg aggregation") self.assertTrue('min_start' in r_aggs, "returned min aggregation") stats_keys = ["min", "max", "sum", "count", "avg"] self.assertTrue(all(hasattr(r_aggs['stats_start'], k) for k in stats_keys), "returned min aggregation") stats_keys.extend(["sum_of_squares", "variance", "std_deviation", "std_deviation_bounds"]) self.assertTrue(all(hasattr(r_aggs['ext_stats_start'], k) for k in stats_keys), "returned min aggregation")
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def setUp(self): # Every test needs access to the request factory. self.factory = RequestFactory() for idx in getattr(chicp_settings, 'TARGET_IDXS'): elasticJSON = Search(idx=idx).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[idx]['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues[idx] = tissueList
def test_significant_terms(self): ''' Significant Terms Aggregation ''' agg = Agg("test_significant_terms", "significant_terms", {"field": "start"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('test_significant_terms' in r_aggs, "returned aggregations")
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) search_filters = self._build_filters(filters=filters) if search_filters is not None: q = ElasticQuery.filtered(Query.match_all(), search_filters) else: q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=getattr(view, 'idx'), size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def test_bulk(self): ''' Test the Bulk.load(). ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \ (idx, 'marker') json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".", "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"}) resp = Bulk.load(idx, '', json_data) self.assertNotEquals(resp.status_code, 200) # note: needs a trailing line return to work Bulk.load(idx, '', json_data + '\n') Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1+1, "contains documents") # produce errors updating doc id that doesn't exist json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"doc": {"start": 100, "end": 200}}\n' resp = Bulk.load(idx, '', json_data) self.assertTrue('errors' in resp.json() and resp.json()['errors'])
def test_string_query(self): ''' Test building and running a string query. ''' query = ElasticQuery.query_string("rs2476601", fields=["id"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search() self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)") self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_top_hits_sub_agg(self): sub_agg = Agg('idx_top_hits', 'top_hits', {"size": 1}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("categories", "terms", {"field": "_type", "size": 0})]) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) buckets = search.search().aggs['idxs'].get_docs_in_buckets() self.assertEqual(buckets[ElasticSettings.idx('DEFAULT')]['doc_count'], 3) self.assertEqual(len(buckets[ElasticSettings.idx('DEFAULT')]['docs']), 1)
def test_top_hits(self): ''' Top Hits Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='2000')), Agg('test_top_hits', 'top_hits', {"size": 1})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) hits = search.search().aggs['test_top_hits'].get_hits() self.assertTrue(len(hits) == 1, "returned the top hit")
def test_missing(self): ''' Missing Aggregation ''' agg = Agg("test_missing", "missing", {"field": "seqid"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(getattr(r_aggs['test_missing'], 'doc_count') == 0, "no missing seqid fields")
def test_terms_avg_order(self): ''' Test average and order. ''' agg_name = "test" sub_agg = Agg('avg_start', 'avg', {"field": "start"}) agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0, "order": {"avg_start": "desc"}}, sub_agg=sub_agg) search = Search(aggs=Aggs(agg), idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") self.assertGreater(r_aggs['test'].get_buckets()[0]['doc_count'], 1)
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_url_rotate(self): ''' Test the url rotates from http://xxx:9200 to correct url. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker") Search.index_exists('test', 'test2') ElasticUrl.URL_INDEX = 0 # reset
def test_get_rdm_feature_id(self): ''' Test get random feature id. ''' idx = IDX['GFF_GENERIC']['indexName'] idx_type = IDX['GFF_GENERIC']['indexType'] doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type) self.assertTrue(isinstance(doc_id, str), 'Document id') docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs self.assertTrue(len(docs) == 1, 'Document retrieved')
def test_filters(self): ''' Filters Aggregation ''' filters = {'filters': {'start_gt': RangeQuery('start', gt='1000'), 'start_lt': RangeQuery('start', lt='100000')}} agg = Agg('test_filters', 'filters', filters) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('start_lt' in r_aggs['test_filters'].get_buckets(), "returned avg aggregation")
def test_terms_query(self): ''' Test building and running a match query. ''' highlight = Highlight(["id"]) query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved markers (rs2476601, rs768019142)") self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found") self.assertTrue(docs[0].highlight() is not None, "highlighting found")
def test_mapping(self): ''' Test mapping used in GFF loader. ''' idx = IDX['GFF_GENERIC']['indexName'] mapping_json = Search(idx=idx).get_mapping() self.assertFalse('error' in mapping_json, 'No error returned from mapping request.') self.assertTrue('mappings' in mapping_json[idx], 'Found mappings.') seqid = mapping_json[idx]['mappings']['gff']['properties']['seqid'] self.assertTrue('not_analyzed' == seqid['index'], 'seqid in GFF is not_analyzed')
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def region_page(request, region): ''' Region elastic''' query = ElasticQuery.query_match("attr.region_id", region) elastic = Search(query, idx=ElasticSettings.idx(name='REGION')) context = elastic.get_result() context['title'] = "Region" print(context) return render(request, 'region/region.html', context, content_type='text/html')
def test_range(self): ''' Range Aggregation ''' agg = Agg("test_range_agg", "range", {"field": "start", "ranges": [{"to": 10000}, {"from": 10000, "to": 15000}]}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(len(r_aggs['test_range_agg'].get_buckets()) == 2, "returned two buckets in range aggregations")