def test_term_query(self): ''' Test building and running a match query. ''' query = ElasticQuery(Query.term("id", "rs2476601")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)") query = ElasticQuery(Query.term("seqid", "1", boost=3.0)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers on chr1")
def test_query_ids(self): ''' Test by query ids. ''' query = ElasticQuery(Query.ids(['1', '2'])) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)") idx_type = docs[0].type() query = ElasticQuery(Query.ids('2', types=idx_type)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
def test_update_doc(self): ''' Update with a partial document. ''' idx = IDX['MARKER']['indexName'] docs = Search(ElasticQuery(Query.term("id", "rs2476601"), sources=['id']), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") update_field = {"doc": {"start": 100, "end": 200}} Update.update_doc(docs[0], update_field) Search.index_refresh(IDX['MARKER']['indexName']) docs = Search(ElasticQuery(Query.term("id", "rs2476601")), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") self.assertEquals(getattr(docs[0], 'start'), 100, "rs2476601 start") self.assertEquals(getattr(docs[0], 'end'), 200, "rs2476601 end")
def test_bulk(self): ''' Test the Bulk.load(). ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \ (idx, 'marker') json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".", "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"}) resp = Bulk.load(idx, '', json_data) self.assertNotEquals(resp.status_code, 200) # note: needs a trailing line return to work Bulk.load(idx, '', json_data + '\n') Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1+1, "contains documents") # produce errors updating doc id that doesn't exist json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"doc": {"start": 100, "end": 200}}\n' resp = Bulk.load(idx, '', json_data) self.assertTrue('errors' in resp.json() and resp.json()['errors'])
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) search_filters = self._build_filters(filters=filters) if search_filters is not None: q = ElasticQuery.filtered(Query.match_all(), search_filters) else: q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=getattr(view, 'idx'), size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def test_get_rdm_feature_id(self): ''' Test get random feature id. ''' idx = IDX['GFF_GENERIC']['indexName'] idx_type = IDX['GFF_GENERIC']['indexType'] doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type) self.assertTrue(isinstance(doc_id, str), 'Document id') docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs self.assertTrue(len(docs) == 1, 'Document retrieved')
def test_terms_query(self): ''' Test building and running a match query. ''' highlight = Highlight(["id"]) query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved markers (rs2476601, rs768019142)") self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found") self.assertTrue(docs[0].highlight() is not None, "highlighting found")
def test_search_count(self): ''' Test index and search counts. ''' idx = IDX['GFF_GENERIC']['indexName'] idx_type = IDX['GFF_GENERIC']['indexType'] count1 = ElasticUtils.get_docs_count(idx, idx_type) self.assertGreater(count1, 0, 'index count') search_query = ElasticQuery( BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')])) count2 = ElasticUtils.get_docs_count(idx, idx_type, search_query=search_query) self.assertGreater(count1, count2, 'search query count')
def test_function_score_query(self): ''' Test a function score query with a query (using the start position as the score). ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start', modifier='reciprocal') query_string = Query.query_string("rs*", fields=["id", "seqid"]) query = ElasticQuery(FunctionScoreQuery(query_string, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = 0 for doc in docs: start = getattr(doc, 'start') self.assertLess(last_start, start) last_start = start
def _get_current_build_info(self, seqid, position): ''' Get upper & lower boundaries for a hit given the position of the marker.''' query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("position", gte=position), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:asc'), size=1).search() genetic_map_position = getattr(result.docs[0], "genetic_map_position") query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("genetic_map_position", gte=(genetic_map_position + 0.1)), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:asc'), size=1).search() start = int(getattr(result.docs[0], "position")) query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("genetic_map_position", lte=(genetic_map_position - 0.1)), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:desc'), size=1).search() end = int(getattr(result.docs[0], "position")) build_info = {'build': 38, 'seqid': seqid, 'start': start, 'end': end} return build_info
def get_object(self): q = ElasticQuery(Query.ids(self.kwargs[self.lookup_field])) s = Search(search_query=q, idx=getattr(self, 'idx')) try: result = s.get_json_response()['hits']['hits'][0] obj = ElasticObject(initial=result['_source']) obj.uuid = result['_id'] # May raise a permission denied self.check_object_permissions(self.request, obj) return obj except (TypeError, ValueError, IndexError): raise Http404
def test_function_score_filter(self): ''' Test a function score query with a filter. ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start') bool_filter = Filter(BoolQuery(must_arr=[RangeQuery("start", lte=50000)])) query = ElasticQuery(FunctionScoreQuery(bool_filter, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = sys.maxsize for doc in docs: start = getattr(doc, 'start') # test that the start is equal to the score self.assertEqual(start, int(doc.__dict__['_meta']['_score'])) self.assertGreater(last_start, start) last_start = start
def test_term(self): ''' Terms Aggregation ''' agg_name = "test" agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") ''' Ids Query with Terms Aggregation''' query = ElasticQuery(Query.ids(['1', '2'])) search = Search(search_query=query, aggs=aggs, idx=ElasticSettings.idx('DEFAULT'), size=5) r_aggs = search.search().aggs self.assertTrue(len(r_aggs[agg_name].get_buckets()) > 0, "returned test aggregation buckets") self.assertTrue(getattr(r_aggs[agg_name], 'buckets')[0]['doc_count'] >= 0, "bucket document count")
def _get_chr_band(self, seqid, position): ''' Get chr band for a given chr/position ''' if seqid == 6 and position >= 24891793 and position <= 34924245: return 'MHC' query = ElasticQuery( BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("start", lte=position), RangeQuery("stop", gte=position) ])) result = Search(query, idx=ElasticSettings.idx('BAND', 'BAND'), size=1).search() return (getattr(result.docs[0], "seqid") + getattr(result.docs[0], "name"))
def test_function_score_query2(self): ''' Test multiple function score query with a query. ''' score_function1 = ScoreFunction.create_score_function('field_value_factor', field='start') score_function2 = ScoreFunction.create_score_function('field_value_factor', field='start') query_string = Query.query_string("rs*", fields=["id"]) query = ElasticQuery(FunctionScoreQuery(query_string, [score_function1, score_function2], score_mode='sum', boost_mode='replace', min_score=1., max_boost=100000000.), sources=['start']) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = sys.maxsize for doc in docs: start = getattr(doc, 'start') self.assertGreater(last_start, start) last_start = start
def test_delete_docs_by_query(self): ''' Test deleting docs using a query. ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] self.assertGreater(hits_total1, 0, "contains documents") # delete single doc Delete.docs_by_query(idx, query=Query.term("id", "rs2476601")) Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1-1, "contains documents") # delete remaining docs Delete.docs_by_query(idx, 'marker') Search.index_refresh(idx) self.assertEquals(elastic.get_count()['count'], 0, "contains no documents")
def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1): ''' Get a random doc from the indices. ''' score_function1 = ScoreFunction.create_score_function( 'random_score', seed=random.randint(0, 1000000)) search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'), sources=sources) elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type) try: return elastic.search().docs except IndexError: return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
def test_nested_query(self): ''' Test nested query with aggregations. ''' self.assertRaises(QueryError, Query.nested, 'build_info', 'xxxx') qnested = ElasticQuery(Query.nested('build_info', Query.term("build_info.build", "38"))) diseases_by_seqid = Agg('diseases_by_seqid', 'terms', {"size": 0, "field": "disease"}) disease_hits = Agg('disease_hits', 'reverse_nested', {}, sub_agg=diseases_by_seqid) seq_hits = Agg('seq_hits', 'terms', {'field': 'build_info.seqid', 'size': 0}, sub_agg=disease_hits) build_info = Agg('build_info', 'nested', {"path": 'build_info'}, sub_agg=[seq_hits]) elastic = Search(qnested, idx=IDX['JSON_NESTED']['indexName'], aggs=Aggs(build_info)) res = elastic.search() # returns just build 38 hits self.assertEqual(len(res.docs), 2) seq_hits = getattr(res.aggs['build_info'], 'seq_hits')['buckets'] # two seq ids self.assertEqual(len(seq_hits), 2) for seq in seq_hits: disease_hits = seq['disease_hits'] # one disease found on the sequence self.assertEqual(len(disease_hits['diseases_by_seqid']['buckets']), 1)
def add_study_data(self, **options): ''' add gwas stats from a study ''' study = options['study_id'] file = options['addStudyData'] message = "" print("Deleting study hits for " + study) Delete.docs_by_query(ElasticSettings.idx('REGION', 'STUDY_HITS'), query=Query.term("dil_study_id", study)) with open(file, newline='') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: if row[0] == 'Marker': continue # 0 - Marker # 1 - disease # 2 - Chromosome # 3 - Region Start # 4 - Region End # 5 - Position # 6 - Strand # 7 - Major Allele # 8 - Minor allele # 9 - Minor allele frequency # 10 - Discovery P value # 11 - Discovery Odds ratio # 12 - Discovery 95% confidence interval lower limit # 13 - Discovery 95% confidence interval upper limit # 14 - Replication P value # 15 - Replication Odds ratio # 16 - Replication 95% confidence interval lower limit # 17 - Replication 95% confidence interval upper limit # 18 - Combined P value # 19 - Combined Odds ratio # 20 - Combined 95% confidence interval lower limit # 21 - Combined 95% confidence interval upper limit # 22 - PP Colocalisation # 23 - Gene # 24 - PubMed ID # 25 - Other Signal # 26 - Notes # 27 - Curation status/ failed quality control query = ElasticQuery(Query.match("id", row[0])) result = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER')).search() if result.hits_total == 0: result2 = Search(search_query=ElasticQuery( Query.match("rshigh", row[0])), idx=ElasticSettings.idx( 'MARKER', 'HISTORY')).search() if result2.hits_total > 0: history_doc = result2.docs[0] new_id = getattr(history_doc, "rscurrent") query = ElasticQuery(Query.match("id", new_id)) result = Search(search_query=query, idx=ElasticSettings.idx( 'MARKER', 'MARKER')).search() if result.hits_total != 1: message += "ERROR loading row of gwas data for " + row[ 0] + " - Marker cannot be found; <br />\n" marker = result.docs[0] query = ElasticQuery(Query.match("code", row[1])) result = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE')).search() if result.hits_total != 1: message += "ERROR loading row of gwas data for " + row[ 0] + " - Disease cannot be found; <br />\n" continue disease = result.docs[0] if not re.match(r"^\w$", row[7]): message += "ERROR loading row of gwas data for " + row[ 0] + " - Major allele is not set; <br />\n" continue if not re.match(r"^\w$", row[8]): message += "ERROR loading row of gwas data for " + row[ 0] + " - Minor allele is not set; <br />\n" continue if float(row[9]) > 0.5: message += "WARNING - MAF for " + row[ 0] + " is >0.5; <br />\n" strand = row[6] if re.match(r"\d", strand): strand = '+' if strand > 0 else '-' row[6] = strand if not re.match(r"\d+", row[2]): row[2] = getattr(marker, "seqid") if not re.match(r"\d+", row[5]): row[5] = getattr(marker, "start") if not row[5] == getattr(marker, "start"): row[5] = getattr(marker, "start") data = { "chr_band": self._get_chr_band(row[2], row[5]), "other_signal": row[25], "species": "Human", "disease": getattr(disease, "code"), "notes": row[26], "disease_locus": "TBC", "dil_study_id": study, "marker": getattr(marker, "id"), "status": "N", "pp_probability": row[22], "tier": 100, "pmid": row[24], "genes": self._get_ens_gene(row[23]) } build_info = self._get_current_build_info(row[2], row[5]) data['build_info'] = [build_info] data['p_values'] = { 'discovery': row[10], 'replication': row[14], 'combined': row[18] } data['odds_ratios'] = { 'discovery': { "or": row[11], "lower": row[12], "upper": row[13] }, 'replication': { "or": row[15], "lower": row[16], "upper": row[17] }, 'combined': { "or": row[19], "lower": row[20], "upper": row[21] } } data['alleles'] = { 'major': row[7], 'minor': row[8], 'maf': row[9] } data['suggest'] = {'input': [], 'weight': 1} r = Search.elastic_request( ElasticSettings.url(), ElasticSettings.idx('REGION', 'STUDY_HITS'), json.dumps(data)) if r.status_code != 201: message += "ERROR loading row of gwas data for " + row[ 0] + " - Failed to create document; <br />\n" print("\n\n" + message)
def test_count_with_query(self): ''' Test count the number of documents returned by a query. ''' query = ElasticQuery(Query.term("id", "rs768019142")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.get_count()['count'] == 1, "Elastic count with a query")
def _get_ens_gene(self, gene_list): genes = re.sub("__", " ", gene_list) query = ElasticQuery(Query.query_string(genes)) result = Search(query, idx=ElasticSettings.idx('GENE', 'GENE')).search() return [doc.doc_id() for doc in result.docs]
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', { 'username': '******', 'password': '******' }) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get( id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter( "field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
def _add_diseases(): ''' Add diseases dictionary to a context ''' query = ElasticQuery(Query.match_all()) elastic_disease = Search(search_query=query, size=100, idx='disease') return elastic_disease.get_json_response()['hits']['hits']