def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def test_term_query(self): ''' Test building and running a match query. ''' query = ElasticQuery(Query.term("id", "rs2476601")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)") query = ElasticQuery(Query.term("seqid", "1", boost=3.0)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers on chr1")
def test_query_ids(self): ''' Test by query ids. ''' query = ElasticQuery(Query.ids(['1', '2'])) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)") idx_type = docs[0].type() query = ElasticQuery(Query.ids('2', types=idx_type)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
def test_term(self): ''' Terms Aggregation ''' agg_name = "test" agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") ''' Ids Query with Terms Aggregation''' query = ElasticQuery(Query.ids(['1', '2'])) search = Search(search_query=query, aggs=aggs, idx=ElasticSettings.idx('DEFAULT'), size=5) r_aggs = search.search().aggs self.assertTrue(len(r_aggs[agg_name].get_buckets()) > 0, "returned test aggregation buckets") self.assertTrue(getattr(r_aggs[agg_name], 'buckets')[0]['doc_count'] >= 0, "bucket document count")
def get_gene_docs_by_ensembl_id(cls, ens_ids, sources=None): ''' Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. ''' query = ElasticQuery(Query.ids(ens_ids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx('GENE', idx_type='GENE'), size=len(ens_ids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def test_pubs_disease_tags(self): ''' Check the number of disease publications against the number of tags.disease and report differences`. ''' count = True msg = '' for disease in DiseasePublicationTest.DISEASES: pmids = self._get_pmids(disease) disease_code = disease.lower() elastic = Search(search_query=ElasticQuery(BoolQuery( b_filter=Filter(Query.term('tags.disease', disease_code))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) res = elastic.get_count() msg += disease_code+'\tINDEX: '+str(res['count'])+'\tNCBI: '+str(len(pmids)) if res['count'] != len(pmids): count = False docs = elastic.search().docs pmids_in_idx = [getattr(doc, 'pmid') for doc in docs] pmids_diff1 = [pmid for pmid in pmids_in_idx if pmid not in pmids] pmids_diff2 = [pmid for pmid in pmids if pmid not in pmids_in_idx] if len(pmids_diff1) > 0: msg += '\textra PMIDs: '+str(pmids_diff1) if len(pmids_diff2) > 0: msg += '\tmissing PMIDs: '+str(pmids_diff2) msg += '\n' print(msg) self.assertTrue(count, 'Count for disease tags')
def test_significant_terms(self): ''' Significant Terms Aggregation ''' agg = Agg("test_significant_terms", "significant_terms", {"field": "start"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('test_significant_terms' in r_aggs, "returned aggregations")
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_string_query(self): ''' Test building and running a string query. ''' query = ElasticQuery.query_string("rs2476601", fields=["id"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search() self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)") self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
def test_filter(self): ''' Filter Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='25000')), Agg('avg_start', 'avg', {"field": 'start'}), Agg('min_start', 'min', {"field": 'start'}), Agg('sum_start', 'sum', {"field": 'start'}), Agg('stats_start', 'stats', {"field": 'start'}), Agg('count_start', 'value_count', {"field": 'start'}), Agg('ext_stats_start', 'extended_stats', {"field": 'start'})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('avg_start' in r_aggs, "returned avg aggregation") self.assertTrue('min_start' in r_aggs, "returned min aggregation") stats_keys = ["min", "max", "sum", "count", "avg"] self.assertTrue(all(hasattr(r_aggs['stats_start'], k) for k in stats_keys), "returned min aggregation") stats_keys.extend(["sum_of_squares", "variance", "std_deviation", "std_deviation_bounds"]) self.assertTrue(all(hasattr(r_aggs['ext_stats_start'], k) for k in stats_keys), "returned min aggregation")
def _get_pub_docs_by_pmid(pmids, sources=None): """ Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. """ query = ElasticQuery(Query.ids(pmids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx("PUBLICATION"), size=len(pmids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from Rserve. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get('marker', 'rs2476601') dataset = filters.get('dataset', 'EUR').replace('-', '') query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start']) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, 'seqid') rserve = getattr(settings, 'RSERVE') conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT')) pop_str = conn.r.get_pop(dataset, seqid, mid1) pops = json.loads(str(pop_str)) populations = [] for pop in pops: pops[pop]['population'] = pop populations.append(pops[pop]) conn.close() return [ElasticObject(initial={'populations': populations, 'marker': mid1})] except (TypeError, ValueError, IndexError, ConnectionError): return [ElasticObject(initial={'populations': None, 'marker': mid1})]
def get_disease_tags(cls, feature_id, idx=None, idx_type=None): ''' function to get the aggregated list of disease_tags for a given feature id, aggregated from all criteria_types for a feature type @type feature_id: string @keyword feature_id: Id of the feature (gene => gene_id, region=>region_id) @type idx: string @param idx: name of the index @type idx_type: string @param idx_type: name of the idx type, each criteria is an index type ''' query = ElasticQuery(Query.term("qid", feature_id)) agg = Agg("criteria_disease_tags", "terms", {"field": "disease_tags", "size": 0}) aggs = Aggs(agg) if idx_type: search = Search(query, aggs=aggs, idx=idx, idx_type=idx_type) else: search = Search(query, aggs=aggs, idx=idx) disease_tags = [] try: r_aggs = search.search().aggs buckets = r_aggs['criteria_disease_tags'].get_buckets() disease_tags = [dis_dict['key'].lower() for dis_dict in buckets] except: return [] # get disease docs if (len(disease_tags) > 0): (core, other) = Disease.get_site_diseases(dis_list=disease_tags) diseases = list(core) diseases.extend(other) return diseases else: return None
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_top_hits(self): ''' Top Hits Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='2000')), Agg('test_top_hits', 'top_hits', {"size": 1})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) hits = search.search().aggs['test_top_hits'].get_hits() self.assertTrue(len(hits) == 1, "returned the top hit")
def test_top_hits_sub_agg(self): sub_agg = Agg('idx_top_hits', 'top_hits', {"size": 1}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("categories", "terms", {"field": "_type", "size": 0})]) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) buckets = search.search().aggs['idxs'].get_docs_in_buckets() self.assertEqual(buckets[ElasticSettings.idx('DEFAULT')]['doc_count'], 3) self.assertEqual(len(buckets[ElasticSettings.idx('DEFAULT')]['docs']), 1)
def test_missing(self): ''' Missing Aggregation ''' agg = Agg("test_missing", "missing", {"field": "seqid"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(getattr(r_aggs['test_missing'], 'doc_count') == 0, "no missing seqid fields")
def check_hits(resp_json): rsids = {} docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc in docs: rsid = getattr(doc, "id") if rsid is not None: rsids[rsid] = doc rsids_keys = list(rsids.keys()) terms_filter = TermsFilter.get_terms_filter("id", rsids_keys) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys)) docs_by_rsid = elastic.search().docs for doc in docs_by_rsid: info = getattr(doc, "info") if 'VC=SNV' not in info: continue rsid = getattr(doc, "id") ic_doc = rsids[rsid] pos1 = getattr(doc, "start") pos2 = self._get_highest_build(ic_doc)['position'] if abs(int(pos1) - int(pos2)) > 1: is_par = getattr(ic_doc, 'is_par') allele_a = getattr(ic_doc, 'allele_a') if is_par is None and not (allele_a == 'D' or allele_a == 'I'): msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') + ' '+str(pos2)+" "+rsid+' '+str(pos1)) # ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')' query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')), Filter(Query.term("start", pos2))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")" query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " (rshigh:"+str(getattr(d, "rshigh")) + \ " build_id:"+str(getattr(d, "build_id"))+")" logger.error(msg)
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def fetch_overlapping_features(cls, build, seqid, start, end, idx=None, idx_type=None, disease_id=None): ''' function to create fetch overlapping features for a given stretch of region the build info is stored as nested document..so nested query is build @type build: string @param build: build info eg: 'GRCh38' @type seqid: string @param seqid: chromosome number @type start: string @param start: region start @type end: string @param end: region end @type idx: string @param idx: name of the index @type idx_type: string @param idx_type: name of the idx type, each criteria is an index type @type disease_id: string @param disease_id: disease code ''' nbuild = build start_range = start end_range = end bool_range = BoolQuery() bool_range.must(RangeQuery("build_info.start", lte=start_range)) \ .must(RangeQuery("build_info.end", gte=end_range)) or_filter = OrFilter(RangeQuery("build_info.start", gte=start_range, lte=end_range)) or_filter.extend(RangeQuery("build_info.end", gte=start_range, lte=end_range)) \ .extend(bool_range) bool_query = BoolQuery() if disease_id: qnested_buildinfo = Query.nested('build_info', bool_query) bool_query = BoolQuery() bool_query.must(Query.term("disease", disease_id.lower())).must(qnested_buildinfo) qnested = ElasticQuery(bool_query, sources=['build_info.*', 'disease_locus', 'disease', 'chr_band', 'species']) else: bool_query.must(Query.term("build_info.build", nbuild)) \ .must(Query.term("build_info.seqid", seqid)) \ .filter(or_filter) qnested = ElasticQuery(Query.nested('build_info', bool_query), sources=['build_info.*', 'disease_locus', 'disease', 'chr_band', 'species']) elastic = Search(qnested, idx=idx, idx_type=idx_type) res = elastic.search() return res.docs
def test_url_rotate(self): ''' Test the url rotates from http://xxx:9200 to correct url. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker") Search.index_exists('test', 'test2') ElasticUrl.URL_INDEX = 0 # reset
def test_terms_avg_order(self): ''' Test average and order. ''' agg_name = "test" sub_agg = Agg('avg_start', 'avg', {"field": "start"}) agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0, "order": {"avg_start": "desc"}}, sub_agg=sub_agg) search = Search(aggs=Aggs(agg), idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") self.assertGreater(r_aggs['test'].get_buckets()[0]['doc_count'], 1)
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def is_region_for_disease(cls, hit, section=None, config=None, result_container={}): result_container_populated = result_container feature_doc = hit['_source'] feature_doc['_id'] = hit['_id'] disease_loci = feature_doc['disease_loci'] region_id = feature_doc['region_id'] diseases = set() for disease_locus_id in disease_loci: query = ElasticQuery(Query.ids([disease_locus_id]), sources=['hits']) elastic = Search(query, idx=ElasticSettings.idx('REGION', idx_type='DISEASE_LOCUS')) disease_locus_hits = elastic.search().docs for disease_locus_hit in disease_locus_hits: hits = getattr(disease_locus_hit, 'hits') for hit in hits: query = ElasticQuery(Query.ids([hit])) elastic = Search(query, idx=ElasticSettings.idx('REGION', idx_type='STUDY_HITS')) hit_doc = elastic.search().docs[0] disease = getattr(hit_doc, "disease") status = getattr(hit_doc, "status") if status != 'N': return result_container disease_loci = getattr(hit_doc, "disease_locus").lower() if disease_loci == 'tbc': return result_container diseases.add(disease) for disease in diseases: result_container_populated = cls.populate_container(disease, disease, fnotes=None, features=[region_id], diseases=[disease], result_container=result_container_populated) return result_container_populated
def test_filters(self): ''' Filters Aggregation ''' filters = {'filters': {'start_gt': RangeQuery('start', gt='1000'), 'start_lt': RangeQuery('start', lt='100000')}} agg = Agg('test_filters', 'filters', filters) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('start_lt' in r_aggs['test_filters'].get_buckets(), "returned avg aggregation")
def test_terms_query(self): ''' Test building and running a match query. ''' highlight = Highlight(["id"]) query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved markers (rs2476601, rs768019142)") self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found") self.assertTrue(docs[0].highlight() is not None, "highlighting found")
def test_range(self): ''' Range Aggregation ''' agg = Agg("test_range_agg", "range", {"field": "start", "ranges": [{"to": 10000}, {"from": 10000, "to": 15000}]}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(len(r_aggs['test_range_agg'].get_buckets()) == 2, "returned two buckets in range aggregations")
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1): ''' Get a random doc from the indices. ''' score_function1 = ScoreFunction.create_score_function('random_score', seed=random.randint(0, 1000000)) search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'), sources=sources) elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type) try: return elastic.search().docs except IndexError: return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
def test_bool_filtered_query4(self): ''' Test building and running a filtered boolean query. Note: ElasticQuery used to wrap match in a query object. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_pub_ini_file2(self): ''' Test publication pipeline with a list of PMIDs. ''' out = StringIO() call_command('publications', '--dir', TEST_DATA_DIR, '--steps', 'load', sections='DISEASE::TEST', ini=MY_PUB_INI_FILE, stdout=out) INI_CONFIG = IniParser().read_ini(MY_PUB_INI_FILE) idx = INI_CONFIG['DISEASE']['index'] Search.index_refresh(idx) query = ElasticQuery.query_string("test", fields=["tags.disease"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(docs), 1)
def test_bool_nested_filter(self): ''' Test combined Bool filter ''' query_bool_nest = BoolQuery() query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query_bool = BoolQuery() query_bool.should(query_bool_nest) \ .should(Query.term("seqid", 2)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
def test_bool_query(self): ''' Test a bool query. ''' query_bool = BoolQuery() highlight = Highlight(["id", "seqid"]) query_bool.must(Query.term("id", "rs768019142")) \ .must(RangeQuery("start", gt=1000)) \ .must_not(Query.match("seqid", "2")) \ .should(Query.match("seqid", "3")) \ .should(Query.match("seqid", "1")) query = ElasticQuery.bool(query_bool, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
def test_mapping_parent_child(self): ''' Test creating mapping with parent child relationship. ''' gene_mapping = MappingProperties("gene") gene_mapping.add_property("symbol", "string", analyzer="full_name") inta_mapping = MappingProperties("publication", "gene") load = Loader() idx = "test__mapping__"+SEARCH_SUFFIX options = {"indexName": idx, "shards": 1} requests.delete(ElasticSettings.url() + '/' + idx) # add child mappings first status = load.mapping(inta_mapping, "publication", analyzer=Loader.KEYWORD_ANALYZER, **options) self.assertTrue(status, "mapping inteactions") status = load.mapping(gene_mapping, "gene", analyzer=Loader.KEYWORD_ANALYZER, **options) self.assertTrue(status, "mapping genes") ''' load docs and test has parent query''' json_data = '{"index": {"_index": "%s", "_type": "gene", "_id" : "1"}}\n' % idx json_data += json.dumps({"symbol": "PAX1"}) + '\n' json_data += '{"index": {"_index": "%s", "_type": "publication", "_id" : "2", "parent": "1"}}\n' % idx json_data += json.dumps({"pubmed": 1234}) + '\n' Bulk.load(idx, '', json_data) Search.index_refresh(idx) query = ElasticQuery.has_parent('gene', Query.match('symbol', 'PAX1')) elastic = Search(query, idx=idx, idx_type='publication', size=500) docs = elastic.search().docs self.assertEquals(len(docs), 1) self.assertEquals(getattr(docs[0], 'pubmed'), 1234) self.assertEquals(docs[0].parent(), '1') self.assertRaises(QueryError, ElasticQuery.has_parent, 'gene', 'xxxxx') ''' test has child query ''' query = ElasticQuery.has_child('publication', Query.match('pubmed', 1234)) elastic = Search(query, idx=idx, idx_type='gene', size=500) docs = elastic.search().docs self.assertEquals(len(docs), 1) self.assertEquals(getattr(docs[0], 'symbol'), 'PAX1') self.assertEquals(docs[0].parent(), None) requests.delete(ElasticSettings.url() + '/' + idx)
def filter_queryset(self, request, queryset, view): ''' Override this method to request feature locations. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) query_str = filters.get('feature', 'PTPN22') build = self._get_build(filters.get('build', settings.DEFAULT_BUILD)) if query_str is None or query_str == '': return [ElasticObject(initial={'error': 'No feature name provided.'})] search_fields = ['id', 'symbol', 'dbxrefs.ensembl', 'region_name'] sources = ['start', 'stop', 'seqid', 'chromosome', 'disease_loci'] idxs = ElasticSettings.getattr('IDX') MARKER_IDX = '' if build == ElasticSettings.get_label('MARKER', label='build'): MARKER_IDX = 'MARKER' if MARKER_IDX == '': for idx in idxs: if 'MARKER' in idx: if build == ElasticSettings.get_label(idx, label='build'): MARKER_IDX = idx (idx, idx_type) = ElasticSettings.idx_names(MARKER_IDX, 'MARKER') (idx_r, idx_type_r) = ElasticSettings.idx_names('REGION', 'REGION') (idx_g, idx_type_g) = ElasticSettings.idx_names('GENE', 'GENE') idx += ',' + idx_r + ',' + idx_g idx_type += ',' + idx_type_r + ',' + idx_type_g equery = BoolQuery(must_arr=Query.query_string(query_str, fields=search_fields)) elastic = Search(search_query=ElasticQuery(equery, sources), size=10, idx=idx, idx_type=idx_type) docs = elastic.search().docs locs = [] for doc in docs: if isinstance(doc, RegionDocument): doc = Region.pad_region_doc(doc) loc = doc.get_position(build=build).split(':') pos = loc[1].replace(',', '').split('-') locs.append(ElasticObject( {'feature': query_str, 'chr': loc[0], 'start': int(pos[0]), 'end': int(pos[1]) if len(pos) > 1 else int(pos[0]), 'locusString': query_str+" ("+str(loc[1])+")"})) return locs except (TypeError, ValueError, IndexError, ConnectionError): raise Http404
def study_page(request, study): ''' Renders a study page. ''' if study is None: messages.error(request, 'No study id given.') raise Http404() query = ElasticQuery(Query.ids(study.split(','))) elastic = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=5) res = elastic.search(obj_document=StudyDocument) if res.hits_total == 0: messages.error(request, 'Study(s) '+study+' not found.') elif res.hits_total < 9: names = ', '.join([getattr(doc, 'study_name') for doc in res.docs]) context = {'features': res.docs, 'title': names} return render(request, 'study/study.html', context, content_type='text/html') raise Http404()
def filter_queryset(self, request, queryset, view): """ Override this method to request just the documents required from Rserve. """ try: filterable = getattr(view, "filter_fields", []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get("m1") if mid1 is None or mid1 == "": return [ElasticObject(initial={"error": "No marker ID provided."})] dataset = filters.get("dataset", "EUR").replace("-", "") mid2 = filters.get("m2") window_size = int(filters.get("window_size", 1000000)) dprime = filters.get("dprime", 0.0) rsq = filters.get("rsq", 0.8) maf = filters.get("maf", False) if maf: maf = True build_version = filters.get("build", "GRCh38").lower() pos = filters.get("pos", False) if pos: pos = True query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=["seqid", "start"]) elastic = Search(search_query=query, idx=ElasticSettings.idx("MARKER", "MARKER"), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, "seqid") rserve = getattr(settings, "RSERVE") conn = pyRserve.connect(host=rserve.get("HOST"), port=rserve.get("PORT")) ld_str = conn.r.ld_run( dataset, seqid, mid1, marker2=mid2, window_size=window_size, dprime=dprime, rsq=rsq, maf=maf, position=pos, build_version=build_version, ) ld_str = ld_str.replace("D.prime", "dprime").replace("R.squared", "rsquared") conn.close() return [ElasticObject(initial=json.loads(str(ld_str)))] except (TypeError, ValueError, IndexError, ConnectionError): raise Http404
def marker_page(request): ''' Renders a gene page. ''' query_dict = request.GET marker = query_dict.get("m") if marker is None: messages.error(request, 'No gene name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = Document(hit) if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) criteria = {} if marker_doc is not None: if ElasticSettings.idx('CRITERIA') is not None: criteria = views.get_criteria([marker_doc], 'marker', 'id', 'MARKER') marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) context = { 'marker': marker_doc, 'old_dbsnp_docs': _get_old_dbsnps(marker), 'ic': ic_docs, 'history': history_docs, 'criteria': criteria } return render(request, 'marker/marker.html', context, content_type='text/html') elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def _get_old_dbsnps(marker): ''' Get markers from old versions of DBSNP. Assumes the index key is prefixed by 'MARKER_'. ''' old_dbsnps_names = sorted([ElasticSettings.idx(k) for k in ElasticSettings.getattr('IDX').keys() if 'MARKER_' in k], reverse=True) old_dbsnp_docs = [] if len(old_dbsnps_names) > 0: search_query = ElasticQuery(Query.query_string(marker, fields=['id', 'rscurrent'])) for idx_name in old_dbsnps_names: elastic2 = Search(search_query=search_query, idx=idx_name, idx_type='marker') docs = elastic2.search().docs if len(docs) > 0: old_doc = docs[0] old_doc.marker_build = _get_marker_build(idx_name) old_dbsnp_docs.append(old_doc) return old_dbsnp_docs
def disease_page(request, disease): ''' Renders a disease page. ''' disease = disease.lower() if disease is None: messages.error(request, 'No disease given.') raise Http404() query = ElasticQuery(Query.terms("code", [disease.split(',')])) elastic = Search(query, idx=ElasticSettings.idx('DISEASE', 'DISEASE'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Disease(s) '+disease+' not found.') elif res.hits_total < 9: names = ', '.join([getattr(doc, 'name') for doc in res.docs]) context = {'features': res.docs, 'title': names} return render(request, 'disease/index.html', context, content_type='text/html') raise Http404()
def region_page(request, region): ''' Renders a region page. ''' if region is None: messages.error(request, 'No region given.') raise Http404() query = ElasticQuery(Query.ids(region.split(','))) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'REGION'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Region(s) '+region+' not found.') elif res.hits_total < 9: names = ', '.join([getattr(doc, 'region_name') for doc in res.docs]) REGIONS = [Region.pad_region_doc(doc) for doc in res.docs] context = {'features': REGIONS, 'title': names} return render(request, 'region/index.html', context, content_type='text/html') raise Http404()
def get_marker(cls, request, marker, context): if marker is None: messages.error(request, 'No marker name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() title = '' if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = PydginDocument.factory(hit) if doc.get_name() is not None: title = doc.get_name() if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) if marker_doc is not None: marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) criteria_disease_tags = MarkerView.criteria_disease_tags(request, [marker]) context['criteria'] = criteria_disease_tags context['features'] = [marker_doc] context['old_dbsnp_docs'] = _get_old_dbsnps(marker) context['ic'] = ic_docs context['history'] = history_docs context['title'] = title context['jbrowse_tracks'] = "PydginRegions%2Cdbsnp146%2CEnsemblGenes" return context elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def gene_page(request): """ Renders a gene page. """ query_dict = request.GET gene = query_dict.get("g") if gene is None: messages.error(request, "No gene name given.") raise Http404() query = ElasticQuery(Query.ids(gene.split(","))) elastic = Search(query, idx=ElasticSettings.idx("GENE", "GENE"), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, "Gene(s) " + gene + " not found.") elif res.hits_total < 9: symbols = ", ".join([getattr(doc, "symbol") for doc in res.docs]) context = {"genes": res.docs, "title": symbols, "criteria": get_criteria(res.docs, "gene", "symbol", "GENE")} return render(request, "gene/gene.html", context, content_type="text/html") raise Http404()
def get_overlapping_hits(self, build, seqid, start, end): query_bool = BoolQuery(must_arr=[RangeQuery("build_info.start", lte=start), RangeQuery("build_info.end", gte=end)]) or_filter = OrFilter(RangeQuery("build_info.start", gte=start, lte=end)) or_filter.extend(RangeQuery("build_info.end", gte=start, lte=end)) \ .extend(query_bool) range_query = FilteredQuery(BoolQuery(must_arr=[Query.term("build_info.seqid", seqid), Query.term("build_info.build", build)]), or_filter) query = ElasticQuery.filtered_bool( Query.nested("build_info", range_query), BoolQuery(must_arr=[RangeQuery("tier", lte=2)]), # sources=["disease", "marker", "chr_band", "tier", "build_info", "disease_locus"] ) elastic = Search(search_query=query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS')) return elastic.search().docs
def fetch_disease_locus(cls, hits_docs): region_index = ElasticSettings.idx('REGIONS', idx_type='DISEASE_LOCUS') disease_loc_docs = [] locus_id_set = set() for doc in hits_docs.docs: locus_id = getattr(doc, 'disease_locus') if locus_id not in locus_id_set: locus_id_set.add(locus_id) query = ElasticQuery(Query.ids([locus_id])) elastic = Search(query, idx=region_index) disease_loc = elastic.search().docs if(len(disease_loc) == 1): disease_loc_docs.append(disease_loc[0]) else: logger.critical('disease_locus doc not found for it ' + locus_id) return disease_loc_docs
def get_region(cls, request, region, context): if region is None: messages.error(request, 'No region given.') raise Http404() query = ElasticQuery(Query.ids(region.split(','))) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'REGION'), size=5) res = elastic.search() if res.hits_total == 0: messages.error(request, 'Region(s) '+region+' not found.') elif res.hits_total < 9: context['features'] = [Region.pad_region_doc(doc) for doc in res.docs] fids = [doc.doc_id() for doc in res.docs] criteria_disease_tags = RegionView.criteria_disease_tags(request, fids) context['criteria'] = criteria_disease_tags context['title'] = ', '.join([getattr(doc, 'region_name') for doc in res.docs]) return context raise Http404()
def get_pmids(resp_json): pmids = [] for hit in resp_json['hits']['hits']: doc = Document(hit) pmids.append(getattr(doc, "pmid")) pmids = list(set(pmids)) elastic = Search(search_query=ElasticQuery(BoolQuery(b_filter=Filter(Query.ids(pmids))), sources=['pmid']), idx=ElasticSettings.idx('PUBLICATION'), size=len(pmids)*2) if len(pmids) != elastic.get_count()['count']: # check for differences in pmids docs = elastic.search().docs pmids_in_pub_idx = [getattr(doc, 'pmid') for doc in docs] pmids_diff = list(set(pmids) - set(pmids_in_pub_idx)) self.assertListEqual([], pmids_diff, "PMIDs list empty ("+str(pmids_diff)+")") self.assertEqual(len(pmids), elastic.get_count()['count'], 'Count for region publications')
def get_comparison_results(cls, criteria_idx, criteria_idx_type, old_criteria_results, primary_id_type, criteria_sub_class): query = ElasticQuery(Query.ids(list(old_criteria_results.keys()))) elastic = Search(query, idx=criteria_idx, idx_type=criteria_idx_type, size=len(old_criteria_results)) criteria_docs = elastic.search().docs print('Number of docs from new criteria elastic index for criteria type ' + criteria_idx_type + ' ' + str(len(criteria_docs))) counter = 1 comparison_result_list = [] for criteria_doc in criteria_docs: print('==========' + str(counter) + '==========') print(criteria_doc.__dict__) counter = counter + 1 current_id = getattr(criteria_doc, 'qid') comparison_result = cls.compare_dicts(criteria_doc.__dict__, old_criteria_results[current_id], primary_id_type, criteria_sub_class, criteria_idx_type) if(len(comparison_result) > 0): comparison_result_list.append(comparison_result) return comparison_result_list
def get_rdm_docs(cls, idx, idx_type, qbool=Query.match_all(), sources=[], size=1): ''' Get a random doc from the indices. ''' score_function1 = ScoreFunction.create_score_function( 'random_score', seed=random.randint(0, 1000000)) search_query = ElasticQuery(FunctionScoreQuery(qbool, [score_function1], boost_mode='replace'), sources=sources) elastic = Search(search_query=search_query, size=size, idx=idx, idx_type=idx_type) try: return elastic.search().docs except IndexError: return cls.get_rdm_docs(idx, idx_type, qbool, sources, size)
def check_hits(resp_json): self.assertTrue('hits' in resp_json, 'scan and scroll hits') self.assertGreaterEqual(len(resp_json['hits']['hits']), 1) docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc1 in docs: doc_internal_id = getattr(doc1, "internal_id") if doc_internal_id in internal_id: pos1 = self._get_highest_build(doc1) for doc2 in internal_id[doc_internal_id]: pos2 = self._get_highest_build(doc2) if pos2['position'] != pos1['position']: msg = ("DIFFERENT POSITIONS ID: "+str(doc_internal_id)+":\t" + str(getattr(doc1, "name"))+": "+pos1['position']+" ("+doc1.doc_id()+")\t" + str(getattr(doc2, "name"))+": "+pos2['position']+" ("+doc2.doc_id()+")\t") try: terms_filter = TermsFilter.get_terms_filter("start", [pos1['position'], pos2['position']]) query = ElasticQuery.filtered(Query.term("seqid", pos1['seqid']), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs found = False for d in docs_by_pos: msg += getattr(d, "id")+": "+str(getattr(d, "start"))+"\t" if getattr(d, "id") == 'rs'+str(doc_internal_id): found = True if not found: msg += 'rs'+str(doc_internal_id) if self._rs_exists('rs'+str(doc_internal_id)): msg += ' EXISTS IN DBSNP\t' else: msg += ' NOT IN DBSNP\t' logger.error(msg) except KeyError: logger.error(msg) internal_id[doc_internal_id].append(doc1) else: internal_id[doc_internal_id] = [doc1]
def test_nested_query(self): ''' Test nested query with aggregations. ''' self.assertRaises(QueryError, Query.nested, 'build_info', 'xxxx') qnested = ElasticQuery(Query.nested('build_info', Query.term("build_info.build", "38"))) diseases_by_seqid = Agg('diseases_by_seqid', 'terms', {"size": 0, "field": "disease"}) disease_hits = Agg('disease_hits', 'reverse_nested', {}, sub_agg=diseases_by_seqid) seq_hits = Agg('seq_hits', 'terms', {'field': 'build_info.seqid', 'size': 0}, sub_agg=disease_hits) build_info = Agg('build_info', 'nested', {"path": 'build_info'}, sub_agg=[seq_hits]) elastic = Search(qnested, idx=IDX['JSON_NESTED']['indexName'], aggs=Aggs(build_info)) res = elastic.search() # returns just build 38 hits self.assertEqual(len(res.docs), 2) seq_hits = getattr(res.aggs['build_info'], 'seq_hits')['buckets'] # two seq ids self.assertEqual(len(seq_hits), 2) for seq in seq_hits: disease_hits = seq['disease_hits'] # one disease found on the sequence self.assertEqual(len(disease_hits['diseases_by_seqid']['buckets']), 1)
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos)]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos)]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from Rserve. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get('m1', 'rs2476601') dataset = filters.get('dataset', 'EUR').replace('-', '') mid2 = filters.get("m2") window_size = int(filters.get('window_size', 1000000)) dprime = filters.get("dprime", 0.) rsq = filters.get("rsq", 0.8) maf = filters.get("maf", False) if maf: maf = True build_version = filters.get("build", 'GRCh38').lower() pos = filters.get("pos", False) if pos: pos = True query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start']) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, 'seqid') rserve = getattr(settings, 'RSERVE') conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT')) ld_str = conn.r.ld_run(dataset, seqid, mid1, marker2=mid2, window_size=window_size, dprime=dprime, rsq=rsq, maf=maf, position=pos, build_version=build_version) ld_str = ld_str.replace('D.prime', 'dprime').replace('R.squared', 'rsquared') conn.close() return [ElasticObject(initial=json.loads(str(ld_str)))] except (TypeError, ValueError, IndexError, ConnectionError): return [ElasticObject(initial={'ld': None})]
def _search_engine(query_dict, user_filters, user): ''' Carry out a search and add results to the context object. ''' user_query = query_dict.get("query") query = _gene_lookup(user_query) source_filter = [ 'symbol', 'synonyms', "dbxrefs.*", 'biotype', 'description', # gene 'id', 'rscurrent', 'rshigh', # marker 'journal', 'title', 'tags.disease', # publication 'name', 'code', # disease 'study_id', 'study_name', # study 'region_name', 'marker'] # regions if re.compile(r'^[0-9 ]+$').findall(query): source_filter.append('pmid') # publication - possible PMID(s) search_fields = [] maxsize = 20 if user_filters.getlist("maxsize"): maxsize = int(user_filters.get("maxsize")) # build search_fields from user input filter fields for it in user_filters.items(): if len(it) == 2: if it[0] == 'query': continue parts = it[1].split(":") if len(parts) == 3: search_fields.append(parts[1]+"."+parts[2]) elif len(parts) == 2: search_fields.append(parts[1]) if len(search_fields) == 0: search_fields = list(source_filter) search_fields.extend(['abstract', 'authors.name', # publication 'authors', 'pmids', # study 'markers', 'genes']) # study/region source_filter.extend(['date', 'pmid', 'build_id', 'ref', 'alt', 'chr_band', 'disease_locus', 'disease_loci', 'region_id']) idx_name = query_dict.get("idx") idx_dict = ElasticSettings.search_props(idx_name, user) query_filters = _get_query_filters(user_filters, user) highlight = Highlight(search_fields, pre_tags="<strong>", post_tags="</strong>", number_of_fragments=0) sub_agg = Agg('idx_top_hits', 'top_hits', {"size": maxsize, "_source": source_filter, "highlight": highlight.highlight['highlight']}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("biotypes", "terms", {"field": "biotype", "size": 0}), Agg("categories", "terms", {"field": "_type", "size": 0})]) # create score functions score_fns = _build_score_functions(idx_dict) equery = BoolQuery(must_arr=Query.query_string(query, fields=search_fields), should_arr=_auth_arr(user), b_filter=query_filters, minimum_should_match=1) search_query = ElasticQuery(FunctionScoreQuery(equery, score_fns, boost_mode='replace')) elastic = Search(search_query=search_query, aggs=aggs, size=0, idx=idx_dict['idx'], idx_type=idx_dict['idx_type']) result = elastic.search() mappings = elastic.get_mapping() _update_mapping_filters(mappings, result.aggs) _update_biotypes(user_filters, result) return {'data': _top_hits(result), 'aggs': result.aggs, 'query': user_query, 'idx_name': idx_name, 'fields': search_fields, 'mappings': mappings, 'hits_total': result.hits_total, 'maxsize': maxsize, 'took': result.took}
def fetch_from_elastic(cls, idx, idx_type, feature_ids): '''Lookup pydgin elastic''' query = ElasticQuery(Query.ids(feature_ids)) elastic = Search(query, idx=ElasticSettings.idx(idx, idx_type=idx_type), size=5) docs = elastic.search().docs return docs
def test_string_query_with_wildcard_and_highlight(self): highlight = Highlight("id", pre_tags="<strong>", post_tags="</strong>") query = ElasticQuery.query_string("rs*", fields=["id"], highlight=highlight) search = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) self.assertTrue(len(search.search().docs) > 1, "Elastic string query retrieved marker (rs*)")
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', { 'username': '******', 'password': '******' }) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get( id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names: group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter( "field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue( len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")
def test_elastic_group_name(self): ''' Testing the workflow defined in: https://killin.cimr.cam.ac.uk/nextgensite/2015/08/05/region-authorization/ Testing various elastic queries idx doc: "_source":{"attr": {"region_id": "803", "group_name": "[\"DIL\"]", "Name": "4q27"}, "seqid": "chr4", "source": "immunobase", "type": "region", "score": ".", "strand": ".", "phase": ".", "start": 122061159, "end": 122684373} idx_query: Private(in given group) OR Public -d '{"query":{"filtered":{"filter":{"bool": { "should": [ {"terms": {"group_name":["dil"]}}, { "missing": { "field": "group_name" }} ] }}}}}' Private(in given group): -d '{"query":{"filtered":{"filter":{"terms":{"group_name":["dil"]}}}}}' Public: -d {'query': {'filtered': {'filter': {'missing': {'field': 'group_name'}}, - 'query': {'term': {'match_all': '{}'}}}}} ''' # get the groups for the given user response = self.client.post('/accounts/login/', {'username': '******', 'password': '******'}) self.assertTrue(response.status_code, "200") logged_in_user = User.objects.get(id=self.client.session['_auth_user_id']) if logged_in_user and logged_in_user.is_authenticated(): user_groups = get_user_groups(logged_in_user) self.assertTrue('READ' in user_groups, "user present in READ group") # make sure the user is not yet in DIL group self.assertFalse('DIL' in user_groups, "user not present in DIL group") group_names = get_user_groups(logged_in_user) if 'READ' in group_names : group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) == 0, "No group present") # Match all query, as there is no group we do a match all query = ElasticQuery(Query.match_all()) expected_query_string = {"query": {"match_all": {}}} self.assertJSONEqual(json.dumps(query.query), json.dumps(expected_query_string), "Query string matched") Search.index_refresh(self.index_name) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 12, "Elastic string query retrieved all public regions") # Filtered query for group names, add the user to DIL group and get the query string self.dil_group = Group.objects.create(name='DIL') logged_in_user.groups.add(self.dil_group) group_names = get_user_groups(logged_in_user) if 'READ' in group_names : group_names.remove('READ') # @IgnorePep8 group_names = [x.lower() for x in group_names] self.assertTrue(len(group_names) > 0, "More than 1 group present") self.assertTrue("dil" in group_names, "DIL group present") # retrieves all docs with missing field group_name - 11 docs terms_filter = TermsFilter.get_missing_terms_filter("field", "attr.group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 11, "Elastic string query retrieved all public regions") # build filtered boolean query to bring all public docs + private docs 11+1 = 12 docs query_bool = BoolQuery() query_bool.should(Query.missing_terms("field", "group_name")) \ .should(Query.terms("group_name", group_names).query_wrap()) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 12, "Elastic string query retrieved both public + private regions") terms_filter = TermsFilter.get_terms_filter("attr.group_name", group_names) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=self.index_name) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved one private regions") self.assertEqual(docs[0].attr['Name'], "4q27", "type matched region") self.assertEqual(docs[0].attr['region_id'], "803", "type matched region") self.assertEqual(docs[0].attr['group_name'], "[\"DIL\"]", "type matched region")