def _find_snp_position(snp_track, name): if snp_track is None: query = ElasticQuery.query_match("id", name) elastic = Search(query, idx=ElasticSettings.idx('MARKER')) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} else: mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.query_match("name", name) elastic = Search(query, idx=snp_track_idx) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) search_filters = self._build_filters(filters=filters) if search_filters is not None: q = ElasticQuery.filtered(Query.match_all(), search_filters) else: q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=getattr(view, 'idx'), size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_term_query(self): ''' Test building and running a match query. ''' query = ElasticQuery(Query.term("id", "rs2476601")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)") query = ElasticQuery(Query.term("seqid", "1", boost=3.0)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers on chr1")
def test_query_ids(self): ''' Test by query ids. ''' query = ElasticQuery(Query.ids(['1', '2'])) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)") idx_type = docs[0].type() query = ElasticQuery(Query.ids('2', types=idx_type)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
def test_update_doc(self): ''' Update with a partial document. ''' idx = IDX['MARKER']['indexName'] docs = Search(ElasticQuery(Query.term("id", "rs2476601"), sources=['id']), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") update_field = {"doc": {"start": 100, "end": 200}} Update.update_doc(docs[0], update_field) Search.index_refresh(IDX['MARKER']['indexName']) docs = Search(ElasticQuery(Query.term("id", "rs2476601")), idx=idx).search().docs self.assertEquals(len(docs), 1, "rs2476601 document") self.assertEquals(getattr(docs[0], 'start'), 100, "rs2476601 start") self.assertEquals(getattr(docs[0], 'end'), 200, "rs2476601 end")
def test_string_query(self): ''' Test building and running a string query. ''' query = ElasticQuery.query_string("rs2476601", fields=["id"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search() self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)") self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
def gene_mgi_parse(cls, gene_pubs, idx): ''' Parse Ensembl and MGI data from JAX. ''' orthogenes_mgi = {} for gene_mgi in gene_pubs: parts = gene_mgi.split('\t') if 'MGI:' not in parts[0]: raise PipelineError('MGI not found '+parts[0]) if 'ENSMUSG' not in parts[5]: raise PipelineError('ENSMUSG not found '+parts[5]) orthogenes_mgi[parts[5]] = parts[0].replace('MGI:', '') orthogene_keys = list(orthogenes_mgi.keys()) chunk_size = 450 for i in range(0, len(orthogene_keys), chunk_size): chunk_gene_keys = orthogene_keys[i:i+chunk_size] json_data = '' query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", chunk_gene_keys)) docs = Search(query, idx=idx, size=chunk_size).search().docs for doc in docs: ens_id = doc.doc_id() idx_type = doc.type() mm = getattr(doc, 'dbxrefs')['orthologs']['mmusculus'] mm['MGI'] = orthogenes_mgi[mm['ensembl']] dbxrefs = {"dbxrefs": {'orthologs': {"mmusculus": mm}}} doc_data = {"update": {"_id": ens_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': dbxrefs}) + '\n' if json_data != '': Loader().bulk_load(idx, idx_type, json_data)
def test_bulk(self): ''' Test the Bulk.load(). ''' self.set_up() idx = IDX['MARKER']['indexName'] elastic = Search(ElasticQuery(Query.match_all()), idx=idx) hits_total1 = elastic.get_count()['count'] json_data = '{"index": {"_index": "%s", "_type": "%s"}}\n' % \ (idx, 'marker') json_data += json.dumps({"alt": "G", "start": 946, "seqid": "1", "filter": ".", "ref": "A", "id": "rsXXXXX", "qual": ".", "info": "RS=XXXXX"}) resp = Bulk.load(idx, '', json_data) self.assertNotEquals(resp.status_code, 200) # note: needs a trailing line return to work Bulk.load(idx, '', json_data + '\n') Search.index_refresh(idx) hits_total2 = elastic.get_count()['count'] self.assertEquals(hits_total2, hits_total1+1, "contains documents") # produce errors updating doc id that doesn't exist json_data += '{"delete": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % \ (idx, 'marker', 'XYZ') json_data += '{"doc": {"start": 100, "end": 200}}\n' resp = Bulk.load(idx, '', json_data) self.assertTrue('errors' in resp.json() and resp.json()['errors'])
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def _check_gene_history(cls, gene_sets, config): '''find a way to handle this better''' section = config['GENE_HISTORY'] newgene_ids = {} discountinued_geneids = [] def process_hits(resp_json): hits = resp_json['hits']['hits'] docs = [Document(hit) for hit in hits] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets), sources=['geneid', 'discontinued_geneid']) ScanAndScroll.scan_and_scroll(section['index'], idx_type=section['index_type'], call_fun=process_hits, query=query) return (newgene_ids, discountinued_geneids)
def _ensembl_entrez_lookup(cls, ensembl_gene_sets, section): ''' Get an ensembl:entrez id dictionary. ''' equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.ensembl", ensembl_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) docs = Search(equery, idx=section['index'], size=len(ensembl_gene_sets)).search().docs return {doc.doc_id(): getattr(doc, 'dbxrefs')['entrez'] for doc in docs}
def check_hits(resp_json): rsids = {} docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc in docs: rsid = getattr(doc, "id") if rsid is not None: rsids[rsid] = doc rsids_keys = list(rsids.keys()) terms_filter = TermsFilter.get_terms_filter("id", rsids_keys) query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=len(rsids_keys)) docs_by_rsid = elastic.search().docs for doc in docs_by_rsid: info = getattr(doc, "info") if 'VC=SNV' not in info: continue rsid = getattr(doc, "id") ic_doc = rsids[rsid] pos1 = getattr(doc, "start") pos2 = self._get_highest_build(ic_doc)['position'] if abs(int(pos1) - int(pos2)) > 1: is_par = getattr(ic_doc, 'is_par') allele_a = getattr(ic_doc, 'allele_a') if is_par is None and not (allele_a == 'D' or allele_a == 'I'): msg = ("CHECK IC/DBSNP POSITIONS:: "+getattr(ic_doc, 'name') + ' '+str(pos2)+" "+rsid+' '+str(pos1)) # ' ('+ic_doc.doc_id()+' '+json.dumps(getattr(ic_doc, 'build_info'))+')' query = ElasticQuery.filtered(Query.term("seqid", getattr(doc, 'seqid')), Filter(Query.term("start", pos2))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " ("+getattr(d, "id")+":"+str(getattr(d, "start"))+")" query = ElasticQuery.filtered(Query.match_all(), Filter(Query.term("rslow", rsid))) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'HISTORY')) docs_by_pos = elastic.search().docs if len(docs_by_pos) > 0: for d in docs_by_pos: msg += " (rshigh:"+str(getattr(d, "rshigh")) + \ " build_id:"+str(getattr(d, "build_id"))+")" logger.error(msg)
def test_bool_filtered_query(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery(must_not_arr=[Query.term("seqid", 2)], should_arr=[RangeQuery("start", gte=10050)]) query_bool.must([Query.term("id", "rs768019142")]) \ .should(RangeQuery("start", gte=10054)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def test_scan_and_scroll(self): ''' Test scan and scroll interface. ''' def check_hits(resp_json): self.assertTrue('hits' in resp_json, 'scan and scroll hits') self.assertGreaterEqual(len(resp_json['hits']['hits']), 1) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits, query=ElasticQuery.query_string("rs2476601", fields=["id"]))
def test_and_filtered_query(self): ''' Test building and running a filtered query. ''' query_bool = BoolQuery(must_arr=[RangeQuery("start", gte=1)]) and_filter = AndFilter(query_bool) and_filter.extend(RangeQuery("start", gte=1)) \ .extend(Query.term("seqid", 1)) query = ElasticQuery.filtered(Query.term("seqid", 1), and_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def test_get_rdm_feature_id(self): ''' Test get random feature id. ''' idx = IDX['GFF_GENERIC']['indexName'] idx_type = IDX['GFF_GENERIC']['indexType'] doc_id = ElasticUtils.get_rdm_feature_id(idx, idx_type) self.assertTrue(isinstance(doc_id, str), 'Document id') docs = Search(ElasticQuery(Query.ids(doc_id)), idx=idx).search().docs self.assertTrue(len(docs) == 1, 'Document retrieved')
def test_url_rotate(self): ''' Test the url rotates from http://xxx:9200 to correct url. ''' query = ElasticQuery.filtered(Query.term("seqid", 1), Filter(Query.term("id", "rs768019142"))) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker") Search.index_exists('test', 'test2') ElasticUrl.URL_INDEX = 0 # reset
def region_page(request, region): ''' Region elastic''' query = ElasticQuery.query_match("attr.region_id", region) elastic = Search(query, idx=ElasticSettings.idx(name='REGION')) context = elastic.get_result() context['title'] = "Region" print(context) return render(request, 'region/region.html', context, content_type='text/html')
def _entrez_ensembl_lookup(cls, gene_sets, section, config=None): ''' Get an entrez:ensembl id dictionary. ''' (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) docs = Search(equery, idx=section['index'], size=len(replaced_gene_sets)).search().docs return {getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}
def test_bool_filtered_query2(self): ''' Test building and running a filtered boolean query. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.term("seqid", 1)) query_string = Query.query_string("rs768019142", fields=["id", "seqid"]) query = ElasticQuery.filtered_bool(query_string, query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_terms_query(self): ''' Test building and running a match query. ''' highlight = Highlight(["id"]) query = ElasticQuery(Query.terms("id", ["rs2476601", "rs768019142"]), highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved markers (rs2476601, rs768019142)") self.assertTrue(getattr(docs[0], 'seqid'), "Hit attribute found") self.assertTrue(docs[0].highlight() is not None, "highlighting found")
def _build_frags_query(frags_idx, chrom, segmin, segmax): query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.bedFields) fragsQuery = Search(search_query=query, search_from=0, size=2000000, idx=frags_idx) fragsResult = fragsQuery.get_result() frags = fragsResult['data'] frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags) return frags
def test_or_filtered_query(self): ''' Test building and running a filtered query. ''' highlight = Highlight(["id", "seqid"]) query_bool = BoolQuery(must_arr=[RangeQuery("start", lte=1), RangeQuery("end", gte=100000)]) or_filter = OrFilter(RangeQuery("start", gte=1, lte=100000)) or_filter.extend(query_bool) \ .extend(Query.query_string("rs*", fields=["id", "seqid"]).query_wrap()) query = ElasticQuery.filtered(Query.term("seqid", 1), or_filter, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Elastic filtered query retrieved marker(s)")
def get_elastic_query(cls, section=None, config=None): ''' function to build the elastic query object @type section: string @keyword section: The section in the criteria.ini file @type config: string @keyword config: The config object initialized from criteria.ini. @return: L{Query} ''' section_config = config[section] source_fields = [] if 'source_fields' in section_config: source_fields_str = section_config['source_fields'] source_fields = source_fields_str.split(',') if 'mhc' in section: seqid = '6' start_range = 25000000 end_range = 35000000 seqid_param = section_config['seqid_param'] start_param = section_config['start_param'] end_param = section_config['end_param'] if section == 'is_gene_in_mhc': # for region you should make a different query # Defined MHC region as chr6:25,000,000..35,000,000 query = ElasticUtils.range_overlap_query(seqid, start_range, end_range, source_fields, seqid_param, start_param, end_param) elif section == 'is_marker_in_mhc': query_bool = BoolQuery() query_bool.must(RangeQuery("start", lte=end_range)) \ .must(RangeQuery("start", gte=start_range)) \ .must(Query.term("seqid", seqid)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elif section == 'is_region_in_mhc': query = ElasticQuery(Query.term("region_name", "MHC")) elif section == 'marker_is_gwas_significant_in_ic': # build a range query gw_sig_p = 0.00000005 query = ElasticQuery(RangeQuery("p_value", lte=gw_sig_p)) else: if len(source_fields) > 0: query = ElasticQuery(Query.match_all(), sources=source_fields) else: # query = ElasticQuery(Query.match_all()) return None return query
def test_search_count(self): ''' Test index and search counts. ''' idx = IDX['GFF_GENERIC']['indexName'] idx_type = IDX['GFF_GENERIC']['indexType'] count1 = ElasticUtils.get_docs_count(idx, idx_type) self.assertGreater(count1, 0, 'index count') search_query = ElasticQuery( BoolQuery(must_not_arr=[Query.term('seqid', 'chr1')])) count2 = ElasticUtils.get_docs_count(idx, idx_type, search_query=search_query) self.assertGreater(count1, count2, 'search query count')
def test_function_score_query(self): ''' Test a function score query with a query (using the start position as the score). ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start', modifier='reciprocal') query_string = Query.query_string("rs*", fields=["id", "seqid"]) query = ElasticQuery(FunctionScoreQuery(query_string, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = 0 for doc in docs: start = getattr(doc, 'start') self.assertLess(last_start, start) last_start = start
def test_bool_filtered_query4(self): ''' Test building and running a filtered boolean query. Note: ElasticQuery used to wrap match in a query object. ''' query_bool = BoolQuery() query_bool.should(RangeQuery("start", lte=20000)) \ .should(Query.term("seqid", 2)) \ .must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total == 1, "Elastic filtered query retrieved marker (rs768019142)")
def test_bool_nested_filter(self): ''' Test combined Bool filter ''' query_bool_nest = BoolQuery() query_bool_nest.must(Query.match("id", "rs768019142").query_wrap()) \ .must(Query.term("seqid", 1)) query_bool = BoolQuery() query_bool.should(query_bool_nest) \ .should(Query.term("seqid", 2)) query = ElasticQuery.filtered_bool(Query.match_all(), query_bool, sources=["id", "seqid", "start"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(elastic.search().hits_total >= 1, "Nested bool filter query")
def test_bool_query(self): ''' Test a bool query. ''' query_bool = BoolQuery() highlight = Highlight(["id", "seqid"]) query_bool.must(Query.term("id", "rs768019142")) \ .must(RangeQuery("start", gt=1000)) \ .must_not(Query.match("seqid", "2")) \ .should(Query.match("seqid", "3")) \ .should(Query.match("seqid", "1")) query = ElasticQuery.bool(query_bool, highlight=highlight) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs768019142)")
def test_pub_ini_file2(self): ''' Test publication pipeline with a list of PMIDs. ''' out = StringIO() call_command('publications', '--dir', TEST_DATA_DIR, '--steps', 'load', sections='DISEASE::TEST', ini=MY_PUB_INI_FILE, stdout=out) INI_CONFIG = IniParser().read_ini(MY_PUB_INI_FILE) idx = INI_CONFIG['DISEASE']['index'] Search.index_refresh(idx) query = ElasticQuery.query_string("test", fields=["tags.disease"]) elastic = Search(query, idx=idx) docs = elastic.search().docs self.assertGreater(len(docs), 1)
def _get_current_build_info(self, seqid, position): ''' Get upper & lower boundaries for a hit given the position of the marker.''' query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("position", gte=position), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:asc'), size=1).search() genetic_map_position = getattr(result.docs[0], "genetic_map_position") query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("genetic_map_position", gte=(genetic_map_position + 0.1)), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:asc'), size=1).search() start = int(getattr(result.docs[0], "position")) query = ElasticQuery( BoolQuery(must_arr=[ RangeQuery("genetic_map_position", lte=(genetic_map_position - 0.1)), Query.match("seqid", seqid) ])) result = Search(query, idx=ElasticSettings.idx('HAPMAP', 'HAPMAP'), qsort=Sort('position:desc'), size=1).search() end = int(getattr(result.docs[0], "position")) build_info = {'build': 38, 'seqid': seqid, 'start': start, 'end': end} return build_info
def get_object(self): q = ElasticQuery(Query.ids(self.kwargs[self.lookup_field])) s = Search(search_query=q, idx=getattr(self, 'idx')) try: result = s.get_json_response()['hits']['hits'][0] obj = ElasticObject(initial=result['_source']) obj.uuid = result['_id'] # May raise a permission denied self.check_object_permissions(self.request, obj) return obj except (TypeError, ValueError, IndexError): raise Http404
def test_mapping_parent_child(self): ''' Test creating mapping with parent child relationship. ''' gene_mapping = MappingProperties("gene") gene_mapping.add_property("symbol", "string", analyzer="full_name") inta_mapping = MappingProperties("publication", "gene") load = Loader() idx = "test__mapping__"+SEARCH_SUFFIX options = {"indexName": idx, "shards": 1} requests.delete(ElasticSettings.url() + '/' + idx) # add child mappings first status = load.mapping(inta_mapping, "publication", analyzer=Loader.KEYWORD_ANALYZER, **options) self.assertTrue(status, "mapping inteactions") status = load.mapping(gene_mapping, "gene", analyzer=Loader.KEYWORD_ANALYZER, **options) self.assertTrue(status, "mapping genes") ''' load docs and test has parent query''' json_data = '{"index": {"_index": "%s", "_type": "gene", "_id" : "1"}}\n' % idx json_data += json.dumps({"symbol": "PAX1"}) + '\n' json_data += '{"index": {"_index": "%s", "_type": "publication", "_id" : "2", "parent": "1"}}\n' % idx json_data += json.dumps({"pubmed": 1234}) + '\n' Bulk.load(idx, '', json_data) Search.index_refresh(idx) query = ElasticQuery.has_parent('gene', Query.match('symbol', 'PAX1')) elastic = Search(query, idx=idx, idx_type='publication', size=500) docs = elastic.search().docs self.assertEquals(len(docs), 1) self.assertEquals(getattr(docs[0], 'pubmed'), 1234) self.assertEquals(docs[0].parent(), '1') self.assertRaises(QueryError, ElasticQuery.has_parent, 'gene', 'xxxxx') ''' test has child query ''' query = ElasticQuery.has_child('publication', Query.match('pubmed', 1234)) elastic = Search(query, idx=idx, idx_type='gene', size=500) docs = elastic.search().docs self.assertEquals(len(docs), 1) self.assertEquals(getattr(docs[0], 'symbol'), 'PAX1') self.assertEquals(docs[0].parent(), None) requests.delete(ElasticSettings.url() + '/' + idx)
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos)]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos)]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def test_function_score_filter(self): ''' Test a function score query with a filter. ''' score_function = ScoreFunction.create_score_function('field_value_factor', field='start') bool_filter = Filter(BoolQuery(must_arr=[RangeQuery("start", lte=50000)])) query = ElasticQuery(FunctionScoreQuery(bool_filter, [score_function], boost_mode='replace')) docs = Search(query, idx=ElasticSettings.idx('DEFAULT')).search().docs self.assertGreater(len(docs), 1, str(len(docs))) last_start = sys.maxsize for doc in docs: start = getattr(doc, 'start') # test that the start is equal to the score self.assertEqual(start, int(doc.__dict__['_meta']['_score'])) self.assertGreater(last_start, start) last_start = start
def post(self, request, *args, **kwargs): ens_id = self.request.POST.get('ens_id') marker = self.request.POST.get('marker') markers = self.request.POST.getlist('markers[]') if ens_id: sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) elif marker: sfilter = Filter(Query.query_string(marker, fields=["marker"]).query_wrap()) elif markers: sfilter = Filter(Query.query_string(' '.join(markers), fields=["marker"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=500) study_hits = elastic.get_json_response()['hits'] ens_ids = [] pmids = [] for hit in study_hits['hits']: if 'pmid' in hit['_source']: pmids.append(hit['_source']['pmid']) if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) pub_docs = PublicationDocument.get_pub_docs_by_pmid(pmids, sources=['authors.name', 'journal']) for hit in study_hits['hits']: genes = {} if 'genes' in hit['_source']: for ens_id in hit['_source']['genes']: try: genes[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} hit['_source']['genes'] = genes if 'pmid' in hit['_source']: pmid = hit['_source']['pmid'] try: authors = getattr(pub_docs[pmid], 'authors') journal = getattr(pub_docs[pmid], 'journal') hit['_source']['pmid'] = \ {'pmid': pmid, 'author': authors[0]['name'].rsplit(None, 1)[-1] if authors else "", 'journal': journal} except KeyError: hit['_source']['pmid'] = {'pmid': pmid} return JsonResponse(study_hits)
def _build_frags_query(frags_idx, chrom, segmin, segmax): query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.bedFields) fragsQuery = Search(search_query=query, search_from=0, size=10000, idx=frags_idx) # fragsResult = fragsQuery.get_result() # frags = fragsResult['data'] fragsResult = fragsQuery.get_json_response() frags = [] for hit in fragsResult['hits']['hits']: frags.append(hit['_source']) frags = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], frags) return frags
def _build_exon_query(chrom, segmin, segmax, genes): # get exonic structure for genes in this section geneExons = dict() query_bool = BoolQuery() query_bool.must([Query.term("seqid", chrom)]) if len(genes) > 0: for g in genes: query = ElasticQuery.filtered_bool(Query.query_string(g["gene_id"], fields=["name"]), query_bool, sources=utils.snpFields) elastic = Search(query, idx=getattr(chicp_settings, 'CP_GENE_IDX')+'/exons/', search_from=0, size=2000) result = elastic.get_result() exons = result['data'] exons = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], exons) geneExons[g["gene_id"]] = sorted(exons, key=operator.itemgetter("start")) return geneExons
def _entrez_ensembl_lookup(cls, gene_sets, section, config=None): ''' Get an entrez:ensembl id dictionary. ''' (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config) replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) lookup = {} def process_hits(resp_json): hits = resp_json['hits']['hits'] docs = [Document(hit) for hit in hits] lookup.update({getattr(doc, 'dbxrefs')['entrez']: doc.doc_id() for doc in docs}) equery = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets), sources=['dbxrefs.ensembl', 'dbxrefs.entrez']) ScanAndScroll.scan_and_scroll(section['index'], call_fun=process_hits, query=equery) return lookup
def get_overlapping_hits(self, build, seqid, start, end): query_bool = BoolQuery(must_arr=[RangeQuery("build_info.start", lte=start), RangeQuery("build_info.end", gte=end)]) or_filter = OrFilter(RangeQuery("build_info.start", gte=start, lte=end)) or_filter.extend(RangeQuery("build_info.end", gte=start, lte=end)) \ .extend(query_bool) range_query = FilteredQuery(BoolQuery(must_arr=[Query.term("build_info.seqid", seqid), Query.term("build_info.build", build)]), or_filter) query = ElasticQuery.filtered_bool( Query.nested("build_info", range_query), BoolQuery(must_arr=[RangeQuery("tier", lte=2)]), # sources=["disease", "marker", "chr_band", "tier", "build_info", "disease_locus"] ) elastic = Search(search_query=query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS')) return elastic.search().docs
def _check_gene_history(cls, gene_sets, section): query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets)) docs = Search(query, idx=section['index'], idx_type=section['index_type_history'], size=1000000).search().docs newgene_ids = {} discountinued_geneids = [] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) return (newgene_ids, discountinued_geneids)
def _find_snp_position(snp_track, name): mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) snp_track_idx = getattr(chicp_settings, 'CHICP_IDX').get(group).get('INDEX') snp_track_type = '' if getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS').get(snp_track): snp_track_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS') \ .get(snp_track).get('TYPE') else: snp_track_type = track query = ElasticQuery.query_match("name", name) elastic = Search(query, idx=snp_track_idx+'/'+snp_track_type) snpResult = elastic.get_result() if (len(snpResult['data']) > 0): chrom = snpResult['data'][0]['seqid'].replace('chr', "") position = snpResult['data'][0]['end'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
def _check_gene_history(cls, gene_sets, config): '''find a way to handle this better''' section = config['GENE_HISTORY'] query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("discontinued_geneid", gene_sets), sources=['geneid', 'discontinued_geneid']) docs = Search(query, idx=section['index'], idx_type=section['index_type'], size=len(gene_sets)).search().docs newgene_ids = {} discountinued_geneids = [] for doc in docs: geneid = getattr(doc, 'geneid') discontinued_geneid = getattr(doc, 'discontinued_geneid') if geneid is None: discountinued_geneids.append(str(discontinued_geneid)) else: newgene_ids[str(discontinued_geneid)] = str(geneid) return (newgene_ids, discountinued_geneids)
def interaction_details(request): ''' Get interaction details for a given ensembl ID. ''' ens_id = request.POST.get('ens_id') query = ElasticQuery.has_parent('gene', Query.ids(ens_id)) elastic = Search(query, idx=ElasticSettings.idx('GENE', 'INTERACTIONS'), size=500) interaction_hits = elastic.get_json_response()['hits'] ens_ids = [] for hit in interaction_hits['hits']: for interactor in hit['_source']['interactors']: ens_ids.append(interactor['interactor']) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for hit in interaction_hits['hits']: for interactor in hit['_source']['interactors']: iid = interactor['interactor'] try: interactor['symbol'] = getattr(docs[iid], 'symbol') except KeyError: interactor['symbol'] = iid return JsonResponse(interaction_hits)
def interaction_details(request): """ Get interaction details for a given ensembl ID. """ ens_id = request.POST.get("ens_id") query = ElasticQuery.has_parent("gene", Query.ids(ens_id)) elastic = Search(query, idx=ElasticSettings.idx("GENE", "INTERACTIONS"), size=500) interaction_hits = elastic.get_json_response()["hits"] ens_ids = [] for hit in interaction_hits["hits"]: for interactor in hit["_source"]["interactors"]: ens_ids.append(interactor["interactor"]) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) for hit in interaction_hits["hits"]: for interactor in hit["_source"]["interactors"]: iid = interactor["interactor"] try: interactor["symbol"] = getattr(docs[iid], "symbol") except KeyError: interactor["symbol"] = iid return JsonResponse(interaction_hits)
def _update_gene(cls, genes, idx): ''' Use genes data to update the index. ''' gene_keys = list(genes.keys()) chunk_size = 450 for i in range(0, len(genes), chunk_size): chunk_gene_keys = gene_keys[i:i+chunk_size] json_data = '' query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", chunk_gene_keys)) docs = Search(query, idx=idx, size=chunk_size).search().docs for doc in docs: ens_id = doc._meta['_id'] idx_type = doc.type() entrez = getattr(doc, 'dbxrefs')['entrez'] doc_data = {"update": {"_id": ens_id, "_type": idx_type, "_index": idx, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': genes[entrez]}) + '\n' if json_data != '': Loader().bulk_load(idx, idx_type, json_data)
def studies_details(request): """ Get studies for a given ensembl ID. """ ens_id = request.POST.get("ens_id") sfilter = Filter(Query.query_string(ens_id, fields=["genes"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), sfilter) elastic = Search(query, idx=ElasticSettings.idx("REGION", "STUDY_HITS"), size=500) study_hits = elastic.get_json_response()["hits"] ens_ids = [] pmids = [] for hit in study_hits["hits"]: if "pmid" in hit["_source"]: pmids.append(hit["_source"]["pmid"]) for ens_id in hit["_source"]["genes"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) pub_docs = _get_pub_docs_by_pmid(pmids, sources=["authors.name", "journal"]) for hit in study_hits["hits"]: genes = {} for ens_id in hit["_source"]["genes"]: try: genes[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genes = {ens_id: ens_id} hit["_source"]["genes"] = genes if "pmid" in hit["_source"]: pmid = hit["_source"]["pmid"] try: authors = getattr(pub_docs[pmid], "authors") journal = getattr(pub_docs[pmid], "journal") hit["_source"]["pmid"] = { "pmid": pmid, "author": authors[0]["name"].rsplit(None, 1)[-1], "journal": journal, } except KeyError: hit["_source"]["pmid"] = {"pmid": pmid} return JsonResponse(study_hits)
def get_new_pmids(cls, pmids, idx, disease_code=None): ''' Find PMIDs in a list that are not in the elastic index. ''' chunk_size = 800 pmids_found = set() pmids_found_add = pmids_found.add time.sleep(5) for i in range(0, len(pmids), chunk_size): pmids_slice = pmids[i:i+chunk_size] terms_filter = TermsFilter.get_terms_filter("pmid", pmids_slice) query = ElasticQuery.filtered(Query.match_all(), terms_filter, sources=['pmid', 'tags']) docs = Search(query, idx=idx, size=chunk_size).search().docs json_data = '' for doc in docs: pmids_found_add(getattr(doc, 'pmid')) if disease_code is not None: tags = getattr(doc, 'tags') if 'disease' in tags: disease = tags['disease'] else: disease = [] if disease_code not in disease: # update disease attribute disease.append(disease_code) tags['disease'] = disease idx_name = doc._meta['_index'] idx_type = doc.type() doc_data = {"update": {"_id": doc._meta['_id'], "_type": idx_type, "_index": idx_name, "_retry_on_conflict": 3}} json_data += json.dumps(doc_data) + '\n' json_data += json.dumps({'doc': {'tags': tags}}) + '\n' if json_data != '': Loader().bulk_load(idx_name, idx_type, json_data) return [pmid for pmid in pmids if pmid not in pmids_found]
def genesets_details(request): ''' Get pathway gene sets for a given ensembl ID. ''' ens_id = request.POST.get('ens_id') geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx('GENE', 'PATHWAY'), size=500) genesets_hits = elastic.get_json_response()['hits'] ens_ids = [] for hit in genesets_hits['hits']: for ens_id in hit['_source']['gene_sets']: ens_ids.append(ens_id) docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for hit in genesets_hits['hits']: genesets = {} for ens_id in hit['_source']['gene_sets']: try: genesets[ens_id] = getattr(docs[ens_id], 'symbol') except KeyError: genesets[ens_id] = ens_id hit['_source']['gene_sets'] = genesets return JsonResponse(genesets_hits)
def genesets_details(request): """ Get pathway gene sets for a given ensembl ID. """ ens_id = request.POST.get("ens_id") geneset_filter = Filter(Query.query_string(ens_id, fields=["gene_sets"]).query_wrap()) query = ElasticQuery.filtered(Query.match_all(), geneset_filter) elastic = Search(query, idx=ElasticSettings.idx("GENE", "PATHWAY"), size=500) genesets_hits = elastic.get_json_response()["hits"] ens_ids = [] for hit in genesets_hits["hits"]: for ens_id in hit["_source"]["gene_sets"]: ens_ids.append(ens_id) docs = _get_gene_docs_by_ensembl_id(ens_ids, ["symbol"]) for hit in genesets_hits["hits"]: genesets = {} for ens_id in hit["_source"]["gene_sets"]: try: genesets[ens_id] = getattr(docs[ens_id], "symbol") except KeyError: genesets[ens_id] = ens_id hit["_source"]["gene_sets"] = genesets return JsonResponse(genesets_hits)
def _convert_entrezid2ensembl(cls, gene_sets, section, log_output_file_handler=None, log_conversion=True): '''Converts given set of entrez ids to ensembl ids by querying the gene index dbxrefs''' # first check in gene_history (newgene_ids, discontinued_ids) = cls._check_gene_history(gene_sets, section) # replace all old ids with new ids replaced_gene_sets = cls._replace_oldids_with_newids(gene_sets, newgene_ids, discontinued_ids) query = ElasticQuery.filtered(Query.match_all(), TermsFilter.get_terms_filter("dbxrefs.entrez", replaced_gene_sets)) docs = Search(query, idx=section['index'], size=1000000).search().docs ensembl_ids = [] for doc in docs: ens_id = doc._meta['_id'] ensembl_ids.append(ens_id) if log_conversion: if log_output_file_handler is not None: cls._log_entrezid2ensembl_coversion(replaced_gene_sets, ensembl_ids, log_output_file_handler) return ensembl_ids
def _build_snp_query(snp_track, chrom, segmin, segmax): snps = [] snpMeta = {} maxScore = -1 if snp_track and snp_track != 'None': # get SNPs based on this segment mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) snp_track_idx = getattr(chicp_settings, 'CHICP_IDX').get(group).get('INDEX') snp_track_type = '' if getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS').get(snp_track): snp_track_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('TRACKS') \ .get(snp_track).get('TYPE') else: snp_track_type = track query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.snpFields) snpQuery = Search(search_query=query, search_from=0, size=2000000, idx=snp_track_idx+'/'+snp_track_type) snpResult = snpQuery.get_result() snps = snpResult['data'] snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps) data_type = getattr(chicp_settings, 'CHICP_IDX').get(group).get('DATA_TYPE') snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type) # if 'max' in snpSettings: # maxScore = float(snpSettings['max']) # else: for s in snps: if float(s['score']) > maxScore: maxScore = float(s['score']) snpSettings['max'] = maxScore snpMeta = snpSettings return snps, snpMeta
def check_hits(resp_json): self.assertTrue('hits' in resp_json, 'scan and scroll hits') self.assertGreaterEqual(len(resp_json['hits']['hits']), 1) docs = [Document(hit) for hit in resp_json['hits']['hits']] for doc1 in docs: doc_internal_id = getattr(doc1, "internal_id") if doc_internal_id in internal_id: pos1 = self._get_highest_build(doc1) for doc2 in internal_id[doc_internal_id]: pos2 = self._get_highest_build(doc2) if pos2['position'] != pos1['position']: msg = ("DIFFERENT POSITIONS ID: "+str(doc_internal_id)+":\t" + str(getattr(doc1, "name"))+": "+pos1['position']+" ("+doc1.doc_id()+")\t" + str(getattr(doc2, "name"))+": "+pos2['position']+" ("+doc2.doc_id()+")\t") try: terms_filter = TermsFilter.get_terms_filter("start", [pos1['position'], pos2['position']]) query = ElasticQuery.filtered(Query.term("seqid", pos1['seqid']), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('MARKER', 'MARKER')) docs_by_pos = elastic.search().docs found = False for d in docs_by_pos: msg += getattr(d, "id")+": "+str(getattr(d, "start"))+"\t" if getattr(d, "id") == 'rs'+str(doc_internal_id): found = True if not found: msg += 'rs'+str(doc_internal_id) if self._rs_exists('rs'+str(doc_internal_id)): msg += ' EXISTS IN DBSNP\t' else: msg += ' NOT IN DBSNP\t' logger.error(msg) except KeyError: logger.error(msg) internal_id[doc_internal_id].append(doc1) else: internal_id[doc_internal_id] = [doc1]
def _build_snp_query(snp_track, chrom, segmin, segmax): snps = [] snpMeta = {} maxScore = -1 if snp_track and snp_track != 'None': # get SNPs based on this segment mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.filtered(Query.terms("seqid", [chrom, str("chr"+chrom)]), Filter(RangeQuery("end", gte=segmin, lte=segmax)), utils.snpFields) snpQuery = Search(search_query=query, search_from=0, size=10000, idx=snp_track_idx) # snpResult = snpQuery.get_result() # snps = snpResult['data'] snpResult = snpQuery.get_json_response() snps = [] for hit in snpResult['hits']['hits']: snps.append(hit['_source']) snps = utils.makeRelative(int(segmin), int(segmax), ['start', 'end'], snps) data_type = ElasticSettings.get_label('CP_STATS_'+group.upper(), None, "data_type") snpSettings = getattr(chicp_settings, 'STUDY_DEFAULTS').get(data_type) for s in snps: if float(s['score']) > maxScore: maxScore = float(s['score']) snpSettings['max'] = maxScore snpMeta = snpSettings return snps, snpMeta