def test_region_attributes(self): ''' test region attributes ''' idx = ElasticSettings.idx(RegionDataTest.IDX_KEY, 'REGION') (idx, idx_type) = idx.split('/') docs = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=Query.match_all(), sources=[], size=1) newRegion = utils.Region.pad_region_doc(docs[0]) if len(getattr(newRegion, "genes")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "genes"))) resultObject = Search(query, idx=ElasticSettings.idx('GENE', 'GENE'), size=len(getattr(newRegion, "genes"))).search() self.assertEqual(len(getattr(newRegion, "genes")), resultObject.hits_total, "All genes on region found in GENE index") if len(getattr(newRegion, "studies")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "studies"))) resultObject = Search(query, idx=ElasticSettings.idx('STUDY', 'STUDY'), size=len(getattr(newRegion, "studies"))).search() self.assertEqual(len(getattr(newRegion, "studies")), resultObject.hits_total, "All study ids for region found in STUDY index") if len(getattr(newRegion, "pmids")) > 0: query = ElasticQuery(Query.ids(getattr(newRegion, "pmids"))) resultObject = Search(query, idx=ElasticSettings.idx('PUBLICATION', 'PUBLICATION'), size=len(getattr(newRegion, "pmids"))).search() self.assertEqual(len(getattr(newRegion, "pmids")), resultObject.hits_total, "All PMIDs for region found in PUBLICATION index")
def test_create_idx_type_model_permissions(self): elastic_settings_before = ElasticSettings.attrs().get('IDX') user_types_before = elastic_settings_before['CP_STATS_UD']['idx_type'] self.assertEqual({}, user_types_before, 'CP_STATS_UD idx_type is empty') idx = "cp:hg19_userdata_bed" new_upload_file = "tmp_newly_uploaded_file" idx_type = new_upload_file os.system("curl -XPUT "+ElasticSettings.url()+"/"+idx+"/_mapping/"+idx_type+" -d '{\"" + idx_type + "\":{ \"properties\" : {\"message\" : {\"type\" : \"string\", \"store\" : true } } }}'") os.system("curl -XPUT "+ElasticSettings.url()+"/"+idx+"/"+idx_type+"/_meta -d '{\"label\": \"" + new_upload_file + "\", \"owner\": \""+self.user.username+"\", \"uploaded\": \"" + str(timezone.now())+"\"}'") elastic_settings_after = elastic_factory.create_idx_type_model_permissions(self.user, indexKey='CP_STATS_UD', indexTypeKey='UD-'+new_upload_file.upper(), # @IgnorePep8 new_upload_file="tmp_newly_uploaded_file") # @IgnorePep8 # elastic_settings_after = elastic_factory.get_elastic_settings_with_user_uploads(elastic_settings_before) user_types_after = elastic_settings_after['CP_STATS_UD']['idx_type'] self.assertTrue(len(user_types_after) > 0, "Has user idx_types ") self.assertTrue('UD-TMP_NEWLY_UPLOADED_FILE' in user_types_after) self.assertEqual(user_types_after['UD-TMP_NEWLY_UPLOADED_FILE']['type'], 'tmp_newly_uploaded_file')
def tearDown(self): ''' Remove loaded test indices and test repository. ''' key = 'PRIVATE_REGIONS_GFF' if key in IDX.keys(): print(ElasticSettings.url() + '/' + IDX[key]['indexName']) requests.delete(ElasticSettings.url() + '/' + IDX[key]['indexName'])
def _find_snp_position(snp_track, name): if snp_track is None: query = ElasticQuery.query_match("id", name) elastic = Search(query, idx=ElasticSettings.idx('MARKER')) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} else: mo = re.match(r"(.*)-(.*)", snp_track) (group, track) = mo.group(1, 2) try: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper(), snp_track.upper()) except SettingsError: snp_track_idx = ElasticSettings.idx('CP_STATS_'+group.upper())+"/"+track query = ElasticQuery.query_match("name", name) elastic = Search(query, idx=snp_track_idx) snpResult = elastic.get_json_response() if(len(snpResult['hits']['hits'])) > 0: snp = snpResult['hits']['hits'][0]['_source'] chrom = snp['seqid'].replace('chr', "") position = snp['start'] return {'chr': chrom, 'start': (position-1), 'end': position, 'name': name} return {'error': 'Marker '+name+' does not exist in the currently selected dataset'}
def suggester(request): ''' Provide auto suggestions. Ajax request returning a JSON response. ''' query_dict = request.GET idx_dict = ElasticSettings.search_props(query_dict.get("idx"), request.user) suggester = ','.join(ElasticSettings.idx(k) for k in idx_dict['suggester_keys']) resp = Suggest.suggest(query_dict.get("term"), suggester, name='suggest', size=8)['suggest'] return JsonResponse({"data": [opts['text'] for opts in resp[0]['options']]})
def test_create_idx_type_model_permissions(self): elastic_settings_before = ElasticSettings.attrs().get('IDX') user_types_before = elastic_settings_before['CP_STATS_UD']['idx_type'] self.assertEqual({}, user_types_before, 'CP_STATS_UD idx_type is empty') idx = "cp:hg19_userdata_bed" new_upload_file = "tmp_newly_uploaded_file" idx_type = new_upload_file os.system( "curl -XPUT " + ElasticSettings.url() + "/" + idx + "/_mapping/" + idx_type + " -d '{\"" + idx_type + "\":{ \"properties\" : {\"message\" : {\"type\" : \"string\", \"store\" : true } } }}'" ) os.system("curl -XPUT " + ElasticSettings.url() + "/" + idx + "/" + idx_type + "/_meta -d '{\"label\": \"" + new_upload_file + "\", \"owner\": \"" + self.user.username + "\", \"uploaded\": \"" + str(timezone.now()) + "\"}'") elastic_settings_after = elastic_factory.create_idx_type_model_permissions( self.user, indexKey='CP_STATS_UD', indexTypeKey='UD-' + new_upload_file.upper(), # @IgnorePep8 new_upload_file="tmp_newly_uploaded_file") # @IgnorePep8 # elastic_settings_after = elastic_factory.get_elastic_settings_with_user_uploads(elastic_settings_before) user_types_after = elastic_settings_after['CP_STATS_UD']['idx_type'] self.assertTrue(len(user_types_after) > 0, "Has user idx_types ") self.assertTrue('UD-TMP_NEWLY_UPLOADED_FILE' in user_types_after) self.assertEqual( user_types_after['UD-TMP_NEWLY_UPLOADED_FILE']['type'], 'tmp_newly_uploaded_file')
def test_show(self): self.assertTrue( Snapshot.show(ElasticSettings.getattr('REPOSITORY'), '_all', False)) self.assertTrue( Snapshot.show(ElasticSettings.getattr('REPOSITORY'), '_all', True)) self.assertFalse(Snapshot.show('xyzabc', '_all', False))
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from elastic. ''' q_size = view.paginator.get_limit(request) q_from = view.paginator.get_offset(request) filterable = getattr(view, 'filter_fields', []) print(filterable) print(request) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) criteria_idx = self._get_index(filters.get('feature_type', 'GENE_CRITERIA')) idx = criteria_idx if type(criteria_idx) == list: idx = ','.join(ElasticSettings.idx(name) for name in criteria_idx) else: idx = ElasticSettings.idx(criteria_idx) q = ElasticQuery(Query.match_all()) s = Search(search_query=q, idx=idx, size=q_size, search_from=q_from) json_results = s.get_json_response() results = [] for result in json_results['hits']['hits']: new_obj = ElasticObject(initial=result['_source']) new_obj.uuid = result['_id'] new_obj.criteria_type = result['_type'] results.append(new_obj) view.es_count = json_results['hits']['total'] return results
def get_criteria(docs, doc_type, doc_attr, idx_type_key): """ Return a dictionary of gene name:criteria. """ genes = [getattr(doc, doc_attr).lower() for doc in docs if doc.type() == doc_type] query = Query.terms("Name", genes) sources = {"exclude": ["Primary id", "Object class", "Total score"]} if ElasticSettings.idx("CRITERIA", idx_type_key) is None: return {} res = Search( ElasticQuery(query, sources=sources), idx=ElasticSettings.idx("CRITERIA", idx_type_key), size=len(genes) ).search() criteria = {} for doc in res.docs: od = collections.OrderedDict(sorted(doc.__dict__.items(), key=lambda t: t[0])) gene_name = getattr(doc, "Name") criteria[gene_name] = [ {attr.replace("_Hs", ""): value.split(":")} for attr, value in od.items() if attr != "Name" and attr != "_meta" and attr != "OD_Hs" and not value.startswith("0") ] if hasattr(doc, "OD_Hs") and not getattr(doc, "OD_Hs").startswith("0"): if gene_name not in criteria: criteria[gene_name] = [] criteria[gene_name].append({"OD": getattr(doc, "OD_Hs").split(":")}) return criteria
def _get_marker_build(idx_name): ''' Get the marker build as defined in the settings. ''' try: idx_key = ElasticSettings.get_idx_key_by_name(idx_name) return ElasticSettings.get_label(idx_key, label='build') except (KeyError, SettingsError, TypeError): logger.error('Marker build not identified from ELASTIC settings.') return ''
def test_top_hits_sub_agg(self): sub_agg = Agg('idx_top_hits', 'top_hits', {"size": 1}) aggs = Aggs([Agg("idxs", "terms", {"field": "_index"}, sub_agg=sub_agg), Agg("categories", "terms", {"field": "_type", "size": 0})]) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) buckets = search.search().aggs['idxs'].get_docs_in_buckets() self.assertEqual(buckets[ElasticSettings.idx('DEFAULT')]['doc_count'], 3) self.assertEqual(len(buckets[ElasticSettings.idx('DEFAULT')]['docs']), 1)
def test_sort_query(self): ''' Test sorting for a query. ''' query = ElasticQuery(Query.match_all()) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=Sort('start:asc,_score')) self._check_sort_order(elastic.search().docs) qsort = Sort({"sort": [{"start": {"order": "asc", "mode": "avg"}}]}) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), qsort=qsort) self._check_sort_order(elastic.search().docs) self.assertRaises(QueryError, Sort, 1)
def test_term_query(self): ''' Test building and running a match query. ''' query = ElasticQuery(Query.term("id", "rs2476601")) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) == 1, "Elastic string query retrieved marker (rs2476601)") query = ElasticQuery(Query.term("seqid", "1", boost=3.0)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) self.assertTrue(len(elastic.search().docs) > 1, "Elastic string query retrieved markers on chr1")
def __init__(self, search_query=None, aggs=None, search_from=0, size=20, search_type=None, idx=ElasticSettings.idx('DEFAULT'), idx_type='', qsort=None, elastic_url=None): ''' Set up parameters to use in the search. L{ElasticQuery} is used to define a search query. @type search_query: L{ElasticQuery} @keyword search_query: The elastic query to search (default: None). @type aggs: L{Aggs} @keyword aggs: Aggregations used in the search. @type search_from: integer @keyword search_from: Offset used in paginations (default: 0). @type size: integer @keyword size: maximum number of hits to return (default: 20). @type search_type: bool @keyword search_type: Set search type = count for aggregations. @type idx: string @keyword idx: index to search (default: default index defined in settings). @type idx_type: string @keyword idx_type: index type (default: ''). @type qsort: Sort @keyword qsort: defines sorting for the query. @type url: string @keyword url: Elastic URL (default: default cluster URL). ''' if search_query is not None: if not isinstance(search_query, ElasticQuery): raise QueryError("not an ElasticQuery") self.query = search_query.query if aggs is not None: if hasattr(self, 'query'): self.query.update(aggs.aggs) else: self.query = aggs.aggs if qsort is not None: if not isinstance(qsort, Sort): raise QueryError("not a Sort") if hasattr(self, 'query'): self.query.update(qsort.qsort) else: logger.error("no query to sort") if elastic_url is None: elastic_url = ElasticSettings.url() self.size = size self.search_from = search_from self.search_type = search_type self.idx = idx self.idx_type = idx_type self.elastic_url = elastic_url if self.search_type is None: self.url = (self.idx + '/' + self.idx_type + '/_search?size=' + str(self.size) + '&from='+str(self.search_from)) else: self.url = (self.idx + '/' + self.idx_type + '/_search?search_type='+search_type)
def test_get_label(self): ''' Test method for getting the index or type label in the settings. ''' self.assertRaises(SettingsError, ElasticSettings.get_label, 'ABC') self.assertTrue( isinstance( ElasticSettings.get_label('MARKER', idx_type='MARKER', label='description'), str)) self.assertTrue(isinstance(ElasticSettings.get_label('MARKER'), str))
def test_scan_and_scroll(self): ''' Test scan and scroll interface. ''' def check_hits(resp_json): self.assertTrue('hits' in resp_json, 'scan and scroll hits') self.assertGreaterEqual(len(resp_json['hits']['hits']), 1) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('DEFAULT'), call_fun=check_hits, query=ElasticQuery.query_string("rs2476601", fields=["id"]))
def get_elastic_settings_with_user_uploads(cls, elastic_dict=None, new_upload_file=None): '''Get the updated elastic settings with user uploaded idx_types''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) ''' why don't we use Search.get_mapping ? I guess it's not a class method''' #logger.debug(response.json()) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) # here if we use aliasing then idx can be different # this causes problems as it's effectively hardcoded # this should fix to handle things where aliases are deployed idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) if elastic_dict is None: elastic_dict = ElasticSettings.attrs().get('IDX') idx_type_dict = {} existing_ct = [ct.name for ct in ContentType.objects.filter(app_label=cls.PERMISSION_MODEL_APP_NAME)] for idx_type in idx_types: idx_type_with_suffix = idx_type + cls.PERMISSION_MODEL_TYPE_SUFFIX for ct in existing_ct: if ct.endswith(idx_type_with_suffix): meta_url = idx + '/' + idx_type + '/_meta/_source' meta_response = Search.elastic_request(elastic_url, meta_url, is_post=False) try: elastic_meta = json.loads(meta_response.content.decode("utf-8")) label = elastic_meta['label'] except: label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} if new_upload_file is not None: idx_type = new_upload_file label = "UD-" + idx_type idx_type_dict['UD-' + idx_type.upper()] = {'label': label, 'type': idx_type} elastic_dict['CP_STATS_UD']['idx_type'] = idx_type_dict return elastic_dict
def tearDownModule(): if os.path.exists(TEST_DATA_DIR + "/STAGE"): shutil.rmtree(TEST_DATA_DIR + "/STAGE") # remove index created INI_CONFIG = IniParser().read_ini(MY_INI_FILE) requests.delete(ElasticSettings.url() + "/" + INI_CONFIG["GENE_HISTORY"]["index"]) requests.delete(ElasticSettings.url() + "/" + INI_CONFIG["DBSNP"]["index"]) os.remove(MY_INI_FILE) ens_dir = os.path.join(TEST_DATA_DIR, "DOWNLOAD", "ENSMART_GENE") if os.path.exists(ens_dir): shutil.rmtree(ens_dir)
def test_query_ids(self): ''' Test by query ids. ''' query = ElasticQuery(Query.ids(['1', '2'])) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 2, "Elastic string query retrieved marker (rs*)") idx_type = docs[0].type() query = ElasticQuery(Query.ids('2', types=idx_type)) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT'), size=5) docs = elastic.search().docs self.assertTrue(len(docs) == 1, "Elastic string query retrieved marker (rs*)")
def factory(hit): ''' Factory method for creating specific document object based on index type of the hit. @type hit: dict @param hit: Elasticsearch hit. ''' (idx, idx_type) = ElasticSettings.get_idx_key_by_name(hit['_index'], idx_type_name=hit['_type']) if idx is None or idx_type is None: return PydginDocument(hit) doc_class = ElasticSettings.get_label(idx, idx_type, label='class') return doc_class(hit) if doc_class is not None else PydginDocument(hit)
def add_arguments(self, parser): parser.add_argument('--snapshot', dest='snapshot', default='_all', help='Snapshot name') parser.add_argument('--repo', dest='repo', default=ElasticSettings.getattr('REPOSITORY'), metavar=ElasticSettings.getattr('REPOSITORY'), help='Repository name') parser.add_argument('--all', dest='all', action='store_true', help='List all repositories')
def factory(hit): ''' Factory method for creating types of documents based on their elasticsearch index type. @type hit: dict @param hit: elasticsearch hit. ''' (idx, idx_type) = ElasticSettings.get_idx_key_by_name(hit['_index'], idx_type_name=hit['_type']) if idx is None or idx_type is None: return PydginDocument(hit) doc_class_str = ElasticSettings.get_label(idx, idx_type, label='class') doc_class = import_string(doc_class_str) if doc_class_str is not None else None return doc_class(hit) if doc_class is not None else PydginDocument(hit)
def test_term(self): ''' Terms Aggregation ''' agg_name = "test" agg = Agg(agg_name, "terms", {"field": "seqid", "size": 0}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(agg_name in r_aggs, "returned test aggregations") ''' Ids Query with Terms Aggregation''' query = ElasticQuery(Query.ids(['1', '2'])) search = Search(search_query=query, aggs=aggs, idx=ElasticSettings.idx('DEFAULT'), size=5) r_aggs = search.search().aggs self.assertTrue(len(r_aggs[agg_name].get_buckets()) > 0, "returned test aggregation buckets") self.assertTrue(getattr(r_aggs[agg_name], 'buckets')[0]['doc_count'] >= 0, "bucket document count")
def test_mapping(self): ''' Test retrieving the mapping for an index. ''' elastic = Search(idx=ElasticSettings.idx('DEFAULT')) mapping = elastic.get_mapping() self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") if ElasticSettings.idx('DEFAULT') in mapping: self.assertTrue("mappings" in mapping[ElasticSettings.idx('DEFAULT')], "Mapping result found") # check using the index type mapping = elastic.get_mapping('marker') self.assertTrue(ElasticSettings.idx('DEFAULT') in mapping, "Database name in mapping result") # err check mapping = elastic.get_mapping('marker/xx') self.assertTrue('error' in mapping, "Database name in mapping result")
def get_criteria_index_types(cls, idx_key): idx = ElasticSettings.idx(idx_key) elastic_url = ElasticSettings.url() url = idx + '/_mappings' response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) idx_types = list(elastic_mapping[idx]['mappings'].keys()) return idx_types
def setUp(self): # Every test needs access to the request factory. self.factory = RequestFactory() self.group, created = Group.objects.get_or_create(name='READ') # @UnusedVariable self.user = User.objects.create_user(username='******', email='*****@*****.**', password='******') self.user.groups.add(self.group) (idx_keys_auth, idx_type_keys_auth) = get_authenticated_idx_and_idx_types( # @UnusedVariable user=self.user, idx_keys=None, idx_type_keys=None) for target in getattr(chicp_settings, 'CP_TARGET'): if 'CP_TARGET_'+target not in idx_keys_auth: continue elasticJSON = Search(idx=ElasticSettings.idx('CP_TARGET_'+target)).get_mapping(mapping_type="gene_target") tissueList = list(elasticJSON[ElasticSettings.idx('CP_TARGET_'+target)] ['mappings']['gene_target']['_meta']['tissue_type'].keys()) utils.tissues['CP_TARGET_'+target] = tissueList
def marker_page(request): ''' Renders a gene page. ''' query_dict = request.GET marker = query_dict.get("m") if marker is None: messages.error(request, 'No gene name given.') raise Http404() fields = ['id', 'rscurrent'] if marker.startswith("rs") else ['name'] sub_agg = Agg('top_hits', 'top_hits', {"size": 15}) aggs = Aggs(Agg("types", "terms", {"field": "_type"}, sub_agg=sub_agg)) query = ElasticQuery(Query.query_string(marker, fields=fields)) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER'), aggs=aggs, size=0) res = elastic.search() if res.hits_total >= 1: types = getattr(res.aggs['types'], 'buckets') marker_doc = None ic_docs = [] history_docs = [] for doc_type in types: hits = doc_type['top_hits']['hits']['hits'] for hit in hits: doc = Document(hit) if 'marker' == doc_type['key']: marker_doc = doc elif 'immunochip' == doc_type['key']: ic_docs.append(doc) elif 'rs_merge' == doc_type['key']: history_docs.append(doc) criteria = {} if marker_doc is not None: if ElasticSettings.idx('CRITERIA') is not None: criteria = views.get_criteria([marker_doc], 'marker', 'id', 'MARKER') marker_doc.marker_build = _get_marker_build(ElasticSettings.idx('MARKER')) context = { 'marker': marker_doc, 'old_dbsnp_docs': _get_old_dbsnps(marker), 'ic': ic_docs, 'history': history_docs, 'criteria': criteria } return render(request, 'marker/marker.html', context, content_type='text/html') elif res.hits_total == 0: messages.error(request, 'Marker '+marker+' not found.') raise Http404()
def scan_and_scroll(self, idx, call_fun=None, idx_type='', url=None, time_to_keep_scoll=1, query=None): ''' Scan and scroll an index and optionally provide a function argument to process the hits. ''' if url is None: url = ElasticSettings.url() url_search_scan = (idx + '/' + idx_type + '/_search?search_type=scan&scroll=' + str(time_to_keep_scoll) + 'm') if query is None: query = { "query": {"match_all": {}}, "size": 1000 } else: if not isinstance(query, ElasticQuery): raise QueryError("not a Query") query = query.query response = Search.elastic_request(url, url_search_scan, data=json.dumps(query)) _scroll_id = response.json()['_scroll_id'] url_scan_scroll = '_search/scroll?scroll=' + str(time_to_keep_scoll) + 'm' count = 0 while True: response = Search.elastic_request(url, url_scan_scroll, data=_scroll_id) _scroll_id = response.json()['_scroll_id'] hits = response.json()['hits']['hits'] nhits = len(hits) if nhits == 0: break count += nhits if call_fun is not None: call_fun(response.json()) logger.debug("Scanned No. Docs ( "+idx+"/"+idx_type+" ) = "+str(count))
def get_hits_by_study_id(cls, study_id, sources=[]): ''' Get visible/authenticated hits. ''' hits_query = ElasticQuery(BoolQuery(must_arr=Query.term('dil_study_id', study_id), b_filter=Filter(Query.missing_terms("field", "group_name"))), sources=sources) docs = Search(hits_query, idx=ElasticSettings.idx('REGION', 'STUDY_HITS'), size=1000).search().docs ens_ids = [gene for doc in docs if getattr(doc, 'genes') for gene in getattr(doc, 'genes')] gene_docs = utils.get_gene_docs_by_ensembl_id(ens_ids, ['symbol']) for doc in docs: if getattr(doc, 'genes'): genes = {} for ens_id in getattr(doc, 'genes'): try: genes[ens_id] = getattr(gene_docs[ens_id], 'symbol') except KeyError: genes = {ens_id: ens_id} setattr(doc, 'genes', genes) build_info = getattr(doc, 'build_info') for bi in build_info: if bi['build'] == settings.DEFAULT_BUILD: setattr(doc, "loc", "chr" + bi['seqid'] + ":" + str(locale.format("%d", bi['start'], grouping=True)) + "-" + str(locale.format("%d", bi['end'], grouping=True))) setattr(doc, "encoded_loc", "chr" + bi['seqid'] + "%3A" + str(bi['start']) + ".." + str(bi['end'])) return docs
def _get_pub_docs_by_pmid(pmids, sources=None): """ Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. """ query = ElasticQuery(Query.ids(pmids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx("PUBLICATION"), size=len(pmids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def filter_queryset(self, request, queryset, view): ''' Override this method to request just the documents required from Rserve. ''' try: filterable = getattr(view, 'filter_fields', []) filters = dict([(k, v) for k, v in request.GET.items() if k in filterable]) mid1 = filters.get('marker', 'rs2476601') dataset = filters.get('dataset', 'EUR').replace('-', '') query = ElasticQuery(BoolQuery(must_arr=[Query.term("id", mid1)]), sources=['seqid', 'start']) elastic = Search(search_query=query, idx=ElasticSettings.idx('MARKER', 'MARKER'), size=1) doc = elastic.search().docs[0] seqid = getattr(doc, 'seqid') rserve = getattr(settings, 'RSERVE') conn = pyRserve.connect(host=rserve.get('HOST'), port=rserve.get('PORT')) pop_str = conn.r.get_pop(dataset, seqid, mid1) pops = json.loads(str(pop_str)) populations = [] for pop in pops: pops[pop]['population'] = pop populations.append(pops[pop]) conn.close() return [ElasticObject(initial={'populations': populations, 'marker': mid1})] except (TypeError, ValueError, IndexError, ConnectionError): return [ElasticObject(initial={'populations': None, 'marker': mid1})]
def get_diseases(self): ''' Overridden get diseases for feature. ''' if super(RegionDocument, self).get_diseases(): idx = ElasticSettings.idx('REGION_CRITERIA') diseases = [getattr(d, "code") for d in Criteria.get_disease_tags(getattr(self, "region_id"), idx=idx)] return diseases return []
def test_gene_interactions(self): '''Fetch random genes from elastic and compare the same with the results fetched via ensembl restful query''' # elastic doc example: # "_source":{"interaction_source": "intact", "interactors": [ # {"interactor": "ENSG00000206053", "pubmed": "16169070"}, # {"interactor": "ENSG00000101474", "pubmed": "16169070"}, # {"interactor": "ENSG00000065361", "pubmed": "16169070"}, # {"interactor": "ENSG00000085465", "pubmed": "16169070"}]} idx_key = 'GENE' idx_type_key = 'INTERACTIONS' idx = ElasticSettings.idx(idx_key, idx_type_key) (idx, idx_type) = idx.split('/') # Test doc count doc_count = DataIntegrityUtils.get_docs_count(idx, idx_type) self.assertGreater(doc_count, 23000, 'Gene doc count greater than 60000') # Get interaction doc - passing the interaction source and id . Also test with random id (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex", "ENSG00000241186") self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex) (child_doc_bioplex, parent_doc_bioplex) = self.get_interaction_doc("bioplex") self.check_bioplex_data(child_doc_bioplex, parent_doc_bioplex) (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact", parent_id="ENSG00000090776") self.check_intact_data(child_doc_intact, parent_doc_intact) (child_doc_intact, parent_doc_intact) = self.get_interaction_doc("intact") self.check_intact_data(child_doc_intact, parent_doc_intact)
def chicpeaFileUpload(request, url): filesDict = request.FILES files = filesDict.getlist("files[]") print(files) snpTracks = list() idx = getattr(chicp_settings, 'CHICP_IDX').get('userdata').get('INDEX') for f in files: line = f.readlines()[0].decode() if line.startswith("#"): line = f.readlines()[1].decode() parts = re.split("\t", line) if re.match("\s", line): parts = re.split("\s", line) if len(parts) != 5: logger.warn("WARNING: unexpected number of columns ("+len(parts)+"): "+line) continue f.seek(0) bedFile = NamedTemporaryFile(delete=False) bedFile.write(f.read()) bedFile.close() idx_type = os.path.basename(bedFile.name) snpTracks.append({"value": idx_type, "text": f.name}) os.system("curl -XDELETE '"+ElasticSettings.url()+"/"+idx+"/"+idx_type+"'") call_command("index_search", indexName=idx, indexType=idx_type, indexBED=bedFile.name) logger.debug("--indexName "+idx+" --indexType "+idx_type+" --indexBED "+bedFile.name) bedFile.delete context = dict() context['userSNPTracks'] = snpTracks return HttpResponse(json.dumps(context), content_type="application/json")
def get_diseases(self): ''' Overridden get diseases for feature. ''' if super(StudyDocument, self).get_diseases(): diseases = [getattr(d, "code") for d in Criteria.get_disease_tags(self.get_name(), idx=ElasticSettings.idx('STUDY_CRITERIA'))] return diseases return []
def test_filter(self): ''' Filter Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='25000')), Agg('avg_start', 'avg', {"field": 'start'}), Agg('min_start', 'min', {"field": 'start'}), Agg('sum_start', 'sum', {"field": 'start'}), Agg('stats_start', 'stats', {"field": 'start'}), Agg('count_start', 'value_count', {"field": 'start'}), Agg('ext_stats_start', 'extended_stats', {"field": 'start'})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('avg_start' in r_aggs, "returned avg aggregation") self.assertTrue('min_start' in r_aggs, "returned min aggregation") stats_keys = ["min", "max", "sum", "count", "avg"] self.assertTrue(all(hasattr(r_aggs['stats_start'], k) for k in stats_keys), "returned min aggregation") stats_keys.extend(["sum_of_squares", "variance", "std_deviation", "std_deviation_bounds"]) self.assertTrue(all(hasattr(r_aggs['ext_stats_start'], k) for k in stats_keys), "returned min aggregation")
def get_gene_docs_by_ensembl_id(cls, ens_ids, sources=None): ''' Get the gene symbols for the corresponding array of ensembl IDs. A dictionary is returned with the key being the ensembl ID and the value the gene document. ''' query = ElasticQuery(Query.ids(ens_ids), sources=sources) elastic = Search(query, idx=ElasticSettings.idx('GENE', idx_type='GENE'), size=len(ens_ids)) return {doc.doc_id(): doc for doc in elastic.search().docs}
def test_string_query(self): ''' Test building and running a string query. ''' query = ElasticQuery.query_string("rs2476601", fields=["id"]) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search() self.assertTrue(len(docs.docs) == 1, "Elastic string query retrieved marker (rs2476601)") self.assertRaises(QueryError, ElasticQuery.query_string, "rs2476601", fieldssss=["id"])
def test_missing_terms_filtered_query(self): ''' Test filtered query with a missing terms filter. ''' terms_filter = TermsFilter.get_missing_terms_filter("field", "group_name") query = ElasticQuery.filtered(Query.match_all(), terms_filter) elastic = Search(query, idx=ElasticSettings.idx('DEFAULT')) docs = elastic.search().docs self.assertTrue(len(docs) == 3, "Elastic string query retrieved all public docs")
def _get_random_marker(self): ''' Get a random marker from the dbSNP elastic index. ''' (idx, idx_type) = ElasticSettings.idx('MARKER', 'MARKER').split('/') seqid = random.randint(1, 10) qbool = BoolQuery(must_arr=[Query.term("seqid", seqid), RangeQuery("tags.weight", gte=80)]) doc = ElasticUtils.get_rdm_docs(idx, idx_type, qbool=qbool, sources=['id', 'start'], size=1)[0] return getattr(doc, 'id')
def gene_in_region(cls, hit, section=None, config=None, result_container={}): try: padded_region_doc = utils.Region.pad_region_doc(Document(hit)) except: logger.warn('Region padding error ') return result_container # 'build_info': {'end': 22411939, 'seqid': '1', 'build': 38, 'start': 22326008}, 'region_id': '1p36.12_008'} region_id = getattr(padded_region_doc, "region_id") region_name = getattr(padded_region_doc, "region_name") build_info = getattr(padded_region_doc, "build_info") diseases = getattr(padded_region_doc, "tags")['disease'] seqid = build_info['seqid'] start = build_info['start'] end = build_info['end'] gene_index = ElasticSettings.idx('GENE', idx_type='GENE') elastic = Search.range_overlap_query(seqid=seqid, start_range=start, end_range=end, idx=gene_index, field_list=['start', 'stop', '_id'], seqid_param="chromosome", end_param="stop", size=10000) result_docs = elastic.search().docs genes = set() for doc in result_docs: genes.add(doc.doc_id()) result_container_populated = cls.populate_container(region_id, region_name, fnotes=None, features=genes, diseases=diseases, result_container=result_container) return result_container_populated
def test_get_criteria_details(self): config = IniParser().read_ini(MY_INI_FILE) idx = ElasticSettings.idx('MARKER_CRITERIA') available_criterias = MarkerCriteria.get_available_criterias(config=config)['marker'] idx_type = ','.join(available_criterias) doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1) self.assertTrue(len(doc_by_idx_type) == 1) feature_id = getattr(doc_by_idx_type[0], 'qid') criteria_details = MarkerCriteria.get_criteria_details(feature_id, config=config) hits = criteria_details['hits'] first_hit = hits[0] _type = first_hit['_type'] _index = first_hit['_index'] _id = first_hit['_id'] _source = first_hit['_source'] disease_tag = _source['disease_tags'][0] self.assertTrue(feature_id, _id) self.assertIn(_type, idx_type) self.assertTrue(idx, _index) self.assertIn(disease_tag, list(_source.keys())) fdetails = _source[disease_tag][0] self.assertIn('fid', fdetails.keys()) self.assertIn('fname', fdetails.keys())
def association_stats(request, sources=None): ''' Get association statistics for a given marker ID. ''' seqid = request.GET.get('chr').replace('chr', '') idx_type = request.GET.get('idx_type').upper() start = request.GET.get('start') end = request.GET.get('end') data = [] def get_stats(resp_json): hits = resp_json['hits']['hits'] for hit in hits: d = Document(hit) data.append({ "CHROM": getattr(d, 'seqid'), "POS": getattr(d, 'position'), "PVALUE": getattr(d, 'p_value'), "DBSNP_ID": getattr(d, 'marker') }) query = ElasticQuery(Query.query_string(seqid, fields=["seqid"]), sources=sources) if start is not None and end is not None: query = ElasticQuery(BoolQuery(must_arr=[Query.query_string(seqid, fields=["seqid"]), RangeQuery("position", gte=start, lte=end)]), sources=sources) ScanAndScroll.scan_and_scroll(ElasticSettings.idx('IC_STATS', idx_type), call_fun=get_stats, query=query) json = {"variants": data} return JsonResponse(json)
def get_interaction_doc(self, interaction_source='intact', parent_id=None): idx_key = 'GENE' idx_type_key = 'INTERACTIONS' parent_idx_key = 'GENE' idx = ElasticSettings.idx(idx_key, idx_type_key) (idx, idx_type) = idx.split('/') if parent_id: qbool_intact = BoolQuery().must([Query.term("interaction_source", interaction_source), Query.term("_parent", parent_id)]) else: qbool_intact = BoolQuery().should([Query.term("interaction_source", interaction_source)]) # Get random doc or specific if id is passed in query docs_by_geneid = DataIntegrityUtils.get_rdm_docs(idx, idx_type, qbool=qbool_intact, sources=[], size=1) doc = docs_by_geneid[0] # Get parent doc parent_id = doc.parent() parent_docs = DataIntegrityUtils.fetch_from_elastic(idx_key, parent_idx_key, [parent_id]) if parent_docs: self.assertTrue(len(parent_docs) >= 1, "Found 1 parent") parent_doc = parent_docs[0] return doc, parent_doc else: return self.get_interaction_doc("intact", parent_id)
def test_gene_criteria_types(self): """Test if the indexes have records""" idx_key = "GENE_CRITERIA" feature_type = "gene" idx = ElasticSettings.idx(idx_key) idx_types = CriteriaDataIntegrityUtils.get_criteria_index_types(idx_key) gene_criterias = Criteria.get_available_criterias(feature_type) CriteriaDataIntegrityTestUtils().test_criteria_types(idx, idx_types, gene_criterias["gene"]) CriteriaDataIntegrityTestUtils().test_criteria_mappings(idx, idx_types) # get random doc for each type ['gene_in_region', 'cand_gene_in_region', 'cand_gene_in_study', 'is_gene_in_mhc'] idx_type = "gene_in_region" doc_by_idx_type = ElasticUtils.get_rdm_docs(idx, idx_type, size=1) self.assertTrue(len(doc_by_idx_type) == 1, "got back one document") gene_in_region_doc = doc_by_idx_type[0] # {'score': 10, 'CRO': [{'fname': '4p11', 'fid': '4p11_005'}], # '_meta': {'_type': 'gene_in_region', '_score': 0.9997835, # '_index': 'pydgin_imb_criteria_gene', '_id': 'ENSG00000250753'}, # 'disease_tags': ['CRO'], 'qid': 'ENSG00000250753'} qid = getattr(gene_in_region_doc, "qid") print(qid) disease_tags = getattr(gene_in_region_doc, "disease_tags") # ENSG00000248482 # ['IBD', 'UC'] # [{'fid': '5q31.1_013', 'fname': '5q31.1'}] # [{'fid': '5q31.1_013', 'fname': '5q31.1'}] fnotes = getattr(gene_in_region_doc, disease_tags[0]) region_id = fnotes[0]["fid"] print(region_id)
def test_significant_terms(self): ''' Significant Terms Aggregation ''' agg = Agg("test_significant_terms", "significant_terms", {"field": "start"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue('test_significant_terms' in r_aggs, "returned aggregations")
def test_indices_str(self): idx_str = ElasticSettings.indices_str() self.assertTrue(idx_str in [ IDX['GFF_GENERIC']['indexName'] + ',' + IDX['MARKER']['indexName'], IDX['MARKER']['indexName'] + ',' + IDX['GFF_GENERIC']['indexName'] ])
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' seqid = str(seqid).replace('chr', '') if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", lte=start_pos), RangeQuery("stop", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("chromosome", seqid), RangeQuery("start", gte=start_pos), RangeQuery("stop", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def _get_old_dbsnps(marker): ''' Get markers from old versions of DBSNP. Assumes the index key is prefixed by 'MARKER_'. ''' old_dbsnps_names = sorted([ElasticSettings.idx(k) for k in ElasticSettings.getattr('IDX').keys() if 'MARKER_' in k], reverse=True) old_dbsnp_docs = [] if len(old_dbsnps_names) > 0: search_query = ElasticQuery(Query.query_string(marker, fields=['id', 'rscurrent'])) for idx_name in old_dbsnps_names: elastic2 = Search(search_query=search_query, idx=idx_name, idx_type='marker') docs = elastic2.search().docs if len(docs) > 0: old_doc = docs[0] old_doc.marker_build = _get_marker_build(idx_name) old_dbsnp_docs.append(old_doc) return old_dbsnp_docs
def tearDownModule(): # remove index created INI_CONFIG = IniParser().read_ini(MY_PUB_INI_FILE) requests.delete(ElasticSettings.url() + '/' + INI_CONFIG['DISEASE']['index']) os.remove(MY_PUB_INI_FILE) if os.path.exists(TEST_DATA_DIR + '/STAGE'): shutil.rmtree(TEST_DATA_DIR + '/STAGE')
def show_es_gene_section(gene_symbol=None, seqid=None, start_pos=None, end_pos=None): ''' Template inclusion tag to render a gene section given a chado gene feature. ''' if seqid is not None and isinstance(seqid, str) and seqid.startswith("chr"): seqid = seqid else: seqid = 'chr' + str(seqid) if gene_symbol is not None: ''' gene symbol query''' query = ElasticQuery.query_match("gene_symbol", gene_symbol) elif end_pos is None: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", lte=start_pos), RangeQuery("featureloc.end", gte=start_pos) ]) query = ElasticQuery.bool(query_bool) else: ''' start and end are same, range query for snp''' query_bool = BoolQuery(must_arr=[ Query.match("seqid", seqid), RangeQuery("featureloc.start", gte=start_pos), RangeQuery("featureloc.end", lte=end_pos) ]) query = ElasticQuery.bool(query_bool) elastic = Search(query, idx=ElasticSettings.idx(name='GENE')) return {'es_genes': elastic.search().docs}
def get_models_to_delete(self): '''Get models to delete''' idx_key = 'CP_STATS_UD' idx = ElasticSettings.idx(idx_key) ''' Check if an index type exists in elastic and later check there is a contenttype/model for the given elastic index type. ''' # @IgnorePep8 elastic_url = ElasticSettings.url() url = idx + '/_mapping' response = Search.elastic_request(elastic_url, url, is_post=False) if "error" in response.json(): logger.warn(response.json()) return None # get idx_types from _mapping elastic_mapping = json.loads(response.content.decode("utf-8")) ## fix needed if we deploy aliasing for indices idx = list(elastic_mapping.keys())[0] idx_types = list(elastic_mapping[idx]['mappings'].keys()) models2go = [] expire_days = 7 # 1 week # add idx_types that have no docs for idx_type in idx_types: ndocs = Search(idx=idx, idx_type=idx_type).get_count()['count'] if (ndocs > 0): models2go.append(idx_type) # add idx_types that were not accessed for a given time period url = idx + '/' + idx_type + '/_meta' response = Search.elastic_request(elastic_url, url, is_post=False) elastic_meta = json.loads(response.content.decode("utf-8")) if '_source' in elastic_meta: uploaded_str_date = elastic_meta['_source']['uploaded'] yymmdd_str = uploaded_str_date.split()[0] # Format: 2015-11-03 14:43:54.099645+00:00 from datetime import datetime as dt dt = dt.strptime(yymmdd_str, '%Y-%m-%d') uploaded_date = dt.date() d1 = datetime.date.today() d2 = d1 - datetime.timedelta(days=expire_days) if uploaded_date < d2: models2go.append(idx_type) return models2go
def test_search_props(self): if 'pydgin_auth' in settings.INSTALLED_APPS: from pydgin_auth.elastic_model_factory import ElasticPermissionModelFactory from django.contrib.contenttypes.models import ContentType from django.contrib.auth.models import Group, User, Permission from django.shortcuts import get_object_or_404 ElasticPermissionModelFactory.create_dynamic_models() search_props = ElasticSettings.search_props("ALL") idx = search_props['idx'] idx_keys = search_props['idx_keys'] idx_type = search_props['idx_type'] self.assertIn('publications', idx, 'publications found in idx') self.assertIn('MARKER', idx_keys, 'MARKER found in idx_keys') self.assertIn('rs_merge', idx_type, 'rs_merge found in idx_type') # CREATE DIL group and add test_dil user to that group dil_group, created = Group.objects.get_or_create(name='DILX') self.assertTrue(created) dil_user = User.objects.create_user( username='******', email='*****@*****.**', password='******') dil_user.groups.add(dil_group) self.assertTrue(dil_user.groups.filter(name='DILX').exists()) # create permission for MARKER and IC test_model_name = 'marker-ic_idx_type' # create permissions on models and retest again to check if the idx type could be seen content_type, created = ContentType.objects.get_or_create( model=test_model_name, app_label="elastic", ) # get the permission ... already created can_read_permission = Permission.objects.get(content_type=content_type) self.assertEqual('can_read_marker-ic_idx_type', can_read_permission.codename, "idx type permission correct") # as soon as the permission is set for an index, the index becomes a restricted resource idx_types_visible = ElasticSettings.search_props("ALL")["idx_type"] self.assertFalse('immunochip' in idx_types_visible, 'immunochip idx type not visible') # now grant permission to dil_user and check if idx type is visible dil_group.permissions.add(can_read_permission) dil_user = get_object_or_404(User, pk=dil_user.id) idx_types_visible = ElasticSettings.search_props("ALL", dil_user)["idx_type"] self.assertTrue('immunochip' in idx_types_visible, 'immunochip idx type visible now')
def exists(cls, repo, snapshot): ''' Test if the repository/snapshot exists. ''' url = ElasticSettings.url() + '/_snapshot/' + repo + '/' + snapshot resp = requests.get(url) if resp.status_code != 200: return False else: return True
def delete_repository(cls, repo): url = ElasticSettings.url() + '/_snapshot/' + repo resp = requests.delete(url) if resp.status_code != 200: logger.error("Status (" + url + "): " + str(resp.status_code) + " :: " + str(resp.json()["error"])) return False return True
def test_top_hits(self): ''' Top Hits Aggregation ''' agg = [Agg('test_filter', 'filter', RangeQuery('start', gt='2000')), Agg('test_top_hits', 'top_hits', {"size": 1})] aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) hits = search.search().aggs['test_top_hits'].get_hits() self.assertTrue(len(hits) == 1, "returned the top hit")
def test_missing(self): ''' Missing Aggregation ''' agg = Agg("test_missing", "missing", {"field": "seqid"}) aggs = Aggs(agg) search = Search(aggs=aggs, idx=ElasticSettings.idx('DEFAULT')) r_aggs = search.search().aggs self.assertTrue(getattr(r_aggs['test_missing'], 'doc_count') == 0, "no missing seqid fields")
def get_count(self): ''' Return the elastic count for a query result ''' url = self.idx + '/' + self.idx_type + '/_count?' data = {} if hasattr(self, 'query'): data = json.dumps(self.query) response = Search.elastic_request(ElasticSettings.url(), url, data=data) return response.json()