def get_associations(self, target_id, disease_id, es): s = Search().using(es).index( self.es_index_assoc).extra(track_total_hits=True)[:20] if target_id: s = s.query( ConstantScore(filter={"term": { "target.id": target_id }})) if disease_id: s = s.query( ConstantScore(filter={"term": { "disease.id": disease_id }})) s = s.sort("-harmonic-sum.overall") s._source = ['id', 'harmonic-sum.overall'] s.aggs.bucket("direct_associations", "filter", term={ "is_direct": "true" }).bucket( "top_direct_ass", "top_hits", sort={"harmonic-sum.overall": { "order": "desc" }}, size=20, _source=['id', 'harmonic-sum.overall']) r = s.execute() #{"total"=[{"id":"xxx","score":"y.y"}],"direct"=[{"id":"xxx","score":"y.y"}]} return ( { "total": [{ "id": h.id, "score": min(float(h["harmonic-sum"]["overall"]), 1.0) } for h in r.hits], "direct": [{ "id": h.id, "score": min(float(h["harmonic-sum"]["overall"]), 1.0) } for h in r.aggregations.direct_associations.top_direct_ass.hits] }, { #see https://www.elastic.co/guide/en/elasticsearch/reference/7.x/search-request-track-total-hits.html "total": int(r.hits.total.value), "direct": int(r.aggregations.direct_associations.top_direct_ass.hits. total.value), })
def query_ids_by_dataset(self, index, dataset, offset, page_size, start_time=None, end_time=None, polygon=None): """ Return list of ids by dataset: { "query": { "term": { "dataset.keyword": "area_of_interest" } }, "fields": [ "_id" ] } :param index: Elasticsearch index/alias :param dataset: dataset field in ES :param offset: page offset from 0 :param page_size: self-explanatory :param start_time: (optional) Greater than or equal of Timestamp field (start_time) in ISO format :param end_time: (optional) Less than of Timestamp field (end_time) in ISO format :param polygon: (optional) List[List[int]] :return: Elasticsearch document """ s = Search(using=self.client, index=index).query(Q('term', dataset__keyword=dataset)) if start_time is not None: s = s.query('range', **{'starttime': {'gte': start_time}}) if end_time is not None: s = s.query('range', **{'endtime': {'lt': end_time}}) if polygon is not None: s = s.query( 'geo_shape', **{ 'location': { 'shape': { 'type': 'polygon', 'coordinates': polygon } } }) s._source = ['id'] if self.logger: self.logger.debug(s.to_dict()) s = s[offset:offset + page_size] return s.count(), [i['id'] for i in s]
def get_associations(self, target_id, disease_id, es): s = Search().using(es).index(self.es_index_assoc)[:20] if target_id: s = s.query( ConstantScore(filter={"term": { "target.id": target_id }})) if disease_id: s = s.query( ConstantScore(filter={"term": { "disease.id": disease_id }})) s = s.sort("-harmonic-sum.overall") s._source = ['id', 'harmonic-sum.overall'] s.aggs.bucket("direct_associations", "filter", term={ "is_direct": "true" }).bucket( "top_direct_ass", "top_hits", sort={"harmonic-sum.overall": { "order": "desc" }}, size=20, _source=['id', 'harmonic-sum.overall']) r = s.execute() #{"total"=[{"id":"xxx","score":"y.y"}],"direct"=[{"id":"xxx","score":"y.y"}]} return ({ "total": [{ "id": h.id, "score": min(h["harmonic-sum"]["overall"], 1.0) } for h in r.hits], "direct": [{ "id": h.id, "score": min(h["harmonic-sum"]["overall"], 1.0) } for h in r.aggregations.direct_associations.top_direct_ass.hits] }, { "total": r.hits.total, "direct": r.aggregations.direct_associations.top_direct_ass.hits.total, })
def overlaps(self, index, _id, terms, fields, offset, page_size): """Return list of documents that overlap temporally and spatially: { "query": { "filtered": { "filter": { "geo_shape": { "location": { "shape": { "type": "polygon", "coordinates": [ [ [ 123.234154, -33.344517 ], [ 120.578377, -32.708748 ], [ 121.122925, -31.111742 ], [ 123.731133, -31.73547 ], [ 123.234154, -33.344517 ] ] ] } } } }, "query": { "bool": { "must": [ { "range": { "endtime": { "gt": "2017-04-18T21:09:23.789" } } }, { "range": { "starttime": { "lt": "2017-04-18T21:09:50.741" } } } ] } } } }, "_source": [ "id", "metadata.trackNumber", "location" ] } """ # get document by id doc = self.query_id(index, _id) if self.logger: self.logger.debug(json.dumps(doc, indent=2)) if doc is None: raise RuntimeError("Failed to find dataset ID: {}".format(_id)) # get spatial and temporal fields starttime = doc.get('starttime', None) endtime = doc.get('endtime', None) location = doc.get('location', None) # build terms query t = None for field, val in list(terms.items()): f = field.lower().replace('.', '__') if t is None: t = Q('term', **{f: val}) else: t += Q('term', **{f: val}) # set temporal query q = Q() if starttime is not None: q += Q('range', **{'endtime': {'gt': starttime}}) if endtime is not None: q += Q('range', **{'starttime': {'lt': endtime}}) # set spatial filter f = None if location is not None: f = Q('geo_shape', **{'location': {'shape': location}}) # search s = Search(using=self.client, index=index) if t is not None: s = s.query(t) if q != Q(): s = s.query(q) if f is not None: s = s.filter(f) s._source = fields if self.logger: self.logger.debug(s.to_dict()) return s.count(), [i.to_dict() for i in s[offset:offset + page_size]]
def query_fields(self, index, terms, fields, offset, page_size, start_time=None, end_time=None, polygon=None): """Return list of documents by term bool query: { "query": { "bool": { "must": [ { "term": { "dataset_type.keyword": "acquisition" } }, { "term": { "dataset.keyword": "acquisition-S1-IW_SLC" } } ] } }, "_source": [ "id", "metadata.trackNumber", "location" ] } :param index: Elasticsearch index/alias :param terms: Dict; "custom" fields to filter on :param fields: fields to be returned by Elasticsearch :param offset: page offset from 0 :param page_size: self-explanatory :param start_time: (optional) Greater than or equal of Timestamp field (start_time) in ISO format :param end_time: (optional) Less than of Timestamp field (end_time) in ISO format :param polygon: (optional) List[List[int]] :return: Elasticsearch document """ q = None for field, val in list(terms.items()): f = field.lower().replace('.', '__') if q is None: q = Q('term', **{f: val}) else: q += Q('term', **{f: val}) s = Search(using=self.client, index=index).query(q) if start_time is not None: s = s.query('range', **{'starttime': {'gte': start_time}}) if end_time is not None: s = s.query('range', **{'endtime': {'lt': end_time}}) if polygon is not None: s = s.query( 'geo_shape', **{ 'location': { 'shape': { 'type': 'polygon', 'coordinates': polygon } } }) s._source = fields # sort by starttime in descending order; TODO: expose sort parameters out through API s = s.sort({"starttime": {"order": "desc"}}) s = s[offset:offset + page_size] if self.logger: self.logger.debug(s.to_dict()) return s.count(), [i.to_dict() for i in s]