def get_associations(self, target_id, disease_id, es):
        s = Search().using(es).index(
            self.es_index_assoc).extra(track_total_hits=True)[:20]
        if target_id:
            s = s.query(
                ConstantScore(filter={"term": {
                    "target.id": target_id
                }}))
        if disease_id:
            s = s.query(
                ConstantScore(filter={"term": {
                    "disease.id": disease_id
                }}))
        s = s.sort("-harmonic-sum.overall")
        s._source = ['id', 'harmonic-sum.overall']
        s.aggs.bucket("direct_associations",
                      "filter",
                      term={
                          "is_direct": "true"
                      }).bucket(
                          "top_direct_ass",
                          "top_hits",
                          sort={"harmonic-sum.overall": {
                              "order": "desc"
                          }},
                          size=20,
                          _source=['id', 'harmonic-sum.overall'])

        r = s.execute()

        #{"total"=[{"id":"xxx","score":"y.y"}],"direct"=[{"id":"xxx","score":"y.y"}]}
        return (
            {
                "total": [{
                    "id":
                    h.id,
                    "score":
                    min(float(h["harmonic-sum"]["overall"]), 1.0)
                } for h in r.hits],
                "direct":
                [{
                    "id": h.id,
                    "score": min(float(h["harmonic-sum"]["overall"]), 1.0)
                } for h in
                 r.aggregations.direct_associations.top_direct_ass.hits]
            },
            {
                #see https://www.elastic.co/guide/en/elasticsearch/reference/7.x/search-request-track-total-hits.html
                "total":
                int(r.hits.total.value),
                "direct":
                int(r.aggregations.direct_associations.top_direct_ass.hits.
                    total.value),
            })
Example #2
0
    def query_ids_by_dataset(self,
                             index,
                             dataset,
                             offset,
                             page_size,
                             start_time=None,
                             end_time=None,
                             polygon=None):
        """
        Return list of ids by dataset:
        {
          "query": {
            "term": {
              "dataset.keyword": "area_of_interest"
            }
          },
          "fields": [
            "_id"
          ]
        }
        :param index: Elasticsearch index/alias
        :param dataset: dataset field in ES
        :param offset: page offset from 0
        :param page_size: self-explanatory
        :param start_time: (optional) Greater than or equal of Timestamp field (start_time) in ISO format
        :param end_time: (optional) Less than of Timestamp field (end_time) in ISO format
        :param polygon: (optional) List[List[int]]
        :return: Elasticsearch document
        """

        s = Search(using=self.client,
                   index=index).query(Q('term', dataset__keyword=dataset))
        if start_time is not None:
            s = s.query('range', **{'starttime': {'gte': start_time}})
        if end_time is not None:
            s = s.query('range', **{'endtime': {'lt': end_time}})
        if polygon is not None:
            s = s.query(
                'geo_shape', **{
                    'location': {
                        'shape': {
                            'type': 'polygon',
                            'coordinates': polygon
                        }
                    }
                })

        s._source = ['id']
        if self.logger:
            self.logger.debug(s.to_dict())
        s = s[offset:offset + page_size]
        return s.count(), [i['id'] for i in s]
Example #3
0
    def get_associations(self, target_id, disease_id, es):
        s = Search().using(es).index(self.es_index_assoc)[:20]
        if target_id:
            s = s.query(
                ConstantScore(filter={"term": {
                    "target.id": target_id
                }}))
        if disease_id:
            s = s.query(
                ConstantScore(filter={"term": {
                    "disease.id": disease_id
                }}))
        s = s.sort("-harmonic-sum.overall")
        s._source = ['id', 'harmonic-sum.overall']
        s.aggs.bucket("direct_associations",
                      "filter",
                      term={
                          "is_direct": "true"
                      }).bucket(
                          "top_direct_ass",
                          "top_hits",
                          sort={"harmonic-sum.overall": {
                              "order": "desc"
                          }},
                          size=20,
                          _source=['id', 'harmonic-sum.overall'])

        r = s.execute()

        #{"total"=[{"id":"xxx","score":"y.y"}],"direct"=[{"id":"xxx","score":"y.y"}]}
        return ({
            "total": [{
                "id": h.id,
                "score": min(h["harmonic-sum"]["overall"], 1.0)
            } for h in r.hits],
            "direct": [{
                "id": h.id,
                "score": min(h["harmonic-sum"]["overall"], 1.0)
            } for h in r.aggregations.direct_associations.top_direct_ass.hits]
        }, {
            "total":
            r.hits.total,
            "direct":
            r.aggregations.direct_associations.top_direct_ass.hits.total,
        })
Example #4
0
    def overlaps(self, index, _id, terms, fields, offset, page_size):
        """Return list of documents that overlap temporally and spatially:
        {
          "query": {
            "filtered": {
              "filter": {
                "geo_shape": {
                  "location": {
                    "shape": {
                      "type": "polygon",
                      "coordinates": [
                        [
                          [
                            123.234154,
                            -33.344517
                          ],
                          [
                            120.578377,
                            -32.708748
                          ],
                          [
                            121.122925,
                            -31.111742
                          ],
                          [
                            123.731133,
                            -31.73547
                          ],
                          [
                            123.234154,
                            -33.344517
                          ]
                        ]
                      ]
                    }
                  }
                }
              },
              "query": {
                "bool": {
                  "must": [
                    {
                      "range": {
                        "endtime": {
                          "gt": "2017-04-18T21:09:23.789"
                        }
                      }
                    },
                    {
                      "range": {
                        "starttime": {
                          "lt": "2017-04-18T21:09:50.741"
                        }
                      }
                    }
                  ]
                }
              }
            }
          },
          "_source": [
            "id",
            "metadata.trackNumber",
            "location"
          ]
        }
        """

        # get document by id
        doc = self.query_id(index, _id)
        if self.logger:
            self.logger.debug(json.dumps(doc, indent=2))
        if doc is None:
            raise RuntimeError("Failed to find dataset ID: {}".format(_id))

        # get spatial and temporal fields
        starttime = doc.get('starttime', None)
        endtime = doc.get('endtime', None)
        location = doc.get('location', None)

        # build terms query
        t = None
        for field, val in list(terms.items()):
            f = field.lower().replace('.', '__')
            if t is None:
                t = Q('term', **{f: val})
            else:
                t += Q('term', **{f: val})

        # set temporal query
        q = Q()
        if starttime is not None:
            q += Q('range', **{'endtime': {'gt': starttime}})
        if endtime is not None:
            q += Q('range', **{'starttime': {'lt': endtime}})

        # set spatial filter
        f = None
        if location is not None:
            f = Q('geo_shape', **{'location': {'shape': location}})

        # search
        s = Search(using=self.client, index=index)
        if t is not None:
            s = s.query(t)
        if q != Q():
            s = s.query(q)
        if f is not None:
            s = s.filter(f)
        s._source = fields

        if self.logger:
            self.logger.debug(s.to_dict())

        return s.count(), [i.to_dict() for i in s[offset:offset + page_size]]
Example #5
0
    def query_fields(self,
                     index,
                     terms,
                     fields,
                     offset,
                     page_size,
                     start_time=None,
                     end_time=None,
                     polygon=None):
        """Return list of documents by term bool query:
        {
          "query": {
            "bool": {
              "must": [
                {
                  "term": {
                    "dataset_type.keyword": "acquisition"
                  }
                },
                {
                  "term": {
                    "dataset.keyword": "acquisition-S1-IW_SLC"
                  }
                }
              ]
            }
          },
          "_source": [
            "id",
            "metadata.trackNumber",
            "location"
          ]
        }
        :param index: Elasticsearch index/alias
        :param terms: Dict; "custom" fields to filter on
        :param fields: fields to be returned by Elasticsearch
        :param offset: page offset from 0
        :param page_size: self-explanatory
        :param start_time: (optional) Greater than or equal of Timestamp field (start_time) in ISO format
        :param end_time: (optional) Less than of Timestamp field (end_time) in ISO format
        :param polygon: (optional) List[List[int]]
        :return: Elasticsearch document
        """

        q = None
        for field, val in list(terms.items()):
            f = field.lower().replace('.', '__')
            if q is None:
                q = Q('term', **{f: val})
            else:
                q += Q('term', **{f: val})

        s = Search(using=self.client, index=index).query(q)
        if start_time is not None:
            s = s.query('range', **{'starttime': {'gte': start_time}})
        if end_time is not None:
            s = s.query('range', **{'endtime': {'lt': end_time}})
        if polygon is not None:
            s = s.query(
                'geo_shape', **{
                    'location': {
                        'shape': {
                            'type': 'polygon',
                            'coordinates': polygon
                        }
                    }
                })

        s._source = fields
        # sort by starttime in descending order; TODO: expose sort parameters out through API
        s = s.sort({"starttime": {"order": "desc"}})
        s = s[offset:offset + page_size]

        if self.logger:
            self.logger.debug(s.to_dict())
        return s.count(), [i.to_dict() for i in s]