Ejemplo n.º 1
0
    def test_match_some_terms_from_array(self):
        query = QueryManager.match_some_terms_from_array(
            terms=["los angeles", "NEW YORK", "AKBA"],
            key="variables.named_entity",
            minimum_should_match=2)
        expected = {
            "query": {
                "bool": {
                    "should": [{
                        "term": {
                            "variables.named_entity": "los angeles"
                        }
                    }, {
                        "term": {
                            "variables.named_entity": "new york"
                        }
                    }, {
                        "term": {
                            "variables.named_entity": "akba"
                        }
                    }],
                    "minimum_should_match":
                    2
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Ejemplo n.º 2
0
    def test_match_some_terms_from_array_default(self):
        terms = ["los angeles", "NEW YORK", "AKBA", "shanghai", "TOKyo"]
        query = QueryManager.match_some_terms_from_array(terms=terms)
        expected = {
            "query": {
                "bool": {
                    "should": [{
                        "term": {
                            "variables.named_entity.keyword": "los angeles"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "new york"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "akba"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "shanghai"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "tokyo"
                        }
                    }],
                    "minimum_should_match":
                    math.ceil(len(terms) / 2)
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Ejemplo n.º 3
0
class Augment(object):

    def __init__(self, es_index: str, es_host: str = "dsbox02.isi.edu", es_port: int = 9200) -> None:
        """Init method of QuerySystem, set up connection to elastic search.

        Args:
            es_index: elastic search index.
            es_host: es_host.
            es_port: es_port.

        Returns:

        """

        self.qm = QueryManager(es_host=es_host, es_port=es_port, es_index=es_index)

    def query_by_column(self,
                        col: pd.Series,
                        minimum_should_match: int = None,
                        **kwargs
                        ) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            col: pandas Dataframe column.
            minimum_should_match: An integer ranges from 0 to length of unique value in col.
            Represent the minimum number of terms should match.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_some_terms_from_array(terms=col.unique().tolist(),
                                                   minimum_should_match=minimum_should_match)
        return self.qm.search(body=body, **kwargs)

    def query_by_named_entities(self,
                                named_entities: list,
                                minimum_should_match: int = None,
                                **kwargs
                                ) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            named_entities: list of named entities
            minimum_should_match: An integer ranges from 0 to length of named entities list.
            Represent the minimum number of terms should match.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_some_terms_from_array(terms=named_entities,
                                                   key="variables.named_entity.keyword",
                                                   minimum_should_match=minimum_should_match)
        return self.qm.search(body=body, **kwargs)

    def query_by_temporal_coverage(self, start=None, end=None, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a temporal coverage of column

        Args:
            start: dataset should cover date time earlier than the start date.
            end: dataset should cover date time later than the end date.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_temporal_coverage(start=start, end=end)
        return self.qm.search(body=body, **kwargs)

    def query_by_datamart_id(self, datamart_id: int, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by datamart id

        Args:
            datamart_id: int

        Returns:
            matching docs of metadata
        """

        global_body = self.qm.match_global_datamart_id(datamart_id=datamart_id)
        variable_body = self.qm.match_variable_datamart_id(datamart_id=datamart_id)
        return self.qm.search(body=global_body, **kwargs) or self.qm.search(body=variable_body, **kwargs)

    def query_by_key_value_pairs(self,
                                 key_value_pairs: typing.List[tuple],
                                 **kwargs
                                 ) -> typing.Optional[typing.List[dict]]:
        """Query metadata by datamart id

        Args:
            key_value_pairs: list of key value tuple

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_key_value_pairs(key_value_pairs=key_value_pairs)
        return self.qm.search(body=body, **kwargs)

    def query_any_field_with_string(self, query_string, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query any field of matadata with query_string

        Args:
            key_value_pairs: list of key value tuple

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_any(query_string=query_string)
        return self.qm.search(body=body, **kwargs)

    def query_by_es_query(self, body: str, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by an elastic search query

        Args:
            body: query body

        Returns:
            matching docs of metadata
        """
        return self.qm.search(body=body, **kwargs)

    @staticmethod
    def get_dataset(metadata: dict, variables: list = None, constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """

        return Utils.materialize(metadata=metadata, variables=variables, constrains=constrains)