Beispiel #1
0
    def __init__(self, es_index: str, es_host: str = "dsbox02.isi.edu", es_port: int = 9200) -> None:
        """Init method of QuerySystem, set up connection to elastic search.

        Args:
            es_index: elastic search index.
            es_host: es_host.
            es_port: es_port.

        Returns:

        """

        self.qm = QueryManager(es_host=es_host, es_port=es_port, es_index=es_index)
Beispiel #2
0
 def load_meta_and_data_by_id(datamart_id: int,
                              first_n_rows: int = None,
                              constrains=None):
     qm = QueryManager(es_host=ES_HOST,
                       es_port=ES_PORT,
                       es_index=PRODUCTION_ES_INDEX)
     res = qm.get_by_id(datamart_id)
     if res and res.get('_source'):
         df = Utils.get_dataset(res['_source'], constrains=constrains)
         if first_n_rows:
             df = df.head(first_n_rows)
         return res['_source'], df
     return None, None
    def test_match_key_value_pairs_list(self):
        query = QueryManager.match_key_value_pairs(key_value_pairs=[(
            "description",
            ["average", "temperature"]), ("title.keyword",
                                          "TAVG"), ("variables.datamart_id",
                                                    0)])
        expected = {
            "query": {
                "bool": {
                    "must": [{
                        "terms": {
                            "description": ["average", "temperature"]
                        }
                    }, {
                        "term": {
                            "title.keyword": "TAVG"
                        }
                    }, {
                        "term": {
                            "variables.datamart_id": 0
                        }
                    }]
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #4
0
    def test_match_temporal_coverage(self):
        query = QueryManager.match_temporal_coverage(start="2018-09-23", end="2018-09-30T00:00:00")
        expected = {
            "query": {
                "nested": {
                    "path": "variables",
                    "inner_hits": {"_source": ["temporal_coverage"]},
                    "query": {
                        "bool": {
                            "must": [
                                {
                                    "range": {
                                        "variables.temporal_coverage.start": {
                                            "lte": "2018-09-23T00:00:00",
                                            "format": "yyyy-MM-dd'T'HH:mm:ss"
                                        }
                                    }
                                },
                                {
                                    "range": {
                                        "variables.temporal_coverage.end": {
                                            "gte": "2018-09-30T00:00:00",
                                            "format": "yyyy-MM-dd'T'HH:mm:ss"
                                        }
                                    }
                                }
                            ]
                        }
                    }
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
    def test_match_some_terms_from_array(self):
        query = QueryManager.match_some_terms_from_array(
            terms=["los angeles", "NEW YORK", "AKBA"],
            key="variables.named_entity",
            minimum_should_match=2)
        expected = {
            "query": {
                "bool": {
                    "should": [{
                        "term": {
                            "variables.named_entity": "los angeles"
                        }
                    }, {
                        "term": {
                            "variables.named_entity": "new york"
                        }
                    }, {
                        "term": {
                            "variables.named_entity": "akba"
                        }
                    }],
                    "minimum_should_match":
                    2
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
    def test_match_some_terms_from_array_default(self):
        terms = ["los angeles", "NEW YORK", "AKBA", "shanghai", "TOKyo"]
        query = QueryManager.match_some_terms_from_array(terms=terms)
        expected = {
            "query": {
                "bool": {
                    "should": [{
                        "term": {
                            "variables.named_entity.keyword": "los angeles"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "new york"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "akba"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "shanghai"
                        }
                    }, {
                        "term": {
                            "variables.named_entity.keyword": "tokyo"
                        }
                    }],
                    "minimum_should_match":
                    math.ceil(len(terms) / 2)
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #7
0
    def test_match_any(self):
        query = QueryManager.match_any(query_string="los angeles average")
        expected = {
            "query_string": {
                "query": "los angeles average"
            }
        }

        self.assertEqual(expected, query)
Beispiel #8
0
    def test_match_global_datamart_id(self):
        query = QueryManager.match_global_datamart_id(datamart_id=0)
        expected = {
            "term": {
                "datamart_id": 0
            }
        }

        self.assertEqual(expected, query)
Beispiel #9
0
    def test_match_variable_datamart_id(self):
        query = QueryManager.match_variable_datamart_id(datamart_id=0)
        expected = {
            "nested": {
                "path": "variables",
                "inner_hits": {"_source": ["datamart_id"]},
                "query": {"bool": {"must": [{"term": {"variables.datamart_id": 0}}]}}}
        }

        self.assertEqual(expected, query)
Beispiel #10
0
    def test_match_match_global_datamart_id(self):
        query = QueryManager.match_global_datamart_id(datamart_id=0)
        expected = {
            "query": {
                "bool": {
                    "must": [{"term": {"datamart_id": 0}}]
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #11
0
    def test_match_all(self):
        query = QueryManager.match_all()
        expected = {
            "query": {
                "bool": {
                    "must": [{"match_all": {}}]
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #12
0
 def test_match_key_value_pairs_list(self):
     query = QueryManager.match_key_value_pairs(key_value_pairs=[
         ("description", ["average", "temperature"]),
         ("title.keyword", "TAVG"),
         ("datamart_id", 0)
     ])
     expected = {
         "bool": {
             "must": [{"match": {"description": "average"}},
                      {"match": {"description": "temperature"}},
                      {"match": {"title.keyword": "TAVG"}},
                      {"match": {"datamart_id": 0}}
                      ]
         }
     }
     self.assertEqual(expected, query)
Beispiel #13
0
    def test_match_some_terms_from_array_default(self):
        terms = ["los angeles", "NEW YORK", "AKBA", "shanghai", "TOKyo"]
        query = QueryManager.match_some_terms_from_variables_array(terms=terms)
        expected = {
            "nested": {
                "path": "variables",
                "inner_hits": {
                    "_source": ["named_entity"],
                    "highlight": {"fields": {"variables.named_entity": {
                        "pre_tags": [""],
                        "post_tags": [""],
                        "number_of_fragments": 0}}}
                },
                "query": {
                    "bool": {
                        "should": [
                            {
                                "match_phrase": {
                                    "variables.named_entity": {"query": "los angeles", "_name": "los angeles"}
                                }
                            },
                            {
                                "match_phrase": {
                                    "variables.named_entity": {"query": "new york", "_name": "new york"}}
                            },
                            {
                                "match_phrase": {
                                    "variables.named_entity": {"query": "akba", "_name": "akba"}}
                            },
                            {
                                "match_phrase": {
                                    "variables.named_entity": {"query": "shanghai", "_name": "shanghai"}}
                            },
                            {
                                "match_phrase": {
                                    "variables.named_entity": {"query": "tokyo", "_name": "tokyo"}}
                            }
                        ],
                        "minimum_should_match": 3
                    }
                }
            }
        }

        self.assertEqual(expected, query)
    def test_match_temporal_coverage_invalid(self):
        query = QueryManager.match_temporal_coverage(start="2222s",
                                                     end="2018-09-30T00:00:00")
        expected = {
            "query": {
                "bool": {
                    "must": [{
                        "range": {
                            "variables.temporal_coverage.end": {
                                "gte": "2018-09-30T00:00:00",
                                "format": "yyyy-MM-dd'T'HH:mm:ss"
                            }
                        }
                    }]
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #15
0
    def test_match_some_terms_from_array(self):
        query = QueryManager.match_some_terms_from_variables_array(terms=["los angeles", "NEW YORK", "AKBA"],
                                                                   key="variables.named_entity",
                                                                   minimum_should_match=2)
        expected = {
            "query": {
                "nested": {
                    "path": "variables",
                    "inner_hits": {
                        "_source": ["named_entity"]
                    },
                    "query": {
                        "bool": {
                            "should": [
                                {
                                    "match_phrase": {
                                        "variables.named_entity": {"query": "los angeles", "_name": "los angeles"}
                                    }
                                },
                                {
                                    "match_phrase": {
                                        "variables.named_entity": {"query": "new york", "_name": "new york"}
                                    }
                                },
                                {
                                    "match_phrase": {
                                        "variables.named_entity": {"query": "akba", "_name": "akba"}
                                    }
                                }
                            ],
                            "minimum_should_match": 2
                        }
                    }
                }
            },
            "highlight": {"fields": {"variables.named_entity": {"pre_tags": [""], "post_tags": [""]}}}
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #16
0
    def test_match_key_value_pairs(self):
        query = QueryManager.match_key_value_pairs(key_value_pairs=[
            ("description", "average"),
            ("title.keyword", "TAVG"),
            ("datamart_id", 0)
        ])
        expected = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"description": "average"}},
                        {"match": {"title.keyword": "TAVG"}},
                        {"match": {"datamart_id": 0}},
                        {"nested": {
                            "path": "variables",
                            "inner_hits": {"_source": []},
                            "query": {"bool": {"must": []}}}
                        }
                    ]
                }
            }
        }

        self.assertEqual(json.dumps(expected), query)
Beispiel #17
0
class Augment(object):
    def __init__(self,
                 es_index: str,
                 es_host: str = "dsbox02.isi.edu",
                 es_port: int = 9200) -> None:
        """Init method of QuerySystem, set up connection to elastic search.

        Args:
            es_index: elastic search index.
            es_host: es_host.
            es_port: es_port.

        Returns:

        """

        self.qm = QueryManager(es_host=es_host,
                               es_port=es_port,
                               es_index=es_index)
        self.joiners = dict()
        self.profiler = Profiler()

    def query(self,
              col: pd.Series = None,
              minimum_should_match_ratio_for_col: float = None,
              query_string: str = None,
              temporal_coverage_start: str = None,
              temporal_coverage_end: str = None,
              global_datamart_id: int = None,
              variable_datamart_id: int = None,
              key_value_pairs: typing.List[tuple] = None,
              **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            col: pandas Dataframe column.
            minimum_should_match_ratio_for_col: An float ranges from 0 to 1
                indicating the ratio of unique value of the column to be matched
            query_string: string to query any field in metadata
            temporal_coverage_start: start of a temporal coverage
            temporal_coverage_end: end of a temporal coverage
            global_datamart_id: match a global metadata id
            variable_datamart_id: match a variable metadata id
            key_value_pairs: match key value pairs

        Returns:
            matching docs of metadata
        """

        queries = list()

        if query_string:
            queries.append(self.qm.match_any(query_string=query_string))

        if temporal_coverage_start or temporal_coverage_end:
            queries.append(
                self.qm.match_temporal_coverage(start=temporal_coverage_start,
                                                end=temporal_coverage_end))

        if global_datamart_id:
            queries.append(
                self.qm.match_global_datamart_id(
                    datamart_id=global_datamart_id))

        if variable_datamart_id:
            queries.append(
                self.qm.match_variable_datamart_id(
                    datamart_id=variable_datamart_id))

        if key_value_pairs:
            queries.append(
                self.qm.match_key_value_pairs(key_value_pairs=key_value_pairs))

        if col is not None:
            queries.append(
                self.qm.match_some_terms_from_variables_array(
                    terms=col.unique().tolist(),
                    minimum_should_match=minimum_should_match_ratio_for_col))

        if not queries:
            return self._query_all()

        return self.qm.search(body=self.qm.form_conjunction_query(queries),
                              **kwargs)

    def _query_by_es_query(self, body: str,
                           **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by an elastic search query

        Args:
            body: query body

        Returns:
            matching docs of metadata
        """
        return self.qm.search(body=body, **kwargs)

    def _query_all(self, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query all metadata

        Args:

        Returns:
            matching docs of metadata
        """

        return self.qm.search(body=self.qm.match_all(), **kwargs)

    def join(self,
             left_df: pd.DataFrame,
             right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]],
             left_metadata: dict = None,
             right_metadata: dict = None,
             joiner: str = "default") -> typing.Optional[pd.DataFrame]:
        """Join two dataframes based on different joiner.

          Args:
              left_df: pandas Dataframe
              right_df: pandas Dataframe
              left_metadata: metadata of left dataframe
              right_metadata: metadata of right dataframe
              left_columns: list of integers from left df for join
              right_columns: list of integers from right df for join
              joiner: string of joiner, default to be "default"

          Returns:
               Dataframe
          """

        if joiner not in self.joiners:
            self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner)

        if not self.joiners[joiner]:
            warnings.warn("No suitable joiner, return original dataframe")
            return left_df

        if not left_metadata:
            # Left df is the user provided one.
            # We will generate metadata just based on the data itself, profiling and so on
            left_metadata = Utils.generate_metadata_from_dataframe(
                data=left_df)

        left_metadata = Utils.calculate_dsbox_features(data=left_df,
                                                       metadata=left_metadata)
        right_metadata = Utils.calculate_dsbox_features(
            data=right_df, metadata=right_metadata)

        return self.joiners[joiner].join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_metadata,
            right_metadata=right_metadata,
        )
def check_existence(materialization: dict,
                    es_index: str = PRODUCTION_ES_INDEX):
    """
    Query ElasticSearch with materializer name and arguments. Currently, only works with
    "general_materializer" and "wikitables_materializer."
    return the datamart id if exists
    else return None
    :param url:
    :return: datamart_id or None
    """

    materializer = materialization['python_path']
    if materializer == "general_materializer":
        query = {
            "query": {
                "bool": {
                    "must": [{
                        "match_phrase": {
                            "materialization.python_path": materializer
                        }
                    }, {
                        "match_phrase": {
                            "materialization.arguments.url":
                            materialization['arguments']['url']
                        }
                    }, {
                        "match_phrase": {
                            "materialization.arguments.index":
                            materialization['arguments']['index']
                        }
                    }]
                }
            }
        }
    elif materializer == "wikitables_materializer":
        query = {
            "query": {
                "bool": {
                    "must": [{
                        "match_phrase": {
                            "materialization.python_path": materializer
                        }
                    }, {
                        "match_phrase": {
                            "materialization.arguments.url":
                            materialization['arguments']['url']
                        }
                    }, {
                        "match_phrase": {
                            "materialization.arguments.xpath":
                            materialization['arguments']['xpath']
                        }
                    }]
                }
            }
        }
    else:
        raise Exception(
            f'Do not know how to perform existence check for materializer: {materializer}'
        )
    qm = QueryManager(es_host=ES_HOST, es_port=ES_PORT, es_index=es_index)
    res = qm.search(dumps(query))
    # TODO: how about return many results, should raise warning
    if res and res[0]:
        return int(res[0].get('_id'))
Beispiel #19
0
    def test_form_conjunction_query(self):
        query = QueryManager.form_conjunction_query(
            queries=[
                {'query_string': {'query': 'Budget'}},
                {'term': {'datamart_id': 125980000}},
                {'bool': {'must': [{'match': {'title': 'Government'}}]}},
                {'nested': {
                    'path': 'variables',
                    'inner_hits': {'_source': ['named_entity'],
                                   'highlight': {
                                       'fields': {
                                           'variables.named_entity': {
                                               'pre_tags': [
                                                   ''],
                                               'post_tags': [
                                                   ''],
                                               'number_of_fragments': 0}
                                       }
                                   }
                                   },
                    'query': {'bool': {
                        'should': [
                            {
                                'match_phrase': {
                                    'variables.named_entity': {
                                        'query': 'russia',
                                        '_name': 'russia'
                                    }
                                }
                            },
                            {
                                'match_phrase': {
                                    'variables.named_entity': {
                                        'query': 'saudi arabia',
                                        '_name': 'saudi arabia'}}}
                        ],
                        'minimum_should_match': 1
                    }
                    }
                }
                }
            ]
        )

        expected = {"query": {"bool": {
            "must": [
                {"query_string": {"query": "Budget"}},
                {"term": {"datamart_id": 125980000}},
                {"bool": {"must": [{"match": {"title": "Government"}}]}},
                {"nested": {"path": "variables",
                            "inner_hits": {"_source": ["named_entity"],
                                           "highlight": {
                                               "fields": {
                                                   "variables.named_entity": {
                                                       "pre_tags": [
                                                           ""],
                                                       "post_tags": [
                                                           ""],
                                                       "number_of_fragments": 0}}}},
                            "query": {"bool": {
                                "should": [{
                                    "match_phrase": {
                                        "variables.named_entity": {
                                            "query": "russia",
                                            "_name": "russia"}}},
                                    {
                                        "match_phrase": {
                                            "variables.named_entity": {
                                                "query": "saudi arabia",
                                                "_name": "saudi arabia"}}}],
                                "minimum_should_match": 1}}}}]}}}

        self.assertEqual(json.dumps(expected), query)
Beispiel #20
0
class Augment(object):
    DEFAULT_START_DATE = "1900-01-01T00:00:00"

    def __init__(self,
                 es_index: str,
                 es_host: str = "dsbox02.isi.edu",
                 es_port: int = 9200) -> None:
        """Init method of QuerySystem, set up connection to elastic search.

        Args:
            es_index: elastic search index.
            es_host: es_host.
            es_port: es_port.

        Returns:

        """

        self.qm = QueryManager(es_host=es_host,
                               es_port=es_port,
                               es_index=es_index)
        self.joiners = dict()

    def query_by_column(self,
                        col: pd.Series,
                        minimum_should_match: int = None,
                        **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            col: pandas Dataframe column.
            minimum_should_match: An integer ranges from 0 to length of unique value in col.
            Represent the minimum number of terms should match.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_some_terms_from_variables_array(
            terms=col.unique().tolist(),
            minimum_should_match=minimum_should_match)
        return self.qm.search(body=body, **kwargs)

    def query_by_named_entities(
            self,
            named_entities: list,
            minimum_should_match: int = None,
            **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            named_entities: list of named entities
            minimum_should_match: An integer ranges from 0 to length of named entities list.
            Represent the minimum number of terms should match.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_some_terms_from_variables_array(
            terms=named_entities,
            key="variables.named_entity",
            minimum_should_match=minimum_should_match)
        return self.qm.search(body=body, **kwargs)

    def query_by_temporal_coverage(
            self,
            start=None,
            end=None,
            **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a temporal coverage of column

        Args:
            start: dataset should cover date time earlier than the start date.
            end: dataset should cover date time later than the end date.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_temporal_coverage(start=start, end=end)
        return self.qm.search(body=body, **kwargs)

    def query_by_datamart_id(self, datamart_id: int,
                             **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by datamart id

        Args:
            datamart_id: int

        Returns:
            matching docs of metadata
        """

        global_body = self.qm.match_global_datamart_id(datamart_id=datamart_id)
        variable_body = self.qm.match_variable_datamart_id(
            datamart_id=datamart_id)
        return self.qm.search(body=global_body, **kwargs) or self.qm.search(
            body=variable_body, **kwargs)

    def query_by_key_value_pairs(
            self, key_value_pairs: typing.List[tuple],
            **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by datamart id

        Args:
            key_value_pairs: list of key value tuple

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_key_value_pairs(key_value_pairs=key_value_pairs)
        return self.qm.search(body=body, **kwargs)

    def query_any_field_with_string(
            self, query_string,
            **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query any field of matadata with query_string

        Args:
            key_value_pairs: list of key value tuple

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_any(query_string=query_string)
        return self.qm.search(body=body, **kwargs)

    def query_by_es_query(self, body: str,
                          **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by an elastic search query

        Args:
            body: query body

        Returns:
            matching docs of metadata
        """
        return self.qm.search(body=body, **kwargs)

    @staticmethod
    def get_dataset(metadata: dict,
                    variables: list = None,
                    constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """
        if "date_range" in constrains:
            if not constrains["date_range"].get("start", None):
                constrains["date_range"]["start"] = Augment.DEFAULT_START_DATE
            if not constrains["date_range"].get("end", None):
                constrains["date_range"]["end"] = datetime.now().strftime(
                    '%Y-%m-%dT%H:%M:%S')
        df = Utils.materialize(metadata=metadata, constrains=constrains)
        if variables:
            return df.iloc[:, variables]
        return df

    @staticmethod
    def get_metadata_intersection(*metadata_lst) -> list:
        """Get the intersect metadata list.

       Args:
           metadata_lst: all metadata list returned by multiple queries

       Returns:
            list of intersect metadata
       """

        metadata_dict = dict()
        metadata_sets = []
        for lst in metadata_lst:
            this_set = set()
            for x in lst:
                if x["_source"]["datamart_id"] not in metadata_dict:
                    metadata_dict[x["_source"]["datamart_id"]] = x
                this_set.add(x["_source"]["datamart_id"])
            metadata_sets.append(this_set)
        return [
            metadata_dict[datamart_id]
            for datamart_id in metadata_sets[0].intersection(
                *metadata_sets[1:])
        ]

    def join(self,
             left_df: pd.DataFrame,
             right_df: pd.DataFrame,
             left_columns: typing.List[int],
             right_columns: typing.List[int],
             left_metadata: dict = None,
             right_metadata: dict = None,
             joiner: str = "default") -> typing.Optional[pd.DataFrame]:
        """Join two dataframes based on different joiner.

          Args:
              left_df: pandas Dataframe
              right_df: pandas Dataframe
              left_metadata: metadata of left dataframe
              right_metadata: metadata of right dataframe
              left_columns: list of integers from left df for join
              right_columns: list of integers from right df for join
              joiner: string of joiner, default to be "default"

          Returns:
               Dataframe
          """

        if joiner not in self.joiners:
            self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner)

        if not self.joiners[joiner]:
            warnings.warn("No suitable joiner, return original dataframe")
            return left_df

        return self.joiners[joiner].join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_metadata,
            right_metadata=right_metadata,
        )
Beispiel #21
0
class Augment(object):

    def __init__(self, es_index: str, es_host: str = "dsbox02.isi.edu", es_port: int = 9200) -> None:
        """Init method of QuerySystem, set up connection to elastic search.

        Args:
            es_index: elastic search index.
            es_host: es_host.
            es_port: es_port.

        Returns:

        """

        self.qm = QueryManager(es_host=es_host, es_port=es_port, es_index=es_index)

    def query_by_column(self,
                        col: pd.Series,
                        minimum_should_match: int = None,
                        **kwargs
                        ) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            col: pandas Dataframe column.
            minimum_should_match: An integer ranges from 0 to length of unique value in col.
            Represent the minimum number of terms should match.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_some_terms_from_array(terms=col.unique().tolist(),
                                                   minimum_should_match=minimum_should_match)
        return self.qm.search(body=body, **kwargs)

    def query_by_named_entities(self,
                                named_entities: list,
                                minimum_should_match: int = None,
                                **kwargs
                                ) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a pandas Dataframe column

        Args:
            named_entities: list of named entities
            minimum_should_match: An integer ranges from 0 to length of named entities list.
            Represent the minimum number of terms should match.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_some_terms_from_array(terms=named_entities,
                                                   key="variables.named_entity.keyword",
                                                   minimum_should_match=minimum_should_match)
        return self.qm.search(body=body, **kwargs)

    def query_by_temporal_coverage(self, start=None, end=None, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by a temporal coverage of column

        Args:
            start: dataset should cover date time earlier than the start date.
            end: dataset should cover date time later than the end date.

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_temporal_coverage(start=start, end=end)
        return self.qm.search(body=body, **kwargs)

    def query_by_datamart_id(self, datamart_id: int, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by datamart id

        Args:
            datamart_id: int

        Returns:
            matching docs of metadata
        """

        global_body = self.qm.match_global_datamart_id(datamart_id=datamart_id)
        variable_body = self.qm.match_variable_datamart_id(datamart_id=datamart_id)
        return self.qm.search(body=global_body, **kwargs) or self.qm.search(body=variable_body, **kwargs)

    def query_by_key_value_pairs(self,
                                 key_value_pairs: typing.List[tuple],
                                 **kwargs
                                 ) -> typing.Optional[typing.List[dict]]:
        """Query metadata by datamart id

        Args:
            key_value_pairs: list of key value tuple

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_key_value_pairs(key_value_pairs=key_value_pairs)
        return self.qm.search(body=body, **kwargs)

    def query_any_field_with_string(self, query_string, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query any field of matadata with query_string

        Args:
            key_value_pairs: list of key value tuple

        Returns:
            matching docs of metadata
        """

        body = self.qm.match_any(query_string=query_string)
        return self.qm.search(body=body, **kwargs)

    def query_by_es_query(self, body: str, **kwargs) -> typing.Optional[typing.List[dict]]:
        """Query metadata by an elastic search query

        Args:
            body: query body

        Returns:
            matching docs of metadata
        """
        return self.qm.search(body=body, **kwargs)

    @staticmethod
    def get_dataset(metadata: dict, variables: list = None, constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """

        return Utils.materialize(metadata=metadata, variables=variables, constrains=constrains)