コード例 #1
0
class ElasticSearchProcessor(MultiPackProcessor):
    r"""This processor searches for relevant documents for a query"""

    # pylint: disable=useless-super-delegation
    def __init__(self) -> None:
        super().__init__()

    def initialize(self, resources: Resources, configs: Config):
        self.resources = resources
        self.config = configs
        self.index = ElasticSearchIndexer(config=self.config.index_config)

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        config = super().default_configs()
        config.update({
            "query_pack_name": "query",
            "index_config": ElasticSearchIndexer.default_configs(),
            "field": "content",
            "response_pack_name_prefix": "passage"
        })
        return config

    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query: Query = query_pack.get_single(Query)  # type: ignore
        # pylint: disable=isinstance-second-argument-not-valid-type
        # TODO: until fix: https://github.com/PyCQA/pylint/issues/3507
        if not isinstance(first_query, Dict):
            raise ValueError(
                "The query to the elastic indexer need to be a dictionary.")
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]

        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.add_result(document["doc_id"], hit["_score"])

            pack: DataPack = input_pack.add_pack(
                f"{self.config.response_pack_name_prefix}_{idx}"
            )
            pack.pack_name = document["doc_id"]

            content = document[self.config.field]
            pack.set_text(content)

            Document(pack=pack, begin=0, end=len(content))
コード例 #2
0
    def setUp(self):
        # create indexer
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        self.index_name = "final"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})
        nlp: Pipeline[DataPack] = Pipeline()
        nlp.set_reader(MSMarcoPassageReader())
        nlp.add(DataSelectorIndexProcessor(), config=indexer_config)
        nlp.initialize()

        self.size = 0
        for _ in nlp.process_dataset(self.abs_data_dir):
            self.size += 1

        self.test_dir = tempfile.mkdtemp()
コード例 #3
0
 def default_configs(cls) -> Dict[str, Any]:
     config = super().default_configs()
     config.update({
         "query_pack_name": "query",
         "index_config": ElasticSearchIndexer.default_configs(),
         "field": "content",
         "response_pack_name_prefix": "passage"
     })
     return config
コード例 #4
0
    def default_configs(cls) -> Dict[str, Any]:
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                "batch_size": 128,
                "fields": "content",
                "indexer": {
                    "name": "ElasticSearchIndexer",
                    "hparams": ElasticSearchIndexer.default_configs(),
                    "kwargs": {
                        "request_timeout": 10,
                        "refresh": False
                    }
                }
            }

        Here:

        `"batch_size"`: int
            Number of examples that will be bulk added to Elasticsearch index

        `"fields"`: str, list
            Field name that will be used as a key while indexing the document

        `"indexer"`: dict

            `"name"`: str
                Name of Indexer to be used.

            `"hparams"`: dict
                Hyperparameters to be used for the index. See
                :meth:`ElasticSearchIndexer.default_hparams` for more details

            `"kwargs"`: dict
                Keyword arguments that will be passed to
                :meth:`ElasticSearchIndexer.add_bulk` API

        """
        config = super().default_configs()
        config.update({
            **IndexProcessor.default_configs(), "fields":
            ["doc_id", "content"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": ElasticSearchIndexer.default_configs(),
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": False
                }
            }
        })
        return config
コード例 #5
0
    def setUp(self):
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        corpus_file = os.path.join(self.abs_data_dir, 'collection.tsv')

        self.expected_content = set()
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content.add(value)

        self.index_name = "test_indexer"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})

        self.nlp: Pipeline[DataPack] = Pipeline()
        self.reader = MSMarcoPassageReader()
        self.processor = DataSelectorIndexProcessor()
        self.nlp.set_reader(self.reader)
        self.nlp.add(self.processor, config=indexer_config)
        self.nlp.initialize()
コード例 #6
0
 def default_config(cls) -> Dict[str, Any]:
     return {
         "batch_size": 10000,
         "fields": ["doc_id", "content", "pack_info"],
         "indexer": {
             "name": "ElasticSearchIndexer",
             "hparams": ElasticSearchIndexer.default_configs(),
             "other_kwargs": {
                 "request_timeout": 60,
                 "refresh": False
             }
         }
     }
コード例 #7
0
class ElasticSearchProcessor(MultiPackProcessor):
    r"""This processor searches for relevant documents for a query"""

    # pylint: disable=useless-super-delegation
    def __init__(self) -> None:
        super().__init__()

    def initialize(self, resources: Resources, configs: HParams):

        self.resources = resources
        self.config = configs
        self.index = ElasticSearchIndexer(hparams=self.config.index_config)

    @staticmethod
    def default_configs() -> Dict[str, Any]:
        return {
            "query_pack_name": "query",
            "index_config": ElasticSearchIndexer.default_configs(),
            "field": "content"
        }

    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]
        packs = {}
        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.update_results({document["doc_id"]: hit["_score"]})
            pack = DataPack(doc_id=document["doc_id"])
            content = document[self.config.field]
            document = Document(pack=pack, begin=0, end=len(content))
            pack.add_entry(document)
            pack.set_text(content)
            packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack

        input_pack.update_pack(packs)
コード例 #8
0
class TestDataSelectorIndexProcessor(unittest.TestCase):
    def setUp(self):
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        corpus_file = os.path.join(self.abs_data_dir, 'collection.tsv')

        self.expected_content = set()
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content.add(value)

        self.index_name = "test_indexer"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})

        self.nlp: Pipeline[DataPack] = Pipeline()
        self.reader = MSMarcoPassageReader()
        self.processor = DataSelectorIndexProcessor()
        self.nlp.set_reader(self.reader)
        self.nlp.add(self.processor, config=indexer_config)
        self.nlp.initialize()

    def tearDown(self):
        self.indexer.elasticsearch.indices.delete(index=self.index_name,
                                                  ignore=[400, 404])

    def test_pipeline(self):
        size = 0
        for _ in self.nlp.process_dataset(self.abs_data_dir):
            size += 1

        retrieved_document = self.indexer.search(
            query={"query": {
                "match_all": {}
            }},
            index_name=self.index_name,
            size=size)

        hits = retrieved_document["hits"]["hits"]
        self.assertEqual(len(hits), size)
        results = set([hit["_source"]["content"] for hit in hits])
        self.assertEqual(results, self.expected_content)
コード例 #9
0
 def initialize(self, resources: Resources, configs: Config):
     self.resources = resources
     self.config = configs
     self.index = ElasticSearchIndexer(config=self.config.index_config)
コード例 #10
0
 def initialize(self, resources: Resources, configs: Config):
     super().initialize(resources, configs)
     self.index = ElasticSearchIndexer(config=self.configs.index_config)
コード例 #11
0
 def default_configs() -> Dict[str, Any]:
     return {
         "query_pack_name": "query",
         "index_config": ElasticSearchIndexer.default_configs(),
         "field": "content"
     }
コード例 #12
0
    def initialize(self, resources: Resources, configs: HParams):

        self.resources = resources
        self.config = configs
        self.index = ElasticSearchIndexer(hparams=self.config.index_config)
コード例 #13
0
 def initialize(self, resources: Resources, configs: Config):
     super().initialize(resources, configs)
     self.indexer = ElasticSearchIndexer(self.configs.indexer.hparams)
コード例 #14
0
class ElasticSearchIndexerBase(IndexProcessor, ABC):
    r"""This processor implements the basic functions to add the data packs
    into an `Elasticsearch` index."""
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self.indexer = ElasticSearchIndexer(self.configs.indexer.hparams)

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        r"""Returns a dictionary of default hyperparameters.

        .. code-block:: python

            {
                "batch_size": 128,
                "fields": "content",
                "indexer": {
                    "name": "ElasticSearchIndexer",
                    "hparams": ElasticSearchIndexer.default_configs(),
                    "kwargs": {
                        "request_timeout": 10,
                        "refresh": False
                    }
                }
            }

        Here:

        `"batch_size"`: int
            Number of examples that will be bulk added to `Elasticsearch` index

        `"fields"`: str, list
            Field name that will be used as a key while indexing the document

        `"indexer"`: dict

            `"name"`: str
                Name of Indexer to be used.

            `"hparams"`: dict
                Hyperparameters to be used for the index. See
                :meth:`ElasticSearchIndexer.default_hparams` for more details

            `"kwargs"`: dict
                Keyword arguments that will be passed to
                :meth:`ElasticSearchIndexer.add_bulk` API

        """
        config = super().default_configs()
        config.update({
            "fields": ["doc_id", "content"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": ElasticSearchIndexer.default_configs(),
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": False
                }
            }
        })
        return config

    def _bulk_process(self):
        self.indexer.add_bulk(self.documents,
                              **self.configs.indexer.other_kwargs)
コード例 #15
0
 def default_configs(cls) -> Dict[str, Any]:
     config = super().default_configs()
     config.update({
         "index_config": ElasticSearchIndexer.default_configs(),
     })
     return config
コード例 #16
0
ファイル: indexers_test.py プロジェクト: gxchris95/forte-1
class TestElasticSearchIndexer(unittest.TestCase):
    r"""Tests Elastic Indexer."""
    def setUp(self):
        self.indexer = ElasticSearchIndexer(
            config={"index_name": "test_index"})

    def tearDown(self):
        self.indexer.elasticsearch.indices.delete(
            index=self.indexer.hparams.index_name, ignore=[400, 404])

    def test_add(self):
        document = {
            "key": "This document is created to test "
            "ElasticSearchIndexer"
        }
        self.indexer.add(document, refresh="wait_for")
        retrieved_document = self.indexer.search(
            query={
                "query": {
                    "match": {
                        "key": "ElasticSearchIndexer"
                    }
                },
                "_source": ["key"]
            })
        hits = retrieved_document["hits"]["hits"]
        self.assertEqual(len(hits), 1)
        self.assertEqual(hits[0]["_source"], document)

    def test_add_bulk(self):
        size = 10000
        documents = set([
            f"This document {i} is created to test "
            f"ElasticSearchIndexer" for i in range(size)
        ])
        self.indexer.add_bulk([{
            "key": document
        } for document in documents],
                              refresh="wait_for")
        retrieved_document = self.indexer.search(
            query={"query": {
                "match_all": {}
            }},
            index_name="test_index",
            size=size)
        hits = retrieved_document["hits"]["hits"]
        self.assertEqual(len(hits), size)
        results = set([hit["_source"]["key"] for hit in hits])
        self.assertEqual(results, documents)

    @performance_test
    @data([100, 0.3], [500, 0.3], [1000, 0.3])
    @unpack
    def test_speed(self, size, epsilon):
        es = Elasticsearch()
        documents = [{
            "_index":
            "test_index_",
            "_type":
            "document",
            "key":
            f"This document {i} is created to test "
            f"ElasticSearchIndexer"
        } for i in range(size)]

        start = time.time()
        bulk(es, documents, refresh=False)
        baseline = time.time() - start
        es.indices.delete(index="test_index_", ignore=[400, 404])

        documents = set([
            f"This document {i} is created to test "
            f"ElasticSearchIndexer" for i in range(size)
        ])
        start = time.time()
        self.indexer.add_bulk([{
            "key": document
        } for document in documents],
                              refresh=False)
        forte_time = time.time() - start
        self.assertLessEqual(forte_time, baseline + epsilon)
コード例 #17
0
ファイル: indexers_test.py プロジェクト: gxchris95/forte-1
 def setUp(self):
     self.indexer = ElasticSearchIndexer(
         config={"index_name": "test_index"})
コード例 #18
0
 def setUp(self):
     self.indexer = ElasticSearchIndexer(
         hparams={"index_name": "test_index"})