def setUp(self):
        self.reader = MSMarcoPassageReader()

        self.data_dir = 'data_samples/ms_marco_passage_retrieval'

        corpus_file = os.path.join(self.data_dir, 'collection.tsv')
        self.expected_content = {}
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content[key] = value
    def setUp(self):
        # create indexer
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        self.index_name = "final"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})
        nlp: Pipeline[DataPack] = Pipeline()
        nlp.set_reader(MSMarcoPassageReader())
        nlp.add(DataSelectorIndexProcessor(), config=indexer_config)
        nlp.initialize()

        self.size = 0
        for _ in nlp.process_dataset(self.abs_data_dir):
            self.size += 1

        self.test_dir = tempfile.mkdtemp()
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    # loading config
    config = yaml.safe_load(open(args.config_file, "r"))

    file_dir_path = os.path.dirname(__file__)
    data_dir = 'data_samples/ms_marco_passage_retrieval'
    abs_data_dir = os.path.abspath(
        os.path.join(file_dir_path, *([os.pardir] * 3), data_dir))

    reader = MSMarcoPassageReader()
    nlp = CreateIndexerPipeline(reader=reader,
                                reader_config=None,
                                indexer_config=config["indexer_config"])
    nlp.create_index(abs_data_dir)
Ejemplo n.º 4
0
    def setUp(self):
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        corpus_file = os.path.join(self.abs_data_dir, 'collection.tsv')

        self.expected_content = set()
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content.add(value)

        self.index_name = "test_indexer"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})

        self.nlp: Pipeline[DataPack] = Pipeline()
        self.reader = MSMarcoPassageReader()
        self.processor = DataSelectorIndexProcessor()
        self.nlp.set_reader(self.reader)
        self.nlp.add(self.processor, config=indexer_config)
        self.nlp.initialize()
Ejemplo n.º 5
0
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(MSMarcoPassageReader())
        self.pipeline.initialize()

        root_path = os.path.abspath(
            os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                os.pardir,
                os.pardir,
                os.pardir,
                os.pardir,
            ))

        self.data_dir = os.path.join(
            root_path, "data_samples/ms_marco_passage_retrieval")

        corpus_file = os.path.join(self.data_dir, "collection.tsv")
        self.expected_content = {}
        with open(corpus_file, "r") as f:
            for line in f.readlines():
                key, value = tuple(line.split("\t", 1))
                self.expected_content[key] = value
class MSMarcoPassageReaderTest(unittest.TestCase):
    def setUp(self):
        self.reader = MSMarcoPassageReader()

        self.data_dir = 'data_samples/ms_marco_passage_retrieval'

        corpus_file = os.path.join(self.data_dir, 'collection.tsv')
        self.expected_content = {}
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content[key] = value

    def test_ms_marco_passage_reader(self):
        actual_content: Dict[str, str] = {}
        for data_pack in self.reader.iter(self.data_dir):
            self.assertIsInstance(data_pack, DataPack)
            doc_entries = list(data_pack.get_entries_by_type(Document))
            self.assertTrue(len(doc_entries) == 1)
            doc_entry: Document = doc_entries[0]
            self.assertIsInstance(doc_entry, Document)
            actual_content[data_pack.meta.doc_id] = doc_entry.text

        self.assertDictEqual(actual_content, self.expected_content)
Ejemplo n.º 7
0
from forte.common.configuration import Config
from forte.data.data_pack import DataPack
from forte.data.readers import MSMarcoPassageReader
from forte.pipeline import Pipeline
from forte.processors.ir import ElasticSearchTextIndexProcessor

logging.basicConfig(level=logging.INFO)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    config = yaml.safe_load(open(args.config_file, "r"))
    config = Config(config, default_hparams=None)

    nlp: Pipeline[DataPack] = Pipeline()
    nlp.set_reader(MSMarcoPassageReader())
    nlp.add(ElasticSearchTextIndexProcessor(), config=config.create_index)
    nlp.initialize()

    data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             config.data.relative_path)

    for idx, pack in enumerate(nlp.process_dataset(data_path)):
        if idx + 1 > 0 and (idx + 1) % 10000 == 0:
            print(f"Indexed {idx + 1} packs")