class MSMarcoPassageReaderTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(MSMarcoPassageReader()) self.pipeline.initialize() self.data_dir = 'data_samples/ms_marco_passage_retrieval' corpus_file = os.path.join(self.data_dir, 'collection.tsv') self.expected_content = {} with open(corpus_file, 'r') as f: for line in f.readlines(): key, value = tuple(line.split('\t', 1)) self.expected_content[key] = value def test_ms_marco_passage_reader(self): actual_content: Dict[str, str] = {} for data_pack in self.pipeline.process_dataset(self.data_dir): self.assertIsInstance(data_pack, DataPack) doc_entries = list(data_pack.get(Document)) self.assertTrue(len(doc_entries) == 1) doc_entry: Document = doc_entries[0] self.assertIsInstance(doc_entry, Document) actual_content[data_pack.pack_name] = doc_entry.text self.assertDictEqual(actual_content, self.expected_content)
def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(AGNewsReader()) self.pipeline.initialize() self.sample_file: str = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), *([os.path.pardir] * 4), "data_samples/ag_news/sample.csv")) self.expected_content: Dict[int, str] = {} with open(self.sample_file, "r") as file: for line_id, line in enumerate(file): data = line.strip().split(",") class_id, title, description = ( int(data[0].replace('"', "")), data[1], data[2], ) self.expected_content[line_id] = (class_id, title, description) self.class_idx_to_name = { 1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tech", }
class AGNewsReaderTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(AGNewsReader()) self.pipeline.initialize() self.sample_file: str = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), *([os.path.pardir] * 4), "data_samples/ag_news/sample.csv")) self.expected_content: Dict[int, str] = {} with open(self.sample_file, "r") as file: for line_id, line in enumerate(file): data = line.strip().split(",") class_id, title, description = ( int(data[0].replace('"', "")), data[1], data[2], ) self.expected_content[line_id] = (class_id, title, description) self.class_idx_to_name = { 1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tech", } def test_ag_news_reader(self): for data_pack in self.pipeline.process_dataset(self.sample_file): ( expected_class_id, expected_title, expected_desc, ) = self.expected_content[data_pack.pack_name] self.assertIsInstance(data_pack, DataPack) # Test Article doc_entries = list(data_pack.get(Document)) self.assertTrue(len(doc_entries) == 1) article: Document = doc_entries[0] self.assertIsInstance(article, Document) self.assertEqual(article.text, expected_title + "\n" + expected_desc) # Test Document Class doc_class = article.document_class self.assertTrue(len(doc_class) == 1) self.assertEqual(doc_class[0], self.class_idx_to_name[expected_class_id]) # Test Title title_entries = list(data_pack.get(Title)) self.assertTrue(len(title_entries) == 1) title: Title = title_entries[0] self.assertEqual(title.text, expected_title) # Test Description desc_entries = list(data_pack.get(Description)) self.assertTrue(len(desc_entries) == 1) description: Description = desc_entries[0] self.assertEqual(description.text, expected_desc)
def setUp(self): # Define and config the Pipeline self.fp = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) self.nlp = Pipeline() self.nlp.set_reader(ProdigyReader()) self.create_sample_file()
def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = "data_samples/ontonotes/00/"
def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add(NLTKSentenceSegmenter()) boxer_config = {"pack_name": "question"} self.nlp.add(MultiPackBoxer(), boxer_config) self.nlp.add(MutliDocPackAdder()) self.nlp.add(QuestionAnsweringMulti()) self.nlp.initialize()
def setUp(self) -> None: self.nlp = Pipeline() self.reader = OntonotesReader() self.data_path = "examples/data_samples/ontonotes/00/" self.nlp.set_reader(OntonotesReader()) self.nlp.add_processor(DummyRelationExtractor()) self.nlp.initialize()
def write_results(pl: Pipeline, output_path: str, input_data: str): pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, }, ) pl.run(input_data)
def setUp(self): # Define and config the Pipeline self.dataset_path = "examples/data_samples/ontonotes/00" self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.initialize()
class TestNLTKPOSTagger(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) self.nltk.add_processor(NLTKWordTokenizer()) self.nltk.add_processor(NLTKPOSTagger()) def test_pos_tagger(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] pos = [["DT", "NN", "VBZ", "VBN", "NNP", "."], [ "DT", "NN", "IN", "DT", "NN", "TO", "VB", "PRP", "VB", "NNP", "NNS", "." ], ["NNP", "VBZ", "RB", "VBN", "VBN", "DT", "JJ", "RB", "."]] document = ' '.join(sentences) pack = self.nltk.process(document) for i, sentence in enumerate(pack.get(Sentence)): for j, token in enumerate( pack.get(entry_type=Token, range_annotation=sentence)): self.assertEqual(token.pos, pos[i][j])
def stanford_nlp_example(lang: str, text: str): pl = Pipeline() pl.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_hparams()) pl.add_processor(processor=StandfordNLPProcessor(models_path), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = { "input_pack_name": "query", "output_pack_name": "output" } nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) config = { "model": { "name": "bert-base-uncased" }, "tokenizer": { "name": "bert-base-uncased" }, "max_seq_length": 128, "query_pack_name": "query" } nlp.add_processor(BertBasedQueryCreator(), config=config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): query_pack = m_pack.get_pack("query") self.assertEqual(len(query_pack.generics), 1) self.assertIsInstance(query_pack.generics[0], Query) query = query_pack.generics[0].value self.assertEqual(query.shape, (1, 768))
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams( { "input_pack_name": "input", "output_pack_name": "output" }, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) translator_config = HParams( { "src_language": "de", "target_language": "en", "in_pack_name": "input", "out_pack_name": "result" }, None) nlp.add_processor(MicrosoftBingTranslator(), config=translator_config) nlp.initialize() english_results = ["Hey good morning", "This is Forte. A tool for NLP"] for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): self.assertEqual(set(m_pack._pack_names), set(["input", "output", "result"])) self.assertEqual( m_pack.get_pack("result").text, english_results[idx] + "\n")
def setUp(self): # Define and config the Pipeline self.dataset_path = "data_samples/conll03" self.nlp = Pipeline() self.nlp.set_reader(CoNLL03Reader()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.initialize()
def setUp(self) -> None: # Define and config the Pipeline self.dataset_path = "examples/ontonotes_sample_dataset/00" self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) self.processor = DummyRelationExtractor() self.nlp.add_processor(self.processor) self.nlp.initialize()
def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = HParams({"batcher": { "batch_size": 5 }}, dummy.default_hparams()) self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = \ "forte/processors/base/tests/data_samples/ontonotes/00/"
def setUp(self): self.spacy = Pipeline() self.spacy.set_reader(StringReader()) config = { "processors": "tokenize", "lang": "en_core_web_sm", # Language code for the language to build the Pipeline "use_gpu": False } self.spacy.add_processor(SpacyProcessor(), config=config) self.spacy.initialize()
def setUp(self): self._cache_directory = Path(os.path.join(os.getcwd(), "cache_html")) self.reader = HTMLReader(cache_directory=self._cache_directory, append_to_cache=True) self.pl1 = Pipeline() self.pl1.set_reader(self.reader) self.pl1.initialize() self.pl2 = Pipeline() self.pl2.set_reader(HTMLReader(from_cache=True, cache_directory=self._cache_directory)) self.pl2.initialize()
def setUp(self): self.stanford_nlp = Pipeline() self.stanford_nlp.set_reader(StringReader()) models_path = os.getcwd() config = { "processors": "tokenize", "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False } self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path), config=config) self.stanford_nlp.initialize()
def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(MSMarcoPassageReader()) self.pipeline.initialize() self.data_dir = 'data_samples/ms_marco_passage_retrieval' corpus_file = os.path.join(self.data_dir, 'collection.tsv') self.expected_content = {} with open(corpus_file, 'r') as f: for line in f.readlines(): key, value = tuple(line.split('\t', 1)) self.expected_content[key] = value
def test_attribute_masker(self): pl = Pipeline() pl.set_reader(CoNLL03Reader()) config = {"kwargs": {Token: ["ner"]}} pl.add_processor(processor=AttributeMasker(), config=config) pl.initialize() for pack in pl.process_dataset("data_samples/conll03/"): entries = pack.get_entries_by_type(Token) for entry in entries: self.assertIsNone(entry.ner)
def setUp(self): p: Pipeline = Pipeline() p.set_reader(EmptyReader()) p.add(EntryAnnotator()) p.initialize() self.pack: DataPack = p.process(['doc1', 'doc2'])
def setUp(self): # Define and config the Pipeline self.dataset_path = "examples/" self.pl1 = Pipeline() self._cache_directory = Path(os.path.join(os.getcwd(), "cache_data")) self.pl1.set_reader(StringReader()) self.pl2 = Pipeline() self.pl2.set_reader(StringReader()) self.text = ( "The plain green Norway spruce is displayed in the gallery's " "foyer. Wentworth worked as an assistant to sculptor Henry Moore " "in the late 1960s. His reputation as a sculptor grew in the " "1980s.")
def test_caster_all_selector(self): """ Test if the caster and all pack selector works well. The caster is used to convert a single pack to multi pack, and then pack copier is used to create a new pack. The all pack selector selects all the pack from the multi pack. This test make sure this pipeline works OK. """ mp: MultiPack for mp in ( Pipeline() .set_reader(SentenceReader()) .add(MultiPackBoxer()) .add(MultiPackCopier()) .add(DummyPackProcessor(), selector=AllPackSelector()) .initialize() .process_dataset( os.path.join(data_samples_root, "random_texts", "0.txt") ) ): num_pack = 0 for pack in mp.packs: num_pack += 1 entries = list(pack.get(NewType)) self.assertEqual(len(entries), 1) self.assertEqual(entries[0].value, "[PACK]") self.assertEqual(num_pack, 2)
def setUp(self): root_path = os.path.abspath( os.path.join( os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir, os.pardir, )) file_path: str = os.path.join(root_path, "data_samples/data_pack_dataset_test") reader = CoNLL03Reader() context_type = Sentence request = {Sentence: []} skip_k = 0 self.input_files = ["conll03_1.conll", "conll03_2.conll"] self.feature_schemes = {} train_pl: Pipeline = Pipeline() train_pl.set_reader(reader) train_pl.initialize() pack_iterator: Iterator[PackType] = train_pl.process_dataset(file_path) self.data_source: DataPackIterator = DataPackIterator( pack_iterator, context_type, request, skip_k)
def setUp(self): # create indexer file_dir_path = os.path.dirname(__file__) data_dir = 'data_samples/ms_marco_passage_retrieval' self.abs_data_dir = os.path.abspath( os.path.join(file_dir_path, *([os.pardir] * 4), data_dir)) self.index_name = "final" indexer_config = { "batch_size": 5, "fields": ["doc_id", "content", "pack_info"], "indexer": { "name": "ElasticSearchIndexer", "hparams": { "index_name": self.index_name, "hosts": "localhost:9200", "algorithm": "bm25" }, "other_kwargs": { "request_timeout": 10, "refresh": True } } } self.indexer = ElasticSearchIndexer( config={"index_name": self.index_name}) nlp: Pipeline[DataPack] = Pipeline() nlp.set_reader(MSMarcoPassageReader()) nlp.add(DataSelectorIndexProcessor(), config=indexer_config) nlp.initialize() self.size = 0 for _ in nlp.process_dataset(self.abs_data_dir): self.size += 1 self.test_dir = tempfile.mkdtemp()
class TestNLTKSentenceSegmenter(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) def test_segmenter(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = self.nltk.process(document) for idx, sentence in enumerate(pack.get(Sentence)): self.assertEqual(sentence.text, sentences[idx])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() # loading config config = yaml.safe_load(open(args.config_file, "r")) nlp: Pipeline[MultiPack] = Pipeline() nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"]) nlp.add(component=MultiPackBoxer(), config=config["boxer_config"]) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=config["da_processor_config"], ) nlp.initialize() for _, m_pack in enumerate(nlp.process_dataset()): aug_pack = m_pack.get_pack("augmented_input") logging.info(aug_pack.text)
def _create_pipeline(config): nlp = Pipeline() nlp.set_reader(StringReader()) # Using SpacyProcessor to segment the sentences nlp.add_processor( processor=SpacyProcessor(), config={ 'processors': '', 'lang': "en_core_web_sm", # Language code to build the Pipeline 'use_gpu': False }) nlp.add_processor(processor=AllenNLPProcessor(), config=config) nlp.initialize() return nlp
def create_pack_iterator(self) -> Iterator[DataPack]: srl_train_reader = OntonotesReader(cache_in_memory=True) train_pl: Pipeline = Pipeline() train_pl.set_reader(srl_train_reader) train_pl.initialize() pack_iterator = train_pl.process_dataset(self.train_path) return pack_iterator