def test_pipeline4(self, batch_size): """Tests a chain of Pack->Batch->Pack.""" nlp = Pipeline() reader = SentenceReader() nlp.set_reader(reader) dummy1 = DummyPackProcessor() nlp.add_processor(processor=dummy1) dummy2 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size}} nlp.add_processor(processor=dummy2, config=config) dummy3 = DummyPackProcessor() nlp.add_processor(processor=dummy3) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[PACK][BATCH][PACK]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
class MSMarcoPassageReaderTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(MSMarcoPassageReader()) self.pipeline.initialize() self.data_dir = 'data_samples/ms_marco_passage_retrieval' corpus_file = os.path.join(self.data_dir, 'collection.tsv') self.expected_content = {} with open(corpus_file, 'r') as f: for line in f.readlines(): key, value = tuple(line.split('\t', 1)) self.expected_content[key] = value def test_ms_marco_passage_reader(self): actual_content: Dict[str, str] = {} for data_pack in self.pipeline.process_dataset(self.data_dir): self.assertIsInstance(data_pack, DataPack) doc_entries = list(data_pack.get(Document)) self.assertTrue(len(doc_entries) == 1) doc_entry: Document = doc_entries[0] self.assertIsInstance(doc_entry, Document) actual_content[data_pack.pack_name] = doc_entry.text self.assertDictEqual(actual_content, self.expected_content)
def test_pipeline7(self, batch_size1, batch_size2, batch_size3): # Tests a chain of Batch->Batch->Batch->Pack with different batch sizes. nlp = Pipeline() reader = MultiPackSentenceReader() nlp.set_reader(reader) dummy1 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size1}} nlp.add_processor(processor=dummy1, config=config, selector=FirstPackSelector()) dummy2 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size2}} nlp.add_processor(processor=dummy2, config=config, selector=FirstPackSelector()) dummy3 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size3}} nlp.add_processor(processor=dummy3, config=config, selector=FirstPackSelector()) dummy4 = DummyPackProcessor() nlp.add_processor(processor=dummy4, selector=FirstPackSelector()) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_pack("pack").get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[BATCH][BATCH][BATCH][PACK]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
class AGNewsReaderTest(unittest.TestCase): def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(AGNewsReader()) self.pipeline.initialize() self.sample_file: str = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), *([os.path.pardir] * 4), "data_samples/ag_news/sample.csv")) self.expected_content: Dict[int, str] = {} with open(self.sample_file, "r") as file: for line_id, line in enumerate(file): data = line.strip().split(",") class_id, title, description = ( int(data[0].replace('"', "")), data[1], data[2], ) self.expected_content[line_id] = (class_id, title, description) self.class_idx_to_name = { 1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tech", } def test_ag_news_reader(self): for data_pack in self.pipeline.process_dataset(self.sample_file): ( expected_class_id, expected_title, expected_desc, ) = self.expected_content[data_pack.pack_name] self.assertIsInstance(data_pack, DataPack) # Test Article doc_entries = list(data_pack.get(Document)) self.assertTrue(len(doc_entries) == 1) article: Document = doc_entries[0] self.assertIsInstance(article, Document) self.assertEqual(article.text, expected_title + "\n" + expected_desc) # Test Document Class doc_class = article.document_class self.assertTrue(len(doc_class) == 1) self.assertEqual(doc_class[0], self.class_idx_to_name[expected_class_id]) # Test Title title_entries = list(data_pack.get(Title)) self.assertTrue(len(title_entries) == 1) title: Title = title_entries[0] self.assertEqual(title.text, expected_title) # Test Description desc_entries = list(data_pack.get(Description)) self.assertTrue(len(desc_entries) == 1) description: Description = desc_entries[0] self.assertEqual(description.text, expected_desc)
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = { "input_pack_name": "query", "output_pack_name": "output" } nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) config = { "model": { "name": "bert-base-uncased" }, "tokenizer": { "name": "bert-base-uncased" }, "max_seq_length": 128, "query_pack_name": "query" } nlp.add_processor(BertBasedQueryCreator(), config=config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): query_pack = m_pack.get_pack("query") self.assertEqual(len(query_pack.generics), 1) self.assertIsInstance(query_pack.generics[0], Query) query = query_pack.generics[0].value self.assertEqual(query.shape, (1, 768))
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams( { "input_pack_name": "input", "output_pack_name": "output" }, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) translator_config = HParams( { "src_language": "de", "target_language": "en", "in_pack_name": "input", "out_pack_name": "result" }, None) nlp.add_processor(MicrosoftBingTranslator(), config=translator_config) nlp.initialize() english_results = ["Hey good morning", "This is Forte. A tool for NLP"] for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): self.assertEqual(set(m_pack._pack_names), set(["input", "output", "result"])) self.assertEqual( m_pack.get_pack("result").text, english_results[idx] + "\n")
class CoNLL03ReaderPipelineTest(unittest.TestCase): def setUp(self): # Define and config the Pipeline self.dataset_path = "data_samples/conll03" self.nlp = Pipeline() self.nlp.set_reader(CoNLL03Reader()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.initialize() def test_process_next(self): doc_exists = False # get processed pack from dataset for pack in self.nlp.process_dataset(self.dataset_path): # get sentence from pack for sentence in pack.get_entries(Sentence): doc_exists = True sent_text = sentence.text # second method to get entry in a sentence tokens = [ token.text for token in pack.get_entries(Token, sentence) ] self.assertEqual(sent_text, " ".join(tokens)) self.assertTrue(doc_exists)
class PipelineTest(unittest.TestCase): def setUp(self) -> None: # Define and config the Pipeline self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = HParams({"batcher": { "batch_size": 5 }}, dummy.default_hparams()) self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.dataset_path = \ "forte/tests/data_samples/ontonotes_sample_dataset/00" def test_process_next(self): # get processed pack from dataset for pack in self.nlp.process_dataset(self.dataset_path): # get sentence from pack for sentence in pack.get_entries(Sentence): sent_text = sentence.text # first method to get entry in a sentence for link in pack.get_entries(RelationLink, sentence): parent = link.get_parent() child = link.get_child() print(f"{parent.text} is {link.rel_type} {child.text}") pass # some operation on link # second method to get entry in a sentence tokens = [ token.text for token in pack.get_entries(Token, sentence) ] self.assertEqual(sent_text, " ".join(tokens))
def test_without_attribute_masker(self): pl = Pipeline() pl.set_reader(CoNLL03Reader()) pl.initialize() for pack in pl.process_dataset("data_samples/conll03/"): entries = pack.get_entries_by_type(Token) for entry in entries: self.assertIsNotNone(entry.ner)
def test_process_next(self): another_pipeline = Pipeline() another_pipeline.set_reader(DeserializeReader()) another_pipeline.initialize() data = ["Testing Reader", "Testing Deserializer"] for pack in self.nlp.process_dataset(data): for new_pack in another_pipeline.process_dataset([pack.serialize()]): self.assertEqual(pack.text, new_pack.text)
def test_attribute_masker(self): pl = Pipeline() pl.set_reader(CoNLL03Reader()) config = {"kwargs": {Token: ["ner"]}} pl.add_processor(processor=AttributeMasker(), config=config) pl.initialize() for pack in pl.process_dataset("data_samples/conll03/"): entries = pack.get_entries_by_type(Token) for entry in entries: self.assertIsNone(entry.ner)
class StringReaderPipelineTest(unittest.TestCase): def setUp(self): # Define and config the Pipeline self.dataset_path = "examples/" self.pl1 = Pipeline() self._cache_directory = Path(os.path.join(os.getcwd(), "cache_data")) self.pl1.set_reader(StringReader()) self.pl2 = Pipeline() self.pl2.set_reader(StringReader()) self.text = ( "The plain green Norway spruce is displayed in the gallery's " "foyer. Wentworth worked as an assistant to sculptor Henry Moore " "in the late 1960s. His reputation as a sculptor grew in the " "1980s.") def test_reader(self): self._process() self._read_caching() def _process(self): doc_exists = False for pack in self.pl1.process_dataset([self.text]): doc_exists = True self.assertEqual(self.text, pack.text) self.assertTrue(doc_exists) def _read_caching(self): doc_exists = False # get processed pack from dataset for pack in self.pl2.process_dataset([self.text]): doc_exists = True self.assertEqual(self.text, pack.text) self.assertTrue(doc_exists) def tearDown(self): os.system("rm -r {}".format(self._cache_directory))
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams({"input_pack_name": "input", "output_pack_name": "output"}, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): self.assertEqual(m_pack._pack_names, ["input", "output"]) self.assertEqual(m_pack.get_pack("input").text, texts[idx] + "\n")
def test_reader_no_replace_test(self): # Read with no replacements pipeline = Pipeline() reader = SquadReader() pipeline.set_reader(reader) pipeline.initialize() data_packs: Iterable[DataPack] = pipeline.process_dataset( self.dataset_path) file_path: str = self.dataset_path expected_file_dict = {} with open(file_path, "r", encoding="utf8", errors="ignore") as file: expected_json = json.load(file) for dic in expected_json["data"]: title = dic["title"] cnt = 0 for qa_dic in dic["paragraphs"]: expected_file_dict[title + str(cnt)] = qa_dic # qas, context cnt += 1 count_packs = 0 for pack in data_packs: count_packs += 1 expected_text: str = "" expected = expected_file_dict[pack.pack_name] passage = list(pack.get(Passage)) self.assertEqual(len(passage), 1) expected_context = expected["context"] self.assertEqual(passage[0].text, expected_context) expected_text += expected_context for qid, question in enumerate(pack.get(MRCQuestion)): expected_qa = expected["qas"][qid] expected_question = expected_qa["question"] expected_answers = expected_qa["answers"] self.assertEqual(question.text, expected_question) if not isinstance(expected_answers, list): expected_answers = [expected_answers] answers = question.answers for answer, expected_answer in zip(answers, expected_answers): self.assertEqual(answer.text, expected_answer["text"]) expected_text += "\n" + expected_question self.assertEqual(pack.text, expected_text)
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str): pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_path, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) pl.add_processor(CoNLLNERPredictor(), ner_configs) srl_configs = HParams({ 'storage_path': srl_model_path, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def setUp(self): """ Reading the data into data_pack object to be used in the tests """ file_dir_path = os.path.dirname(__file__) conll_ud_dir = os.path.abspath(os.path.join(file_dir_path, *([os.pardir] * 4), 'data_samples/conll_ud')) pl = Pipeline() pl.set_reader(ConllUDReader()) pl.initialize() self.data_packs: List[DataPack] = \ [data_pack for data_pack in pl.process_dataset(conll_ud_dir)] self.doc_ids = ["weblog-blogspot.com_nominations_20041117172713_ENG_" "20041117_172713", "weblog-blogspot.com_nominations_20041117172713_ENG_" "20041117_172714"]
def test_reader_no_replace_test(self): # Read with no replacements pipeline = Pipeline() reader = RACEMultiChoiceQAReader() pipeline.set_reader(reader) pipeline.initialize() data_packs: Iterable[DataPack] = pipeline.process_dataset( self.dataset_path) file_paths: Iterator[str] = reader._collect(self.dataset_path) count_packs = 0 for pack, file_path in zip(data_packs, file_paths): count_packs += 1 expected_text: str = "" with open(file_path, "r", encoding="utf8", errors='ignore') as file: expected = json.load(file) articles = list(pack.get(RaceDocument)) self.assertEqual(len(articles), 1) expected_article = expected['article'] self.assertEqual(articles[0].text, expected_article) expected_text += expected_article for qid, question in enumerate(pack.get(Question)): expected_question = expected['questions'][qid] self.assertEqual(question.text, expected_question) expected_answers = expected['answers'][qid] if not isinstance(expected_answers, list): expected_answers = [expected_answers] expected_answers = [ reader._convert_to_int(ans) for ans in expected_answers ] self.assertEqual(question.answers, expected_answers) expected_text += '\n' + expected_question for oid, option in enumerate(question.options): expected_option = expected['options'][qid][oid] self.assertEqual(option.text, expected_option) expected_text += '\n' + expected_option self.assertEqual(pack.text, expected_text) self.assertEqual(count_packs, 2)
def test_pipeline1(self): """Tests a pack processor only.""" nlp = Pipeline() reader = MultiPackSentenceReader() nlp.set_reader(reader) dummy = DummyPackProcessor() nlp.add_processor(dummy, selector=FirstPackSelector()) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_pack("pack").get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[PACK]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
class DeserializeReaderPipelineTest(unittest.TestCase): def setUp(self): # Define and config the Pipeline self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.initialize() def test_process_next(self): another_pipeline = Pipeline() another_pipeline.set_reader(DeserializeReader()) another_pipeline.initialize() data = ["Testing Reader", "Testing Deserializer"] for pack in self.nlp.process_dataset(data): for new_pack in another_pipeline.process_dataset([pack.serialize()]): self.assertEqual(pack.text, new_pack.text)
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_pipeline2(self): """Tests a batch processor only.""" nlp = Pipeline() reader = MultiPackSentenceReader() nlp.set_reader(reader) dummy = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": 4}} nlp.add_processor(processor=dummy, config=config, selector=FirstPackSelector()) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_pack("pack").get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[BATCH]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
def test_process_next(self): # Define and config the Pipeline nlp = Pipeline() nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} nlp.add_processor(dummy, config=config) nlp.initialize() dataset_path = "data_samples/ontonotes/00" # get processed pack from dataset for pack in nlp.process_dataset(dataset_path): # get sentence from pack for sentence in pack.get_entries(Sentence): sent_text = sentence.text # second method to get entry in a sentence tokens = [ token.text for token in pack.get_entries(Token, sentence) ] self.assertEqual(sent_text, " ".join(tokens))
def test_reader(self): pipeline = Pipeline() reader = SST2Reader() pipeline.set_reader(reader) pipeline.initialize() data_packs: Iterable[DataPack] = pipeline.process_dataset( self.dataset_path, self.n_samples_per_pack) for i, pack in enumerate(data_packs): # Test a simple case """ 0 | 7 ______|_______ | | 6 5 _____|____ _____|_____ | | | | Effective but too-tepid biopic """ if i == 0: count_root: int = 0 count_leaf: int = 0 root: ConstituentNode = None for cnode in pack.get(ConstituentNode): if cnode.is_root: root = cnode count_root += 1 if cnode.is_leaf: count_leaf += 1 self.assertEqual(count_root, 1) self.assertEqual(count_leaf, 4) # node 0 self.assertEqual(root.text, "Effective but too-tepid biopic") self.assertEqual(len(root.children_nodes), 1) # node 7 root = root.children_nodes[0] self.assertEqual(root.text, "Effective but too-tepid biopic") self.assertEqual(len(root.children_nodes), 2) self.assertEqual(root.sentiment["pos"], 0.51389) left_subtree = root.children_nodes[0] right_subtree = root.children_nodes[1] # node 6 self.assertEqual(left_subtree.text, "Effective but") self.assertEqual(len(left_subtree.children_nodes), 2) self.assertEqual(left_subtree.sentiment["pos"], 0.63889) # node 5 self.assertEqual(right_subtree.text, "too-tepid biopic") self.assertEqual(len(right_subtree.children_nodes), 2) self.assertEqual(right_subtree.sentiment["pos"], 0.375) leaf_node_1 = left_subtree.children_nodes[0] leaf_node_2 = left_subtree.children_nodes[1] leaf_node_3 = right_subtree.children_nodes[0] leaf_node_4 = right_subtree.children_nodes[1] self.assertEqual(leaf_node_1.text, "Effective") self.assertEqual(leaf_node_1.is_leaf, True) self.assertEqual(leaf_node_1.parent_node, left_subtree) self.assertEqual(leaf_node_2.text, "but") self.assertEqual(leaf_node_2.is_leaf, True) self.assertEqual(leaf_node_2.parent_node, left_subtree) self.assertEqual(leaf_node_3.text, "too-tepid") self.assertEqual(leaf_node_3.is_leaf, True) self.assertEqual(leaf_node_3.parent_node, right_subtree) self.assertEqual(leaf_node_4.text, "biopic") self.assertEqual(leaf_node_4.is_leaf, True) self.assertEqual(leaf_node_4.parent_node, right_subtree)
from forte.processors import ElasticSearchIndexProcessor from forte.pipeline import Pipeline if __name__ == "__main__": nlp = Pipeline() nlp.set_reader(CorpusReader()) config = tx.HParams( { "batch_size": 100000, "fields": ["doc_id", "content"], "indexer": { "name": "ElasticSearchIndexer", "hparams": { "index_name": "elastic_indexer2", "hosts": "localhost:9200", "algorithm": "bm25" }, "other_kwargs": { "request_timeout": 60, "refresh": False } } }, default_hparams=None) nlp.add_processor(ElasticSearchIndexProcessor(), config=config) nlp.initialize() for idx, pack in enumerate(nlp.process_dataset(".")): if idx + 1 > 0 and (idx + 1) % 100000 == 0: print(f"Completed {idx+1} packs")
class ProdigyReaderTest(unittest.TestCase): def setUp(self): # Define and config the Pipeline self.fp = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) self.nlp = Pipeline() self.nlp.set_reader(ProdigyReader()) self.create_sample_file() def tearDown(self): os.system("rm {}".format(self.fp.name)) def create_sample_file(self): prodigy_entry = { "text": "Lorem ipsum dolor sit amet", "tokens": [{ "text": "Lorem", "start": 0, "end": 5, "id": 0 }, { "text": "ipsum", "start": 6, "end": 11, "id": 1 }, { "text": "dolor", "start": 12, "end": 17, "id": 2 }, { "text": "sit", "start": 18, "end": 21, "id": 3 }, { "text": "amet", "start": 22, "end": 26, "id": 4 }], "spans": [{ "start": 0, "end": 5, "token_start": 0, "token_end": 1, "label": "sample_latin" }, { "start": 12, "end": 26, "token_start": 2, "token_end": 18, "label": "sample_latin" }], "meta": { "id": "doc_1", "sect_id": 1, "version": "1" }, "_input_hash": 123456789, "_task_hash": -123456789, "_session_id": "abcd", "_view_id": "ner_manual", "answer": "accept" } # for entry in JSON_file: json.dump(prodigy_entry, self.fp) self.fp.write('\n') json.dump(prodigy_entry, self.fp) self.fp.write('\n') self.fp.close() def test_packs(self): doc_exists = False # get processed pack from dataset for pack in self.nlp.process_dataset(self.fp.name): # get documents from pack for doc in pack.get_entries(Document): doc_exists = True self.token_check(doc, pack) self.label_check(doc, pack) self.assertEqual(pack.meta.doc_id, "doc_1") self.assertTrue(doc_exists) def token_check(self, doc, pack): doc_text = doc.text # Compare document text with tokens tokens = [token.text for token in pack.get_entries(Token, doc)] self.assertEqual(tokens[2], "dolor") self.assertEqual(doc_text.replace(" ", ""), "".join(tokens)) def label_check(self, doc, pack): # make sure that the labels are read in correctly labels = [ label.ner_type for label in pack.get_entries(EntityMention, doc) ] self.assertEqual(labels, ["sample_latin", "sample_latin"])
ElasticSearchQueryCreator from forte.processors.elastic_search_processor import \ ElasticSearchProcessor from forte.pipeline import Pipeline from reader import EvalReader from ms_marco_evaluator import MSMarcoEvaluator if __name__ == "__main__": config = yaml.safe_load(open("config.yml", "r")) config = tx.HParams(config, default_hparams=None) ms_marco_evaluator = MSMarcoEvaluator() nlp = Pipeline() nlp.set_reader(reader=EvalReader(), config=config.reader) nlp.add_processor(processor=ElasticSearchQueryCreator(), config=config.query_creator) nlp.add_processor(processor=ElasticSearchProcessor(), config=config.indexer) nlp.set_evaluator(evaluator=ms_marco_evaluator, config=config.evaluator) nlp.initialize() for idx, m_pack in enumerate( nlp.process_dataset( "./collection_and_queries/queries.dev.small.tsv")): if (idx + 1) % 1000 == 0: print(f"Processed {idx+1} examples") scores = nlp.evaluate() print(scores)
config_predict = yaml.safe_load(open("configs/config_predict.yml", "r")) saved_model = torch.load(config_predict["model_path"]) train_state = torch.load(config_predict["train_state_path"]) reader = CoNLL03Reader() predictor = TaggingPredictor() evaluator = CoNLLNEREvaluator() pl = Pipeline() pl.set_reader(reader) pl.add(predictor) pl.add(evaluator) pl.initialize() for pack in pl.process_dataset(config_predict["test_path"]): print("---- pack ----") for instance in pack.get(Sentence): sent = instance.text output_tags = [] if task == "ner": for entry in pack.get(EntityMention, instance): output_tags.append((entry.text, entry.ner_type)) else: for entry in pack.get(Token, instance): output_tags.append((entry.text, entry.pos)) print("---- example -----") print("sentence: ", sent) print("output_tags: ", output_tags) print(evaluator.get_result())
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) if not os.path.exists(config.indexer.model_dir): print(f"Creating a new index...") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "segment_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "text": ["str", "FixedLenFeature"] } hparam = { "allow_smaller_final_batch": True, "batch_size": config.indexer.batch_size, "dataset": { "data_name": "data", "feature_original_types": feature_original_types, "files": config.indexer.pickle_data_dir }, "shuffle": False } print(f"Embedding the text using BERTEncoder...") record_data = RecordData(hparams=hparam, device=device) data_iterator = DataIterator(record_data) index = EmbeddingBasedIndexer(hparams={ "index_type": "GpuIndexFlatIP", "dim": 768, "device": "gpu0" }) for idx, batch in enumerate(data_iterator): ids = batch["id"] input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] text = batch["text"] _, pooled_output = get_embeddings(encoder, input_ids, segment_ids) index.add(vectors=pooled_output, meta_data={k.item(): v for k, v in zip(ids, text)}) if (idx + 1) % 50 == 0: print(f"Completed {idx+1} batches of size " f"{config.indexer.batch_size}") index.save(path=config.indexer.model_dir) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(MultiPackTerminalReader()) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.translator) query_pipeline.add_processor( processor=QueryCreator(), config=config.query_creator) query_pipeline.add_processor( processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name="doc_0")) # query_pipeline.add_processor( # processor=CoNLLNERPredictor(), config=config.NER, # selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack("query") if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack("response") if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack("doc_0") print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
output = model.decode( text=text, char_batch=char_tensor, char_masks=char_masks, text_batch=text_tensor, text_mask=text_mask, srl_features=srl_features, ) print(output) return {"pred_link_tag": output} train_state = torch.load("train_state.pkl") predictor = Predictor( batch_size=10, model=saved_model, predict_forward_fn=predict_forward_fn, feature_resource=train_state["feature_resource"], ) pl = Pipeline() pl.set_reader(reader) pl.add(predictor) pl.initialize() for pack in pl.process_dataset("data/test"): print("====== pack ======") for instance in pack.get(Sentence): pass
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(reader=MultiPackTerminalReader(), config=config.reader) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.translator) query_pipeline.add_processor(processor=BertBasedQueryCreator(), config=config.query_creator) query_pipeline.add_processor(processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack(config.translator.in_pack_name) if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack(config.back_translator.in_pack_name) if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack(config.indexer.response_pack_name[0]) print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))