def test_sentiment_in_sentence() -> None: stanza.download("en", processors="tokenize, sentiment") nlp = stanza.Pipeline(lang="en", processors="tokenize", use_gpu=False) # without sentiment for document in stanza_batch.batch([EXAMPLE_ONE], nlp): for sentence in document.sentences: with pytest.raises(AttributeError): sentence.sentiment # with sentiment nlp = stanza.Pipeline(lang="en", processors="tokenize, sentiment", use_gpu=False) for document in stanza_batch.batch([EXAMPLE_ONE], nlp): for sentence in document.sentences: assert isinstance(sentence.sentiment, int)
def load_document(self, input, language): if (isinstance(input, str)): paragraphs = input.split('\n') tokenized_paragraphs = [ self.nlp(p) for p in paragraphs if p.strip() ] elif isinstance(input, list): paragraphs = [' '.join(para) for para in input] sentence_list, sentence_paragraph = flat_article(input) sentence_nlp = [ doc.sentences[0] for doc in batch(sentence_list, self.nlp, batch_size=32) ] article_rebuilt = unflat_article(sentence_nlp, sentence_paragraph) tokenized_paragraphs = [ SimpleNamespace(sentences=para) for para in article_rebuilt ] # tokenized_paragraphs = [SimpleNamespace(sentences=[self.nlp(sentence).sentences[0] for sentence in para if sentence.strip()]) for para in input] # tokenized_paragraphs = [SimpleNamespace(sentences=[doc.sentences[0] for doc in batch(para, self.nlp,batch_size=32)]) for para in input] else: raise NotImplementedError() # print(paragraphs) # tokenizing, lemmarization, ... return { 'paragraphs': paragraphs, 'tokenized_paragraphs': tokenized_paragraphs, 'language': language }
def freq_dict(data_texts: list, tagging_method, nlp): vectors = [] if tagging_method == 'SPACY': for text in data_texts: vector_new_tags = prep_text_new_tags(text) doc = nlp(" ".join(vector_new_tags[0])) freq_dict = Counter(([token.pos_ for token in doc])) vector = vector_freq(freq_dict, vector_new_tags) vectors.append(vector) # ---test--- # for token in doc: # print(token.text) # print(token.pos_) elif tagging_method == 'STANZA': vectors = [] list_stza = doc_withing_double_space(data_texts) #https://pypi.org/project/stanza-batch/ for doc in batch(list_stza, nlp, batch_size=100): vector_new_tags = prep_text_new_tags(doc.text) if type(vector_new_tags[0]) == list: text = " ".join(vector_new_tags[0]) sentence = join_sentences(text, nlp) freq_dict = Counter([word.upos for word in sentence]) vector = vector_freq(freq_dict, vector_new_tags) vectors.append(vector) else: raise Exception("You must enter a valid tagging method, See TaggingMethod.") return vectors
def test_ents_attribute_in_doc_and_sentence(include_ner: bool) -> None: entity_document = ["Two entities Alice and Bob"] processes = "tokenize" if include_ner: processes = "tokenize, ner" stanza.download("en", processors=processes) nlp = stanza.Pipeline(lang="en", processors=processes, use_gpu=False) for document in stanza_batch.batch(entity_document, nlp): if include_ner: assert document.ents else: assert not document.ents for sentence in document.sentences: if include_ner: assert sentence.ents else: assert not document.ents
nlp = stanza.Pipeline(lang="en", processors="tokenize,pos,sentiment", use_gpu=True) book_data: List[str] = [] test_data_dir = Path(__file__, "..", "tests", "data").resolve() with Path(test_data_dir, "jane_austin_emma_data.txt").open("r") as emma_file: book_data = [line for line in emma_file] assert len(book_data) == 490 t = time() gpu_memory_used: List[float] = [] processed_book_data: List[Document] = [] for document in stanza_batch.batch(book_data, nlp, clear_cache=args.clear_cache): processed_book_data.append(document) if args.save_fp: # assuming the first GPU is the one being used. gpu_memory_used.append(GPUtil.getGPUs()[0].memoryUsed) print(f'Time taken: {time() - t}') if args.save_fp: number_documents_processed = range(len(processed_book_data)) plt.plot(number_documents_processed, gpu_memory_used) plt.xlabel('Number of documents processed') plt.ylabel('GPU Memory used (MB)') plt.grid(True) plt.savefig(str(args.save_fp))
def test_batch(clear_cache: bool, torch_no_grad: bool) -> None: stanza.download("en", processors="tokenize") nlp = stanza.Pipeline(lang="en", processors="tokenize, sentiment", use_gpu=False) # One sample count = 0 for document in stanza_batch.batch([EXAMPLE_ONE], nlp, clear_cache=clear_cache, torch_no_grad=torch_no_grad): count += 1 # This process removes the \n either side of the string assert document.text == "Hello how are you" assert document.sentences[0].tokens[0].start_char == 0 assert document.sentences[-1].tokens[-1].end_char == 17 document_text = document.text for sentence in document.sentences: for token in sentence.tokens: assert (document_text[token.start_char:token.end_char] == token.text) assert isinstance(sentence.sentiment, int) assert count == 1 # One sample where the sample is split into three due to `\n\n` in the # middle of the string. count = 0 for document in stanza_batch.batch( [EXAMPLE_FOUR], nlp, clear_cache=clear_cache, torch_no_grad=torch_no_grad, ): count += 1 # This process removes the `\n \n\n` and adds `\n\n` in its place. assert ( document.text == "Hello how are you. Great Thanks\n\nSomething else\n\nAnother test" ) assert document.sentences[0].tokens[0].start_char == 0 assert document.sentences[-1].tokens[-1].end_char == 61 document_text = document.text for sentence in document.sentences: for token in sentence.tokens: assert (document_text[token.start_char:token.end_char] == token.text) assert isinstance(sentence.sentiment, int) assert count == 1 # Multiple samples text_dict = { 0: EXAMPLE_ONE.strip(), 1: EXAMPLE_THREE.strip(), 2: "Hello how are you. Great Thanks\n\nSomething else\n\nAnother test", 3: EXAMPLE_ONE.strip(), } documents = [EXAMPLE_ONE, EXAMPLE_THREE, EXAMPLE_FOUR, EXAMPLE_ONE] count = 0 for index, document in enumerate( stanza_batch.batch( documents, nlp, batch_size=2, clear_cache=clear_cache, torch_no_grad=torch_no_grad, )): count += 1 document_text = document.text assert document_text == text_dict[index] for sentence in document.sentences: for token in sentence.tokens: assert (document_text[token.start_char:token.end_char] == token.text) assert isinstance(sentence.sentiment, int) assert count == len(documents) # One text across 3 batches long_text = "\nHi\n\nNice to meet you\n \n \nIt is a nice day\n\nBut it could be warmer\n \nBye!\n\n \n\n" count = 0 for index, document in enumerate( stanza_batch.batch( [long_text], nlp, batch_size=2, clear_cache=clear_cache, torch_no_grad=torch_no_grad, )): count += 1 document_text = document.text assert ( document_text == "Hi\n\nNice to meet you\n\nIt is a nice day\n\nBut it could be warmer\n\nBye!" ) for sentence in document.sentences: for token in sentence.tokens: assert (document_text[token.start_char:token.end_char] == token.text) assert isinstance(sentence.sentiment, int) assert count == 1 # Real world type of test across a number of samples from the Jane Austin # book Emma. book_data: List[str] = [] test_data_dir = Path(__file__, "..", "data").resolve() with Path(test_data_dir, "jane_austin_emma_data.txt").open("r") as emma_file: book_data = [line for line in emma_file] assert len(book_data) == 490 processed_book_data: List[Document] = [] processed_book_data = [ document for document in stanza_batch.batch(book_data, nlp, clear_cache=clear_cache, torch_no_grad=torch_no_grad) ] assert len(book_data) == len(processed_book_data) for true_data, processed_data in zip(book_data, processed_book_data): processed_text = processed_data.text assert true_data.strip() == processed_text for sentence in processed_data.sentences: for token in sentence.tokens: assert (processed_text[token.start_char:token.end_char] == token.text) assert isinstance(sentence.sentiment, int)
def test_misc_in_change_offsets() -> None: """ Tests that we check if "misc" exists in as a key in token within the change_offsets functions. The change_offsets functions are within the combine_stanza_documents and _batch_to_documents functions. """ # For the _batch_to_documents function stanza.download("cs", processors="tokenize,mwt") nlp = stanza.Pipeline(lang="cs", processors="tokenize,mwt", use_gpu=False) sent = "Požádal, aby mu vyhověli." correct_tokens = [ [{ "id": 1, "text": "Požádal", "misc": "start_char=0|end_char=7" }], [{ "id": 2, "text": ",", "misc": "start_char=7|end_char=8" }], [ { "id": (3, 4), "text": "aby", "misc": "start_char=9|end_char=12" }, { "id": 3, "text": "aby" }, { "id": 4, "text": "by" }, ], [{ "id": 5, "text": "mu", "misc": "start_char=13|end_char=15" }], [{ "id": 6, "text": "vyhověli", "misc": "start_char=16|end_char=24" }], [{ "id": 7, "text": ".", "misc": "start_char=24|end_char=25" }], ] docs = list(stanza_batch.batch([sent], nlp)) assert len(docs) == 1 for index, token in enumerate(docs[0].iter_tokens()): correct_token = correct_tokens[index] temp_token = token.to_dict() assert len(temp_token) == len(correct_token) assert temp_token == correct_token # For the combine_stanza_documents function doc = stanza_batch.combine_stanza_documents(docs) for index, token in enumerate(doc.iter_tokens()): correct_token = correct_tokens[index] temp_token = token.to_dict() assert len(temp_token) == len(correct_token) assert temp_token == correct_token