def test_pretokenized_multidoc(): nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_pretokenized': True}) doc = nlp(EN_DOC_PRETOKENIZED) assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) doc = nlp([stanza.Document([], text=EN_DOC_PRETOKENIZED_LIST)])[0] assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
def _tag(self, text: Union[str, List[str]]) -> List[TaggedDocument]: """Tag text. Return dict if lists.""" documents: List[stanza.Document] = [stanza.Document([], text=d) for d in text] tagged_documents: List[stanza.Document] = self.nlp(documents) if isinstance(tagged_documents, stanza.Document): tagged_documents = [tagged_documents] return [self._to_dict(d) for d in tagged_documents]
def test_depparse_with_pretagged_doc(): nlp = stanza.Pipeline( **{ 'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'depparse_pretagged': True }) doc = stanza.Document(CoNLL.conll2dict(input_str=EN_DOC_CONLLU_PRETAGGED)) processed_doc = nlp(doc) assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join( [sent.dependencies_string() for sent in processed_doc.sentences])
def extract_features(writer, language, corpus, sentence_list): id = 0 for sentence in sentence_list: data = {} root = get_root(sentence) # First sanity check: is there a verbal root? if root == None: continue sentence_all, sentence_open = remove_punct_particles( sentence), remove_closed_class(sentence) # Convert back to stanza for later tree creation (lazy) try: document_all = stanza.Document(CoNLL.convert_conll([sentence_all])) document_open = stanza.Document( CoNLL.convert_conll([sentence_open])) except: print("WARNING: Could not parse {0}".format(id)) continue try: dependency_tree_all = tree(document_all.sentences[0].dependencies) dependency_tree_open = tree( document_open.sentences[0].dependencies) except: print("WARNING: Could not create tree for {0}".format(id)) continue # Second sanity check: can we make a tree? if len(dependency_tree_all) == 0 or len(dependency_tree_open) == 0: print(root) text = [] for tok in sentence: text.append(tok[1]) text.append(tok[7]) print(text) print("WARNING: Dependencies empty! (sentence {0})".format(id)) id += 1 continue # Third sanity check: does it meet order_info requirements? root = get_root(sentence_all) # Retrieve new verb index order_info = determine_order_from_constituents(root, sentence_all) if (order_info == None): continue data.update({ "language": language, "corpus": corpus, "id": "{0}_{1}".format(corpus, id), "original_length": len(sentence) }) data.update(order_info) data.update(head_final(sentence_all, sentence_open)) observed_data = data observed_data.update({"baseline": "observed"}) observed_data.update(get_dep_length(sentence_all, sentence_open)) optimal_data = data optimal_data.update({"baseline": "optimal"}) optimal_data.update( get_optimal_dep_length(dependency_tree_all, dependency_tree_open)) writer.writerow(observed_data) writer.writerow(optimal_data) #print(observed_data) for i in range(0, 10): random_data = data random_data.update({"baseline": "random"}) random_data.update( get_random_dep_lengths(dependency_tree_all, dependency_tree_open)) writer.writerow(random_data) #print(random_data) id += 1