Python Document Examples

Programming Language: Python

Namespace/Package Name: stanza

Method/Function: Document

Examples at hotexamples.com: 4

Python Document - 4 examples found. These are the top rated real world Python examples of stanza.Document extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_tokenizer.py Project: stanfordnlp/stanza

def test_pretokenized_multidoc():
    nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
                                  'tokenize_pretokenized': True})
    doc = nlp(EN_DOC_PRETOKENIZED)
    assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
    doc = nlp([stanza.Document([], text=EN_DOC_PRETOKENIZED_LIST)])[0]
    assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])

Example #2

Show file

File: stanza.py Project: welfare-state-analytics/westac_parlaclarin_pipeline

    def _tag(self, text: Union[str, List[str]]) -> List[TaggedDocument]:
        """Tag text. Return dict if lists."""

        documents: List[stanza.Document] = [stanza.Document([], text=d) for d in text]

        tagged_documents: List[stanza.Document] = self.nlp(documents)

        if isinstance(tagged_documents, stanza.Document):
            tagged_documents = [tagged_documents]

        return [self._to_dict(d) for d in tagged_documents]

Example #3

Show file

File: test_depparse.py Project: sarves/stanza-for-Tamil

def test_depparse_with_pretagged_doc():
    nlp = stanza.Pipeline(
        **{
            'processors': 'depparse',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'depparse_pretagged': True
        })

    doc = stanza.Document(CoNLL.conll2dict(input_str=EN_DOC_CONLLU_PRETAGGED))
    processed_doc = nlp(doc)

    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in processed_doc.sentences])

Example #4

Show file

File: wmt_stats_by_sent.py Project: Narquelion/Crosslinguistic-Structural-Biases

def extract_features(writer, language, corpus, sentence_list):

    id = 0

    for sentence in sentence_list:

        data = {}
        root = get_root(sentence)

        # First sanity check: is there a verbal root?
        if root == None:
            continue

        sentence_all, sentence_open = remove_punct_particles(
            sentence), remove_closed_class(sentence)

        # Convert back to stanza for later tree creation (lazy)
        try:
            document_all = stanza.Document(CoNLL.convert_conll([sentence_all]))
            document_open = stanza.Document(
                CoNLL.convert_conll([sentence_open]))
        except:
            print("WARNING: Could not parse {0}".format(id))
            continue

        try:
            dependency_tree_all = tree(document_all.sentences[0].dependencies)
            dependency_tree_open = tree(
                document_open.sentences[0].dependencies)
        except:
            print("WARNING: Could not create tree for {0}".format(id))
            continue

        # Second sanity check: can we make a tree?
        if len(dependency_tree_all) == 0 or len(dependency_tree_open) == 0:
            print(root)
            text = []
            for tok in sentence:
                text.append(tok[1])
                text.append(tok[7])
            print(text)
            print("WARNING: Dependencies empty! (sentence {0})".format(id))
            id += 1
            continue

        # Third sanity check: does it meet order_info requirements?
        root = get_root(sentence_all)  # Retrieve new verb index
        order_info = determine_order_from_constituents(root, sentence_all)
        if (order_info == None):
            continue

        data.update({
            "language": language,
            "corpus": corpus,
            "id": "{0}_{1}".format(corpus, id),
            "original_length": len(sentence)
        })
        data.update(order_info)
        data.update(head_final(sentence_all, sentence_open))

        observed_data = data
        observed_data.update({"baseline": "observed"})
        observed_data.update(get_dep_length(sentence_all, sentence_open))

        optimal_data = data
        optimal_data.update({"baseline": "optimal"})
        optimal_data.update(
            get_optimal_dep_length(dependency_tree_all, dependency_tree_open))

        writer.writerow(observed_data)
        writer.writerow(optimal_data)
        #print(observed_data)

        for i in range(0, 10):
            random_data = data
            random_data.update({"baseline": "random"})
            random_data.update(
                get_random_dep_lengths(dependency_tree_all,
                                       dependency_tree_open))

            writer.writerow(random_data)
            #print(random_data)

        id += 1