Ejemplo n.º 1
0
class Word:
    """Contains attributes of each processed word in a list of
    words. Designed to be used in the ``Doc.words`` dataclass.

    >>> from cltk.core.data_types import Word
    >>> from cltk.languages.example_texts import get_example_text
    >>> get_example_text("lat")[:25]
    'Gallia est omnis divisa i'
    >>> from cltk.languages.utils import get_lang
    >>> lat = get_lang("lat")
    >>> Word(index_char_start=0, index_char_stop=6, index_token=0, string=get_example_text("lat")[0:6], pos="nom")
    Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', \
lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, \
category={}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)
    """

    index_char_start: int = None
    index_char_stop: int = None
    index_token: int = None
    index_sentence: int = None
    string: str = None
    pos: str = None
    lemma: str = None
    stem: str = None
    scansion: str = None
    xpos: str = None  # treebank-specific POS tag (from stanza)
    upos: str = None  # universal POS tag (from stanza)
    dependency_relation: str = None  # (from stanza)
    governor: int = None
    features: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle()
    category: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle()
    embedding: np.ndarray = field(repr=False, default=None)
    stop: bool = None
    named_entity: bool = None
    syllables: List[str] = None
    phonetic_transcription: str = None
    definition: str = None

    def __getitem__(
        self, feature_name: Union[str, Type[MorphosyntacticFeature]]
    ) -> List[MorphosyntacticFeature]:
        """Accessor to help get morphosyntatic features from a word object."""
        return self.features[feature_name]

    def __getattr__(self, item: str):
        """Accessor to help get morphosyntatic features from a word object."""
        feature_name = sc.pascalcase(item)
        if feature_name in ud_mod.__dict__:
            return self.features[feature_name]
        else:
            raise AttributeError(item)
Ejemplo n.º 2
0
    def stanza_to_cltk_word_type(stanza_doc):
        """Take an entire ``stanza`` document, extract
        each word, and encode it in the way expected by
        the CLTK's ``Word`` type.

        >>> from cltk.dependency.processes import StanzaProcess
        >>> from cltk.languages.example_texts import get_example_text
        >>> process_stanza = StanzaProcess(language="lat")
        >>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words
        >>> isinstance(cltk_words, list)
        True
        >>> isinstance(cltk_words[0], Word)
        True
        >>> cltk_words[0]
        Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos=noun, lemma='Gallia', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=1, features={Case: [nominative], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)

        """

        words_list = list()  # type: List[Word]

        for sentence_index, sentence in enumerate(stanza_doc.sentences):
            sent_words = dict()  # type: Dict[int, Word]
            indices = list()  # type: List[Tuple[int, int]]

            for token_index, token in enumerate(sentence.tokens):
                stanza_word = token.words[0]  # type: stanza.pipeline.doc.Word
                # TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?)
                pos: Optional[MorphosyntacticFeature] = from_ud(
                    "POS", stanza_word.pos)
                cltk_word = Word(
                    index_token=int(stanza_word.id) -
                    1,  # subtract 1 from id b/c Stanza starts their index at 1
                    index_sentence=sentence_index,
                    string=stanza_word.text,  # same as ``token.text``
                    pos=pos,
                    xpos=stanza_word.xpos,
                    upos=stanza_word.upos,
                    lemma=stanza_word.lemma,
                    dependency_relation=stanza_word.deprel,
                    governor=stanza_word.head - 1 if stanza_word.head else
                    -1,  # note: if val becomes ``-1`` then no governor, ie word is root
                )  # type: Word

                # convert UD features to the normalized CLTK features
                raw_features = ([
                    tuple(f.split("=")) for f in stanza_word.feats.split("|")
                ] if stanza_word.feats else [])
                cltk_features = [
                    from_ud(feature_name, feature_value)
                    for feature_name, feature_value in raw_features
                ]
                cltk_word.features = MorphosyntacticFeatureBundle(
                    *cltk_features)
                cltk_word.category = to_categorial(cltk_word.pos)
                cltk_word.stanza_features = stanza_word.feats

                # sent_words[cltk_word.index_token] = cltk_word
                words_list.append(cltk_word)

                # # TODO: Fix this, I forget what we were tracking in this
                # indices.append(
                #     (
                #         int(stanza_word.governor)
                #         - 1,  # -1 to match CLTK Word.index_token
                #         int(stanza_word.parent_token.index)
                #         - 1,  # -1 to match CLTK Word.index_token
                #     )
                # )
            # # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models?
            # for idx, cltk_word in enumerate(sent_words.values()):
            #     governor_index, parent_index = indices[idx]  # type: int, int
            #     cltk_word.governor = governor_index if governor_index >= 0 else None
            #     if cltk_word.index_token != sent_words[parent_index].index_token:
            #         cltk_word.parent = parent_index

        return words_list