class Word: """Contains attributes of each processed word in a list of words. Designed to be used in the ``Doc.words`` dataclass. >>> from cltk.core.data_types import Word >>> from cltk.languages.example_texts import get_example_text >>> get_example_text("lat")[:25] 'Gallia est omnis divisa i' >>> from cltk.languages.utils import get_lang >>> lat = get_lang("lat") >>> Word(index_char_start=0, index_char_stop=6, index_token=0, string=get_example_text("lat")[0:6], pos="nom") Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', \ lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, \ category={}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None) """ index_char_start: int = None index_char_stop: int = None index_token: int = None index_sentence: int = None string: str = None pos: str = None lemma: str = None stem: str = None scansion: str = None xpos: str = None # treebank-specific POS tag (from stanza) upos: str = None # universal POS tag (from stanza) dependency_relation: str = None # (from stanza) governor: int = None features: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle() category: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle() embedding: np.ndarray = field(repr=False, default=None) stop: bool = None named_entity: bool = None syllables: List[str] = None phonetic_transcription: str = None definition: str = None def __getitem__( self, feature_name: Union[str, Type[MorphosyntacticFeature]] ) -> List[MorphosyntacticFeature]: """Accessor to help get morphosyntatic features from a word object.""" return self.features[feature_name] def __getattr__(self, item: str): """Accessor to help get morphosyntatic features from a word object.""" feature_name = sc.pascalcase(item) if feature_name in ud_mod.__dict__: return self.features[feature_name] else: raise AttributeError(item)
def stanza_to_cltk_word_type(stanza_doc): """Take an entire ``stanza`` document, extract each word, and encode it in the way expected by the CLTK's ``Word`` type. >>> from cltk.dependency.processes import StanzaProcess >>> from cltk.languages.example_texts import get_example_text >>> process_stanza = StanzaProcess(language="lat") >>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words >>> isinstance(cltk_words, list) True >>> isinstance(cltk_words[0], Word) True >>> cltk_words[0] Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos=noun, lemma='Gallia', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=1, features={Case: [nominative], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None) """ words_list = list() # type: List[Word] for sentence_index, sentence in enumerate(stanza_doc.sentences): sent_words = dict() # type: Dict[int, Word] indices = list() # type: List[Tuple[int, int]] for token_index, token in enumerate(sentence.tokens): stanza_word = token.words[0] # type: stanza.pipeline.doc.Word # TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?) pos: Optional[MorphosyntacticFeature] = from_ud( "POS", stanza_word.pos) cltk_word = Word( index_token=int(stanza_word.id) - 1, # subtract 1 from id b/c Stanza starts their index at 1 index_sentence=sentence_index, string=stanza_word.text, # same as ``token.text`` pos=pos, xpos=stanza_word.xpos, upos=stanza_word.upos, lemma=stanza_word.lemma, dependency_relation=stanza_word.deprel, governor=stanza_word.head - 1 if stanza_word.head else -1, # note: if val becomes ``-1`` then no governor, ie word is root ) # type: Word # convert UD features to the normalized CLTK features raw_features = ([ tuple(f.split("=")) for f in stanza_word.feats.split("|") ] if stanza_word.feats else []) cltk_features = [ from_ud(feature_name, feature_value) for feature_name, feature_value in raw_features ] cltk_word.features = MorphosyntacticFeatureBundle( *cltk_features) cltk_word.category = to_categorial(cltk_word.pos) cltk_word.stanza_features = stanza_word.feats # sent_words[cltk_word.index_token] = cltk_word words_list.append(cltk_word) # # TODO: Fix this, I forget what we were tracking in this # indices.append( # ( # int(stanza_word.governor) # - 1, # -1 to match CLTK Word.index_token # int(stanza_word.parent_token.index) # - 1, # -1 to match CLTK Word.index_token # ) # ) # # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models? # for idx, cltk_word in enumerate(sent_words.values()): # governor_index, parent_index = indices[idx] # type: int, int # cltk_word.governor = governor_index if governor_index >= 0 else None # if cltk_word.index_token != sent_words[parent_index].index_token: # cltk_word.parent = parent_index return words_list