def tag_bigram(self, untagged_string: str): """Tag POS with bigram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers["bigram"] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False): self.models_path = BackoffLatinLemmatizer.models_path missing_models_message = "BackoffLatinLemmatizer requires the ```latin_models_cltk``` to be in cltk_data. Please load this corpus." try: self.train = open_pickle( os.path.join(self.models_path, "latin_pos_lemmatized_sents.pickle")) self.LATIN_OLD_MODEL = open_pickle( os.path.join(self.models_path, "latin_lemmata_cltk.pickle")) self.LATIN_MODEL = open_pickle( os.path.join(self.models_path, "latin_model.pickle")) except FileNotFoundError as err: raise type(err)(missing_models_message) self.latin_sub_patterns = latin_sub_patterns # Move to latin_models_cltk self.seed = seed self.VERBOSE = verbose def _randomize_data(train: List[list], seed: int): import random random.seed(seed) random.shuffle(train) pos_train_sents = train[:4000] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:4000] test_sents = lem_train_sents[4000:5000] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data( self.train, self.seed) self._define_lemmatizer()
def __init__(self, language: str = None, lang_vars: object = None): """ :param language : language for sentences tokenization :type language: str """ self.language = language if self.language == "lat": self.language_old = "latin" self.lang_vars = lang_vars super().__init__(language=self.language) if self.language: self.models_path = self._get_models_path(self.language) try: self.model = open_pickle( os.path.join( os.path.expanduser(self.models_path), f"{self.language_old}_punkt.pickle", )) except FileNotFoundError as err: raise type(err)( BasePunktSentenceTokenizer.missing_models_message)
def __init__(self: object, strict: bool = False): """Constructor for ``LatinPunktSentenceTokenizer``. :param strict : allow for stricter punctuation for sentences tokenization :type strict: bool """ self.lang_vars = LatinLanguageVars() self.strict = strict super().__init__(language="lat", lang_vars=self.lang_vars) fp_sentence_tok_model_dir = "lat/model/lat_models_cltk/tokenizers/sentence/" models_path = os.path.join(CLTK_DATA_DIR, fp_sentence_tok_model_dir) self.models_path = os.path.join(models_path, "latin_punkt.pickle") try: self.model = open_pickle(self.models_path) except FileNotFoundError as err: msg = f"``LatinPunktSentenceTokenizer`` could not find required file ``{self.models_path}``. Download the corpus ``lat_models_cltk``." raise FileNotFoundError(msg) if self.strict: PunktLanguageVars.sent_end_chars = STRICT_PUNCTUATION else: PunktLanguageVars.sent_end_chars = PUNCTUATION