def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False): self.models_path = BackoffGreekLemmatizer.models_path missing_models_message = "BackoffGreekLemmatizer requires the ```greek_models_cltk``` to be in cltk_data. Please load this corpus." try: self.train = open_pickle(os.path.join(self.models_path, 'greek_lemmatized_sents.pickle')) self.GREEK_OLD_MODEL = open_pickle(os.path.join(self.models_path, 'greek_lemmata_cltk.pickle')) self.GREEK_MODEL = open_pickle(os.path.join(self.models_path, 'greek_model.pickle')) except FileNotFoundError as err: raise type(err)(missing_models_message) self.greek_sub_patterns = greek_sub_patterns # Move to greek_models_cltk self.seed = seed self.VERBOSE=verbose def _randomize_data(train: List[list], seed: int): import random random.seed(seed) random.shuffle(train) pos_train_sents = train[:4000] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:4000] test_sents = lem_train_sents[4000:5000] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed) self._define_lemmatizer()
def __init__(self: object, train: List[list] = None, seed: int = 3, verbose: bool = False): self.models_path = BackoffLatinLemmatizer.models_path missing_models_message = "BackoffLatinLemmatizer requires the ```latin_models_cltk``` to be in cltk_data. Please load this corpus." try: self.train = open_pickle(os.path.join(self.models_path, 'latin_pos_lemmatized_sents.pickle')) self.LATIN_OLD_MODEL = open_pickle(os.path.join(self.models_path, 'latin_lemmata_cltk.pickle')) self.LATIN_MODEL = open_pickle(os.path.join(self.models_path, 'latin_model.pickle')) except FileNotFoundError as err: raise type(err)(missing_models_message) self.latin_sub_patterns = latin_sub_patterns # Move to latin_models_cltk self.seed = seed self.VERBOSE=verbose def _randomize_data(train: List[list], seed: int): import random random.seed(seed) random.shuffle(train) pos_train_sents = train[:4000] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:4000] test_sents = lem_train_sents[4000:5000] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed) self._define_lemmatizer()
def __init__(self, train, seed=3): self.train = train self.seed = seed rel_path = os.path.join( '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) # Check for presence of LATIN_OLD_MODEL file = 'latin_lemmata_cltk.pickle' old_model_path = os.path.join(path, file) if os.path.isfile(old_model_path): self.LATIN_OLD_MODEL = open_pickle(old_model_path) else: self.LATIN_OLD_MODEL = {} print('The file %s is not available in cltk_data' % file) # Check for presence of LATIN_MODEL file = 'latin_model.pickle' model_path = os.path.join(path, file) if os.path.isfile(model_path): self.LATIN_MODEL = open_pickle(model_path) else: self.LATIN_MODEL = {} print('The file %s is not available in cltk_data' % file) # Check for presence of misc_patterns self.latin_sub_patterns = latin_sub_patterns # Check for presence of verb_patterns self.latin_verb_patterns = latin_verb_patterns # Check for presence of latin_pps self.latin_pps = latin_pps def _randomize_data(train, seed): import random random.seed(seed) random.shuffle(train) pos_train_sents = train[:4000] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:4000] test_sents = lem_train_sents[4000:5000] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data( self.train, self.seed) self._define_lemmatizer()
def run(self, input_doc: Doc) -> Doc: """Compute the embeddings.""" output_doc = deepcopy(input_doc) # For word2vec-style embedding, used for word embeddings embeddings_obj = self.algorithm for index, word_obj in enumerate(output_doc.words): if not self.embedding_length: self.embedding_length = embeddings_obj.get_embedding_length() word_embedding = embeddings_obj.get_word_vector( word=word_obj.string) if not isinstance(word_embedding, np.ndarray): word_embedding = np.zeros([self.embedding_length]) word_obj.embedding = word_embedding output_doc.words[index] = word_obj # For sentence embeddings, uses TF-IDF # This checks whether a file of Tf-IDF embeddings is available if not self.idf_model: # First check if user has hard coded the path as an OS variable fp_idf_os_env: Optional[str] = os.environ.get("WORD_IDF_FILE") if fp_idf_os_env: self.idf_model = open_pickle(path=fp_idf_os_env) # Check if IDF embeddings available available in CLTK repo elif TFIDF_MAP.get(self.language): model_path: str = TFIDF_MAP[self.language] if not os.path.isdir(model_path): msg = f"TF-IDF model path '{model_path}' not found. Going to try to download it ..." logger.warning(msg) dl_msg = f"This part of the CLTK depends upon models from the CLTK project." model_url = f"https://github.com/cltk/{self.language}_models_cltk" download_prompt(iso_code=self.language, message=dl_msg, model_url=model_url) self.idf_model = open_pickle(path=f"{model_path}word_idf.pkl") # Min and max values are needed while generating sentence embeddings if self.idf_model and not self.min_idf: tfidf_values: ValuesView = self.idf_model.values() tfidf_values_array: np.array = np.array(list(tfidf_values)) self.min_idf: np.float64 = tfidf_values_array.min() self.max_idf: np.float64 = tfidf_values_array.max() if self.idf_model: for index, sent_obj in enumerate(output_doc.sentences): output_doc.sentence_embeddings[index] = get_sent_embeddings( sent=sent_obj, idf_model=self.idf_model, min_idf=self.min_idf, max_idf=self.max_idf, dimensions=self.embedding_length, ) return output_doc
def __init__(self, train, seed=3): self.train = train self.seed = seed rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) # Check for presence of LATIN_OLD_MODEL file = 'latin_lemmata_cltk.pickle' old_model_path = os.path.join(path, file) if os.path.isfile(old_model_path): self.LATIN_OLD_MODEL = open_pickle(old_model_path) else: self.LATIN_OLD_MODEL = {} print('The file %s is not available in cltk_data' % file) # Check for presence of LATIN_MODEL file = 'latin_model.pickle' model_path = os.path.join(path, file) if os.path.isfile(model_path): self.LATIN_MODEL = open_pickle(model_path) else: self.LATIN_MODEL = {} print('The file %s is not available in cltk_data' % file) # Check for presence of misc_patterns self.latin_sub_patterns = latin_sub_patterns # Check for presence of verb_patterns self.latin_verb_patterns = latin_verb_patterns # Check for presence of latin_pps self.latin_pps = latin_pps def _randomize_data(train, seed): import random random.seed(seed) random.shuffle(train) pos_train_sents = train[:4000] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:4000] test_sents = lem_train_sents[4000:5000] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed) self._define_lemmatizer()
def test_open_pickle(self): """Test opening pickle. This requires ``greek_models_cltk`` to have been run in ``setUp()``. """ pickle_path_rel = '~/cltk_data/greek/model/greek_models_cltk/tokenizers/sentence/greek.pickle' # pylint: disable=line-too-long pickle_path = os.path.expanduser(pickle_path_rel) a_pickle = open_pickle(pickle_path) self.assertTrue(a_pickle)
def _load_model(self, name): model = self.models.get(name, None) if model is None: pickle_path = self.available_taggers[name] model = open_pickle(pickle_path) self.models[name] = model return model
def __init__( self: object, train: List[list] = None, seed: int = 3, verbose: bool = False ): self.models_path = models_path missing_models_message = "GreekBackoffLemmatizer requires the ```grc_models_cltk``` to be in cltk_data. Please load this corpus." try: self.train = open_pickle( os.path.join(self.models_path, "greek_lemmatized_sents.pickle") ) self.GREEK_OLD_MODEL = open_pickle( os.path.join(self.models_path, "greek_lemmata_cltk.pickle") ) self.GREEK_MODEL = open_pickle( os.path.join(self.models_path, "greek_model.pickle") ) except FileNotFoundError as err: raise type(err)(missing_models_message) self.greek_sub_patterns = greek_sub_patterns self.seed = seed self.VERBOSE = verbose def _randomize_data(train: List[list], seed: int): import random random.seed(seed) random.shuffle(train) train_size = int(0.9 * len(train)) pos_train_sents = train[:train_size] lem_train_sents = [[(item[0], item[1]) for item in sent] for sent in train] train_sents = lem_train_sents[:train_size] test_sents = lem_train_sents[train_size:] return pos_train_sents, train_sents, test_sents self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data( self.train, self.seed ) self._define_lemmatizer()
def tag_ngram_123_backoff(self, untagged_string: str): """Tag POS with 1-, 2-, 3-gram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['ngram_123_backoff'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
def tag_unigram(self, untagged_string: str): """Tag POS with unigram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['unigram'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
def tokenize_sentences(self, untokenized_string: str): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer assert isinstance(untokenized_string, str), \ 'Incoming argument must be a string.' if self.language == 'latin': self.models_path = self._get_models_path(self.language) try: self.model = open_pickle( os.path.expanduser( os.path.join(self.models_path, 'latin_punkt.pickle'))) except FileNotFoundError as err: raise type(err)(TokenizeSentence.missing_models_message + self.models_path) tokenizer = self.model tokenizer._lang_vars = self.lang_vars elif self.language == 'greek': # Workaround for regex tokenizer self.sent_end_chars = GreekLanguageVars.sent_end_chars self.sent_end_chars_regex = '|'.join(self.sent_end_chars) self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' else: tokenizer = open_pickle(self.tokenizer_path) tokenizer = self._setup_tokenizer(tokenizer) # mk list of tokenized sentences if self.language == 'latin': return tokenizer.tokenize(untokenized_string) elif self.language == 'greek': return re.split(self.pattern, untokenized_string) else: tokenized_sentences = [ sentence for sentence in tokenizer.sentences_from_text( untokenized_string, realign_boundaries=True) ] return tokenized_sentences
def __init__(self: object, language:str = 'latin'): """ :param language : language for sentence tokenization :type language: str """ self.lang_vars = LatinLanguageVars() super().__init__(language='latin', lang_vars=self.lang_vars) self.models_path = LatinPunktSentenceTokenizer.models_path try: self.model = open_pickle(os.path.join(self.models_path, 'latin_punkt.pickle')) except FileNotFoundError as err: raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)
def tokenize_sentences(self, untokenized_string: str): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer assert isinstance(untokenized_string, str), \ 'Incoming argument must be a string.' if self.language == 'latin': self.models_path = self._get_models_path(self.language) try: self.model = open_pickle( os.path.expanduser(os.path.join(self.models_path, 'latin_punkt.pickle'))) except FileNotFoundError as err: raise type(err)(TokenizeSentence.missing_models_message + self.models_path) tokenizer = self.model tokenizer._lang_vars = self.lang_vars elif self.language == 'greek': # Workaround for regex tokenizer self.sent_end_chars=GreekLanguageVars.sent_end_chars self.sent_end_chars_regex = '|'.join(self.sent_end_chars) self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' else: tokenizer = open_pickle(self.tokenizer_path) tokenizer = self._setup_tokenizer(tokenizer) # mk list of tokenized sentences if self.language == 'latin': return tokenizer.tokenize(untokenized_string) elif self.language == 'greek': return re.split(self.pattern, untokenized_string) else: tokenized_sentences = [sentence for sentence in tokenizer.sentences_from_text(untokenized_string, realign_boundaries=True)] return tokenized_sentences
def __init__(self: object, language: str = 'latin'): """ :param language : language for sentence tokenization :type language: str """ self.lang_vars = LatinLanguageVars() super().__init__(language='latin', lang_vars=self.lang_vars) self.models_path = LatinPunktSentenceTokenizer.models_path try: self.model = open_pickle( os.path.join(self.models_path, 'latin_punkt.pickle')) except FileNotFoundError as err: raise type(err)(LatinPunktSentenceTokenizer.missing_models_message)
def __init__(self: object, language: str = 'greek'): """ :param language : language for sentence tokenization :type language: str """ super().__init__(language='greek') self.models_path = GreekPunktSentenceTokenizer.models_path try: self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path), 'greek_punkt.pickle')) except FileNotFoundError as err: raise type(err)(GreekPunktSentenceTokenizer.missing_models_message) self.lang_vars = GreekLanguageVars()
def __init__(self, language: str = None, lang_vars: object = None): """ :param language : language for sentence tokenization :type language: str """ self.language = language self.lang_vars = lang_vars super().__init__(language=self.language) if self.language: self.models_path = self._get_models_path(self.language) try: self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path), f'{self.language}_punkt.pickle')) except FileNotFoundError as err: raise type(err)(BasePunktSentenceTokenizer.missing_models_message)
def __init__(self: object, language: str = "greek"): """ :param language : language for sentence tokenization :type language: str """ super().__init__(language="greek") self.models_path = GreekPunktSentenceTokenizer.models_path try: self.model = open_pickle( os.path.join(os.path.expanduser(self.models_path), "greek_punkt.pickle")) except FileNotFoundError as err: raise type(err)(GreekPunktSentenceTokenizer.missing_models_message) self.lang_vars = GreekLanguageVars()
def tokenize_sentences(self, untokenized_string): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer tokenizer = open_pickle(self.tokenizer_path) tokenizer = self._setup_tokenizer(tokenizer) # mk list of tokenized sentences tokenized_sentences = [] for sentence in tokenizer.sentences_from_text(untokenized_string, realign_boundaries=True): # pylint: disable=C0301 tokenized_sentences.append(sentence) return tokenized_sentences
def __init__(self, language: str = None, lang_vars: object = None): """ :param language : language for sentence tokenization :type language: str """ self.language = language self.lang_vars = lang_vars super().__init__(language=self.language) if self.language: self.models_path = self._get_models_path(self.language) try: self.model = open_pickle( os.path.join(os.path.expanduser(self.models_path), f'{self.language}_punkt.pickle')) except FileNotFoundError as err: raise type(err)( BasePunktSentenceTokenizer.missing_models_message)
def __init__(self, seed: int = 3, verbose: bool = False): self.models_path = BackoffMHGLemmatizer.models_path missing_models_message = "BackoffMHGLemmatizer requires the ```middle_high_german_models_cltk``` " \ "to be in cltk_data. Please load this corpus." self.seed = seed self.verbose = verbose self.token_to_lemmata = [] self.lemma_to_tokens = [] try: self.token_to_lemmata = open_pickle( os.path.join(self.models_path, "token_to_lemma.pickle")) except FileNotFoundError as err: raise type(err)(missing_models_message) self._define_lemmatizer()
def tokenize_sentences(self: object, untokenized_string: str): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer assert isinstance(untokenized_string, str), \ 'Incoming argument must be a string.' tokenizer = open_pickle(self.tokenizer_path) tokenizer = self._setup_tokenizer(tokenizer) # mk list of tokenized sentences tokenized_sentences = [] for sentence in tokenizer.sentences_from_text(untokenized_string, realign_boundaries=True): # pylint: disable=C0301 tokenized_sentences.append(sentence) return tokenized_sentences
def __init__(self, language: str = None, lang_vars: object = None): """Constructor. :param language : language for sentences tokenization :type language: str """ super().__init__(language=language) if self.language == "lat": self.language_old = "lat" self.lang_vars = lang_vars if self.language: self.models_path = self._get_models_path(self.language) try: self.model = open_pickle( os.path.join( os.path.expanduser(self.models_path), f"{self.language_old}_punkt.pickle", )) except FileNotFoundError as err: raise type(err)(PunktSentenceTokenizer.missing_models_message)
def __init__(self: object, language: str = 'latin', strict: bool = False): """ :param language : language for sentence tokenization :type language: str :param strict : allow for stricter puctuation for sentence tokenization :type strict: bool """ self.lang_vars = LatinLanguageVars() self.strict = strict super().__init__(language='latin', lang_vars=self.lang_vars) self.models_path = LatinPunktSentenceTokenizer.models_path try: self.model = open_pickle( os.path.join(self.models_path, 'latin_punkt.pickle')) except FileNotFoundError as err: raise type(err)(LatinPunktSentenceTokenizer.missing_models_message) if self.strict: PunktLanguageVars.sent_end_chars = STRICT_PUNCTUATION else: PunktLanguageVars.sent_end_chars = PUNCTUATION
def __init__(self: object, strict: bool = False): """Constructor for ``LatinPunktSentenceTokenizer``. :param strict : allow for stricter punctuation for sentences tokenization :type strict: bool """ self.lang_vars = LatinLanguageVars() self.strict = strict super().__init__(language="lat", lang_vars=self.lang_vars) fp_sentence_tok_model_dir = "lat/model/lat_models_cltk/tokenizers/sentence/" models_path = os.path.join(CLTK_DATA_DIR, fp_sentence_tok_model_dir) self.models_path = os.path.join(models_path, "latin_punkt.pickle") try: self.model = open_pickle(self.models_path) except FileNotFoundError as err: msg = f"``LatinPunktSentenceTokenizer`` could not find required file ``{self.models_path}``. Download the corpus ``lat_models_cltk``." raise FileNotFoundError(msg) if self.strict: PunktLanguageVars.sent_end_chars = STRICT_PUNCTUATION else: PunktLanguageVars.sent_end_chars = PUNCTUATION
# Latin Lemmatizer (OLD) # la_corpus_importer = CorpusImporter('latin') # la_corpus_importer.import_corpus('latin_text_latin_library') # la_corpus_importer.import_corpus('latin_models_cltk') # la_lemmatizer = LemmaReplacer('latin') # Latin Lemmatizer (NEW with backoff) # Set up training sentences rel_path = os.path.join('/Users/christiancasey/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) # Check for presence of latin_pos_lemmatized_sents file = 'latin_pos_lemmatized_sents.pickle' latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: latin_pos_lemmatized_sents = [] print('The file %s is not available in cltk_data' % file) la_lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents) # Greek Lemmatizer grc_corpus_importer = CorpusImporter('greek') grc_corpus_importer.import_corpus('greek_models_cltk') grc_lemmatizer = LemmaReplacer('greek') # Initialize lemmatizers once outside of the loop, # then select based on langauge inside the loop -- get_words_from_file() tagLat = POSTag('latin') tagGrk = POSTag('greek')
def getmodel(fname): return open_pickle(join(path, fname))
from extract_features import parse_tess from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess') tokenizer = open_pickle('tokenizers/ancient_greek.pickle') print('Xenophon tokens: ' + str(len(tokenizer.tokenize(text)))) print() trainer = PunktTrainer(lang_vars=PunktLanguageVars()) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text, verbose=True) new_tokenizer = PunktSentenceTokenizer(trainer.get_params()) print('tokenizers equal? ' + str(tokenizer == new_tokenizer)) print('tokenization equal? ' + str(tokenizer.tokenize(text) == new_tokenizer.tokenize(text))) old_tok_out = open('feature_data/old_tok.txt', mode='w') old_tok_out.write('\n'.join(tokenizer.tokenize(text))) new_tok_out = open('feature_data/new_tok.txt', mode='w') new_tok_out.write('\n'.join(new_tokenizer.tokenize(text))) ''' There seem to be very few abbreviations in the tesserae corpus. This means training the PunktSentenceTokenizer might not yield any improvement. From paper abstract: "[Punkt sentence tokenization training] is based on the assumption that a large number of ambiguities in the determination of sentence boundaries can be eliminated once abbreviations have been identified."
import os from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer from progress_bar import print_progress_bar from extract_features import file_parsers PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') cltk_params = open_pickle('tokenizers/ancient_greek.pickle')._params kjohnson_params = open_pickle( 'feature_data/kjohnson_greek.pickle').get_params() #Are the attributes from ~/cltk_data/greek/model/greek_models_cltk/tokenizers/sentence/greek.pickle the same as https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/greek.pickle ? Yes they are print(cltk_params.abbrev_types) print(cltk_params.abbrev_types == kjohnson_params.abbrev_types) print() print(cltk_params.collocations) print(cltk_params.collocations == kjohnson_params.collocations) print() print(cltk_params.sent_starters) print(cltk_params.sent_starters == kjohnson_params.sent_starters) print() print(cltk_params.ortho_context) print(cltk_params.ortho_context == kjohnson_params.ortho_context) print() p = PunktSentenceTokenizer()._params print('Defaults') print(p.abbrev_types) print(p.collocations) print(p.sent_starters)
lemmatizedTextList = [] # holds the versions of the title as we lemmatize them lemmatizer = LemmaReplacer('latin') lengthOfDataFile = 0 # number of rows in data file numberOfFails = 0 numberOfSuccesses = 0 preprocessedTitle = "" # a temp string where we store the ongoing preprocessing work on a title successfulHits = [] word_tokenizer = WordTokenizer('latin') # build standard dictionary/model # courtesy of Patrick Burns rel_path = os.path.join( '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) file = 'latin_lemmata_cltk.pickle' old_model_path = os.path.join(path, file) LATIN_OLD_MODEL = open_pickle(old_model_path) # make standard lemmatizer # as an instance of TrainLemmatizer # courtesy of Patrick Burns lemmatizer = TrainLemmatizer(model=LATIN_OLD_MODEL, backoff=default) # import custom dictionary csv as python dictionary customDictionaryPath = os.path.join(cwd, 'customDictionary.csv') with open( customDictionaryPath, 'r') as f: # this should close the file after the end of the with loop reader = csv.DictReader(f) for row in reader: customDictionaryCurrentLength += 1 if row['lemma'] == "": continue # in case a token has been added to custom dictionary but no lemma has yet been provided it customDictionary[row['token']] = row['lemma']
def test_open_pickle_fail_missing(self): """Test failure to unpickle a file that doesn't exist""" bad_file = 'cltk/tests/doesnt_exist.pickle' with self.assertRaises(FileNotFoundError): open_pickle(bad_file)
def test_open_pickle_fail_corrupt(self): """Test failure to open corrupted pickle.""" bad_file = 'cltk/tests/bad_pickle.pickle' with self.assertRaises(EOFError): open_pickle(bad_file)
import os from cltk.utils.file_operations import open_pickle from extract_features import file_parsers from progress_bar import print_progress_bar xeno_tokenizer = open_pickle('tokenizers/ancient_greek.pickle') tess_tokenizer = open_pickle('feature_data/tesserae_greek.pickle') corpus_dir = 'tesserae' + os.sep + 'texts' + os.sep + 'grc' file_extension = 'tess' #Obtain all the files to parse by traversing through the directory file_names = sorted(list({current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in \ os.walk(corpus_dir) for current_file_name in current_file_names if current_file_name.endswith('.' + file_extension)})) counter = 1 for file_name in [ 'tesserae/texts/grc/achilles_tatius.leucippe_et_clitophon.tess' ]: #file_names: file_text = file_parsers[file_extension](file_name) x_tokens = xeno_tokenizer.tokenize(file_text) t_tokens = tess_tokenizer.tokenize(file_text) if t_tokens != x_tokens: xeno_out = open('feature_data/xeno_token_achilles.txt', mode='w') xeno_out.write('\n'.join(x_tokens)) tess_out = open('feature_data/tess_token_achilles.txt', mode='w') tess_out.write('\n'.join(t_tokens)) # print_progress_bar(counter, len(file_names)) counter += 1 ''' I trained Punkt on the entire tesserae corpus (feature_data/tesserae_greek.pickle). It's performance was actually worse than the tokenizer that was created from training on just Xenophon (tokenizers/ancient_greek.pickle). The tokenizer created from training on just Xenophon does well, except for failing to tokenize sentences where the terminal punctuation is not followed by a space. '''
#The greek.pickle used by cltk for ancient greek will unserialize as a PunktTrainer object. #This script converts it into a PunktSentenceTokenizer import os import pickle from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer lang = 'greek' file = 'greek.pickle' PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') rel_path = os.path.join('~/cltk_data', lang, 'model/' + lang + '_models_cltk/tokenizers/sentence') path = os.path.expanduser(rel_path) tokenizer_path = os.path.join(path, file) trainer = open_pickle(tokenizer_path) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) with open('ancient_greek.pickle', 'wb') as pickle_file: pickle_file.write(pickle.dumps(tokenizer))
lemmatizer = self._define_lemmatizer() return lemmatizer.evaluate(self.test_sents) if __name__ == "__main__": # Set up training sentences rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) # Check for presence of latin_pos_lemmatized_sents file = 'latin_pos_lemmatized_sents.pickle' latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: latin_pos_lemmatized_sents = [] print('The file %s is not available in cltk_data' % file) RUN = 10 ACCURACIES = [] for I in range(RUN): LEMMATIZER = BackoffLatinLemmatizer(latin_pos_lemmatized_sents) ACC = LEMMATIZER.evaluate() ACCURACIES.append(ACC) print('{:.2%}'.format(ACC)) print('\nTOTAL (Run %d) times' % RUN)
from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer from extract_features import parse_tess PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess') new_xeno_trainer = PunktTrainer() # new_xeno_trainer.INCLUDE_ALL_COLLOCS = True # new_xeno_trainer.INCLUDE_ABBREV_COLLOCS = True new_xeno_trainer.train(text) new_xeno_params = new_xeno_trainer.get_params() tess_xeno_params = open_pickle('tokenizers/ancient_greek.pickle')._params print(new_xeno_params.abbrev_types) print(new_xeno_params.abbrev_types == tess_xeno_params.abbrev_types) print() print(new_xeno_params.collocations) print(new_xeno_params.collocations == tess_xeno_params.collocations) print() print(new_xeno_params.sent_starters) print(new_xeno_params.sent_starters == tess_xeno_params.sent_starters) print() print(new_xeno_params.ortho_context) print(new_xeno_params.ortho_context == tess_xeno_params.ortho_context) print() ''' I got the internal PunktParameters object from the cltk pickle file that was trained on Xenophon's Anabasis (https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/training_sentences.txt), and I also got the internal PunktParameters object from an PunktTrainer that I created from training on Xenophon's Anabasis from the tesserae corpus (https://github.com/tesserae/tesserae/blob/master/texts/grc/xenophon.anabasis.tess).