def test_json_corpus_reader(self): """Test filtered corpus sents method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus') # this has simple sections reader._fileids = ['cicero__on-behalf-of-aulus-caecina__latin.json'] self.assertTrue(len(list(reader.paras())) >= 1) self.assertTrue(len(list(reader.sents())) > 400) self.assertTrue(len(list(reader.words())) > 12200) reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus') # this example has subsections reader._fileids = [ 'ausonius-decimus-magnus__eclogarum-liber__latin.json' ] self.assertTrue(len(list(reader.docs())) == 1) self.assertTrue(len(list(reader.paras())) >= 1) self.assertTrue(len(list(reader.sents())) > 50) self.assertTrue(len(list(reader.words())) > 2750) reader = get_corpus_reader(corpus_name='greek_text_perseus', language='greek') reader._fileids = ['plato__apology__grc.json'] self.assertTrue(len(list(reader.docs())) == 1) self.assertTrue(len(list(reader.paras())) > 1) self.assertTrue(len(list(reader.sents())) > 260) self.assertTrue(len(list(reader.words())) > 9800)
def setUpClass(cls): try: corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') except: raise Exception('Failure to download test corpus') cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader._fileids = ['pervig.txt'] # Need a additional instance because tests below change internals #TO-DO Fix cls.reader_2 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader_3 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader_4 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
def test_json_corpus_reader(self): """Test filtered corpus sents method.""" reader = get_corpus_reader(language="latin", corpus_name="latin_text_perseus") # this has simple sections reader._fileids = ["cicero__on-behalf-of-aulus-caecina__latin.json"] self.assertTrue(len(list(reader.paras())) >= 1) self.assertTrue(len(list(reader.sents())) > 400) self.assertTrue(len(list(reader.words())) > 12000) reader = get_corpus_reader(language="latin", corpus_name="latin_text_perseus") # this example has subsections reader._fileids = ["ausonius-decimus-magnus__eclogarum-liber__latin.json"] self.assertTrue(len(list(reader.docs())) == 1) self.assertTrue(len(list(reader.paras())) >= 1) self.assertTrue(len(list(reader.sents())) > 50) self.assertTrue(len(list(reader.words())) > 2750)
def test_json_corpus_reader(self): """Test filtered corpus sents method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus') # this has simple sections reader._fileids = ['cicero__on-behalf-of-aulus-caecina__latin.json'] self.assertTrue(len(list(reader.paras())) >= 1) self.assertTrue(len(list(reader.sents())) > 400) self.assertTrue(len(list(reader.words())) > 12200) reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus') # this example has subsections reader._fileids = ['ausonius-decimus-magnus__eclogarum-liber__latin.json'] self.assertTrue(len(list(reader.docs())) == 1) self.assertTrue(len(list(reader.paras())) >= 1) self.assertTrue(len(list(reader.sents())) > 50) self.assertTrue(len(list(reader.words())) > 2750)
def setUpClass(cls): try: corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') except: raise Exception('Failure to download test corpus') cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader._fileids = ['pervig.txt'] # Need a additional instance because tests below change internals #TO-DO Fix cls.reader_2 = get_corpus_reader( language='latin', corpus_name='latin_text_latin_library') cls.reader_3 = get_corpus_reader( language='latin', corpus_name='latin_text_latin_library') cls.reader_4 = get_corpus_reader( language='latin', corpus_name='latin_text_latin_library')
def __init__(self): self.sent_tokenizer = SentenceTokenizer() self.word_tokenizer = WordTokenizer('greek') self.corpus_reader = get_corpus_reader( corpus_name='greek_text_perseus', language='greek') self.lemmatizer = LemmaReplacer('greek') self.tfidf_vectorizer = TfidfVectorizer(input="filename")
def test_import_latin_library_corpus_filter_by_file_and_dir(self): """Test the Latin Library corpus reader filter by directories.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') filtered_reader, files_found, dirs_found = assemble_corpus(reader, ['old'], corpus_directories_by_type, corpus_texts_by_type) self.assertTrue(len(list(filtered_reader.fileids())) > 0)
def test_import_latin_library_corpus_reader(self): """Test the Latin Library corpus reader.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') ALL_FILE_IDS = list(reader.fileids()) self.assertTrue(len(ALL_FILE_IDS) > 2100)
def test_import_latin_library_corpus_reader(self): """Test the Latin Library corpus reader.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') ALL_FILE_IDS = list(reader.fileids()) self.assertTrue(len(ALL_FILE_IDS) > 2100)
def setUpClass(self): try: corpus_importer = CorpusImporter("latin") corpus_importer.import_corpus("latin_models_cltk") corpus_importer.import_corpus("latin_text_latin_library") except: raise Exception("Failure to download test corpus") self.reader = get_corpus_reader(language="latin", corpus_name="latin_text_latin_library") self.reader._fileids = ["pervig.txt"] # Need a additional instance because tests below change internals #TO-DO Fix self.reader_2 = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library") self.reader_3 = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library") self.reader_4 = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library")
def test_import_latin_library_corpus_filter_by_file(self): """Test the Latin Library corpus reader filter by files.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') filtered_reader = assemble_corpus(reader, types_requested=['old'], type_files=corpus_texts_by_type) self.assertTrue(len(list(filtered_reader.fileids())) > 0)
def main(): if len(sys.argv) < 2: print( "Please supply an inflected word on the command line. Example: search_by_lemma.py κύνεσσιν\n" ) sys.exit() infl = sys.argv[1] lem = lemmatize(infl)[0] # lemmatized print("searching for " + lem + " <- " + infl) index = {} for work in ["iliad", "odyssey"]: for book in range(1, 24 + 1): # ranges from 1 to 24 filename = 'texts/homer.' + work + '.part.' + str(book) + '.tess' #print(filename) reader = get_corpus_reader(corpus_name='greek_text_tesserae', language='greek') reader._fileids = [filename] sentences = list(reader.sents([filename])) sentences = [cltk_normalize(s) for s in sentences] count_sentences = 0 for s in sentences: count_sentences = count_sentences + 1 no_punct = re.sub( r"[,;:\.']", '', s ) # remove punctuation, which lemmatizer treats as independent words words = re.split("\s+", no_punct) count_words = 0 for word in lemmatize(no_punct): count_words = count_words + 1 if lem == word: i = count_words - 1 w = words[i] context = " ".join( words[max(i - 3, 0):min(i + 4, len(words) - 1)]) #context = re.sub(re.compile("("+w+")"),r"__\1__",context) # ... surround with __ __ pos_tagged = tagger.tag_tnt(no_punct) # ... tag words in sentence with parts of speech, https://github.com/cltk/tutorials/blob/master/8%20Part-of-speech%20tagging.ipynb # for descriptions of what the POS tags mean, see https://linguistics.stackexchange.com/questions/12803/what-do-the-labels-mean-in-this-latin-pos-tagging describe = w for t in pos_tagged: if t[0] == w: describe = t[0] + " " + pos_tag_to_description( t[1]) break print(work + " " + str(book) + ", sentence " + str(count_sentences) + ", word " + str(count_words) + ": " + describe + " " + context) if w in index: index[w] += 1 else: index[w] = 1 #sys.exit() for w in sorted(list(index.keys())): print(str(index[w]) + " " + w)
def getWordList(selectedWork): reader = get_corpus_reader(corpus_name='greek_text_perseus', language='greek') docs = list(reader.docs()) reader._fileids = [selectedWork] words = list(reader.words()) return words
def test_filtered_corpus_reader_docs(self): """Test filtered corpus docs method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') reader._fileids = ['catullus.txt'] docs = list(reader.docs()) words = distinct_words(docs) if 'Latin' in words: self.fail('Filtered word present!') if 'Library' in words: self.fail('Filtered word present!') self.assertTrue(len(docs) > 0)
def test_filtered_corpus_reader_paras(self): """Test filtered corpus paras method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') reader._fileids = ['catullus.txt'] paras = list(reader.paras()) sents = [sent for para in paras for sent in para] uniq_words = distinct_words(sents) if 'Latin' in uniq_words: self.fail('Filtered word present!') if 'Library' in uniq_words: self.fail('Filtered word present!') self.assertTrue(len(paras) > 0)
def test_tesserae_corpus_reader(self): """Test Tesserae corpus methods.""" # Update when corpus is add to CLTK reader = get_corpus_reader(language="greek", corpus_name="greek_text_tesserae") sample = reader.fileids()[0] self.assertTrue(len(list(reader.docs(sample))) >= 1) self.assertTrue(len(list(reader.texts(sample))) >= 1) self.assertTrue(len(list(reader.paras(sample))) >= 1) self.assertTrue(len(list(reader.sents(sample))) >= 1) self.assertTrue(len(list(reader.words(sample))) >= 1) self.assertTrue(len(list(reader.lines(sample))) >= 1) self.assertTrue(reader.describe()) self.assertTrue(len(list(reader.pos_tokenize(sample))) >= 1)
def test_tesserae_corpus_reader(self): """Test Tesserae corpus methods.""" # Update when corpus is add to CLTK reader = get_corpus_reader(language='greek', corpus_name='greek_text_tesserae') sample = reader.fileids()[0] self.assertTrue(len(list(reader.docs(sample))) >= 1) self.assertTrue(len(list(reader.texts(sample))) >= 1) self.assertTrue(len(list(reader.paras(sample))) >= 1) self.assertTrue(len(list(reader.sents(sample))) >= 1) self.assertTrue(len(list(reader.words(sample))) >= 1) self.assertTrue(len(list(reader.lines(sample))) >= 1) self.assertTrue(reader.describe()) self.assertTrue(len(list(reader.pos_tokenize(sample))) >= 1)
def choose_corpus( self, corpus_name: 'latin_text_latin_library or latin_text_perseus' = '' ) -> "list": """Lists the available Latin texts. Currently supports Latin Library and Perseus Library. Will display a list of available texts as a pandas `series`. :Param corpus_name: either 'latin_text_latin_library' or 'latin_text_perseus.' These are listed in the corpus_names attribute. """ self.reader = get_corpus_reader(language='latin', corpus_name=corpus_name) self.catalog = list(self.reader.fileids()) self.corpus_name = corpus_name
def extract(name): reader = get_corpus_reader(language="latin", corpus_name=name) lines = [] if name == "latin_text_perseus": sentences = reader.sents() elif name == "latin_text_tesserae": sentences = reader.sents(fileids=reader.fileids()) elif name == "latin_text_latin_library": sentences = (" ".join(sentence) for sentence in reader.sents()) for sentence in tqdm(sentences): try: cleaned_sentence = preprocess(preprocess_like_evalatin(sentence)) cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence).strip() if len(cleaned_sentence.split()) >= 5: if "�" not in cleaned_sentence: lines.append(cleaned_sentence) except: continue return lines
def test_filtered_corpus_reader_docs(self): """Test filtered corpus docs method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') reader._fileids = ['catullus.txt'] docs = list(reader.docs()) words = distinct_words(docs) if 'Latin' in words: self.fail('Filtered word present!') if 'Library' in words: self.fail('Filtered word present!') self.assertTrue(len(docs) > 0) problem_files = [ 'caesar/bc3.txt', 'hymni.txt', 'varro.frag.txt', 'varro.ll10.txt', 'varro.ll5.txt', 'varro.ll6.txt', 'varro.ll7.txt', 'varro.ll8.txt', 'varro.ll9.txt' ] for filename in problem_files: doc = list(reader.docs([filename])) assert (doc) assert (len(doc[0]) > 100)
from cltk.corpus.utils.importer import CorpusImporter from cltk.corpus.readers import get_corpus_reader from cltk.stem.lemma import LemmaReplacer from cltk.corpus.utils.formatter import cltk_normalize from cltk.lemmatize.greek.backoff import BackoffGreekLemmatizer from cltk.phonology.greek.transcription import Transcriber from cltk.tag.pos import POSTag from cltk.tag import ner corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') corpus_importer2 = CorpusImporter('greek') corpus_importer2.import_corpus('greek_text_perseus') philippians_reader = get_corpus_reader(corpus_name="greek_text_perseus", language="greek") philippians_reader._fileids = [ 'new-testament__letter-to-the-philippians__grc.json' ] # print(list(perseus_reader.sents())) sentences = list(philippians_reader.sents()) sentence = cltk_normalize(sentences[0]) lemmatizer = LemmaReplacer('greek') word_list = lemmatizer.lemmatize(sentence) tagger = POSTag('greek') parts_of_speech = tagger.tag_ngram_123_backoff(sentence)
def __init__(self, corpus_name): self.corpus_name = corpus_name self.catalog = catalog self.reader = get_corpus_reader(language='latin', corpus_name=corpus_name)
def test_json_corpus_reader_sizes(self): """Test filtered corpus sizes method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus') self.assertTrue(len(list(reader.sizes())) > 290)
def test_json_corpus_reader_sizes(self): """Test filtered corpus sizes method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus') self.assertTrue(len(list(reader.sizes())) > 290)
def test_filtered_corpus_reader_sizes(self): """Test filtered corpus sizes method.""" reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') reader._fileids = ['catullus.txt'] self.assertTrue(len(list(reader.sizes())) > 0)