예제 #1
0
 def setUp(self):
     corpus_importer = CorpusImporter('french')
     corpus_importer.import_corpus('french_data_cltk')
     file_rel = os.path.join('~/cltk_data/french/text/french_data_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #2
0
 def test_import_latin_library_corpus_reader(self):
     """Test the Latin Library corpus reader."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     ALL_FILE_IDS = list(reader.fileids())
     self.assertTrue(len(ALL_FILE_IDS) > 2100)
예제 #3
0
def _install(lang, lst):
    print("Downloading %s " % (lang))
    corpus_importer = CorpusImporter(lang)
    for _corpus in lst:
        if _corpus['location'] == 'remote':
            print("    Downloading %s " % (_corpus['name']))
            corpus_importer.import_corpus(_corpus['name'])
예제 #4
0
파일: test_tmp.py 프로젝트: j-duff/cltk
 def test_import_nonexistant_corpus(self):
     """Test that creating a CorpusImporter for a non existent lang
        fails smoothly
     """
     with self.assertRaises(CorpusImportError):
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('euclids_book_of_recipes')
예제 #5
0
 def setUp(self):
     corpus_importer = CorpusImporter("old_english")
     corpus_importer.import_corpus("old_english_models_cltk")
     file_rel = os.path.join('~/cltk_data/old_english/model/old_english_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
	def get_cltkData(self):
		latin_cltk_importer = CorpusImporter('latin')
		latin_cltk_importer.import_corpus('latin_models_cltk')
		latin_cltk_importer.import_corpus('latin_text_latin_library')

		greek_cltk_importer = CorpusImporter('greek')
		greek_cltk_importer.import_corpus('greek_models_cltk')
		greek_cltk_importer.import_corpus('greek_text_perseus')
예제 #7
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_git_import_chinese_cbeta_txt(self):
     """Test import of plaintext CBETA."""
     corpus_importer = CorpusImporter('chinese')
     corpus_importer.import_corpus('chinese_text_cbeta_txt')
     file_rel = os.path.join('~/cltk_data/chinese/text/chinese_text_cbeta_txt/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #8
0
 def test_import_latin_text_antique_digiliblt(self):
     """Test cloning the Antique Latin from digilibLT."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_antique_digiliblt')
     file_rel = os.path.join('~/cltk_data/latin/text/latin_text_antique_digiliblt/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #9
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_la_text_lac_curt(self):
     """Test cloning the Lacus Curtius Latin text corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_lacus_curtius')
     file_rel = os.path.join('~/cltk_data/latin/text/latin_text_lacus_curtius/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #10
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_greek_software_tlgu(self):
     """Test cloning TLGU."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_software_tlgu')
     file_rel = os.path.join('~/cltk_data/greek/software/greek_software_tlgu/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #11
0
def pos_tagger_example_latin():
    corpus_importer = CorpusImporter('latin')
    corpus_importer.import_corpus('latin_models_cltk')

    tagger = pos.POSTag('latin')
    pos_tags = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres')

    print(pos_tags)
예제 #12
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_greek_text_perseus(self):
     """Test cloning the Perseus Greek text corpus."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_text_perseus')
     file_rel = os.path.join('~/cltk_data/greek/text/greek_text_perseus/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #13
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_git_import_copt_script(self):
     """Test import of Coptic Scriptorium."""
     corpus_importer = CorpusImporter('coptic')
     corpus_importer.import_corpus('coptic_text_scriptorium')
     file_rel = os.path.join('~/cltk_data/coptic/text/coptic_text_scriptorium/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #14
0
파일: test_corpus.py 프로젝트: neaGaze/cltk
 def test_import_proper_names_latin(self):
     """Test cloning the Latin proper names corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_proper_names_cltk')
     file_rel = os.path.join('~/cltk_data/latin/lexicon/latin_proper_names_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #15
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_proper_names_greek(self):
     """Test cloning the Greek proper names corpus."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_proper_names_cltk')
     file_rel = os.path.join('~/cltk_data/greek/lexicon/greek_proper_names_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #16
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_git_import_tib_pos_tdc(self):
     """Test import Tibetan POS files."""
     corpus_importer = CorpusImporter('tibetan')
     corpus_importer.import_corpus('tibetan_pos_tdc')
     file_rel = os.path.join('~/cltk_data/tibetan/pos/tibetan_pos_tdc/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #17
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_git_import_tib_lexica_tdc(self):
     """Test import of Tibetan dictionary."""
     corpus_importer = CorpusImporter('tibetan')
     corpus_importer.import_corpus('tibetan_lexica_tdc')
     file_rel = os.path.join('~/cltk_data/tibetan/lexicon/tibetan_lexica_tdc/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #18
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_la_treebank_pers(self):
     """Test cloning the Perseus Latin treebank corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_treebank_perseus')
     file_rel = os.path.join('~/cltk_data/latin/treebank/latin_treebank_perseus/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #19
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_latin_models_cltk(self):
     """Test cloning the CLTK Latin models."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_models_cltk')
     file_rel = os.path.join('~/cltk_data/latin/model/latin_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #20
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_lat_text_lat_lib(self):
     """Test cloning the Latin Library text corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     file_rel = os.path.join('~/cltk_data/latin/text/latin_text_latin_library/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #21
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_lat_pos_lemma_cltk(self):
     """Test cloning the CLTK POS lemmata dict."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_pos_lemmata_cltk')
     file_rel = os.path.join('~/cltk_data/latin/lemma/latin_pos_lemmata_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #22
0
 def test_import_punjabi_punjabi_text_gurban(self):
     pun_import = CorpusImporter('punjabi')
     corpora_list = pun_import.list_corpora
     self.assertTrue('punjabi_text_gurban' in corpora_list)
     pun_import.import_corpus('punjabi_text_gurban')
     file_path = os.path.join('~/cltk_data/punjabi/text/punjabi_text_gurban/README.md')
     _file = os.path.expanduser(file_path)
     self.assertTrue(os.path.isfile(_file))
예제 #23
0
 def test_import_lat_text_lat_lib(self):
     """Test cloning the Latin Library text corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     file_rel = os.path.join(
         '~/cltk_data/latin/text/latin_text_latin_library/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #24
0
 def test_git_import_chinese_cbeta_txt(self):
     """Test import of plaintext CBETA."""
     corpus_importer = CorpusImporter('chinese')
     corpus_importer.import_corpus('chinese_text_cbeta_txt')
     file_rel = os.path.join(
         '~/cltk_data/chinese/text/chinese_text_cbeta_txt/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #25
0
파일: test_corpus.py 프로젝트: zfletch/cltk
 def test_import_latin_models_cltk(self):
     """Test cloning the CLTK Latin models."""
     corpus_importer = CorpusImporter("latin")
     corpus_importer.import_corpus("latin_models_cltk")
     file_rel = os.path.join(get_cltk_data_dir() +
                             "/latin/model/latin_models_cltk/README.md")
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #26
0
 def test_import_latin_models_cltk(self):
     """Test cloning the CLTK Latin models."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_models_cltk')
     file_rel = os.path.join(
         '~/cltk_data/latin/model/latin_models_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #27
0
 def test_import_greek_software_tlgu(self):
     """Test cloning TLGU."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_software_tlgu')
     file_rel = os.path.join(
         '~/cltk_data/greek/software/greek_software_tlgu/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #28
0
 def test_import_la_treebank_pers(self):
     """Test cloning the Perseus Latin treebank corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_treebank_perseus')
     file_rel = os.path.join(
         '~/cltk_data/latin/treebank/latin_treebank_perseus/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #29
0
 def test_import_greek_text_perseus(self):
     """Test cloning the Perseus Greek text corpus."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_text_perseus')
     file_rel = os.path.join(
         '~/cltk_data/greek/text/greek_text_perseus/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #30
0
 def test_import_proper_names_greek(self):
     """Test cloning the Greek proper names corpus."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_proper_names_cltk')
     file_rel = os.path.join(
         '~/cltk_data/greek/lexicon/greek_proper_names_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #31
0
 def test_git_import_tib_pos_tdc(self):
     """Test import Tibetan POS files."""
     corpus_importer = CorpusImporter('tibetan')
     corpus_importer.import_corpus('tibetan_pos_tdc')
     file_rel = os.path.join(
         '~/cltk_data/tibetan/pos/tibetan_pos_tdc/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #32
0
 def test_import_lat_pos_lemma_cltk(self):
     """Test cloning the CLTK POS lemmata dict."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_pos_lemmata_cltk')
     file_rel = os.path.join(
         '~/cltk_data/latin/lemma/latin_pos_lemmata_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #33
0
 def test_git_import_tib_lexica_tdc(self):
     """Test import of Tibetan dictionary."""
     corpus_importer = CorpusImporter('tibetan')
     corpus_importer.import_corpus('tibetan_lexica_tdc')
     file_rel = os.path.join(
         '~/cltk_data/tibetan/lexicon/tibetan_lexica_tdc/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #34
0
 def test_import_latin_text_antique_digiliblt(self):
     """Test cloning the Antique Latin from digilibLT."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_antique_digiliblt')
     file_rel = os.path.join(
         '~/cltk_data/latin/text/latin_text_antique_digiliblt/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #35
0
 def test_git_import_copt_script(self):
     """Test import of Coptic Scriptorium."""
     corpus_importer = CorpusImporter('coptic')
     corpus_importer.import_corpus('coptic_text_scriptorium')
     file_rel = os.path.join(
         '~/cltk_data/coptic/text/coptic_text_scriptorium/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #36
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
         corpus_importer.import_corpus('latin_text_perseus')
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('greek_text_perseus')
     except:
         raise Exception('Failure to download test corpus')
예제 #37
0
 def test_import_la_text_lac_curt(self):
     """Test cloning the Lacus Curtius Latin text corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_lacus_curtius')
     file_rel = os.path.join(
         '~/cltk_data/latin/text/latin_text_lacus_curtius/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #38
0
 def test_import_latin_library_corpus_reader(self):
     """Test the Latin Library corpus reader."""
     corpus_importer = CorpusImporter("latin")
     corpus_importer.import_corpus("latin_text_latin_library")
     reader = get_corpus_reader(
         language="latin", corpus_name="latin_text_latin_library"
     )
     ALL_FILE_IDS = list(reader.fileids())
     self.assertTrue(len(ALL_FILE_IDS) > 2100)
예제 #39
0
파일: test_stem.py 프로젝트: vierth/cltk
 def setUp(self):
     """Import sanskrit models first, some CSV files necessary for the
     Indian lang tokenizers.
     """
     corpus_importer = CorpusImporter('sanskrit')
     corpus_importer.import_corpus('sanskrit_models_cltk')
     file_rel = os.path.join('~/cltk_data/sanskrit/model/sanskrit_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #40
0
파일: test_corpus.py 프로젝트: mcneela/cltk
 def test_import_greek_models_cltk(self):
     """Test pull (not clone) the CLTK Greek models. Import was run in
     ``setUp()``.
     """
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_models_cltk')
     file_rel = os.path.join('~/cltk_data/greek/model/greek_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #41
0
 def setUp(self):
     corpus_importer = CorpusImporter("middle_high_german")
     corpus_importer.import_corpus("middle_high_german_models_cltk")
     file_rel = os.path.join(
         get_cltk_data_dir() +
         '/middle_high_german/model/middle_high_german_models_cltk/README.md'
     )
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #42
0
파일: test.py 프로젝트: decretist/Sand
def main():
    corpus_importer = CorpusImporter('latin')
    corpora_list = corpus_importer.list_corpora
    print(corpora_list)
    corpus_importer.import_corpus('latin_models_cltk')
    sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.'
    sentence = sentence.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmatized_sentence = lemmatizer.lemmatize(sentence)
    print(lemmatized_sentence)
예제 #43
0
 def test_import_greek_models_cltk(self):
     """Test pull (not clone) the CLTK Greek models. Import was run in
     ``setUp()``.
     """
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_models_cltk')
     file_rel = os.path.join('~/cltk_data/greek/model/greek_models_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #44
0
 def test_import_lat_text_lat_lib(self):
     """Test cloning the Latin Library text corpus."""
     corpus_importer = CorpusImporter("latin")
     corpus_importer.import_corpus("latin_text_latin_library")
     file_rel = os.path.join(
         get_cltk_data_dir() + "/latin/text/latin_text_latin_library/README.md"
     )
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #45
0
 def test_import_greek_software_tlgu(self):
     """Test cloning TLGU."""
     corpus_importer = CorpusImporter("greek")
     corpus_importer.import_corpus("greek_software_tlgu")
     file_rel = os.path.join(
         get_cltk_data_dir() + "/greek/software/greek_software_tlgu/README.md"
     )
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
예제 #46
0
 def setUp(self):
     """Import sanskrit models first, some CSV files necessary for the
     Indian lang tokenizers.
     """
     corpus_importer = CorpusImporter('sanskrit')
     corpus_importer.import_corpus('sanskrit_models_cltk')
     file_rel = os.path.join('~/cltk_data/sanskrit/model/sanskrit_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
예제 #47
0
    def setUp(self):
        file_rel = os.path.join('~/cltk_data/latin/text/latin_text_perseus/README.md')
        file = os.path.expanduser(file_rel)
        if not os.path.isfile(file):
            corpus_importer = CorpusImporter('latin')
            corpus_importer.import_corpus('latin_text_perseus')
            file_exists = os.path.isfile(file)
            self.assertTrue(file_exists)

        self.app = api_json.app.test_client()
예제 #48
0
파일: ner.py 프로젝트: Akirato/cltk
def _check_latest_data(lang):
    """Check for presence of proper names dir, clone if not."""

    assert lang in NER_DICT.keys(), \
        'Invalid language. Choose from: {}'.format(', '.join(NER_DICT.keys()))

    ner_file_path = os.path.expanduser(NER_DICT[lang])

    if not os.path.isfile(ner_file_path):
        corpus_importer = CorpusImporter(lang)
        corpus_importer.import_corpus('{}_models_cltk'.format(lang))
    def setUp(self):
        """Install the Indic NLP library, which includes Morfessor files
        for morphology.
        """
        INDIC_RESOURCES_PATH = os.path.expanduser('~/cltk_data/sanskrit/model/sanskrit_models_cltk')
        resources_present = os.path.isdir(INDIC_RESOURCES_PATH)

        if not resources_present:
            from cltk.corpus.utils.importer import CorpusImporter
            c = CorpusImporter('sanskrit')
            c.import_corpus('sanskrit_models_cltk')
예제 #50
0
파일: tlgu.py 프로젝트: eamonnbell/cltk
 def _check_import_source():
     """Check if tlgu imported, if not import it."""
     path_rel = '~/cltk_data/greek/software/greek_software_tlgu/tlgu.h'
     path = os.path.expanduser(path_rel)
     if not os.path.isfile(path):
         try:
             corpus_importer = CorpusImporter('greek')
             corpus_importer.import_corpus('tlgu')
         except Exception as exc:
             logger.error('Failed to import TLGU: %s', exc)
             raise
예제 #51
0
파일: ner.py 프로젝트: BrillForward/cltk
def _check_latest_data(lang):
    """Check for presence of proper names dir, clone if not."""

    assert lang in NER_DICT.keys(), \
        'Invalid language. Choose from: {}'.format(', '.join(NER_DICT.keys()))

    ner_file_path = os.path.expanduser(NER_DICT[lang])

    if not os.path.isfile(ner_file_path):
        corpus_importer = CorpusImporter(lang)
        corpus_importer.import_corpus('{}_models_cltk'.format(lang))
예제 #52
0
 def _check_import_source():
     """Check if tlgu imported, if not import it."""
     path_rel = '~/cltk_data/greek/software/greek_software_tlgu/tlgu.h'
     path = os.path.expanduser(path_rel)
     if not os.path.isfile(path):
         try:
             corpus_importer = CorpusImporter('greek')
             corpus_importer.import_corpus('tlgu')
         except Exception as exc:
             logger.error('Failed to import TLGU: %s', exc)
             raise
예제 #53
0
    def setUp(self):
        file_rel = os.path.join(
            '~/cltk_data/latin/text/latin_text_perseus/README.md')
        file = os.path.expanduser(file_rel)
        if not os.path.isfile(file):
            corpus_importer = CorpusImporter('latin')
            corpus_importer.import_corpus('latin_text_perseus')
            file_exists = os.path.isfile(file)
            self.assertTrue(file_exists)

        self.app = api_json.app.test_client()
예제 #54
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
     except:
         raise Exception('Failure to download test corpus')
     cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     cls.reader._fileids = ['pervig.txt']
     # Need a additional instance because tests below change internals #TO-DO Fix
     cls.reader_2 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     cls.reader_3 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     cls.reader_4 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
예제 #55
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
         corpus_importer.import_corpus('latin_text_perseus')
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('greek_text_perseus')
         corpus_importer.import_corpus('greek_text_tesserae')
         nltk.download('punkt')
         nltk.download('averaged_perceptron_tagger')
     except:
         raise Exception('Failure to download test corpus')
def gen_docs(corpus, lemmatize, rm_stops=False):
    '''Opens and processes file. Stores in processed .txt files to be used for making the models.'''

    language = 'sanskrit'
    assert corpus in ['ramayana']

    #assert if gita corpus exists
    path = os.path.join(
        os.path.expanduser('~'),
        'cltk_data/sanskrit/parallel/sanskrit_parallel_gitasupersite/')
    if (not os.path.exists(path)):
        print('Importing \'sanskrit_parallel_gitasupersite\'...')
        c = CorpusImporter('sanskrit')
        c.import_corpus('sanskrit_parallel_gitasupersite')
    else:
        print('Not importing corpora...')

    #make path in ramayana for word2vec_models
    path = os.path.join(
        os.path.expanduser('~'),
        'cltk_data/sanskrit/parallel/sanskrit_parallel_gitasupersite/ramayana/word2vec_models'
    )
    if (not os.path.exists(path)): os.makedirs(path)

    #make preprocessed text and make the models
    path = os.path.join(
        os.path.expanduser('~'),
        'cltk_data/sanskrit/parallel/sanskrit_parallel_gitasupersite/ramayana/'
    )

    for name in [file for file in os.listdir(path) if file.endswith(".txt")]:
        #for each file in ramayana folder
        if name.endswith('_sanskrit.txt'):

            #txt file to read data from
            f = open(path + name)
            print('Reading ' + path + name)

            #make raw data file to store preprocessed text
            if os.path.exists(path + 'word2vec_models/ramayana_sanskrit.txt'):
                append_write = 'a'
            else:
                append_write = 'w'
            f2 = open(path + 'word2vec_models/ramayana_sanskrit.txt',
                      append_write)
            print('and storing in ' + path +
                  'word2vec_models/ramayana_sanskrit.txt\n')
            read_doc(f, f2, rm_stops)

            f.close()
            f2.close()
    return True
예제 #57
0
    def test_latin_library_reader_missing_corpus(self):
        """
        Needs to precede (for now) the next two tests which load the corpus
        Provided by Patrick Burns
        """
        corpus_importer = CorpusImporter('latin')
        # corpus_importer.import_corpus('latin_text_latin_library')
        corpus_importer.import_corpus('latin_models_cltk')

        def _import():
            with patch('builtins.input', return_value='n'):
                from cltk.corpus.readers import latinlibrary
                self.assertRaises(OSError, _import)
예제 #58
0
 def setUpClass(self):
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_models_cltk')
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_models_cltk')
     self.greek_text = """ὅλως δ’ ἀντεχόμενοί τινες, ὡς οἴονται, δικαίου τινός (ὁ γὰρ νόμος δίκαιόν τἰ τὴν κατὰ πόλεμον δουλείαν τιθέασι δικαίαν, ἅμα δ’ οὔ φασιν· τήν τε γὰρ ἀρχὴν ἐνδέχεται μὴ δικαίαν εἶναι τῶν πολέμων, καὶ τὸν ἀνάξιον δουλεύειν οὐδαμῶς ἂν φαίη τις δοῦλον εἶναι· εἰ δὲ μή, συμβήσεται τοὺς εὐγενεστάτους εἶναι δοκοῦντας δούλους εἶναι καὶ ἐκ δούλων, ἐὰν συμβῇ πραθῆναι ληφθέντας."""  # pylint: disable=line-too-long
     self.latin_text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
예제 #59
0
 def setUpClass(self):
     try:
         corpus_importer = CorpusImporter('sanskrit')
         corpus_importer.import_corpus('sanskrit_models_cltk')
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('greek_models_cltk')
     except:
         raise Exception('Failure to download test corpus')
예제 #60
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
     except:
         raise Exception('Failure to download test corpus')
     cls.reader = get_corpus_reader(language='latin',
                                    corpus_name='latin_text_latin_library')
     cls.reader._fileids = ['pervig.txt']
     # Need a additional instance because tests below change internals #TO-DO Fix
     cls.reader_2 = get_corpus_reader(
         language='latin', corpus_name='latin_text_latin_library')
     cls.reader_3 = get_corpus_reader(
         language='latin', corpus_name='latin_text_latin_library')
     cls.reader_4 = get_corpus_reader(
         language='latin', corpus_name='latin_text_latin_library')