コード例 #1
0
 def setUpClass(self):
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_models_cltk')
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_models_cltk')
     self.greek_text = """ὅλως δ’ ἀντεχόμενοί τινες, ὡς οἴονται, δικαίου τινός (ὁ γὰρ νόμος δίκαιόν τἰ τὴν κατὰ πόλεμον δουλείαν τιθέασι δικαίαν, ἅμα δ’ οὔ φασιν· τήν τε γὰρ ἀρχὴν ἐνδέχεται μὴ δικαίαν εἶναι τῶν πολέμων, καὶ τὸν ἀνάξιον δουλεύειν οὐδαμῶς ἂν φαίη τις δοῦλον εἶναι· εἰ δὲ μή, συμβήσεται τοὺς εὐγενεστάτους εἶναι δοκοῦντας δούλους εἶναι καὶ ἐκ δούλων, ἐὰν συμβῇ πραθῆναι ληφθέντας."""  # pylint: disable=line-too-long
     self.latin_text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
コード例 #2
0
ファイル: test_tag.py プロジェクト: willismonroe/cltk
    def setUp(self):
        """Clone Greek models in order to test pull function and other model
        tests later.
        """
        corpus_importer = CorpusImporter('greek')
        corpus_importer.import_corpus('greek_models_cltk')
        file_rel = os.path.join(
            '~/cltk_data/greek/model/greek_models_cltk/README.md')
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = CorpusImporter('latin')
        corpus_importer.import_corpus('latin_models_cltk')
        file_rel = os.path.join(
            '~/cltk_data/latin/model/latin_models_cltk/README.md')
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = CorpusImporter('french')
        corpus_importer.import_corpus('french_data_cltk')
        file_rel = os.path.join(
            '~/cltk_data/french/text/french_data_cltk/README.md')
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = CorpusImporter("old_norse")
        corpus_importer.import_corpus("old_norse_models_cltk")
        file_rel = os.path.join(
            '~/cltk_data/old_norse/model/old_norse_models_cltk/README.md')
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)
コード例 #3
0
 def setUpClass(self):
     try:
         corpus_importer = CorpusImporter('sanskrit')
         corpus_importer.import_corpus('sanskrit_models_cltk')
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('greek_models_cltk')
     except:
         raise Exception('Failure to download test corpus')
コード例 #4
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
         corpus_importer.import_corpus('latin_text_perseus')
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('greek_text_perseus')
     except:
         raise Exception('Failure to download test corpus')
コード例 #5
0
 def setUpClass(self):
     try:
         corpus_importer = CorpusImporter("latin")
         corpus_importer.import_corpus("latin_text_latin_library")
         corpus_importer.import_corpus("latin_text_perseus")
         corpus_importer = CorpusImporter("greek")
         corpus_importer.import_corpus("greek_text_perseus")
         corpus_importer.import_corpus("greek_text_tesserae")
         nltk.download("punkt")
         nltk.download("averaged_perceptron_tagger")
     except:
         raise Exception("Failure to download test corpus")
コード例 #6
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
         corpus_importer.import_corpus('latin_text_perseus')
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('greek_text_perseus')
         corpus_importer.import_corpus('greek_text_tesserae')
         nltk.download('punkt')
         nltk.download('averaged_perceptron_tagger')
     except:
         raise Exception('Failure to download test corpus')
コード例 #7
0
def main():
    lang = 'latin'
    corpus = 'latin_text_perseus'
    corpus_present = check_corpus_presence(lang, corpus)
    if not corpus_present:
        corpus_importer = CorpusImporter(lang)
        corpus_importer.import_corpus(corpus)

    lang = 'greek'
    corpus = 'greek_text_perseus'
    corpus_present = check_corpus_presence(lang, corpus)
    if not corpus_present:
        corpus_importer = CorpusImporter(lang)
        corpus_importer.import_corpus(corpus)
コード例 #8
0
 def setUp(self):
     corpus_importer = CorpusImporter("old_norse")
     corpus_importer.import_corpus("old_norse_models_cltk")
     file_rel = os.path.join(get_cltk_data_dir() + '/old_norse/model/old_norse_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
コード例 #9
0
def _install(lang, lst):
    print("Downloading %s " % (lang))
    corpus_importer = CorpusImporter(lang)
    for _corpus in lst:
        if _corpus['location'] == 'remote':
            print("    Downloading %s " % (_corpus['name']))
            corpus_importer.import_corpus(_corpus['name'])
コード例 #10
0
ファイル: lapos.py プロジェクト: saikswaroop/cltk
 def _is_cloned_get_make(self):
     """Check if installed, if not, install it.
     TODO: Add check for Windows and Linux as they are added.
     TODO: This could be 3 functions.
     """
     if self.operating_system == 'mac':
         branch = 'apple'
     else:
         # branch = 'master'
         raise OSError(
             'Lapos for Linux/Windows not currently available through CLTK. Please file issue if you can fix it.'
         )
     fp = os.path.expanduser(
         '~/cltk_data/multilingual/software/lapos/README.md')
     if os.path.isfile(fp):
         return True
     else:
         importer = CorpusImporter('multilingual')
         importer.import_corpus('lapos', branch=branch)
     if os.path.isfile(fp):
         print('Cloned Lapos successfully.')
         self.make()
         return True
     else:
         logger.error(
             "Something went wrong with importing the Lapos tagger on the '{}' branch."
             .format(branch))
         raise CorpusImportError
コード例 #11
0
ファイル: test_corpus.py プロジェクト: ykl7/cltk
 def test_import_nonexistant_corpus(self):
     """Test that creating a CorpusImporter for a non existent lang
        fails smoothly
     """
     with self.assertRaises(CorpusImportError):
         corpus_importer = CorpusImporter('greek')
         corpus_importer.import_corpus('euclids_book_of_recipes')
コード例 #12
0
ファイル: cltk.py プロジェクト: thePortus/dhelp
    def setup(self):
        """Download CLTK packages and trainer corpora.

        Launches the CLTK package download interface. Overridden by the CLTK
        child classes to launch the automated CLTK downloader. Convenience
        method if user has not already downloaded CLTK packages and trainer
        sets.

        Example:
            >>> LatinText('').setup()
        """
        # check if cltk is already installed, if not, install it
        if not importlib.find_loader('cltk'):
            pip.main(['install', 'cltk'])
        # include cltk inline
        from cltk.corpus.utils.importer import CorpusImporter
        setup_language = self.options['language']
        # for ancient greek, change to 'greek' for purposes of cltk setup
        if setup_language == 'ancient greek':
            setup_language = 'greek'
        corpus_importer = CorpusImporter(setup_language)
        # loop through, check if extant, attempt to download, skip any errors
        for cltk_corpus in corpus_importer.list_corpora:
            print('Downloading', cltk_corpus)
            try:
                corpus_importer.import_corpus(cltk_corpus)
            except:
                print('Problem downloading', cltk_corpus, '(skipping)')
        return True
コード例 #13
0
 def setUp(self):
     corpus_importer = CorpusImporter("old_english")
     corpus_importer.import_corpus("old_english_models_cltk")
     file_rel = os.path.join('~/cltk_data/old_english/model/old_english_models_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
コード例 #14
0
def get_src_header_and_transcriber(source: str) -> Tuple[str, G2P_func]:
    """Return the column name for the output csv header, and a transcriber for the source language."""
    if source == 'lat':
        try:
            src_transcriber = Transcriber(dialect="Classical",
                                          reconstruction="Allen")
            src = 'Latin'
        except FileNotFoundError:
            print(
                "Did not have the corpus `latin_models_cltk`, downloading it now"
            )
            from cltk.corpus.utils.importer import CorpusImporter
            corpus_importer = CorpusImporter('latin')
            corpus_importer.import_corpus('latin_models_cltk')

            src_transcriber = Transcriber(dialect="Classical",
                                          reconstruction="Allen")
            src = 'Latin'

        @lru_cache(maxsize=None)
        def src_func(token):
            try:
                ipa = src_transcriber.transcribe(token)
            except IndexError:
                ipa = src_transcriber.transcribe(token, syllabify=False)
            ipa = ipa.strip('[]')
            # Some weird cases of failed macronization.
            ipa = re.sub(r'(.)_', r'\1ː', ipa)
            return ipa
    else:
        src = 'Proto-Germanic'
        src_func = PGmc_ipa_trans
    return src, src_func
コード例 #15
0
 def test_import_latin_library_corpus_reader(self):
     """Test the Latin Library corpus reader."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     ALL_FILE_IDS = list(reader.fileids())
     self.assertTrue(len(ALL_FILE_IDS) > 2100)
コード例 #16
0
 def test_import_lat_pos_lemma_cltk(self):
     """Test cloning the CLTK POS lemmata dict."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_pos_lemmata_cltk')
     file_rel = os.path.join('~/cltk_data/latin/lemma/latin_pos_lemmata_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #17
0
def setup():
    corpus_importer = CorpusImporter('latin')
    # corpus_importer.import_corpus('latin_models_cltk')
    corpora = corpus_importer.list_corpora
    corpora.remove('phi5')
    corpora.remove('phi7')
    for corpus in corpora:
        corpus_importer.import_corpus(corpus)
コード例 #18
0
def pos_tagger_example_latin():
    corpus_importer = CorpusImporter('latin')
    corpus_importer.import_corpus('latin_models_cltk')

    tagger = pos.POSTag('latin')
    pos_tags = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres')

    print(pos_tags)
コード例 #19
0
 def test_git_import_copt_script(self):
     """Test import of Coptic Scriptorium."""
     corpus_importer = CorpusImporter('coptic')
     corpus_importer.import_corpus('coptic_text_scriptorium')
     file_rel = os.path.join('~/cltk_data/coptic/text/coptic_text_scriptorium/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #20
0
ファイル: test_lemmatize.py プロジェクト: usmanmuhd/cltk
 def setUp(self):
     corpus_importer = CorpusImporter('french')
     corpus_importer.import_corpus('french_data_cltk')
     file_rel = os.path.join(
         '~/cltk_data/french/text/french_data_cltk/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
コード例 #21
0
ファイル: test_corpus.py プロジェクト: shwetankshrey/cltk
 def test_import_latin_models_cltk(self):
     """Test cloning the CLTK Latin models."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_models_cltk')
     file_rel = os.path.join('~/cltk_data/latin/model/latin_models_cltk/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #22
0
 def test_git_import_tib_lexica_tdc(self):
     """Test import of Tibetan dictionary."""
     corpus_importer = CorpusImporter('tibetan')
     corpus_importer.import_corpus('tibetan_lexica_tdc')
     file_rel = os.path.join('~/cltk_data/tibetan/lexicon/tibetan_lexica_tdc/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #23
0
 def test_git_import_chinese_cbeta_txt(self):
     """Test import of plaintext CBETA."""
     corpus_importer = CorpusImporter('chinese')
     corpus_importer.import_corpus('chinese_text_cbeta_txt')
     file_rel = os.path.join('~/cltk_data/chinese/text/chinese_text_cbeta_txt/README.md')
     file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(file)
     self.assertTrue(file_exists)
コード例 #24
0
ファイル: test_corpus.py プロジェクト: shwetankshrey/cltk
 def test_import_greek_software_tlgu(self):
     """Test cloning TLGU."""
     corpus_importer = CorpusImporter('greek')
     corpus_importer.import_corpus('greek_software_tlgu')
     file_rel = os.path.join('~/cltk_data/greek/software/greek_software_tlgu/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #25
0
ファイル: test_corpus.py プロジェクト: shwetankshrey/cltk
 def test_import_latin_text_antique_digiliblt(self):
     """Test cloning the Antique Latin from digilibLT."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_antique_digiliblt')
     file_rel = os.path.join('~/cltk_data/latin/text/latin_text_antique_digiliblt/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #26
0
 def test_import_la_text_lac_curt(self):
     """Test cloning the Lacus Curtius Latin text corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_lacus_curtius')
     file_rel = os.path.join('~/cltk_data/latin/text/latin_text_lacus_curtius/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #27
0
 def test_import_la_treebank_pers(self):
     """Test cloning the Perseus Latin treebank corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_treebank_perseus')
     file_rel = os.path.join('~/cltk_data/latin/treebank/latin_treebank_perseus/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #28
0
    def setUp(self):
        """Clone Greek models in order to test pull function and other model
        tests later.
        """
        corpus_importer = CorpusImporter('greek')
        corpus_importer.import_corpus('greek_models_cltk')
        file_rel = os.path.join(get_cltk_data_dir() + '/greek/model/greek_models_cltk/README.md')
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = CorpusImporter('latin')
        corpus_importer.import_corpus('latin_models_cltk')
        file_rel = os.path.join(get_cltk_data_dir() + '/latin/model/latin_models_cltk/README.md')
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)
コード例 #29
0
 def test_git_import_tib_pos_tdc(self):
     """Test import Tibetan POS files."""
     corpus_importer = CorpusImporter('tibetan')
     corpus_importer.import_corpus('tibetan_pos_tdc')
     file_rel = os.path.join('~/cltk_data/tibetan/pos/tibetan_pos_tdc/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)
コード例 #30
0
ファイル: test_corpus.py プロジェクト: shwetankshrey/cltk
 def test_import_lat_text_lat_lib(self):
     """Test cloning the Latin Library text corpus."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     file_rel = os.path.join('~/cltk_data/latin/text/latin_text_latin_library/README.md')
     _file = os.path.expanduser(file_rel)
     file_exists = os.path.isfile(_file)
     self.assertTrue(file_exists)