Beispiel #1
0
def _check_latest_data(lang):
    """Check for presence of proper names dir, clone if not."""

    assert lang in NER_DICT.keys(), "Invalid language. Choose from: {}".format(
        ", ".join(NER_DICT.keys())
    )

    ner_file_path = os.path.expanduser(NER_DICT[lang])

    if not os.path.isfile(ner_file_path):
        corpus_importer = FetchCorpus(lang)
        corpus_importer.import_corpus("{}_models_cltk".format(lang))
Beispiel #2
0
 def setUpClass(self):
     corpus_importer = FetchCorpus("grc")
     corpus_importer.import_corpus("grc_models_cltk")
     corpus_importer = FetchCorpus("lat")
     corpus_importer.import_corpus("lat_models_cltk")
     self.greek_text = """ὅλως δ’ ἀντεχόμενοί τινες, ὡς οἴονται, δικαίου τινός (ὁ γὰρ νόμος δίκαιόν τἰ τὴν κατὰ πόλεμον δουλείαν τιθέασι δικαίαν, ἅμα δ’ οὔ φασιν· τήν τε γὰρ ἀρχὴν ἐνδέχεται μὴ δικαίαν εἶναι τῶν πολέμων, καὶ τὸν ἀνάξιον δουλεύειν οὐδαμῶς ἂν φαίη τις δοῦλον εἶναι· εἰ δὲ μή, συμβήσεται τοὺς εὐγενεστάτους εἶναι δοκοῦντας δούλους εἶναι καὶ ἐκ δούλων, ἐὰν συμβῇ πραθῆναι ληφθέντας."""  # pylint: disable=line-too-long
     self.latin_text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
def download_cltk_models(iso_code: str) -> None:

    corpus_downloader = FetchCorpus(language=iso_code)
    # print(corpus_downloader.list_corpora)
    if iso_code == "fro":
        corpus_downloader.import_corpus(corpus_name=f"{iso_code}_data_cltk")
    else:
        corpus_downloader.import_corpus(corpus_name=f"{iso_code}_models_cltk")
        if iso_code == "lat":
            corpus_downloader.import_corpus(
                corpus_name=f"{iso_code}_models_cltk")
def download_cltk_models_repo(iso_code: str) -> None:
    """Download CLTK repos."""
    print(f"Going to download CLTK models for '{iso_code}'.")
    corpus_downloader = FetchCorpus(language=iso_code)
    corpus_downloader.import_corpus(corpus_name=f"{iso_code}_models_cltk")
    if iso_code == "lat":
        corpus_downloader.import_corpus(
            corpus_name="cltk_lat_lewis_elementary_lexicon")
    elif iso_code == "non":
        corpus_downloader.import_corpus(
            corpus_name="cltk_non_zoega_dictionary")
    print(f"Finished downloading CLTK models for '{iso_code}'.")
Beispiel #5
0
 def _check_and_download_tlgu_source(self):
     """Check if tlgu downloaded, if not download it."""
     path = make_cltk_path("grc/software/grc_software_tlgu/tlgu.h")
     if not os.path.isfile(path):
         dl_msg = f"This part of the CLTK depends upon TLGU, software written by Dimitri Marinakis `<http://tlgu.carmen.gr/>`_."
         print(dl_msg)
         repo_url = "https://github.com/cltk/grc_software_tlgu.git"
         dl_dir = os.path.split(path)[0]
         dl_question = (
             f"Do you want to download TLGU from '{repo_url}' to '{dl_dir}'?"
         )
         if self.interactive:
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="grc")
             fetch_corpus.import_corpus(corpus_name="grc_software_tlgu")
         else:
             raise CLTKException(
                 f"TLGU software required for this class to work.")
Beispiel #6
0
def download_prompt(
    iso_code: str,
    message: str,
    model_url: str,
    interactive: bool = True,
    silent: bool = False,
):
    """Ask user whether to download files.

    TODO: Make ft and stanza use this fn. Consider moving to other module.
    """
    fetch_corpus = FetchCorpus(language=iso_code)
    if not interactive:
        if not silent:
            print(message)
        fetch_corpus.import_corpus(corpus_name=f"{iso_code}_models_cltk")
        # get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip)
    else:
        print(message)
        dl_is_allowed = query_yes_no(
            f"Do you want to download '{model_url}' to '~/cltk_data/{iso_code}'?"
        )  # type: bool
        if dl_is_allowed:
            fetch_corpus.import_corpus(corpus_name=f"{iso_code}_models_cltk")
            # get_file_with_progress_bar(model_url=model_url, file_path=self.fp_zip)
        else:
            raise CLTKException(
                f"Download of necessary model declined for '{iso_code}'. Following functions will likely fail."
            )
Beispiel #7
0
 def __init__(self, interactive: bool = True):
     self.interactive = interactive
     self.lewis_yaml_fp = make_cltk_path(
         "lat", "lexicon", "cltk_lat_lewis_elementary_lexicon",
         "lewis.yaml")
     try:
         self.entries = self._load_entries()
     except FileNotFoundError:
         if self.interactive:
             dl_msg = f"This part of the CLTK depends upon Lewis's *An Elementary Latin Dictionary* (1890)."
             print(dl_msg)
             dl_question = "Do you want to download this?"
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="lat")
             fetch_corpus.import_corpus(
                 corpus_name="cltk_lat_lewis_elementary_lexicon")
         else:
             raise CLTKException(
                 f"File '{self.lewis_yaml_fp}' is not found. It is required for this class."
             )
         self.entries = self._load_entries()
Beispiel #8
0
 def __init__(self, interactive: bool = True):
     self.interactive = interactive
     self.zoega_yaml_fp = make_cltk_path("non", "dictionary",
                                         "cltk_non_zoega_dictionary",
                                         "dictionary.yaml")
     try:
         self.entries = self._load_entries()
     except FileNotFoundError:
         if self.interactive:
             dl_msg = f"This part of the CLTK depends upon Zoëga's *A Concise Old Norse Dictionary* (1890)."
             print(dl_msg)
             dl_question = "Do you want to download this?"
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="non")
             fetch_corpus.import_corpus(
                 corpus_name="cltk_non_zoega_dictionary")
         else:
             raise CLTKException(
                 f"File '{self.zoega_yaml_fp}' is not found. It is required for this class."
             )
         self.entries = self._load_entries()
Beispiel #9
0
    def setUp(self):
        """Clone Greek models in order to test pull function and other model
        tests later.
        """
        corpus_importer = FetchCorpus("grc")
        corpus_importer.import_corpus("grc_models_cltk")
        file_rel = os.path.join(CLTK_DATA_DIR,
                                "grc/model/grc_models_cltk/README.md")
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = FetchCorpus("lat")
        corpus_importer.import_corpus("lat_models_cltk")
        file_rel = os.path.join(CLTK_DATA_DIR,
                                "lat/model/lat_models_cltk/README.md")
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = FetchCorpus("fro")
        corpus_importer.import_corpus("fro_data_cltk")
        file_rel = os.path.join(CLTK_DATA_DIR,
                                "fro/text/fro_data_cltk/README.md")
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = FetchCorpus("non")
        corpus_importer.import_corpus("non_models_cltk")
        file_rel = os.path.join(CLTK_DATA_DIR,
                                "non/model/non_models_cltk/README.md")
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = FetchCorpus("gml")
        corpus_importer.import_corpus("gml_models_cltk")
        file_rel = os.path.join(CLTK_DATA_DIR,
                                "gml/model/gml_models_cltk/README.md")
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)

        corpus_importer = FetchCorpus("ang")
        corpus_importer.import_corpus("ang_models_cltk")
        file_rel = os.path.join(CLTK_DATA_DIR,
                                "ang/model/ang_models_cltk/README.md")
        file = os.path.expanduser(file_rel)
        file_exists = os.path.isfile(file)
        self.assertTrue(file_exists)
"""`install_corpora.py` - install free data corpora."""
import logging

from cltk.data.fetch import FetchCorpus

if __name__ == '__main__':

    LOG = logging.getLogger(__name__)
    LOG.addHandler(logging.NullHandler())
    logging.basicConfig(level=logging.INFO)
    try:
        corpus_importer = FetchCorpus('latin')
        corpus_importer.import_corpus('latin_text_latin_library')
        corpus_importer.import_corpus('latin_text_perseus')
        corpus_importer.import_corpus('latin_text_tesserae')
        corpus_importer = FetchCorpus('greek')
        corpus_importer.import_corpus('greek_text_perseus')
        corpus_importer.import_corpus('greek_text_lacus_curtius')
        logging.disable(logging.NOTSET)
    except:
        LOG.exception('Failure to download test corpora')
Beispiel #11
0
    if anc_lang == 'pgmc':
        src_ipa_col = 'pgm_ipa'
        src_form_col = 'gem-pro'
        desc[src_ipa_col] = desc[src_form_col].apply(PGmc_ipa_trans).apply(i2t)
        show_all_segs(desc[src_ipa_col])
        desc[src_ipa_col] = desc[src_ipa_col].apply(break_false_complex,
                                                    lang='pgm')
        show_all_segs(desc[src_ipa_col])
    elif anc_lang == 'la':
        src_ipa_col = 'la_ipa'
        src_form_col = 'la'
        try:
            transcriber = LatTranscriber(dialect="Classical",
                                         reconstruction="Allen")
        except FileNotFoundError:
            lat_fetch = FetchCorpus('lat')
            lat_fetch.import_corpus('lat_models_cltk')
            transcriber = LatTranscriber(dialect="Classical",
                                         reconstruction="Allen")

        desc[src_ipa_col] = desc[src_form_col].apply(
            la_transcribe_and_tokenize, transcriber=transcriber)
        show_all_segs(desc[src_ipa_col])
        desc[src_ipa_col] = desc[src_ipa_col].apply(break_false_complex,
                                                    lang='la')
        show_all_segs(desc[src_ipa_col])
    elif anc_lang == 'sla-pro':
        src_ipa_col = 'sla_pro_ipa'
        src_form_col = 'sla-pro'
        desc[src_ipa_col] = desc[src_form_col].apply(sla_pro_transcribe).apply(
            i2t)