Esempio n. 1
0
    def _get_user_defined_corpora(self):
        """Check CLTK_DATA_DIR + '/distributed_corpora.yaml' for any custom,
        distributed corpora that the user wants to load locally.
        """
        if self.testing:
            distributed_corpora_fp = os.path.normpath(
                CLTK_DATA_DIR + "/test_distributed_corpora.yaml")
        else:
            distributed_corpora_fp = os.path.normpath(
                CLTK_DATA_DIR + "/distributed_corpora.yaml")
        try:
            with open(distributed_corpora_fp) as file_open:
                corpora_dict = yaml.safe_load(file_open)
        except FileNotFoundError:
            logger.info(
                "``~/cltk_data/distributed_corpora.yaml`` file not found.")
            return []
        except yaml.parser.ParserError as parse_err:
            logger.debug("Yaml parsing error: %s" % parse_err)
            return []
        user_defined_corpora = []
        for corpus_name in corpora_dict:
            about = corpora_dict[corpus_name]

            if about["language"].lower() == self.language:
                user_defined_corpus = dict()
                user_defined_corpus["origin"] = about["origin"]
                user_defined_corpus["type"] = about["type"]
                user_defined_corpus["name"] = corpus_name
                user_defined_corpus["user_defined"] = True
                user_defined_corpora.append(user_defined_corpus)

        return user_defined_corpora
Esempio n. 2
0
    def _retrieve_morpheus_entry(self, word):
        """Return Morpheus entry for word

        Entry format: [(head word, tag, macronized form)]

        :param word: unmacronized, lowercased word
        :ptype word: string
        :return: Morpheus entry in tuples
        :rtype : list
        """
        entry = self.macron_data.get(word)
        if entry is None:
            logger.info("No Morpheus entry found for {}.".format(word))
            return None
        elif len(entry) == 0:
            logger.info("No Morpheus entry found for {}.".format(word))
        return entry
Esempio n. 3
0
 def _git_user_defined_corpus(self,
                              corpus_name,
                              corpus_type,
                              uri: str,
                              branch="master"):
     """Clone or update a git repo defined by user.
     TODO: This code is very redundant with what's in import_corpus(),
     could be refactored.
     """
     type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
     type_dir = os.path.expanduser(type_dir_rel)
     repo_name = uri.split("/")[-1]  # eg, 'latin_corpus_newton_example.git'
     repo_name = repo_name.rstrip(".git")
     target_dir = os.path.join(type_dir, repo_name)
     target_file = os.path.join(type_dir, repo_name, "README.md")
     # check if corpus already present
     # if not, clone
     if not os.path.isfile(target_file):
         if not os.path.isdir(type_dir):
             os.makedirs(type_dir)
         try:
             msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
             logger.info(msg)
             Repo.clone_from(uri,
                             target_dir,
                             branch=branch,
                             depth=1,
                             progress=ProgressPrinter())
         except CorpusImportError as corpus_imp_err:
             msg = "Git clone of '{}' failed: '{}'".format(
                 uri, corpus_imp_err)
             logger.error(msg)
     # if corpus is present, pull latest
     else:
         try:
             repo = Repo(target_dir)
             assert not repo.bare  # or: assert repo.exists()
             git_origin = repo.remotes.origin
             msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
             logger.info(msg)
             git_origin.pull()
         except CorpusImportError as corpus_imp_err:
             msg = "Git pull of '{}' failed: '{}'".format(
                 uri, corpus_imp_err)
             logger.error(msg)
Esempio n. 4
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this

        """
        if corpus == "tlg":
            orig_dir = make_cltk_path("originals/tlg")
            works_dir = make_cltk_path("grc/text/tlg/individual_works")
            file_prefix = "TLG"
            lat = False
        elif corpus == "phi5":
            orig_dir = make_cltk_path("originals/phi5")
            works_dir = make_cltk_path("lat/text/phi5/individual_works")
            file_prefix = "LAT"
            lat = True  # this is for the optional TLGU argument to convert()
        elif corpus == "phi7":
            raise CLTKException(
                "``phi7`` cannot be divided into individual works.")
        else:
            raise CLTKException(
                f"Invalid corpus '{corpus}'. This should never happen.")

        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [
            x for x in files
            if x.endswith(".TXT") and x.startswith(file_prefix)
        ]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path,
                             new_file_path,
                             divide_works=True,
                             lat=lat)
                logger.info("Writing files at %s to %s.", orig_file_path,
                            works_dir)
            except Exception as err:
                logger.error("Failed to convert files: %s.", err)
Esempio n. 5
0
 def _copy_dir_recursive(src_rel, dst_rel):
     """Copy contents of one directory to another. `dst_rel` dir cannot
     exist. Source: http://stackoverflow.com/a/1994840
     TODO: Move this to file_operations.py module.
     :type src_rel: str
     :param src_rel: Directory to be copied.
     :type dst_rel: str
     :param dst_rel: Directory to be created with contents of ``src_rel``.
     """
     src = os.path.expanduser(src_rel)
     dst = os.path.expanduser(dst_rel)
     try:
         shutil.copytree(src, dst)
         logger.info("Files copied from %s to %s", src, dst)
     except OSError as exc:
         if exc.errno == errno.ENOTDIR:
             shutil.copy(src, dst)
             logger.info("Files copied from %s to %s", src, dst)
         else:
             raise
Esempio n. 6
0
    def _long_by_position(self, syllable: str, sentence: List[str]) -> bool:
        """Check if syllable is long by position. Returns ``True``
        if syllable is long by position Long by position
        includes contexts when:

        1. Next syllable begins with two consonants, unless those consonants are a stop + liquid combination
        2. Next syllable begins with a double consonant
        3. Syllable ends with a consonant and the next syllable begins with a consonant

        Args:
            syllable: Current syllable
            sentence: Sentence in which syllable appears

        Returns:
            Whether or not a syllable is long by position

        >>> from cltk.prosody.grc import Scansion
        >>> syllables_sentence = ["μεν", "και", "α", "πει", "ρος"]
        >>> [Scansion()._long_by_position(syllable=syllable, sentence=syllables_sentence) for syllable in syllables_sentence]
        [True, False, False, False, False]
        """
        try:
            next_syll = sentence[sentence.index(syllable) + 1]
            # Long by position by case 1
            if (next_syll[0] in self.sing_cons and next_syll[1]
                    in self.sing_cons) and (next_syll[0] not in self.stops and
                                            next_syll[1] not in self.liquids):
                return True
            # Long by position by case 2
            if syllable[-1] in self.vowels and next_syll[0] in self.doub_cons:
                return True
            # Long by position by case 3
            if syllable[-1] in self.sing_cons and (next_syll[0]
                                                   in self.sing_cons):
                return True
        except IndexError:
            logger.info(
                "IndexError while checking if syllable '%s' is long. Continuing.",
                syllable,
            )
        return False
Esempio n. 7
0
    def _macronize_word(self, word: Tuple[str, str]) -> Tuple[str, str, str]:
        """Return macronized word.

        :param word: (word, tag)

        :return: (word, tag, macronized_form)
        """
        head_word = word[0]
        tag = word[1]
        if tag is None:
            logger.info("Tagger {} could not tag {}.".format(self.tagger, head_word))
            return head_word, tag, head_word
        elif tag == "U--------":
            return (head_word, tag.lower(), head_word)
        else:
            entries = self._retrieve_morpheus_entry(head_word)
            if entries is None:
                return head_word, tag.lower(), head_word
            matched_entry = [entry for entry in entries if entry[0] == tag.lower()]
            if len(matched_entry) == 0:
                logger.info(
                    "No matching Morpheus entry found for {}.".format(head_word)
                )
                return head_word, tag.lower(), entries[0][2]
            elif len(matched_entry) == 1:
                return head_word, tag.lower(), matched_entry[0][2].lower()
            else:
                logger.info("Multiple matching entries found for {}.".format(head_word))
                return head_word, tag.lower(), matched_entry[1][2].lower()
Esempio n. 8
0
    def import_corpus(self,
                      corpus_name: str,
                      local_path: str = None,
                      branch: str = "master"):
        """Download a remote or load local corpus into dir ``~/cltk_data``.

        TODO: maybe add ``from git import RemoteProgress``
        TODO: refactor this, it's getting kinda long

        :param corpus_name: The name of an available corpus.
        :param local_path: A filepath, required when importing local corpora.
        :param branch: What Git branch to clone.
        """

        matching_corpus_list = [
            _dict for _dict in self.all_corpora_for_lang
            if _dict["name"] == corpus_name
        ]
        if not matching_corpus_list:
            raise CorpusImportError(
                f"No corpus ``{corpus_name}`` for language ``{self.language}``."
            )
        if len(matching_corpus_list) > 1:
            raise CorpusImportError(
                f"Found more than one corpus with the name ``{corpus_name}``.")
        matching_corpus = matching_corpus_list[0]
        if matching_corpus.get("user_defined"):
            """{'origin': 'https://github.com/kylepjohnson/latin_corpus_newton_example.git',
            'type': 'text',
            'name': 'example_distributed_latin_corpus',
            'user_defined': True}
            """
            self._git_user_defined_corpus(
                matching_corpus["name"],
                matching_corpus["type"],
                matching_corpus["origin"],
            )
            return
        elif matching_corpus.get("location") == "local":
            # {'location': 'local', 'name': 'phi5', 'origin': None, 'type': 'text'}
            msg = "Importing from local path: '{}'".format(local_path)
            logger.info(msg)
            if corpus_name not in ["phi5", "phi7", "tlg"]:
                raise CorpusImportError(
                    f"Unsupported local corpus ``{corpus_name}``.")
            if corpus_name == "phi5":
                # normalize path for checking dir
                if local_path.endswith("/"):
                    local_path = local_path[:-1]
                # check for right corpus dir
                if os.path.split(local_path)[1] != "PHI5":
                    logger.info("Directory must be named 'PHI5'.")
            if corpus_name == "phi7":
                # normalize local_path for checking dir
                if local_path.endswith("/"):
                    local_path = local_path[:-1]
                # check for right corpus dir
                if os.path.split(local_path)[1] != "PHI7":
                    logger.info("Directory must be named 'PHI7'.")
            if corpus_name == "tlg":
                # normalize path for checking dir
                if local_path.endswith("/"):
                    local_path = local_path[:-1]
                # check for right corpus dir
                if os.path.split(local_path)[1] != "TLG_E":
                    logger.info("Directory must be named 'TLG_E'.")
            # move the dir-checking commands into a function
            data_dir = os.path.expanduser(CLTK_DATA_DIR)
            originals_dir = os.path.join(data_dir, "originals")
            # check for `originals` dir; if not present mkdir
            if not os.path.isdir(originals_dir):
                os.makedirs(originals_dir)
                msg = "Wrote directory at '{}'.".format(originals_dir)
                logger.info(msg)
            tlg_originals_dir = os.path.join(data_dir, "originals",
                                             corpus_name)
            # check for `originals/<corpus_name>`; if pres, delete
            if os.path.isdir(tlg_originals_dir):
                shutil.rmtree(tlg_originals_dir)
                msg = "Removed directory at '{}'.".format(tlg_originals_dir)
                logger.info(msg)
            # copy_dir requires that target
            if not os.path.isdir(tlg_originals_dir):
                self._copy_dir_recursive(local_path, tlg_originals_dir)
        else:
            """{'type': 'text',
            'name': 'lat_text_perseus',
            'origin': 'https://github.com/cltk/lat_text_perseus.git'},
            """
            if (not matching_corpus.get("type")
                    and not matching_corpus.get("name")
                    and not matching_corpus.get("origin")):
                raise FetchCorpus(f"Malformed record for ``{corpus_name}``.")
            git_uri = matching_corpus["origin"]
            type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language,
                                        matching_corpus["type"])
            type_dir = os.path.expanduser(type_dir_rel)
            target_dir = os.path.join(type_dir, corpus_name)
            target_file = os.path.join(type_dir, corpus_name, "README.md")
            # check if corpus already present
            # if not, clone
            if not os.path.isfile(target_file):
                if not os.path.isdir(type_dir):
                    os.makedirs(type_dir)
                try:
                    msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri)
                    logger.info(msg)
                    Repo.clone_from(
                        git_uri,
                        target_dir,
                        branch=branch,
                        depth=1,
                        progress=ProgressPrinter(),
                    )
                except CorpusImportError as corpus_imp_err:
                    msg = "Git clone of '{}' failed: '{}'".format(
                        git_uri, corpus_imp_err)
                    logger.error(msg)
            # if corpus is present, pull latest
            else:
                try:
                    repo = Repo(target_dir)
                    assert not repo.bare  # or: assert repo.exists()
                    git_origin = repo.remotes.origin
                    msg = "Pulling latest '{}' from '{}'.".format(
                        corpus_name, git_uri)
                    logger.info(msg)
                    git_origin.pull()
                except CorpusImportError as corpus_imp_err:
                    msg = "Git pull of '{}' failed: '{}'".format(
                        git_uri, corpus_imp_err)
                    logger.error(msg)
Esempio n. 9
0
    def _make_syllables(self, sentences_words: str) -> List[List[List[str]]]:
        """First tokenize, then divide word tokens into a list of syllables.
        Note that a syllable in this instance is defined as a vocalic
        group (i.e., vowel or a diphthong). This means that all
        syllables which are not the last syllable in the word
        will end with a vowel or diphthong.

        Todo:
            * Determine whether a CLTK syllabifier could replace this.

        Args:
            sentences_words: Text string

        Returns:
            List of list of list of syllables

        >>> from cltk.prosody.grc import Scansion
        >>> text_string = "νέος μὲν καὶ ἄπειρος, δικῶν ἔγωγε ἔτι. μὲν καὶ ἄπειρος."
        >>> Scansion()._make_syllables(text_string)
        [[['νε', 'ος'], ['μεν'], ['και'], ['α', 'πει', 'ρος'], ['δι', 'κων'], ['ε', 'γω', 'γε'], ['ε', 'τι']], [['μεν'], ['και'], ['α', 'πει', 'ρος']]]
        """
        text = self._tokenize(sentences_words)
        all_syllables = list()
        for sentence in text:
            syll_per_sent = list()
            for word in sentence:
                syll_start = 0  # Begins syllable iterator
                syll_per_word = list()
                cur_letter_in = 0  # Begins general iterator
                while cur_letter_in < len(word):
                    letter = word[cur_letter_in]
                    if (cur_letter_in != len(word) - 1) and (
                            word[cur_letter_in] +
                            word[cur_letter_in + 1]) in self.diphthongs:
                        cur_letter_in += 1
                        # Syllable ends with a diphthong
                        syll_per_word.append(word[syll_start:cur_letter_in +
                                                  1])
                        syll_start = cur_letter_in + 1
                    elif (letter in self.vowels) or (letter
                                                     in self.long_vowels):
                        # Syllable ends with a vowel
                        syll_per_word.append(word[syll_start:cur_letter_in +
                                                  1])
                        syll_start = cur_letter_in + 1
                    cur_letter_in += 1
                try:
                    last_vowel = syll_per_word[-1][-1]  # Last vowel of a word
                    # Modifies general iterator to accomodate consonants after
                    # the last syllable in a word
                    cur_letter_in = len(word) - 1
                    # Contains all of the consonants after the last vowel in a word
                    leftovers = ""
                    while word[cur_letter_in] != last_vowel:
                        if word[cur_letter_in] != ".":
                            # Adds consonants to leftovers
                            leftovers = word[cur_letter_in] + leftovers
                        cur_letter_in -= 1
                    # Adds leftovers to last syllable in a word
                    syll_per_word[-1] += leftovers
                    syll_per_sent.append(syll_per_word)
                except IndexError:
                    logger.info(
                        "IndexError while making syllables of '%s'. Continuing.",
                        word)
            all_syllables.append(syll_per_sent)
        return all_syllables
Esempio n. 10
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(["which", "tlgu"])
     except subprocess.SubprocessError as sub_err:
         print("TLGU not installed.")
         logger.info("TLGU not installed: %s", sub_err)
         logger.info("Installing TLGU.")
         if not subprocess.check_output(["which", "gcc"]):
             logger.error("GCC seems not to be installed.")
         else:
             tlgu_path = make_cltk_path("grc/software/grc_software_tlgu")
             if self.interactive:
                 install_question = "Do you want to install TLGU?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
             else:
                 print("Non-interactive installation. Continuing ...")
             command = "cd {0} && make install".format(tlgu_path)
             print(f"Going to run command: ``{command}``")
             try:
                 p_out = subprocess.call(command, shell=True)
             except subprocess.SubprocessError as sub_err:
                 print(
                     "Error executing installation. Going to check output of ``subprocess.call()`` ..."
                 )
                 raise CLTKException(sub_err)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
                 return True
             else:
                 msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..."
                 print(msg)
                 logger.error(msg)
             command = "cd {0} && sudo make install".format(tlgu_path)
             if self.interactive:
                 install_question = "Do you want to install TLGU? with sudo?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
                 p_out = subprocess.call(command, shell=True)
             else:
                 print("Going to run command:", command)
                 p_out = subprocess.call(command, shell=True)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
             else:
                 msg = "TLGU install with sudo failed."
                 print(msg)
                 logger.error(msg)
                 raise CLTKException(
                     "TLGU installation required for this class to work.")
Esempio n. 11
0
    def convert(
        input_path=None,
        output_path=None,
        markup=None,
        rm_newlines=False,
        divide_works=False,
        lat=False,
        extra_args=None,
    ):
        """
        Do conversion.

        :param input_path: TLG filepath to convert.
        :param output_path: filepath of new converted text.
        :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line.
        :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained.
        :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect.
        :param lat: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered.
        :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B'].

        """
        # setup file paths
        input_path = os.path.expanduser(input_path)
        output_path = os.path.expanduser(output_path)

        # check input path exists
        assert os.path.isfile(input_path), "File {0} does not exist.".format(
            input_path)

        # setup tlgu flags
        tlgu_options = []
        if markup == "full":
            full_args = ["v", "w", "x", "y", "z"]
            [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
        if rm_newlines:
            tlgu_options.append("N")
        if divide_works:
            tlgu_options.append("W")
        if lat:
            tlgu_options.append("r")
        # setup extra args
        if extra_args is None:
            extra_args = []
        else:
            try:
                extra_args = list(extra_args)
            except Exception as exc:
                logger.error("Argument 'extra_args' must be a list: %s.", exc)
                raise
        tlgu_options = tlgu_options + extra_args
        # assemble all tlgu flags
        tlgu_options = list(set(tlgu_options))
        if tlgu_options:
            tlgu_flags = "-" + " -".join(tlgu_options)
        else:
            tlgu_flags = ""
        # make tlgu call
        tlgu_call = "tlgu {0} {1} {2}".format(tlgu_flags, input_path,
                                              output_path)
        logger.info(tlgu_call)
        try:
            p_out = subprocess.call(tlgu_call, shell=True)
            if p_out == 1:
                logger.error("Failed to convert %s to %s.", input_path,
                             output_path)
        except Exception as exc:
            logger.error("Failed to convert %s to %s: %s", input_path,
                         output_path, exc)
            raise