Exemple #1
0
    def find_alliteration(self):
        """
        Find alliterations in the complete verse.
        :return:
        """
        if len(self.phonological_features_text) == 0:
            logger.error("No phonological transcription found")
            raise ValueError
        else:
            first_sounds = []
            for i, line in enumerate(self.phonological_features_text):
                first_sounds.append([])
                for j, short_line in enumerate(line):
                    first_sounds[i].append([])
                    for viisuord in short_line:
                        first_sounds[i][j].append(viisuord[0])

            verse_alliterations = []
            n_alliterations_lines = []
            for i, first_sound_line in enumerate(first_sounds):
                if isinstance(self.long_lines[i][0], ShortLine) and isinstance(
                        self.long_lines[i][1], ShortLine):
                    alli, counter = self.long_lines[i][0].find_alliterations(
                        self.long_lines[i][1])
                    verse_alliterations.append(alli)
                    n_alliterations_lines.append(counter)
                elif isinstance(self.long_lines[i][0], LongLine):
                    alli, counter = self.long_lines[i][0].find_alliterations()
                    verse_alliterations.append(alli)
                    n_alliterations_lines.append(counter)
            return verse_alliterations, n_alliterations_lines
Exemple #2
0
def open_pickle(path: str) -> Any:
    """Open a pickle and return loaded pickle object.
    :type path: str
    :param : path: File path to pickle file to be opened.
    :rtype : object
    """
    try:
        with open(path, "rb") as opened_pickle:
            try:
                return pickle.load(opened_pickle)
            except Exception as pickle_error:
                logger.error(pickle_error)
                raise
    except FileNotFoundError as fnf_error:
        logger.error(fnf_error)
        raise
    except IOError as io_err:
        logger.error(io_err)
        raise
    except EOFError as eof_error:
        logger.error(eof_error)
        raise
    except pickle.UnpicklingError as unp_error:
        logger.error(unp_error)
        raise
Exemple #3
0
    def to_phonetics(self):
        """
        Transcribing words in verse helps find alliteration.
        """
        if len(self.long_lines) == 0:
            logger.error("No text has been imported")
            self.syllabified_text = []
        else:
            transcriber = Transcriber(
                old_norse_transcription.DIPHTHONGS_IPA,
                old_norse_transcription.DIPHTHONGS_IPA_class,
                old_norse_transcription.IPA_class,
                old_norse_transcription.old_norse_rules,
            )
            transcribed_text = []
            phonological_features_text = []
            for i, long_line in enumerate(self.long_lines):
                transcribed_text.append([])
                phonological_features_text.append([])
                for short_line in long_line:
                    assert isinstance(short_line, ShortLine) or isinstance(
                        short_line, LongLine)
                    short_line.to_phonetics(transcriber)
                    transcribed_text[i].append(short_line.transcribed)
                    phonological_features_text[i].append(
                        short_line.phonological_features_text)

            self.transcribed_text = transcribed_text
            self.phonological_features_text = phonological_features_text
Exemple #4
0
def onekgreek_tei_xml_to_text():
    """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus."""
    if not bs4_installed:
        logger.error("Install `bs4` and `lxml` to parse these TEI files.")
        raise ImportError
    xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml")
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            "1K Greek corpus not installed. Use ``FetchCorpus`` to get `First1KGreek`."
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if "__cts__" not in path]

    # new dir
    new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/")
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip(".xml")
        xml_name += ".txt"
        with open(xml_path) as file_open:
            soup = BeautifulSoup(file_open, "lxml")
        body = soup.body
        text = body.get_text()
        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, "w") as file_open:
            file_open.write(text)
Exemple #5
0
 def __init__(self,
              height=None,
              backness=None,
              rounded=None,
              length=None,
              ipar=None):
     if isinstance(height, Height) or height is None:
         self.height = height
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if isinstance(backness, Backness) or backness is None:
         self.backness = backness
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if type(rounded) == bool or rounded is None:
         self.rounded = rounded
     else:
         logger.error("Incorrect argument")
         raise TypeError
     if isinstance(length, Length) or length is None:
         self.length = length
     else:
         logger.error("Incorrect argument")
         raise ValueError
     self.ipar = ipar
Exemple #6
0
 def __init__(self,
              place=None,
              manner=None,
              voiced=None,
              ipar=None,
              geminate=None):
     if isinstance(place, Place) or place is None:
         self.place = place
     else:
         logger.error("Incorrect argument")
     if isinstance(manner, Manner) or manner is None:
         self.manner = manner
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if type(voiced) == bool or voiced is None:
         self.voiced = voiced
     else:
         logger.error("Incorrect argument")
         raise TypeError
     if type(geminate) == bool or geminate is None:
         self.geminate = geminate
     else:
         logger.error("Incorrect argument")
         raise TypeError
     self.ipar = ipar
Exemple #7
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this

        """
        if corpus == "tlg":
            orig_dir = make_cltk_path("originals/tlg")
            works_dir = make_cltk_path("grc/text/tlg/individual_works")
            file_prefix = "TLG"
            lat = False
        elif corpus == "phi5":
            orig_dir = make_cltk_path("originals/phi5")
            works_dir = make_cltk_path("lat/text/phi5/individual_works")
            file_prefix = "LAT"
            lat = True  # this is for the optional TLGU argument to convert()
        elif corpus == "phi7":
            raise CLTKException(
                "``phi7`` cannot be divided into individual works.")
        else:
            raise CLTKException(
                f"Invalid corpus '{corpus}'. This should never happen.")

        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [
            x for x in files
            if x.endswith(".TXT") and x.startswith(file_prefix)
        ]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path,
                             new_file_path,
                             divide_works=True,
                             lat=lat)
                logger.info("Writing files at %s to %s.", orig_file_path,
                            works_dir)
            except Exception as err:
                logger.error("Failed to convert files: %s.", err)
Exemple #8
0
 def _git_user_defined_corpus(self,
                              corpus_name,
                              corpus_type,
                              uri: str,
                              branch="master"):
     """Clone or update a git repo defined by user.
     TODO: This code is very redundant with what's in import_corpus(),
     could be refactored.
     """
     type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
     type_dir = os.path.expanduser(type_dir_rel)
     repo_name = uri.split("/")[-1]  # eg, 'latin_corpus_newton_example.git'
     repo_name = repo_name.rstrip(".git")
     target_dir = os.path.join(type_dir, repo_name)
     target_file = os.path.join(type_dir, repo_name, "README.md")
     # check if corpus already present
     # if not, clone
     if not os.path.isfile(target_file):
         if not os.path.isdir(type_dir):
             os.makedirs(type_dir)
         try:
             msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
             logger.info(msg)
             Repo.clone_from(uri,
                             target_dir,
                             branch=branch,
                             depth=1,
                             progress=ProgressPrinter())
         except CorpusImportError as corpus_imp_err:
             msg = "Git clone of '{}' failed: '{}'".format(
                 uri, corpus_imp_err)
             logger.error(msg)
     # if corpus is present, pull latest
     else:
         try:
             repo = Repo(target_dir)
             assert not repo.bare  # or: assert repo.exists()
             git_origin = repo.remotes.origin
             msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
             logger.info(msg)
             git_origin.pull()
         except CorpusImportError as corpus_imp_err:
             msg = "Git pull of '{}' failed: '{}'".format(
                 uri, corpus_imp_err)
             logger.error(msg)
Exemple #9
0
 def syllabify(self, hierarchy: Dict[str, int]):
     """
     Syllables may play a role in verse classification.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         syllabifier = Syllabifier(language="non", break_geminants=True)
         syllabifier.set_hierarchy(hierarchy)
         syllabified_text = []
         for i, long_line in enumerate(self.long_lines):
             syllabified_text.append([])
             for short_line in long_line:
                 assert isinstance(short_line, ShortLine) or isinstance(
                     short_line, LongLine)
                 short_line.syllabify(syllabifier)
                 syllabified_text[i].append(short_line.syllabified)
         self.syllabified_text = syllabified_text
Exemple #10
0
    def from_regular_expression(re_rule, estimated_sound, ipa_class):
        """

        :param re_rule: pattern (first argument of re.sub)
        :param estimated_sound: an IPA character (second argument of re.sub)
        :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances
        :return: corresponding Rule instance
        """
        assert len(re_rule) > 0
        if re_rule[0] == "^":
            place = Rank.first
        elif re_rule[-1] == "$":
            place = Rank.last
        else:
            place = Rank.inner

        before_pattern = r"(?<=\(\?\<\=\[)\w*"
        core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)"
        after_pattern = r"(?<=\(\?\=\[)\w*"
        before_search = re.search(before_pattern, re_rule)
        core_search = re.search(core_pattern, re_rule)
        after_search = re.search(after_pattern, re_rule)
        if before_search is None:
            before = None
        else:
            before = [
                ipa_class[ipar].to_abstract()
                for ipar in before_search.group(0)
            ]
        if core_search is not None:
            core = ipa_class[core_search.group(0)]
        else:
            logger.error("No core")
            raise ValueError
        if after_search is None:
            after = None
        else:
            after = [
                ipa_class[ipar].to_abstract() for ipar in after_search.group(0)
            ]
        abstract_position = AbstractPosition(place, before, after)
        return Rule(abstract_position, core, ipa_class[estimated_sound])
Exemple #11
0
 def _get_corpus_properties(self, corpus_name: str):
     """Check whether a corpus is available for import.
     :type corpus_name: str
     :param corpus_name: Name of available corpus.
     :rtype : str
     """
     try:
         corpora = self.all_corpora
     except NameError as name_error:
         msg = "Corpus not available for language " '"%s": %s' % (
             self.language,
             name_error,
         )
         logger.error(msg)
         raise CorpusImportError(msg)
     for corpus_properties in corpora:
         if corpus_properties["name"] == corpus_name:
             return corpus_properties
     msg = 'Corpus "%s" not available for the ' '"%s" language.' % (
         corpus_name,
         self.language,
     )
     logger.error(msg)
     raise CorpusImportError(msg)
Exemple #12
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = make_cltk_path(
        "grc/text/grc_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml"
    )
    xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml")
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            "1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`."
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if "__cts__" not in path]

    # new dir
    new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/")
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip(".xml")
        xml_name += ".txt"

        plain_text = ""
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT,
                                       exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, "w") as file_open:
            file_open.write(plain_text)
Exemple #13
0
    def convert(
        input_path=None,
        output_path=None,
        markup=None,
        rm_newlines=False,
        divide_works=False,
        lat=False,
        extra_args=None,
    ):
        """
        Do conversion.

        :param input_path: TLG filepath to convert.
        :param output_path: filepath of new converted text.
        :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line.
        :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained.
        :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect.
        :param lat: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered.
        :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B'].

        """
        # setup file paths
        input_path = os.path.expanduser(input_path)
        output_path = os.path.expanduser(output_path)

        # check input path exists
        assert os.path.isfile(input_path), "File {0} does not exist.".format(
            input_path)

        # setup tlgu flags
        tlgu_options = []
        if markup == "full":
            full_args = ["v", "w", "x", "y", "z"]
            [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
        if rm_newlines:
            tlgu_options.append("N")
        if divide_works:
            tlgu_options.append("W")
        if lat:
            tlgu_options.append("r")
        # setup extra args
        if extra_args is None:
            extra_args = []
        else:
            try:
                extra_args = list(extra_args)
            except Exception as exc:
                logger.error("Argument 'extra_args' must be a list: %s.", exc)
                raise
        tlgu_options = tlgu_options + extra_args
        # assemble all tlgu flags
        tlgu_options = list(set(tlgu_options))
        if tlgu_options:
            tlgu_flags = "-" + " -".join(tlgu_options)
        else:
            tlgu_flags = ""
        # make tlgu call
        tlgu_call = "tlgu {0} {1} {2}".format(tlgu_flags, input_path,
                                              output_path)
        logger.info(tlgu_call)
        try:
            p_out = subprocess.call(tlgu_call, shell=True)
            if p_out == 1:
                logger.error("Failed to convert %s to %s.", input_path,
                             output_path)
        except Exception as exc:
            logger.error("Failed to convert %s to %s: %s", input_path,
                         output_path, exc)
            raise
Exemple #14
0
 def convert_corpus(self, corpus, markup=None, lat=None):  # pylint: disable=W0613
     """Look for imported TLG or PHI files and convert them all to
     ``~/cltk_data/grc/text/tlg/<plaintext>``.
     TODO: Add markup options to input.
     TODO: Add rm_newlines, divide_works, and extra_args
     """
     orig_path = make_cltk_path("originals")
     target_path = make_cltk_path()
     assert corpus in [
         "tlg",
         "phi5",
         "phi7",
     ], "Corpus must be 'tlg', 'phi5', or 'phi7'"
     if corpus in ["tlg", "phi5", "phi7"]:
         orig_path = os.path.join(orig_path, corpus)
         if corpus in ["tlg", "phi7"]:
             if "phi7" and lat is True:
                 lat = True
                 target_path = os.path.join(target_path, "lat", "text",
                                            corpus)
             else:
                 lat = None
                 target_path = os.path.join(target_path, "grc", "text",
                                            corpus)
         else:
             target_path = os.path.join(target_path, "lat", "text", corpus)
             lat = True
     try:
         corpus_files = os.listdir(orig_path)
     except Exception as exception:
         logger.error("Failed to find TLG files: %s", exception)
         raise
     # make a list of files to be converted
     txts = [x for x in corpus_files if x.endswith("TXT")]
     # loop through list and convert one at a time
     for txt in txts:
         orig_txt_path = os.path.join(orig_path, txt)
         if markup is None:
             target_txt_dir = os.path.join(target_path, "plaintext")
         else:
             target_txt_dir = os.path.join(target_path, str(markup))
         if not os.path.isdir(target_txt_dir):
             os.makedirs(target_txt_dir)
         target_txt_path = os.path.join(target_txt_dir, txt)
         try:
             self.convert(
                 orig_txt_path,
                 target_txt_path,
                 markup=False,
                 rm_newlines=False,
                 divide_works=False,
                 lat=lat,
                 extra_args=None,
             )
         except Exception as exception:
             logger.error(
                 "Failed to convert file '%s' to '%s': %s",
                 orig_txt_path,
                 target_txt_path,
                 exception,
             )
Exemple #15
0
"""
import re
import unicodedata

from nltk.tokenize import wordpunct_tokenize

from cltk.core.cltk_logger import logger
from cltk.prosody.lat import macronizer as m

try:
    # James Tauber's greek_accentuation package
    from greek_accentuation import characters as chars
except ImportError as import_error:
    message = ('Missing "greek_accentuation" package. Install with '
               "`pip install greek-accentuation`.")
    logger.error(message)
    logger.error(import_error)
    raise

__author__ = ["Jack Duff <*****@*****.**>"]
__license__ = "MIT License. See LICENSE."

# Dictionaries of phonological reconstructions for use in transcribing.
# Allen, W. Sidney. 1965. Vox Latina.

LATIN = {
    "Classical": {
        "Allen": {
            "correspondence": {
                "p": "p",
                "t": "t̪",
Exemple #16
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(["which", "tlgu"])
     except subprocess.SubprocessError as sub_err:
         print("TLGU not installed.")
         logger.info("TLGU not installed: %s", sub_err)
         logger.info("Installing TLGU.")
         if not subprocess.check_output(["which", "gcc"]):
             logger.error("GCC seems not to be installed.")
         else:
             tlgu_path = make_cltk_path("grc/software/grc_software_tlgu")
             if self.interactive:
                 install_question = "Do you want to install TLGU?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
             else:
                 print("Non-interactive installation. Continuing ...")
             command = "cd {0} && make install".format(tlgu_path)
             print(f"Going to run command: ``{command}``")
             try:
                 p_out = subprocess.call(command, shell=True)
             except subprocess.SubprocessError as sub_err:
                 print(
                     "Error executing installation. Going to check output of ``subprocess.call()`` ..."
                 )
                 raise CLTKException(sub_err)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
                 return True
             else:
                 msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..."
                 print(msg)
                 logger.error(msg)
             command = "cd {0} && sudo make install".format(tlgu_path)
             if self.interactive:
                 install_question = "Do you want to install TLGU? with sudo?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
                 p_out = subprocess.call(command, shell=True)
             else:
                 print("Going to run command:", command)
                 p_out = subprocess.call(command, shell=True)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
             else:
                 msg = "TLGU install with sudo failed."
                 print(msg)
                 logger.error(msg)
                 raise CLTKException(
                     "TLGU installation required for this class to work.")
Exemple #17
0
    def import_corpus(self,
                      corpus_name: str,
                      local_path: str = None,
                      branch: str = "master"):
        """Download a remote or load local corpus into dir ``~/cltk_data``.

        TODO: maybe add ``from git import RemoteProgress``
        TODO: refactor this, it's getting kinda long

        :param corpus_name: The name of an available corpus.
        :param local_path: A filepath, required when importing local corpora.
        :param branch: What Git branch to clone.
        """

        matching_corpus_list = [
            _dict for _dict in self.all_corpora_for_lang
            if _dict["name"] == corpus_name
        ]
        if not matching_corpus_list:
            raise CorpusImportError(
                f"No corpus ``{corpus_name}`` for language ``{self.language}``."
            )
        if len(matching_corpus_list) > 1:
            raise CorpusImportError(
                f"Found more than one corpus with the name ``{corpus_name}``.")
        matching_corpus = matching_corpus_list[0]
        if matching_corpus.get("user_defined"):
            """{'origin': 'https://github.com/kylepjohnson/latin_corpus_newton_example.git',
            'type': 'text',
            'name': 'example_distributed_latin_corpus',
            'user_defined': True}
            """
            self._git_user_defined_corpus(
                matching_corpus["name"],
                matching_corpus["type"],
                matching_corpus["origin"],
            )
            return
        elif matching_corpus.get("location") == "local":
            # {'location': 'local', 'name': 'phi5', 'origin': None, 'type': 'text'}
            msg = "Importing from local path: '{}'".format(local_path)
            logger.info(msg)
            if corpus_name not in ["phi5", "phi7", "tlg"]:
                raise CorpusImportError(
                    f"Unsupported local corpus ``{corpus_name}``.")
            if corpus_name == "phi5":
                # normalize path for checking dir
                if local_path.endswith("/"):
                    local_path = local_path[:-1]
                # check for right corpus dir
                if os.path.split(local_path)[1] != "PHI5":
                    logger.info("Directory must be named 'PHI5'.")
            if corpus_name == "phi7":
                # normalize local_path for checking dir
                if local_path.endswith("/"):
                    local_path = local_path[:-1]
                # check for right corpus dir
                if os.path.split(local_path)[1] != "PHI7":
                    logger.info("Directory must be named 'PHI7'.")
            if corpus_name == "tlg":
                # normalize path for checking dir
                if local_path.endswith("/"):
                    local_path = local_path[:-1]
                # check for right corpus dir
                if os.path.split(local_path)[1] != "TLG_E":
                    logger.info("Directory must be named 'TLG_E'.")
            # move the dir-checking commands into a function
            data_dir = os.path.expanduser(CLTK_DATA_DIR)
            originals_dir = os.path.join(data_dir, "originals")
            # check for `originals` dir; if not present mkdir
            if not os.path.isdir(originals_dir):
                os.makedirs(originals_dir)
                msg = "Wrote directory at '{}'.".format(originals_dir)
                logger.info(msg)
            tlg_originals_dir = os.path.join(data_dir, "originals",
                                             corpus_name)
            # check for `originals/<corpus_name>`; if pres, delete
            if os.path.isdir(tlg_originals_dir):
                shutil.rmtree(tlg_originals_dir)
                msg = "Removed directory at '{}'.".format(tlg_originals_dir)
                logger.info(msg)
            # copy_dir requires that target
            if not os.path.isdir(tlg_originals_dir):
                self._copy_dir_recursive(local_path, tlg_originals_dir)
        else:
            """{'type': 'text',
            'name': 'lat_text_perseus',
            'origin': 'https://github.com/cltk/lat_text_perseus.git'},
            """
            if (not matching_corpus.get("type")
                    and not matching_corpus.get("name")
                    and not matching_corpus.get("origin")):
                raise FetchCorpus(f"Malformed record for ``{corpus_name}``.")
            git_uri = matching_corpus["origin"]
            type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language,
                                        matching_corpus["type"])
            type_dir = os.path.expanduser(type_dir_rel)
            target_dir = os.path.join(type_dir, corpus_name)
            target_file = os.path.join(type_dir, corpus_name, "README.md")
            # check if corpus already present
            # if not, clone
            if not os.path.isfile(target_file):
                if not os.path.isdir(type_dir):
                    os.makedirs(type_dir)
                try:
                    msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri)
                    logger.info(msg)
                    Repo.clone_from(
                        git_uri,
                        target_dir,
                        branch=branch,
                        depth=1,
                        progress=ProgressPrinter(),
                    )
                except CorpusImportError as corpus_imp_err:
                    msg = "Git clone of '{}' failed: '{}'".format(
                        git_uri, corpus_imp_err)
                    logger.error(msg)
            # if corpus is present, pull latest
            else:
                try:
                    repo = Repo(target_dir)
                    assert not repo.bare  # or: assert repo.exists()
                    git_origin = repo.remotes.origin
                    msg = "Pulling latest '{}' from '{}'.".format(
                        corpus_name, git_uri)
                    logger.info(msg)
                    git_origin.pull()
                except CorpusImportError as corpus_imp_err:
                    msg = "Git pull of '{}' failed: '{}'".format(
                        git_uri, corpus_imp_err)
                    logger.error(msg)