Beispiel #1
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        '~/cltk_data/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
    xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)
Beispiel #2
0
def onekgreek_tei_xml_to_text():
    """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus."""
    if not bs4_installed:
        logger.error('Install `bs4` and `lxml` to parse these TEI files.')
        raise ImportError
    xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'
        with open(xml_path) as file_open:
            soup = BeautifulSoup(file_open, 'lxml')
        body = soup.body
        text = body.get_text()
        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(text)
Beispiel #3
0
    def find_alliteration(self):
        """
        Find alliterations in the complete verse.
        :return:
        """
        if len(self.phonological_features_text) == 0:
            logger.error("No phonological transcription found")
            raise ValueError
        else:
            first_sounds = []
            for i, line in enumerate(self.phonological_features_text):
                first_sounds.append([])
                for j, short_line in enumerate(line):
                    first_sounds[i].append([])
                    for viisuord in short_line:
                        first_sounds[i][j].append(viisuord[0])

            verse_alliterations = []
            n_alliterations_lines = []
            for i, first_sound_line in enumerate(first_sounds):
                if isinstance(self.long_lines[i][0], ShortLine) and isinstance(self.long_lines[i][1], ShortLine):
                    self.long_lines[i][0].get_first_sounds()
                    self.long_lines[i][1].get_first_sounds()
                    alli, counter = self.long_lines[i][0].find_alliterations(self.long_lines[i][1])
                    verse_alliterations.append(alli)
                    n_alliterations_lines.append(counter)
                elif isinstance(self.long_lines[i][0], LongLine):
                    self.long_lines[i][0].get_first_sounds()
                    alli, counter = self.long_lines[i][0].find_alliterations()
                    verse_alliterations.append(alli)
                    n_alliterations_lines.append(counter)
            return verse_alliterations, n_alliterations_lines
Beispiel #4
0
 def syllabify(self):
     """
     Syllables may play a role in verse classification.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         syllabifier = Syllabifier(language="old_norse", break_geminants=True)
         syllabified_text = []
         for i, line in enumerate(self.long_lines):
             syllabified_text.append([])
             for j, viisuordh in enumerate(line):
                 syllabified_text[i].append([])
                 words = []
                 for word in tokenize_old_norse_words(viisuordh):
                     # punctuation is not necessary here
                     word = word.replace(",", "")
                     word = word.replace(".", "")
                     word = word.replace(";", "")
                     word = word.replace("!", "")
                     word = word.replace("?", "")
                     word = word.replace("-", "")
                     word = word.replace(":", "")
                     if word != '':
                         words.append(syllabifier.syllabify(word.lower()))
                 syllabified_text[i][j].append(words)
         self.syllabified_text = syllabified_text
Beispiel #5
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this
        """
        if corpus == 'tlg':
            orig_dir_rel = '~/cltk_data/originals/tlg'
            works_dir_rel = '~/cltk_data/greek/text/tlg/individual_works'
            file_prefix = 'TLG'
            latin = False
        elif corpus == 'phi5':
            orig_dir_rel = '~/cltk_data/originals/phi5'
            works_dir_rel = '~/cltk_data/latin/text/phi5/individual_works'
            file_prefix = 'LAT'
            latin = True  # this is for the optional TLGU argument to convert()

        orig_dir = os.path.expanduser(orig_dir_rel)
        works_dir = os.path.expanduser(works_dir_rel)
        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [x for x in files if x.endswith('.TXT') and x.startswith(file_prefix)]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path, new_file_path, divide_works=True, latin=latin)
                logger.info('Writing files at %s to %s.', orig_file_path, works_dir)
            except Exception as err:
                logger.error('Failed to convert files: %s.', err)
Beispiel #6
0
def open_pickle(path: str):
    """Open a pickle and return loaded pickle object.
    :type path: str
    :param : path: File path to pickle file to be opened.
    :rtype : object
    """
    try:
        with open(path, 'rb') as opened_pickle:
            try:
                return pickle.load(opened_pickle)
            except Exception as pickle_error:
                logger.error(pickle_error)
                raise
    except FileNotFoundError as fnf_error:
        logger.error(fnf_error)
        raise
    except IOError as io_err:
        logger.error(io_err)
        raise
    except EOFError as eof_error:
        logger.error(eof_error)
        raise
    except pickle.UnpicklingError as unp_error:
        logger.error(unp_error)
        raise
Beispiel #7
0
 def list_corpora(self):
     """Show corpora available for the CLTK to download."""
     try:
         corpora = LANGUAGE_CORPORA[self.language]
         corpus_names = [corpus['name'] for corpus in corpora]
         return corpus_names
     except (NameError, KeyError) as error:
         msg = 'Corpus not available for language "{}": {}'.format(self.language, error)
         logger.error(msg)
         raise CorpusImportError(msg)
Beispiel #8
0
    def list_corpora(self):
        """Show corpora available for the CLTK to download."""
        try:
            corpora = LANGUAGE_CORPORA[self.language]
        except NameError as name_error:
            logger.error('Corpus not available for language %s: %s', (self.language, name_error))

        corpus_list = []
        for corpus in corpora:
            corpus_list.append(corpus['name'])
        return corpus_list
Beispiel #9
0
 def _check_import_source():
     """Check if tlgu imported, if not import it."""
     path_rel = '~/cltk_data/greek/software/greek_software_tlgu/tlgu.h'
     path = os.path.expanduser(path_rel)
     if not os.path.isfile(path):
         try:
             corpus_importer = CorpusImporter('greek')
             corpus_importer.import_corpus('tlgu')
         except Exception as exc:
             logger.error('Failed to import TLGU: %s', exc)
             raise
Beispiel #10
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(['which', 'tlgu'])
     except Exception as exc:
         logger.info('TLGU not installed: %s', exc)
         logger.info('Installing TLGU.')
         if not subprocess.check_output(['which', 'gcc']):
             logger.error('GCC seems not to be installed.')
         else:
             tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu'
             tlgu_path = os.path.expanduser(tlgu_path_rel)
             if not self.testing:
                 print('Do you want to install TLGU? To continue, press Return. To exit, Control-C.')
                 input()
             else:
                 print('Automated or test build, skipping keyboard input confirmation for installation of TLGU.')
             try:
                 p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install without sudo failed.')
             except Exception as exc:
                 logger.error('TLGU install failed: %s', exc)
             else:  # for Linux needing root access to '/usr/local/bin'
                 p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install with sudo failed.')
Beispiel #11
0
 def _check_install():
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(['which', 'tlgu'])
     except Exception as exc:
         logger.info('TLGU not installed: %s', exc)
         logger.info('Installing TLGU.')
         if not subprocess.check_output(['which', 'gcc']):
             logger.error('GCC seems not to be installed.')
         else:
             tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu'
             tlgu_path = os.path.expanduser(tlgu_path_rel)
             try:
                 p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install without sudo failed.')
             except Exception as exc:
                 logger.error('TLGU install failed: %s', exc)
             else:  # for Linux needing root access to '/usr/local/bin'
                 p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install with sudo failed.')
Beispiel #12
0
 def to_phonetics(self):
     """
     Transcribing words in verse helps find alliteration.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         transcriber = Transcriber(DIPHTHONGS_IPA, DIPHTHONGS_IPA_class, IPA_class, old_norse_rules)
         transcribed_text = []
         for i, line in enumerate(self.long_lines):
             transcribed_text.append([])
             for viisuordh in line:
                 transcribed_text[i].append(transcriber.main(viisuordh))
         self.transcribed_text = transcribed_text
Beispiel #13
0
 def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=False, latin=None, extra_args=None):  # pylint: disable=W0613
     """Look for imported TLG or PHI files and convert them all to
     ``~/cltk_data/greek/text/tlg/<plaintext>``.
     TODO: Should this and/or convert() be static?
     TODO: Add markup options to input.
     TODO: Do something with break_lines, divide_works, and extra_args or rm them
     """
     orig_path_rel = '~/cltk_data/originals'
     orig_path = os.path.expanduser(orig_path_rel)
     target_path_rel = '~/cltk_data'
     target_path = os.path.expanduser(target_path_rel)
     assert corpus in ['tlg', 'phi5', 'phi7'], "Corpus must be 'tlg', 'phi5', or 'phi7'"
     if corpus in ['tlg', 'phi5', 'phi7']:
         orig_path = os.path.join(orig_path, corpus)
         if corpus in ['tlg', 'phi7']:
             if 'phi7' and latin is True:
                 latin = True
                 target_path = os.path.join(target_path, 'latin', 'text', corpus)
             else:
                 latin = None
                 target_path = os.path.join(target_path, 'greek', 'text', corpus)
         else:
             target_path = os.path.join(target_path, 'latin', 'text', corpus)
             latin = True
     try:
         corpus_files = os.listdir(orig_path)
     except Exception as exception:
         logger.error("Failed to find TLG files: %s", exception)
         raise
     # make a list of files to be converted
     txts = []
     [txts.append(x) for x in corpus_files if x.endswith('TXT')]  # pylint: disable=W0106
     # loop through list and convert one at a time
     for txt in txts:
         orig_txt_path = os.path.join(orig_path, txt)
         if markup is None:
             target_txt_dir = os.path.join(target_path, 'plaintext')
         else:
             target_txt_dir = os.path.join(target_path, str(markup))
         if not os.path.isdir(target_txt_dir):
             os.makedirs(target_txt_dir)
         target_txt_path = os.path.join(target_txt_dir, txt)
         try:
             self.convert(orig_txt_path, target_txt_path, markup=None,
                          break_lines=False, divide_works=False, latin=latin,
                          extra_args=None)
         except Exception as exception:
             logger.error("Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception)
Beispiel #14
0
 def write_concordance_from_string(self, text, name):
     """A reworkinng of write_concordance_from_file(). Refactor these."""
     list_of_lists = self._build_concordance(text)
     user_data_rel = '~/cltk_data/user_data'
     user_data = os.path.expanduser(user_data_rel)
     if not os.path.isdir(user_data):
         os.makedirs(user_data)
     file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
     concordance_output = ''
     for word_list in list_of_lists:
         for line in word_list:
             concordance_output += line + '\n'
     try:
         with open(file_path, 'w') as open_file:
             open_file.write(concordance_output)
             logger.info("Wrote concordance to '%s'." % file_path)
     except IOError as io_error:
         logger.error("Failed to write concordance to '%s'." % file_path)
Beispiel #15
0
    def ratio(string_a, string_b):
        """At the most basic level, return a Levenshtein distance ratio via
        fuzzywuzzy.
        :param string_a: str
        :param string_b: str
        :return: float
        """
        from cltk.utils.cltk_logger import logger
        try:
            from fuzzywuzzy import fuzz

        except ImportError as imp_err:  # pragma: no cover
            message = "'fuzzywuzzy' library required for this module: %s. Install with `pip install fuzzywuzzy python-Levenshtein`" % imp_err
            logger.error(message)
            print(message)
            raise ImportError

        return fuzz.ratio(string_a, string_b) / 100
Beispiel #16
0
 def syllabify(self, hierarchy):
     """
     Syllables may play a role in verse classification.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         syllabifier = Syllabifier(language="old_norse", break_geminants=True)
         syllabifier.set_hierarchy(hierarchy)
         syllabified_text = []
         for i, long_line in enumerate(self.long_lines):
             syllabified_text.append([])
             for short_line in long_line:
                 assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine)
                 short_line.syllabify(syllabifier)
                 syllabified_text[i].append(short_line.syllabified)
         self.syllabified_text = syllabified_text
Beispiel #17
0
 def _check_corpus_availability(self, corpus_name):
     """Check whether a corpus is available for import.
     :type corpus_name: str
     :param corpus_name: Name of available corpus.
     :rtype : str
     """
     try:
         corpora = LANGUAGE_CORPORA[self.language]
     except NameError as name_error:
         logger.error('Corpus not available for language %s: %s', (self.language, name_error))
     corpus_properties = None
     for corpus in corpora:
         if corpus['name'] == corpus_name:
             corpus_properties = corpus
     if not corpus_properties:
         logger.info("Corpus '%s' not available for the '%s' language.",
                     corpus_name,
                     self.language)
     return corpus_properties
Beispiel #18
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this
        """
        if corpus == 'tlg':
            orig_dir_rel = get_cltk_data_dir() + '/originals/tlg'
            works_dir_rel = get_cltk_data_dir(
            ) + '/greek/text/tlg/individual_works'
            file_prefix = 'TLG'
            latin = False
        elif corpus == 'phi5':
            orig_dir_rel = get_cltk_data_dir() + '/originals/phi5'
            works_dir_rel = get_cltk_data_dir(
            ) + '/latin/text/phi5/individual_works'
            file_prefix = 'LAT'
            latin = True  # this is for the optional TLGU argument to convert()

        orig_dir = os.path.expanduser(orig_dir_rel)
        works_dir = os.path.expanduser(works_dir_rel)
        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [
            x for x in files
            if x.endswith('.TXT') and x.startswith(file_prefix)
        ]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path,
                             new_file_path,
                             divide_works=True,
                             latin=latin)
                logger.info('Writing files at %s to %s.', orig_file_path,
                            works_dir)
            except Exception as err:
                logger.error('Failed to convert files: %s.', err)
Beispiel #19
0
    def from_regular_expression(re_rule, estimated_sound, ipa_class):
        """

        :param re_rule: pattern (first argument of re.sub)
        :param estimated_sound: an IPA character (second argument of re.sub)
        :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances
        :return: corresponding Rule instance
        """
        assert len(re_rule) > 0
        if re_rule[0] == "^":
            place = Rank.first
        elif re_rule[-1] == "$":
            place = Rank.last
        else:
            place = Rank.inner

        before_pattern = r"(?<=\(\?\<\=\[)\w*"
        core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)"
        after_pattern = r"(?<=\(\?\=\[)\w*"
        before_search = re.search(before_pattern, re_rule)
        core_search = re.search(core_pattern, re_rule)
        after_search = re.search(after_pattern, re_rule)
        if before_search is None:
            before = None
        else:
            before = [
                ipa_class[ipar].to_abstract()
                for ipar in before_search.group(0)
            ]
        if core_search is not None:
            core = ipa_class[core_search.group(0)]
        else:
            logger.error("No core")
            raise ValueError
        if after_search is None:
            after = None
        else:
            after = [
                ipa_class[ipar].to_abstract() for ipar in after_search.group(0)
            ]
        abstract_position = AbstractPosition(place, before, after)
        return Rule(abstract_position, core, ipa_class[estimated_sound])
Beispiel #20
0
 def syllabify(self, hierarchy):
     """
     Syllables may play a role in verse classification.
     """
     if len(self.long_lines) == 0:
         logger.error("No text was imported")
         self.syllabified_text = []
     else:
         syllabifier = Syllabifier(language="old_norse",
                                   break_geminants=True)
         syllabifier.set_hierarchy(hierarchy)
         syllabified_text = []
         for i, long_line in enumerate(self.long_lines):
             syllabified_text.append([])
             for short_line in long_line:
                 assert isinstance(short_line, ShortLine) or isinstance(
                     short_line, LongLine)
                 short_line.syllabify(syllabifier)
                 syllabified_text[i].append(short_line.syllabified)
         self.syllabified_text = syllabified_text
Beispiel #21
0
 def _get_corpus_properties(self, corpus_name):
     """Check whether a corpus is available for import.
     :type corpus_name: str
     :param corpus_name: Name of available corpus.
     :rtype : str
     """
     try:
         corpora = LANGUAGE_CORPORA[self.language]
     except NameError as name_error:
         msg = 'Corpus not available for language ' \
               '"%s": %s' % (self.language, name_error)
         logger.error(msg)
         raise CorpusImportError(msg)
     for corpus_properties in corpora:
         if corpus_properties['name'] == corpus_name:
             return corpus_properties
     msg = 'Corpus "%s" not available for the ' \
           '"%s" language.' % (corpus_name, self.language)
     logger.error(msg)
     raise CorpusImportError(msg)
Beispiel #22
0
def write_concordance_from_string(text: str, name: str) -> None:
    """A reworkinng of write_concordance_from_file(). Refactor these."""
    list_of_lists = build_concordance(text)  # type: List[List[str]]
    user_data_rel = '~/cltk_data/user_data'  # type: str
    user_data = os.path.expanduser(user_data_rel)  # type: str
    if not os.path.isdir(user_data):
        os.makedirs(user_data)
    file_path = os.path.join(user_data,
                             'concordance_' + name + '.txt')  # type: str
    concordance_output = ''  # type: str
    for word_list in list_of_lists:
        for line in word_list:
            concordance_output += line + '\n'
    try:
        with open(file_path, 'w') as open_file:
            open_file.write(concordance_output)
            logger.info("Wrote concordance to '%s'.", file_path)
    except IOError as io_error:
        logger.error("Failed to write concordance to '%s'. Error: %s",
                     file_path, io_error)
Beispiel #23
0
 def _get_corpus_properties(self, corpus_name):
     """Check whether a corpus is available for import.
     :type corpus_name: str
     :param corpus_name: Name of available corpus.
     :rtype : str
     """
     try:
         corpora = LANGUAGE_CORPORA[self.language]
     except NameError as name_error:
         msg = 'Corpus not available for language ' \
               '"%s": %s' % (self.language, name_error)
         logger.error(msg)
         raise CorpusImportError(msg)
     for corpus_properties in corpora:
         if corpus_properties['name'] == corpus_name:
             return corpus_properties
     msg = 'Corpus "%s" not available for the ' \
           '"%s" language.' % (corpus_name, self.language)
     logger.error(msg)
     raise CorpusImportError(msg)
Beispiel #24
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(['which', 'tlgu'])
     except Exception as exc:
         logger.info('TLGU not installed: %s', exc)
         logger.info('Installing TLGU.')
         if not subprocess.check_output(['which', 'gcc']):
             logger.error('GCC seems not to be installed.')
         else:
             tlgu_path_rel = get_cltk_data_dir(
             ) + '/greek/software/greek_software_tlgu'
             tlgu_path = os.path.expanduser(tlgu_path_rel)
             if not self.testing:
                 print('Do you want to install TLGU?')
                 print('To continue, press Return. To exit, Control-C.')
                 input()
             else:
                 print(
                     'Automated or test build, skipping keyboard input confirmation for installation of TLGU.'
                 )
             try:
                 command = 'cd {0} && make install'.format(tlgu_path)
                 print('Going to run command:', command)
                 p_out = subprocess.call(command, shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install without sudo failed.')
             except Exception as exc:
                 logger.error('TLGU install failed: %s', exc)
             else:  # for Linux needing root access to '/usr/local/bin'
                 if not self.testing:
                     print(
                         'Could not install without root access. Do you want to install TLGU with sudo?'
                     )
                     command = 'cd {0} && sudo make install'.format(
                         tlgu_path)
                     print('Going to run command:', command)
                     print('To continue, press Return. To exit, Control-C.')
                     input()
                     p_out = subprocess.call(command, shell=True)
                 else:
                     command = 'cd {0} && sudo make install'.format(
                         tlgu_path)
                     p_out = subprocess.call(command, shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install with sudo failed.')
Beispiel #25
0
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = os.path.expanduser(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml'
    )
    xml_dir = os.path.normpath(
        get_cltk_data_dir() +
        '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            '1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.'
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if '__cts__' not in path]

    # new dir
    new_dir = os.path.normpath(
        get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip('.xml')
        xml_name += '.txt'

        plain_text = ''
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT,
                                       exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, 'w') as file_open:
            file_open.write(plain_text)
Beispiel #26
0
    def to_phonetics(self):
        """
        Transcribing words in verse helps find alliteration.
        """
        if len(self.long_lines) == 0:
            logger.error("No text was imported")
            self.syllabified_text = []
        else:
            transcriber = Transcriber(DIPHTHONGS_IPA, DIPHTHONGS_IPA_class, IPA_class, old_norse_rules)
            transcribed_text = []
            phonological_features_text = []
            for i, long_line in enumerate(self.long_lines):
                transcribed_text.append([])
                phonological_features_text.append([])
                for short_line in long_line:
                    assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine)
                    short_line.to_phonetics(transcriber)
                    transcribed_text[i].append(short_line.transcribed)
                    phonological_features_text[i].append(short_line.phonological_features_text)

            self.transcribed_text = transcribed_text
            self.phonological_features_text = phonological_features_text
Beispiel #27
0
    def to_phonetics(self):
        """
        Transcribing words in verse helps find alliteration.
        """
        if len(self.long_lines) == 0:
            logger.error("No text was imported")
            self.syllabified_text = []
        else:
            transcriber = Transcriber(DIPHTHONGS_IPA, DIPHTHONGS_IPA_class, IPA_class, old_norse_rules)
            transcribed_text = []
            phonological_features_text = []
            for i, long_line in enumerate(self.long_lines):
                transcribed_text.append([])
                phonological_features_text.append([])
                for short_line in long_line:
                    assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine)
                    short_line.to_phonetics(transcriber)
                    transcribed_text[i].append(short_line.transcribed)
                    phonological_features_text[i].append(short_line.phonological_features_text)

            self.transcribed_text = transcribed_text
            self.phonological_features_text = phonological_features_text
Beispiel #28
0
 def _git_user_defined_corpus(self, corpus_name, corpus_type, uri:str, branch='master'):
     """Clone or update a git repo defined by user.
     TODO: This code is very redundant with what's in import_corpus(),
     could be refactored.
     """
     # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
     # self._download_corpus(corpus_type, corpus_name, path)
     type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
     type_dir = os.path.expanduser(type_dir_rel)
     repo_name = uri.split('/')[-1]  # eg, 'latin_corpus_newton_example.git'
     repo_name = repo_name.rstrip('.git')
     target_dir = os.path.join(type_dir, repo_name)
     target_file = os.path.join(type_dir, repo_name, 'README.md')
     # check if corpus already present
     # if not, clone
     if not os.path.isfile(target_file):
         if not os.path.isdir(type_dir):
             os.makedirs(type_dir)
         try:
             msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
             logger.info(msg)
             Repo.clone_from(uri, target_dir, branch=branch, depth=1,
                             progress=ProgressPrinter())
         except CorpusImportError as corpus_imp_err:
             msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err)
             logger.error(msg)
     # if corpus is present, pull latest
     else:
         try:
             repo = Repo(target_dir)
             assert not repo.bare  # or: assert repo.exists()
             git_origin = repo.remotes.origin
             msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
             logger.info(msg)
             git_origin.pull()
         except CorpusImportError as corpus_imp_err:
             msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err)
             logger.error(msg)
Beispiel #29
0
 def _git_user_defined_corpus(self, corpus_name, corpus_type, uri:str, branch='master'):
     """Clone or update a git repo defined by user.
     TODO: This code is very redundant with what's in import_corpus(),
     could be refactored.
     """
     # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
     # self._download_corpus(corpus_type, corpus_name, path)
     type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
     type_dir = os.path.expanduser(type_dir_rel)
     repo_name = uri.split('/')[-1]  # eg, 'latin_corpus_newton_example.git'
     repo_name = repo_name.rstrip('.git')
     target_dir = os.path.join(type_dir, repo_name)
     target_file = os.path.join(type_dir, repo_name, 'README.md')
     # check if corpus already present
     # if not, clone
     if not os.path.isfile(target_file):
         if not os.path.isdir(type_dir):
             os.makedirs(type_dir)
         try:
             msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
             logger.info(msg)
             Repo.clone_from(uri, target_dir, branch=branch, depth=1,
                             progress=ProgressPrinter())
         except CorpusImportError as corpus_imp_err:
             msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err)
             logger.error(msg)
     # if corpus is present, pull latest
     else:
         try:
             repo = Repo(target_dir)
             assert not repo.bare  # or: assert repo.exists()
             git_origin = repo.remotes.origin
             msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
             logger.info(msg)
             git_origin.pull()
         except CorpusImportError as corpus_imp_err:
             msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err)
             logger.error(msg)
Beispiel #30
0
def write_concordance_from_file(filepaths: Union[str, List[str]],
                                name: str) -> None:
    """This calls the modified ConcordanceIndex, taken and modified from
    the NLTK, and writes to disk a file named 'concordance_' + name at
    '~/cltk_data/user_data/'.

    TODO: Add language (here or in class), lowercase option, stemming/
    lemmatization, else?

    :type filepaths: str or list
    :param filepaths: Filepath of text(s) to be used in concordance.
    :rtype : str
    """
    assert isinstance(filepaths, (str, list))
    if isinstance(filepaths, str):
        filepath = filepaths  # type: str
        text = read_file(filepath)  # type: str
    elif isinstance(filepaths, list):
        text = ''
        for filepath in filepaths:
            text += read_file(filepath)
    list_of_lists = build_concordance(text)  # type: List[List[str]]
    user_data_rel = '~/cltk_data/user_data'  # type: str
    user_data = os.path.expanduser(user_data_rel)  # type: str
    if not os.path.isdir(user_data):
        os.makedirs(user_data)
    file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
    concordance_output = ''  # type: str
    for word_list in list_of_lists:
        for line in word_list:
            concordance_output += line + '\n'
    try:
        with open(file_path, 'w') as open_file:  # type: IO
            open_file.write(concordance_output)
            logger.info("Wrote concordance to '%s'.", file_path)
    except IOError as io_error:
        logger.error("Failed to write concordance to '%s'. Error: %s",
                     file_path, io_error)
Beispiel #31
0
    def lemmatize(self, input_text, return_raw=False, return_string=False):
        """Take incoming string or list of tokens. Lookup done against a
        key-value list of lemmata-headword. If a string, tokenize with
        ``PunktLanguageVars()``. If a final period appears on a token, remove
        it, then re-add once replacement done.
        TODO: rm check for final period, change PunktLanguageVars() to nltk_tokenize_words()
        """
        assert type(input_text) in [list, str], \
            logger.error('Input must be a list or string.')
        if type(input_text) is str:
            punkt = PunktLanguageVars()
            tokens = punkt.word_tokenize(input_text)
        else:
            tokens = input_text

        lemmatized_tokens = []
        for token in tokens:
            # check for final period
            final_period = False
            if token[-1] == '.':
                final_period = True
                token = token[:-1]

            # look for token in lemma dict keys
            if token.lower() in self.lemmata.keys():
                headword = self.lemmata[token.lower()]

                # re-add final period if rm'd
                if final_period:
                    headword += '.'

                # append to return list
                if not return_raw:
                    lemmatized_tokens.append(headword)
                else:
                    lemmatized_tokens.append(token + '/' + headword)
            # if token not found in lemma-headword list
            else:
                # re-add final period if rm'd
                if final_period:
                    token += '.'

                if not return_raw:
                    lemmatized_tokens.append(token)
                else:
                    lemmatized_tokens.append(token + '/' + token)
        if not return_string:
            return lemmatized_tokens
        elif return_string:
            return ' '.join(lemmatized_tokens)
Beispiel #32
0
    def lemmatize(self, input_text, return_raw=False, return_string=False):
        """Take incoming string or list of tokens. Lookup done against a
        key-value list of lemmata-headword. If a string, tokenize with
        ``PunktLanguageVars()``. If a final period appears on a token, remove
        it, then re-add once replacement done.
        TODO: rm check for final period, change PunktLanguageVars() to nltk_tokenize_words()
        """
        assert type(input_text) in [list, str], \
            logger.error('Input must be a list or string.')
        if type(input_text) is str:
            punkt = PunktLanguageVars()
            tokens = punkt.word_tokenize(input_text)
        else:
            tokens = input_text

        lemmatized_tokens = []
        for token in tokens:
            # check for final period
            final_period = False
            if token[-1] == '.':
                final_period = True
                token = token[:-1]

            # look for token in lemma dict keys
            if token in self.lemmata.keys():
                headword = self.lemmata[token.lower()]

                # re-add final period if rm'd
                if final_period:
                    headword += '.'

                # append to return list
                if not return_raw:
                    lemmatized_tokens.append(headword)
                else:
                    lemmatized_tokens.append(token + '/' + headword)
            # if token not found in lemma-headword list
            else:
                # re-add final period if rm'd
                if final_period:
                    token += '.'

                if not return_raw:
                    lemmatized_tokens.append(token)
                else:
                    lemmatized_tokens.append(token + '/' + token)
        if not return_string:
            return lemmatized_tokens
        elif return_string:
            return ' '.join(lemmatized_tokens)
Beispiel #33
0
    def from_regular_expression(re_rule, estimated_sound, ipa_class):
        """

        :param re_rule: pattern (first argument of re.sub)
        :param estimated_sound: an IPA character (second argument of re.sub)
        :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances
        :return: corresponding Rule instance
        """
        assert len(re_rule) > 0
        if re_rule[0] == "^":
            place = Rank.first
        elif re_rule[-1] == "$":
            place = Rank.last
        else:
            place = Rank.inner

        before_pattern = r"(?<=\(\?\<\=\[)\w*"
        core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)"
        after_pattern = r"(?<=\(\?\=\[)\w*"
        before_search = re.search(before_pattern, re_rule)
        core_search = re.search(core_pattern, re_rule)
        after_search = re.search(after_pattern, re_rule)
        if before_search is None:
            before = None
        else:
            before = [ipa_class[ipar].to_abstract() for ipar in before_search.group(0)]
        if core_search is not None:
            core = ipa_class[core_search.group(0)]
        else:
            logger.error("No core")
            raise ValueError
        if after_search is None:
            after = None
        else:
            after = [ipa_class[ipar].to_abstract() for ipar in after_search.group(0)]
        abstract_position = AbstractPosition(place, before, after)
        return Rule(abstract_position, core, ipa_class[estimated_sound])
Beispiel #34
0
    def write_concordance_from_file(self, filepaths, name):
        """This calls my modified ConcordanceIndex, taken and modified from
        the NLTK, and writes to disk a file named 'concordance_' + name at
        '~/cltk_data/user_data/'.

        TODO: Add language (here or in class), lowercase option, stemming/
        lemmatization, else?

        :type filepaths: str or list
        :param filepaths: Filepath of text(s) to be used in concordance.
        :rtype : str
        """
        assert isinstance(filepaths, (str, list))
        if isinstance(filepaths, str):
            filepath = filepaths
            text = self._read_file(filepath)
        elif isinstance(filepaths, list):
            text = ''
            for filepath in filepaths:
                text += self._read_file(filepath)
        list_of_lists = self._build_concordance(text)
        user_data_rel = '~/cltk_data/user_data'
        user_data = os.path.expanduser(user_data_rel)
        if not os.path.isdir(user_data):
            os.makedirs(user_data)
        file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
        concordance_output = ''
        for word_list in list_of_lists:
            for line in word_list:
                concordance_output += line + '\n'
        try:
            with open(file_path, 'w') as open_file:
                open_file.write(concordance_output)
                logger.info("Wrote concordance to '%s'." % file_path)
        except IOError as io_error:
            logger.error("Failed to write concordance to '%s'." % file_path)
Beispiel #35
0
 def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None):
     if isinstance(place, Place) or place is None:
         self.place = place
     else:
         logger.error("Incorrect argument")
     if isinstance(manner, Manner) or manner is None:
         self.manner = manner
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if type(voiced) == bool or voiced is None:
         self.voiced = voiced
     else:
         logger.error("Incorrect argument")
         raise TypeError
     if type(geminate) == bool or geminate is None:
         self.geminate = geminate
     else:
         logger.error("Incorrect argument")
         raise TypeError
     self.ipar = ipar
Beispiel #36
0
 def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None):
     if place in PLACES or place is None:
         self.place = place
     else:
         logger.error("Incorrect argument")
     if manner in MANNERS or manner is None:
         self.manner = manner
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if type(voiced) == bool or voiced is None:
         self.voiced = voiced
     else:
         logger.error("Incorrect argument")
         raise TypeError
     if type(geminate) == bool or geminate is None:
         self.geminate = geminate
     else:
         logger.error("Incorrect argument")
         raise TypeError
     self.ipar = ipar
Beispiel #37
0
 def __init__(self, height=None, backness=None, rounded=None, length=None, ipar=None):
     if isinstance(height, Height) or height is None:
         self.height = height
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if isinstance(backness, Backness) or backness is None:
         self.backness = backness
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if type(rounded) == bool or rounded is None:
         self.rounded = rounded
     else:
         logger.error("Incorrect argument")
         raise TypeError
     if isinstance(length, Length) or length is None:
         self.length = length
     else:
         logger.error("Incorrect argument")
         raise ValueError
     self.ipar = ipar
Beispiel #38
0
def open_pickle(path: str):
    """Open a pickle and return loaded pickle object.
    :type path: str
    :param : path: File path to pickle file to be opened.
    :rtype : object
    """
    try:
        with open(path, 'rb') as opened_pickle:
            try:
                return pickle.load(opened_pickle)
            except Exception as pickle_error:
                logger.error(pickle_error)
                raise
    except FileNotFoundError as fnf_error:
        logger.error(fnf_error)
        raise
    except IOError as io_err:
        logger.error(io_err)
        raise
    except EOFError as eof_error:
        logger.error(eof_error)
        raise
Beispiel #39
0
 def __init__(self, height=None, backness=None, rounded=None, length=None, ipar=None):
     if height in HEIGHT or height is None:
         self.height = height
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if backness in BACKNESS or backness is None:
         self.backness = backness
     else:
         logger.error("Incorrect argument")
         raise ValueError
     if type(rounded) == bool or rounded is None:
         self.rounded = rounded
     else:
         logger.error("Incorrect argument")
         raise TypeError
     if length in LENGTHS or length is None:
         self.length = length
     else:
         logger.error("Incorrect argument")
         raise ValueError
     self.ipar = ipar
Beispiel #40
0
    def index_corpus(self):
        """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5,
        or PHI7.

        TLG takes almost 13 min; PHI5 1.5 min.
        To setup index parameters
        >>> # cltk_index = CLTKIndex('latin', 'phi5')  # 1.5 min, 363 docs
        >>> # cltk_index = CLTKIndex('latin', 'phi5', chunk='work')  # 2 min, 837 docs
        >>> # cltk_index = CLTKIndex('greek', 'tlg')  # 13 min, 1823 docs
        >>> # cltk_index = CLTKIndex('greek', 'tlg', chunk='work')  #15.5 min, 6625 docs

        # And to start indexing:
        >>> # cltk_index.index_corpus()

        TODO: Prevent overwriting. Ask user to rm old dir before re-indexing.
        TODO: Add option for lemmatizing.
        TODO: Add for figure out lower() options.
        TODO: Process TLG through forthcoming normalize().
        TODO: Add name to each index.
        TODO: Turn off any language-specific mods (eg, stemming, case) that
        Whoosh might be doing by default.
        """

        # Setup index dir
        schema = Schema(path=ID(stored=True),
                        author=TEXT(stored=True),
                        content=TEXT)
        try:
            _index = create_in(self.index_path, schema)
        except FileNotFoundError:
            os.makedirs(self.index_path)
            _index = create_in(self.index_path, schema)
        writer = _index.writer()

        # Setup corpus to be indexed
        if self.lang == 'greek' and self.corpus == 'tlg':
            corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/individual_works/')
        elif self.lang == 'latin' and self.corpus == 'phi5':
            corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/individual_works/')
        assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path  # pylint: disable=line-too-long

        files = os.listdir(corpus_path)
        if self.lang == 'greek' and self.corpus == 'tlg':
            files = [f[:-4] for f in files if f.startswith('TLG')]
            corpus_index = TLG_AUTHOR_MAP
        elif self.lang == 'latin' and self.corpus == 'phi5':
            files = [f[:-4] for f in files if f.startswith('LAT')]
            corpus_index = PHI5_AUTHOR_MAP

        time_0 = time.time()
        logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus))  # pylint: disable=line-too-long
        logger.info('Index will be written to: "%s".' % self.index_path)
        if self.chunk == 'author':
            for count, file in enumerate(files, 1):

                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        file = file[3:]
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, 'TLG' + file + '.TXT')
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, file + '.TXT')
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()
                writer.add_document(path=path,
                                    author=author,
                                    content=content)

                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)

        if self.chunk == 'work':
            for count, file in enumerate(files, 1):
                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[3:-8]]
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[:-8]]
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()

                writer.add_document(path=path,
                                    author=author,
                                    content=content)
                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)
        logger.info('Commencing to commit changes.')
        writer.commit()

        time_1 = time.time()
        elapsed = time_1 - time_0
        logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed)))  # pylint: disable=line-too-long
Beispiel #41
0
 def import_corpus(self, corpus_name, local_path=None, branch='master'):  # pylint: disable=R0912
     """Download a remote or load local corpus into dir ``~/cltk_data``.
     TODO: maybe add ``from git import RemoteProgress``
     TODO: refactor this, it's getting kinda long
     :type corpus_name: str
     :param corpus_name: The name of an available corpus.
     :param local_path: str
     :param local_path: A filepath, required when importing local corpora.
     :param branch: What Git branch to clone.
     """
     corpus_properties = self._get_corpus_properties(corpus_name)
     try:
         location = corpus_properties['location']
     except KeyError:
         # git_uri = corpus_properties['git_remote']
         git_name = corpus_properties['']
         git_uri = corpus_properties['origin']
         git_type = corpus_properties['type']
         # pass this off to a special downloader just for custom urls
         self._git_user_defined_corpus(git_name, git_type, git_uri)
         return
     corpus_type = corpus_properties['type']
     if location == 'remote':
         # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
         git_uri = corpus_properties['origin']
         type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
         type_dir = os.path.expanduser(type_dir_rel)
         target_dir = os.path.join(type_dir, corpus_name)
         target_file = os.path.join(type_dir, corpus_name, 'README.md')
         # check if corpus already present
         # if not, clone
         if not os.path.isfile(target_file):
             if not os.path.isdir(type_dir):
                 os.makedirs(type_dir)
             try:
                 msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri)
                 logger.info(msg)
                 Repo.clone_from(git_uri, target_dir, branch=branch, depth=1,
                                 progress=ProgressPrinter())
             except CorpusImportError as corpus_imp_err:
                 msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
         # if corpus is present, pull latest
         else:
             try:
                 repo = Repo(target_dir)
                 assert not repo.bare  # or: assert repo.exists()
                 git_origin = repo.remotes.origin
                 msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri)
                 logger.info(msg)
                 git_origin.pull()
             except CorpusImportError as corpus_imp_err:
                 msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
     elif location == 'local':
         msg = "Importing from local path: '{}'".format(local_path)
         logger.info(msg)
         if corpus_name in ('phi5', 'phi7', 'tlg'):
             if corpus_name == 'phi5':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI5':
                     logger.info("Directory must be named 'PHI5'.")
             if corpus_name == 'phi7':
                 # normalize local_path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI7':
                     logger.info("Directory must be named 'PHI7'.")
             if corpus_name == 'tlg':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'TLG_E':
                     logger.info("Directory must be named 'TLG_E'.")
             # move the dir-checking commands into a function
             data_dir = os.path.expanduser(CLTK_DATA_DIR)
             originals_dir = os.path.join(data_dir, 'originals')
             # check for `originals` dir; if not present mkdir
             if not os.path.isdir(originals_dir):
                 os.makedirs(originals_dir)
                 msg = "Wrote directory at '{}'.".format(originals_dir)
                 logger.info(msg)
             tlg_originals_dir = os.path.join(data_dir,
                                              'originals',
                                              corpus_name)
             # check for `originals/<corpus_name>`; if pres, delete
             if os.path.isdir(tlg_originals_dir):
                 shutil.rmtree(tlg_originals_dir)
                 msg = "Removed directory at '{}'.".format(tlg_originals_dir)
                 logger.info(msg)
             # copy_dir requires that target
             if not os.path.isdir(tlg_originals_dir):
                 self._copy_dir_recursive(local_path, tlg_originals_dir)
Beispiel #42
0
"""

import logging
import os
import sys
import time

from cltk.utils.cltk_logger import logger

# TODO: Fix this
# KJ added this to fix failing build on Travis CI. Gensim seems to load boto, which in turn causes an error.
try:
    from gensim.models import Word2Vec
except AttributeError:
    logger.error(
        'Command `from gensim.models import Word2Vec` failed with AttributeError.'
    )

from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from cltk.tokenize.word import nltk_tokenize_words
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer


def gen_docs(corpus, lemmatize, rm_stops):
Beispiel #43
0
    def convert(self, input_path=None, output_path=None, markup=None,
                break_lines=False, divide_works=False, latin=False,
                extra_args=None):
        """
        :param input_path: TLG filepath to convert.
        :param output_path: filepath of new converted text.
        :param markup: Specificity of inline markup. Default None removes all
        numerical markup; 'full' gives most detailed, with reference numbers
        included before each text line.
        :param break_lines: No spaces; removes line ends and hyphens before an
         ID code; hyphens and spaces before page and column ends are retained.
        :param divide_works: Each work (book) is output as a separate file in
        the form output_file-xxx.txt; if an output file is not specified, this
         option has no effect.
        :param latin: Primarily Latin text (PHI). Some TLG texts, notably
        doccan1.txt and doccan2.txt are mostly roman texts lacking explicit
        language change codes. Setting this option will force a change to
        Latin text after each citation block is encountered.
        :param extra_args: Any other tlgu args to be passed, in list form and
        without dashes, e.g.: ['p', 'b', 'B'].
        """
        # setup file paths
        input_path = os.path.expanduser(input_path)
        output_path = os.path.expanduser(output_path)

        # check input path exists
        assert os.path.isfile(input_path), 'File {0} does not exist.'.format(input_path)

        # setup tlgu flags
        tlgu_options = []
        if markup == 'full':
            full_args = ['v', 'w', 'x', 'y', 'z']
            [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
        if break_lines:
            tlgu_options.append('N')
        if divide_works:
            tlgu_options.append('W')
        if latin:
            tlgu_options.append('r')
        # setup extra args
        if extra_args is None:
            extra_args = []
        else:
            try:
                extra_args = list(extra_args)
            except Exception as exc:
                logger.error("Argument 'extra_args' must be a list: %s.", exc)
                raise
        tlgu_options = tlgu_options + extra_args
        # assemble all tlgu flags
        tlgu_options = list(set(tlgu_options))
        if tlgu_options:
            tlgu_flags = '-' + ' -'.join(tlgu_options)
        else:
            tlgu_flags = ''
        # make tlgu call
        tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags,
                                              input_path,
                                              output_path)
        logger.info(tlgu_call)
        try:
            p_out = subprocess.call(tlgu_call, shell=True)
            if p_out == 1:
                logger.error('Failed to convert %s to %s.',
                             input_path,
                             output_path)
        except Exception as exc:
            logger.error('Failed to convert %s to %s: %s',
                         input_path,
                         output_path,
                         exc)
            raise
Beispiel #44
0
 def import_corpus(self, corpus_name, local_path=None):  # pylint: disable=R0912
     """Download a remote or load local corpus into dir ``~/cltk_data``.
     TODO: maybe add ``from git import RemoteProgress``
     TODO: refactor this, it's getting kinda long
     :type corpus_name: str
     :param corpus_name: The name of an available corpus.
     :param local_path: str
     :param local_path: A filepath, required when importing local corpora.
     """
     corpus_properties = self._check_corpus_availability(corpus_name)
     location = corpus_properties['location']
     corpus_type = corpus_properties['type']
     if location == 'remote':
         git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
         #self._download_corpus(corpus_type, corpus_name, path)
         type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language,
                                     corpus_type)
         type_dir = os.path.expanduser(type_dir_rel)
         target_dir = os.path.join(type_dir, corpus_name)
         target_file = os.path.join(type_dir, corpus_name, 'README.md')
         # check if corpus already present
         # if not, clone
         if not os.path.isfile(target_file):
             if not os.path.isdir(type_dir):
                 os.makedirs(type_dir)
             try:
                 logger.info("Cloning '%s' from '%s'" %
                             (corpus_name, git_uri))
                 Repo.clone_from(git_uri, target_dir, depth=1)
             except Exception as e:
                 logger.error("Git clone of '%s' failed: '%s'",
                              (git_uri, e))
         # if corpus is present, pull latest
         else:
             try:
                 repo = Repo(target_dir)
                 assert not repo.bare  # or: assert repo.exists()
                 o = repo.remotes.origin
                 logger.info("Pulling latest '%s' from '%s'." %
                             (corpus_name, git_uri))
                 o.pull()
             except Exception as e:
                 logger.error("Git pull of '%s' failed: '%s'" %
                              (git_uri, e))
     elif location == 'local':
         logger.info("Importing from local path: '%s'", local_path)
         if corpus_name in ('phi5', 'phi7', 'tlg'):
             if corpus_name == 'phi5':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI5':
                     logger.info("Directory must be named 'PHI5'.")
             if corpus_name == 'phi7':
                 # normalize local_path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI7':
                     logger.info("Directory must be named 'PHI7'.")
             if corpus_name == 'tlg':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'TLG_E':
                     logger.info("Directory must be named 'TLG_E'.")
             # move the dir-checking commands into a function
             data_dir = os.path.expanduser(CLTK_DATA_DIR)
             originals_dir = os.path.join(data_dir, 'originals')
             # check for `originals` dir; if not present mkdir
             if not os.path.isdir(originals_dir):
                 os.makedirs(originals_dir)
                 logger.info("Wrote directory at '%s'.", originals_dir)
             tlg_originals_dir = os.path.join(data_dir, 'originals',
                                              corpus_name)
             # check for `originals/<corpus_name>`; if pres, delete
             if os.path.isdir(tlg_originals_dir):
                 shutil.rmtree(tlg_originals_dir)
                 logger.info("Removed directory at '%s'.",
                             tlg_originals_dir)
             # copy_dir requires that target
             if not os.path.isdir(tlg_originals_dir):
                 self._copy_dir_recursive(local_path, tlg_originals_dir)
"""

from cltk.utils.cltk_logger import logger

from nltk.tokenize import wordpunct_tokenize

import re
import unicodedata

try:
    # James Tauber's greek_accentuation package
    from greek_accentuation import characters as chars
except ImportError as import_error:
    message = 'Missing "greek_accentuation" package. Install with ' \
              '`pip install greek-accentuation`.'
    logger.error(message)
    logger.error(import_error)
    raise

__author__ = ['Jack Duff <*****@*****.**>']
__license__ = 'MIT License. See LICENSE.'


# Dictionaries of phonological reconstructions for use in transcribing.
# Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \
# Ancient Greek Language.
# (Entries which are commented out are realized through diacritic analysis.)

GREEK = {
    'Attic': {
        'Probert': {
Beispiel #46
0
"""

from cltk.utils.cltk_logger import logger

from nltk.tokenize import wordpunct_tokenize

import re
import unicodedata

try:
    # James Tauber's greek_accentuation package
    from greek_accentuation import characters as chars
except ImportError as import_error:
    print('Missing "greek_accentuation" package. Install with ' +
          '`pip install greek-accentuation`.')
    logger.error(import_error)
    raise

__author__ = 'Jack Duff <*****@*****.**>'
__license__ = 'MIT License. See LICENSE.'

# Dictionaries of phonological reconstructions for use in transcribing.
# Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \
# Ancient Greek Language.
# (Entries which are commented out are realized through diacritic analysis.)

GREEK = {
    'Attic': {
        'Probert': {
            'correspondence': {
                'α': 'ɑ',
fuzzywuzzy

Good-to-haves:
python-Levenshtein

"""

import re, string
import unicodedata
from cltk.tokenize.sentence import TokenizeSentence
from cltk.utils.cltk_logger import logger

try:
    from fuzzywuzzy import fuzz
except ImportError as imp_err:
    logger.error("'fuzzywuzzy' library required for this module: %s" % imp_err)
    raise ImportError

__author__ = 'Luke Hollis <*****@*****.**>'
__license__ = 'MIT License. See LICENSE.'


class Levenshtein:
    """A wrapper class for fuzzywuzzy's Levenshtein distance calculation methods."""
    def __init__(self):
        """Initialize class. Currently empty."""
        return

    @staticmethod
    def ratio(string_a, string_b):
        """At the most basic level, return a Levenshtein distance ratio via
Beispiel #48
0
 def import_corpus(self, corpus_name, local_path=None, branch='master'):  # pylint: disable=R0912
     """Download a remote or load local corpus into dir ``~/cltk_data``.
     TODO: maybe add ``from git import RemoteProgress``
     TODO: refactor this, it's getting kinda long
     :type corpus_name: str
     :param corpus_name: The name of an available corpus.
     :param local_path: str
     :param local_path: A filepath, required when importing local corpora.
     :param branch: What Git branch to clone.
     """
     corpus_properties = self._get_corpus_properties(corpus_name)
     location = corpus_properties['location']
     corpus_type = corpus_properties['type']
     if location == 'remote':
         git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
         # self._download_corpus(corpus_type, corpus_name, path)
         type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
         type_dir = os.path.expanduser(type_dir_rel)
         target_dir = os.path.join(type_dir, corpus_name)
         target_file = os.path.join(type_dir, corpus_name, 'README.md')
         # check if corpus already present
         # if not, clone
         if not os.path.isfile(target_file):
             if not os.path.isdir(type_dir):
                 os.makedirs(type_dir)
             try:
                 msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri)
                 logger.info(msg)
                 Repo.clone_from(git_uri, target_dir, branch=branch, depth=1,
                                 progress=ProgressPrinter())
             except CorpusImportError as corpus_imp_err:
                 msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
         # if corpus is present, pull latest
         else:
             try:
                 repo = Repo(target_dir)
                 assert not repo.bare  # or: assert repo.exists()
                 git_origin = repo.remotes.origin
                 msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri)
                 logger.info(msg)
                 git_origin.pull()
             except CorpusImportError as corpus_imp_err:
                 msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
     elif location == 'local':
         msg = "Importing from local path: '{}'".format(local_path)
         logger.info(msg)
         if corpus_name in ('phi5', 'phi7', 'tlg'):
             if corpus_name == 'phi5':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI5':
                     logger.info("Directory must be named 'PHI5'.")
             if corpus_name == 'phi7':
                 # normalize local_path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI7':
                     logger.info("Directory must be named 'PHI7'.")
             if corpus_name == 'tlg':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'TLG_E':
                     logger.info("Directory must be named 'TLG_E'.")
             # move the dir-checking commands into a function
             data_dir = os.path.expanduser(CLTK_DATA_DIR)
             originals_dir = os.path.join(data_dir, 'originals')
             # check for `originals` dir; if not present mkdir
             if not os.path.isdir(originals_dir):
                 os.makedirs(originals_dir)
                 msg = "Wrote directory at '{}'.".format(originals_dir)
                 logger.info(msg)
             tlg_originals_dir = os.path.join(data_dir,
                                              'originals',
                                              corpus_name)
             # check for `originals/<corpus_name>`; if pres, delete
             if os.path.isdir(tlg_originals_dir):
                 shutil.rmtree(tlg_originals_dir)
                 msg = "Removed directory at '{}'.".format(tlg_originals_dir)
                 logger.info(msg)
             # copy_dir requires that target
             if not os.path.isdir(tlg_originals_dir):
                 self._copy_dir_recursive(local_path, tlg_originals_dir)
Beispiel #49
0
    def convert(self,
                input_path=None,
                output_path=None,
                markup=None,
                rm_newlines=False,
                divide_works=False,
                latin=False,
                extra_args=None):
        """
        :param input_path: TLG filepath to convert.
        :param output_path: filepath of new converted text.
        :param markup: Specificity of inline markup. Default None removes all
        numerical markup; 'full' gives most detailed, with reference numbers
        included before each text line.
        :param rm_newlines: No spaces; removes line ends and hyphens before an
         ID code; hyphens and spaces before page and column ends are retained.
        :param divide_works: Each work (book) is output as a separate file in
        the form output_file-xxx.txt; if an output file is not specified, this
         option has no effect.
        :param latin: Primarily Latin text (PHI). Some TLG texts, notably
        doccan1.txt and doccan2.txt are mostly roman texts lacking explicit
        language change codes. Setting this option will force a change to
        Latin text after each citation block is encountered.
        :param extra_args: Any other tlgu args to be passed, in list form and
        without dashes, e.g.: ['p', 'b', 'B'].
        """
        # setup file paths
        input_path = os.path.expanduser(input_path)
        output_path = os.path.expanduser(output_path)

        # check input path exists
        assert os.path.isfile(input_path), 'File {0} does not exist.'.format(
            input_path)

        # setup tlgu flags
        tlgu_options = []
        if markup == 'full':
            full_args = ['v', 'w', 'x', 'y', 'z']
            [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
        if rm_newlines:
            tlgu_options.append('N')
        if divide_works:
            tlgu_options.append('W')
        if latin:
            tlgu_options.append('r')
        # setup extra args
        if extra_args is None:
            extra_args = []
        else:
            try:
                extra_args = list(extra_args)
            except Exception as exc:
                logger.error("Argument 'extra_args' must be a list: %s.", exc)
                raise
        tlgu_options = tlgu_options + extra_args
        # assemble all tlgu flags
        tlgu_options = list(set(tlgu_options))
        if tlgu_options:
            tlgu_flags = '-' + ' -'.join(tlgu_options)
        else:
            tlgu_flags = ''
        # make tlgu call
        tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags, input_path,
                                              output_path)
        logger.info(tlgu_call)
        try:
            p_out = subprocess.call(tlgu_call, shell=True)
            if p_out == 1:
                logger.error('Failed to convert %s to %s.', input_path,
                             output_path)
        except Exception as exc:
            logger.error('Failed to convert %s to %s: %s', input_path,
                         output_path, exc)
            raise
Beispiel #50
0
"""Tools for working with Levenshtein distance algorithm and distance ratio between strings.
"""

from cltk.utils.cltk_logger import logger
try:
    from fuzzywuzzy import fuzz
except ImportError as imp_err:
    message = "'fuzzywuzzy' library required for this module: %s. Install with `pip install fuzzywuzzy python-Levenshtein`" % imp_err
    logger.error(message)
    print(message)
    raise ImportError


__author__ = ['Luke Hollis <*****@*****.**>']
__license__ = 'MIT License. See LICENSE.'


class Levenshtein:
    """A wrapper class for fuzzywuzzy's Levenshtein distance calculation methods."""

    def __init__(self):
        """Initialize class. Currently empty."""
        return

    @staticmethod
    def ratio(string_a, string_b):
        """At the most basic level, return a Levenshtein distance ratio via
        fuzzywuzzy.
        :param string_a: str
        :param string_b: str
        :return: float
Beispiel #51
0
 def convert_corpus(self,
                    corpus,
                    markup=None,
                    break_lines=False,
                    divide_works=False,
                    latin=None,
                    extra_args=None):  # pylint: disable=W0613
     """Look for imported TLG or PHI files and convert them all to
     ``~/cltk_data/greek/text/tlg/<plaintext>``.
     TODO: Should this and/or convert() be static?
     TODO: Add markup options to input.
     TODO: Do something with break_lines, divide_works, and extra_args or rm them
     """
     orig_path_rel = '~/cltk_data/originals'
     orig_path = os.path.expanduser(orig_path_rel)
     target_path_rel = '~/cltk_data'
     target_path = os.path.expanduser(target_path_rel)
     assert corpus in ['tlg', 'phi5',
                       'phi7'], "Corpus must be 'tlg', 'phi5', or 'phi7'"
     if corpus in ['tlg', 'phi5', 'phi7']:
         orig_path = os.path.join(orig_path, corpus)
         if corpus in ['tlg', 'phi7']:
             if 'phi7' and latin is True:
                 latin = True
                 target_path = os.path.join(target_path, 'latin', 'text',
                                            corpus)
             else:
                 latin = None
                 target_path = os.path.join(target_path, 'greek', 'text',
                                            corpus)
         else:
             target_path = os.path.join(target_path, 'latin', 'text',
                                        corpus)
             latin = True
     try:
         corpus_files = os.listdir(orig_path)
     except Exception as exception:
         logger.error("Failed to find TLG files: %s", exception)
         raise
     # make a list of files to be converted
     txts = []
     [txts.append(x) for x in corpus_files if x.endswith('TXT')]  # pylint: disable=W0106
     # loop through list and convert one at a time
     for txt in txts:
         orig_txt_path = os.path.join(orig_path, txt)
         if markup is None:
             target_txt_dir = os.path.join(target_path, 'plaintext')
         else:
             target_txt_dir = os.path.join(target_path, str(markup))
         if not os.path.isdir(target_txt_dir):
             os.makedirs(target_txt_dir)
         target_txt_path = os.path.join(target_txt_dir, txt)
         try:
             self.convert(orig_txt_path,
                          target_txt_path,
                          markup=None,
                          break_lines=False,
                          divide_works=False,
                          latin=latin,
                          extra_args=None)
         except Exception as exception:
             logger.error("Failed to convert file '%s' to '%s': %s",
                          orig_txt_path, target_txt_path, exception)
Beispiel #52
0
TODO: Add CLTK logging to this.
"""

import logging
import os
import sys
import time

from cltk.utils.cltk_logger import logger

# TODO: Fix this
# KJ added this to fix failing build on Travis CI. Gensim seems to load boto, which in turn causes an error.
try:
    from gensim.models import Word2Vec
except AttributeError:
    logger.error('Command `from gensim.models import Word2Vec` failed with AttributeError.')


from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer # Change lemmatizer
from cltk.stop.latin import STOPS_LIST as latin_stops
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer


def gen_docs(corpus, lemmatize, rm_stops):
Beispiel #53
0
    def index_corpus(self):
        """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5,
        or PHI7.

        TLG takes almost 13 min; PHI5 1.5 min.
        To setup index parameters
        >>> cltk_index = CLTKIndex('latin', 'phi5')  # 1.5 min, 363 docs
        >>> cltk_index = CLTKIndex('latin', 'phi5', chunk='work')  # 2 min, 837 docs
        >>> cltk_index = CLTKIndex('greek', 'tlg')  # 13 min, 1823 docs
        >>> cltk_index = CLTKIndex('greek', 'tlg', chunk='work')  #15.5 min, 6625 docs

        # And to start indexing:
        >>> cltk_index.index_corpus()

        TODO: Prevent overwriting. Ask user to rm old dir before re-indexing.
        TODO: Add option for lemmatizing.
        TODO: Add for figure out lower() options.
        TODO: Process TLG through forthcoming normalize().
        TODO: Add name to each index.
        TODO: Turn off any language-specific mods (eg, stemming, case) that
        Whoosh might be doing by default.
        """

        # Setup index dir
        schema = Schema(path=ID(stored=True),
                        author=TEXT(stored=True),
                        content=TEXT)
        try:
            _index = create_in(self.index_path, schema)
        except FileNotFoundError:
            os.makedirs(self.index_path)
            _index = create_in(self.index_path, schema)
        writer = _index.writer()

        # Setup corpus to be indexed
        if self.lang == 'greek' and self.corpus == 'tlg':
            corpus_path = os.path.expanduser(
                '~/cltk_data/greek/text/tlg/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser(
                    '~/cltk_data/greek/text/tlg/individual_works/')
        elif self.lang == 'latin' and self.corpus == 'phi5':
            corpus_path = os.path.expanduser(
                '~/cltk_data/latin/text/phi5/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser(
                    '~/cltk_data/latin/text/phi5/individual_works/')
        assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path  # pylint: disable=line-too-long

        files = os.listdir(corpus_path)
        if self.lang == 'greek' and self.corpus == 'tlg':
            files = [f[:-4] for f in files if f.startswith('TLG')]
            corpus_index = TLG_AUTHOR_MAP
        elif self.lang == 'latin' and self.corpus == 'phi5':
            files = [f[:-4] for f in files if f.startswith('LAT')]
            corpus_index = PHI5_AUTHOR_MAP

        time_0 = time.time()
        logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus))  # pylint: disable=line-too-long
        logger.info('Index will be written to: "%s".' % self.index_path)
        if self.chunk == 'author':
            for count, file in enumerate(files, 1):

                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        file = file[3:]
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, 'TLG' + file + '.TXT')
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, file + '.TXT')
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()
                writer.add_document(path=path, author=author, content=content)

                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)

        if self.chunk == 'work':
            for count, file in enumerate(files, 1):
                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[3:-8]]
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[:-8]]
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()

                writer.add_document(path=path, author=author, content=content)
                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)
        logger.info('Commencing to commit changes.')
        writer.commit()

        time_1 = time.time()
        elapsed = time_1 - time_0
        logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed)))  # pylint: disable=line-too-long