def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == 'tlg': orig_dir_rel = '~/cltk_data/originals/tlg' works_dir_rel = '~/cltk_data/greek/text/tlg/individual_works' file_prefix = 'TLG' latin = False elif corpus == 'phi5': orig_dir_rel = '~/cltk_data/originals/phi5' works_dir_rel = '~/cltk_data/latin/text/phi5/individual_works' file_prefix = 'LAT' latin = True # this is for the optional TLGU argument to convert() orig_dir = os.path.expanduser(orig_dir_rel) works_dir = os.path.expanduser(works_dir_rel) if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [x for x in files if x.endswith('.TXT') and x.startswith(file_prefix)] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, latin=latin) logger.info('Writing files at %s to %s.', orig_file_path, works_dir) except Exception as err: logger.error('Failed to convert files: %s.', err)
def _check_distributed_corpora_file(self): """Check '~/cltk_data/distributed_corpora.yaml' for any custom, distributed corpora that the user wants to load locally. TODO: write check or try if `cltk_data` dir is not present """ if self.testing: distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml') else: distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml') try: with open(distributed_corpora_fp) as file_open: corpora_dict = yaml.safe_load(file_open) except FileNotFoundError: logger.info('Distributed_corpora.yaml file not found.') return [] except yaml.parser.ParserError as parse_err: logger.debug('Yaml parsing error: %s' % parse_err) return [] user_defined_corpora = [] for corpus_name in corpora_dict: about = corpora_dict[corpus_name] if about['language'].lower() == self.language: user_defined_corpus = dict() # user_defined_corpus['git_remote'] = about['git_remote'] user_defined_corpus['origin'] = about['origin'] user_defined_corpus['type'] = about['type'] user_defined_corpus['name'] = corpus_name user_defined_corpora.append(user_defined_corpus) return user_defined_corpora
def _long_by_position(self, syllable, sentence): """Check if syllable is long by position. Long by position includes: 1) Next syllable begins with two consonants, unless those consonants are a stop + liquid combination 2) Next syllable begins with a double consonant 3) Syllable ends with a consonant and the next syllable begins with a consonant :param syllable: Current syllable :param sentence: Current sentence :return: True if syllable is long by position :rtype : bool """ try: next_syll = sentence[sentence.index(syllable) + 1] # Long by postion by case 1 if (next_syll[0] in self.sing_cons and next_syll[1] in self.sing_cons) and (next_syll[0] not in self.stops and next_syll[1] not in self.liquids): return True # Long by position by case 2 elif syllable[-1] in self.vowels and next_syll[0] in \ self.doub_cons: return True # Long by position by case 3 elif syllable[-1] in self.sing_cons and next_syll[0] in \ self.sing_cons: return True else: pass except IndexError: logger.info("IndexError while checking if syllable '%s' is long. Continuing.", syllable)
def __init__(self, language, testing=False): """Setup corpus importing. `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting local. A better idea is probably to refuse to overwrite the .yaml. """ self.language = language.lower() assert isinstance(testing, bool), '`testing` parameter must be boolean type' self.testing = testing self.user_defined_corpora = self._setup_language_variables() # if user_defined_corpora, then we need to add these to the corpus.py objects if self.user_defined_corpora: logger.info('User-defined corpus found for "{}" language'.format(self.language)) try: logger.debug('Core corpora also found for "{}" language'.format(self.language)) logger.debug('Combining the user-defined and the core corpora') self.official_corpora = LANGUAGE_CORPORA[self.language] self.all_corpora = self.official_corpora for corpus in self.user_defined_corpora: self.all_corpora.append(corpus) except KeyError: logger.debug('Nothing in the official repos ' 'for "{}" language. Make the all_corpora solely ' 'from the .yaml'.format(self.language)) self.all_corpora = [] for corpus in self.user_defined_corpora: self.all_corpora.append(corpus) else: logger.info('No user-defined corpora found for "{}" language'.format(self.language)) # self.official_corpora = LANGUAGE_CORPORA[self.language] self.all_corpora = LANGUAGE_CORPORA[self.language]
def _check_distributed_corpora_file(self): """Check '~/cltk_data/distributed_corpora.yaml' for any custom, distributed corpora that the user wants to load locally. TODO: write check or try if `cltk_data` dir is not present """ if self.testing: distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml') else: distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml') try: with open(distributed_corpora_fp) as file_open: corpora_dict = yaml.safe_load(file_open) except FileNotFoundError: logger.info('Distributed_corpora.yaml file not found.') return [] except yaml.parser.ParserError as parse_err: logger.debug('Yaml parsing error: %s' % parse_err) return [] user_defined_corpora = [] for corpus_name in corpora_dict: about = corpora_dict[corpus_name] if about['language'].lower() == self.language: user_defined_corpus = dict() user_defined_corpus['git_remote'] = about['git_remote'] user_defined_corpus['name'] = corpus_name user_defined_corpus['type'] = about['type'] user_defined_corpora.append(user_defined_corpus) return user_defined_corpora
def _long_by_position(self, syllable, sentence): """Check if syllable is long by position. Long by position includes: 1) Next syllable begins with two consonants, unless those consonants are a stop + liquid combination 2) Next syllable begins with a double consonant 3) Syllable ends with a consonant and the next syllable begins with a consonant :param syllable: Current syllable :param sentence: Current sentence :return: True if syllable is long by position :rtype : bool """ try: next_syll = sentence[sentence.index(syllable) + 1] # Long by position by case 1 if (next_syll[0] in self.sing_cons and next_syll[1] in self.sing_cons) and (next_syll[0] not in self.stops and next_syll[1] not in self.liquids): return True # Long by position by case 2 elif syllable[-1] in self.vowels and next_syll[0] in self.doub_cons: return True # Long by position by case 3 elif syllable[-1] in self.sing_cons and (next_syll[0] in self.sing_cons): return True else: pass except IndexError: logger.info( "IndexError while checking if syllable '%s' is long. Continuing.", syllable)
def _make_syllables(self, sentences_words): """Divide the word tokens into a list of syllables. Note that a syllable in this instance is defined as a vocalic group (i.e., vowel or a diphthong). This means that all syllables which are not the last syllable in the word will end with a vowel or diphthong. TODO: Determine whether a CLTK syllabifier could replace this :param sentence_words: :return: Syllabified words :rtype : list """ text = self._tokenize(sentences_words) all_syllables = [] for sentence in text: syll_per_sent = [] for word in sentence: syll_start = 0 # Begins syllable iterator syll_per_word = [] cur_letter_in = 0 # Begins general iterator while cur_letter_in < len(word): letter = word[cur_letter_in] if (cur_letter_in != len(word) - 1) and \ (word[cur_letter_in] + word[cur_letter_in + 1]) \ in self.diphthongs: cur_letter_in += 1 # Syllable ends with a diphthong syll_per_word.append(word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 elif (letter in self.vowels) or (letter in self.long_vowels): # Syllable ends with a vowel syll_per_word.append(word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 cur_letter_in += 1 try: last_vowel = syll_per_word[-1][-1] # Last vowel of a word # Modifies general iterator to accomodate consonants after # the last syllable in a word cur_letter_in = len(word) - 1 # Contains all of the consonants after the last vowel in a word leftovers = '' while word[cur_letter_in] != last_vowel: if word[cur_letter_in] != '.': # Adds consonants to leftovers leftovers = word[cur_letter_in] + leftovers cur_letter_in -= 1 # Adds leftovers to last syllable in a word syll_per_word[-1] += leftovers syll_per_sent.append(syll_per_word) except IndexError: logger.info( "IndexError while making syllables of '%s'. Continuing.", word) all_syllables.append(syll_per_sent) return all_syllables
def test_contribs_find_write_contribs(self): """Test contrib writing function.""" file_contribs = 'contributors.md' try: os.remove(file_contribs) except FileNotFoundError: logger.info("No file to remove at '%s'. Continuing.", file_contribs) find_write_contribs() contribs_file = os.path.isfile(file_contribs) self.assertTrue(contribs_file)
def test_contribs_write_contribs(self): contribs = Contributors() file = 'contributors.md' try: os.remove(file) except FileNotFoundError: logger.info("No file to remove at '%s'. Continuing.", file) contribs.write_contribs() contribs_file = os.path.isfile(file) self.assertTrue(contribs_file)
def test_contribs_find_write_contribs(self): """Test contrib writing function.""" file = 'contributors.md' try: os.remove(file) except FileNotFoundError: logger.info("No file to remove at '%s'. Continuing.", file) find_write_contribs() contribs_file = os.path.isfile(file) self.assertTrue(contribs_file)
def _what_os(self): """Get operating system.""" if platform == "linux" or platform == "linux2": _platform = 'linux' elif platform == "darwin": _platform = 'mac' elif platform == "win32": _platform = 'windows' logger.info("Detected '{}' operating system.".format(_platform)) return _platform
def load_replacement_patterns(self): """Check for availability of the specified dictionary.""" filename = self.dictionary + '.py' models = self.language + '_models_cltk' rel_path = os.path.join('~/cltk_data', self.language, 'model', models, 'semantics', filename) path = os.path.expanduser(rel_path) logger.info('Loading lemmata or synonyms. This may take a minute.') loader = importlib.machinery.SourceFileLoader(filename, path) module = types.ModuleType(loader.name) loader.exec_module(module) return module.DICTIONARY
def _make_syllables(self, sentences_words): """Divide the word tokens into a list of syllables. Note that a syllable in this instance is defined as a vocalic group (i.e., vowel or a diphthong). This means that all syllables which are not the last syllable in the word will end with a vowel or diphthong. TODO: Determine whether a CLTK syllabifier could replace this :param sentence_words: :return: Syllabified words :rtype : list """ text = self._tokenize(sentences_words) all_syllables = [] for sentence in text: syll_per_sent = [] for word in sentence: syll_start = 0 # Begins syllable iterator syll_per_word = [] cur_letter_in = 0 # Begins general iterator while cur_letter_in < len(word): letter = word[cur_letter_in] if (cur_letter_in != len(word) - 1) and \ (word[cur_letter_in] + word[cur_letter_in + 1]) \ in self.diphthongs: cur_letter_in += 1 # Syllable ends with a diphthong syll_per_word.append(word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 elif (letter in self.vowels) or (letter in self.long_vowels): # Syllable ends with a vowel syll_per_word.append(word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 cur_letter_in += 1 try: last_vowel = syll_per_word[-1][-1] # Last vowel of a word # Modifies general iterator to accomodate consonants after # the last syllable in a word cur_letter_in = len(word) - 1 # Contains all of the consonants after the last vowel in a word leftovers = '' while word[cur_letter_in] != last_vowel: if word[cur_letter_in] != '.': # Adds consonants to leftovers leftovers = word[cur_letter_in] + leftovers cur_letter_in -= 1 # Adds leftovers to last syllable in a word syll_per_word[-1] += leftovers syll_per_sent.append(syll_per_word) except IndexError: logger.info("IndexError while making syllables of '%s'. Continuing.", word) all_syllables.append(syll_per_sent) return all_syllables
def write_contribs(def_dict_list): """Write to file, in current dir, 'contributors.md'.""" file_str = '' note = '# Contributors\nCLTK Core authors, ordered alphabetically by first name\n\n' file_str += note for contrib in def_dict_list: file_str += '## ' + contrib + '\n' for module in def_dict_list[contrib]: file_str += '* ' + module + '\n' file_str += '\n' file_name = 'contributors.md' with open(file_name, 'w') as file_open: file_open.write(file_str) logger.info('Wrote contribs file at "%s".', file_name)
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(['which', 'tlgu']) except Exception as exc: logger.info('TLGU not installed: %s', exc) logger.info('Installing TLGU.') if not subprocess.check_output(['which', 'gcc']): logger.error('GCC seems not to be installed.') else: tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu' tlgu_path = os.path.expanduser(tlgu_path_rel) if not self.testing: print('Do you want to install TLGU? To continue, press Return. To exit, Control-C.') input() else: print('Automated or test build, skipping keyboard input confirmation for installation of TLGU.') try: p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install without sudo failed.') except Exception as exc: logger.error('TLGU install failed: %s', exc) else: # for Linux needing root access to '/usr/local/bin' p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install with sudo failed.')
def make(self): """Build program.""" #! for linux install Clan fp = os.path.expanduser('~/cltk_data/multilingual/software/lapos') p_out = subprocess.call('cd {} && make'.format(fp), shell=True, stdout=subprocess.DEVNULL) if p_out == 0: print('Lapos built successfully.') logger.info('Lapos build successfully.') else: print('Lapos did not build successfully.') logger.error('Lapos did not build successfully.')
def write_contribs(def_dict_list: Dict[str, List[str]]) -> None: """Write to file, in current dir, 'contributors.md'.""" file_str = '' # type: str note = '# Contributors\nCLTK Core authors, ordered alphabetically by first name\n\n' # type: str # pylint: disable=line-too-long file_str += note for contrib in def_dict_list: file_str += '## ' + contrib + '\n' for module in def_dict_list[contrib]: file_str += '* ' + module + '\n' file_str += '\n' file_name = 'contributors.md' # type: str with open(file_name, 'w') as file_open: # type: IO file_open.write(file_str) logger.info('Wrote contribs file at "%s".', file_name)
def _check_install(): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(['which', 'tlgu']) except Exception as exc: logger.info('TLGU not installed: %s', exc) logger.info('Installing TLGU.') if not subprocess.check_output(['which', 'gcc']): logger.error('GCC seems not to be installed.') else: tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu' tlgu_path = os.path.expanduser(tlgu_path_rel) try: p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install without sudo failed.') except Exception as exc: logger.error('TLGU install failed: %s', exc) else: # for Linux needing root access to '/usr/local/bin' p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install with sudo failed.')
def write_contribs(self): """Write to file, in current dir, 'contributors.md'.""" file_str = '' note = '# Contributors\nCLTK Core authors, ordered alphabetically by first name\n\n' file_str += note for contrib in self.credits: file_str += '## ' + contrib + '\n' for module in self.credits[contrib]: file_str += '* ' + module + '\n' file_str += '\n' file_name = 'contributors.md' with open(file_name, 'w') as file_open: file_open.write(file_str) logger.info('Wrote contribs file at "%s".', file_name)
def test_write_contribs(self): """Test file writer for contribs module.""" # rm old file = 'contributors.md' try: os.remove(file) except FileNotFoundError: logger.info("No file to remove at '%s'. Continuing.", file) # mk new dict def_dict = defaultdict(list) def_dict['key'].append('val1') def_dict['key'].append('val2') write_contribs(def_dict) # write file contribs_file = os.path.isfile(file) self.assertTrue(contribs_file)
def _git_user_defined_corpus(self, corpus_name, corpus_type, uri: str, branch='master'): """Clone or update a git repo defined by user. TODO: This code is very redundant with what's in import_corpus(), could be refactored. """ # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') # self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) repo_name = uri.split('/')[-1] # eg, 'latin_corpus_newton_example.git' repo_name = repo_name.rstrip('.git') target_dir = os.path.join(type_dir, repo_name) target_file = os.path.join(type_dir, repo_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, uri) logger.info(msg) Repo.clone_from(uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format( uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format( uri, corpus_imp_err) logger.error(msg)
def _setup_language_variables(self): """Check for availability of corpora for a language. TODO: Make the selection of available languages dynamic from dirs within ``corpora`` which contain a ``corpora.py`` file. """ if self.language not in AVAILABLE_LANGUAGES: # If no official repos, check if user has custom user_defined_corpora = self._check_distributed_corpora_file() if user_defined_corpora: return user_defined_corpora else: msg = 'Corpora not available (either core or user-defined) for the "{}" language.'.format(self.language) logger.info(msg) raise CorpusImportError(msg) else: user_defined_corpora = self._check_distributed_corpora_file() return user_defined_corpora
def _retrieve_morpheus_entry(self, word): """Return Morpheus entry for word Entry format: [(head word, tag, macronized form)] :param word: unmacronized, lowercased word :ptype word: string :return: Morpheus entry in tuples :rtype : list """ entry = self.macron_data.get(word) if entry is None: logger.info('No Morpheus entry found for {}.'.format(word)) return None elif len(entry) == 0: logger.info('No Morpheus entry found for {}.'.format(word)) return entry
def write_concordance_from_string(self, text, name): """A reworkinng of write_concordance_from_file(). Refactor these.""" list_of_lists = self._build_concordance(text) user_data_rel = '~/cltk_data/user_data' user_data = os.path.expanduser(user_data_rel) if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') concordance_output = '' for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: open_file.write(concordance_output) logger.info("Wrote concordance to '%s'." % file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'." % file_path)
def _check_corpus_availability(self, corpus_name): """Check whether a corpus is available for import. :type corpus_name: str :param corpus_name: Name of available corpus. :rtype : str """ try: corpora = LANGUAGE_CORPORA[self.language] except NameError as name_error: logger.error('Corpus not available for language %s: %s', (self.language, name_error)) corpus_properties = None for corpus in corpora: if corpus['name'] == corpus_name: corpus_properties = corpus if not corpus_properties: logger.info("Corpus '%s' not available for the '%s' language.", corpus_name, self.language) return corpus_properties
def write_concordance_from_string(text: str, name: str) -> None: """A reworkinng of write_concordance_from_file(). Refactor these.""" list_of_lists = build_concordance(text) # type: List[List[str]] user_data_rel = '~/cltk_data/user_data' # type: str user_data = os.path.expanduser(user_data_rel) # type: str if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') # type: str concordance_output = '' # type: str for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: open_file.write(concordance_output) logger.info("Wrote concordance to '%s'.", file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'. Error: %s", file_path, io_error)
def _copy_dir_recursive(src_rel, dst_rel): """Copy contents of one directory to another. `dst_rel` dir cannot exist. Source: http://stackoverflow.com/a/1994840 TODO: Move this to file_operations.py module. :type src_rel: str :param src_rel: Directory to be copied. :type dst_rel: str :param dst_rel: Directory to be created with contents of ``src_rel``. """ src = os.path.expanduser(src_rel) dst = os.path.expanduser(dst_rel) try: shutil.copytree(src, dst) logger.info('Files copied from %s to %s', src, dst) except OSError as exc: if exc.errno == errno.ENOTDIR: shutil.copy(src, dst) logger.info('Files copied from %s to %s', src, dst) else: raise
def _macronize_word(self, word): """Return macronized word. :param word: (word, tag) :ptype word: tuple :return: (word, tag, macronized_form) :rtype : tuple """ head_word = word[0] tag = word[1] if tag is None: logger.info('Tagger {} could not tag {}.'.format(self.tagger, head_word)) return head_word, tag, head_word elif tag == 'U--------': return (head_word, tag.lower(), head_word) else: entries = self._retrieve_morpheus_entry(head_word) if entries is None: return head_word, tag.lower(), head_word matched_entry = [entry for entry in entries if entry[0] == tag.lower()] if len(matched_entry) == 0: logger.info('No matching Morpheus entry found for {}.'.format(head_word)) return head_word, tag.lower(), entries[0][2] elif len(matched_entry) == 1: return head_word, tag.lower(), matched_entry[0][2].lower() else: logger.info('Multiple matching entries found for {}.'.format(head_word)) return head_word, tag.lower(), matched_entry[1][2].lower()
def _macronize_word(self, word): """Return macronized word. :param word: (word, tag) :ptype word: tuple :return: (word, tag, macronized_form) :rtype : tuple """ head_word = word[0] tag = word[1] if tag is None: logger.info('Tagger {} could not tag {}.'.format( self.tagger, head_word)) return head_word, tag, head_word elif tag == 'U--------': return (head_word, tag.lower(), head_word) else: entries = self._retrieve_morpheus_entry(head_word) if entries is None: return head_word, tag.lower(), head_word matched_entry = [ entry for entry in entries if entry[0] == tag.lower() ] if len(matched_entry) == 0: logger.info('No matching Morpheus entry found for {}.'.format( head_word)) return head_word, tag.lower(), entries[0][2] elif len(matched_entry) == 1: return head_word, tag.lower(), matched_entry[0][2].lower() else: logger.info('Multiple matching entries found for {}.'.format( head_word)) return head_word, tag.lower(), matched_entry[1][2].lower()
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(['which', 'tlgu']) except Exception as exc: logger.info('TLGU not installed: %s', exc) logger.info('Installing TLGU.') if not subprocess.check_output(['which', 'gcc']): logger.error('GCC seems not to be installed.') else: tlgu_path_rel = get_cltk_data_dir( ) + '/greek/software/greek_software_tlgu' tlgu_path = os.path.expanduser(tlgu_path_rel) if not self.testing: print('Do you want to install TLGU?') print('To continue, press Return. To exit, Control-C.') input() else: print( 'Automated or test build, skipping keyboard input confirmation for installation of TLGU.' ) try: command = 'cd {0} && make install'.format(tlgu_path) print('Going to run command:', command) p_out = subprocess.call(command, shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install without sudo failed.') except Exception as exc: logger.error('TLGU install failed: %s', exc) else: # for Linux needing root access to '/usr/local/bin' if not self.testing: print( 'Could not install without root access. Do you want to install TLGU with sudo?' ) command = 'cd {0} && sudo make install'.format( tlgu_path) print('Going to run command:', command) print('To continue, press Return. To exit, Control-C.') input() p_out = subprocess.call(command, shell=True) else: command = 'cd {0} && sudo make install'.format( tlgu_path) p_out = subprocess.call(command, shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install with sudo failed.')
def _git_user_defined_corpus(self, corpus_name, corpus_type, uri:str, branch='master'): """Clone or update a git repo defined by user. TODO: This code is very redundant with what's in import_corpus(), could be refactored. """ # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') # self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) repo_name = uri.split('/')[-1] # eg, 'latin_corpus_newton_example.git' repo_name = repo_name.rstrip('.git') target_dir = os.path.join(type_dir, repo_name) target_file = os.path.join(type_dir, repo_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, uri) logger.info(msg) Repo.clone_from(uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err) logger.error(msg)
def write_concordance_from_file(filepaths: Union[str, List[str]], name: str) -> None: """This calls the modified ConcordanceIndex, taken and modified from the NLTK, and writes to disk a file named 'concordance_' + name at '~/cltk_data/user_data/'. TODO: Add language (here or in class), lowercase option, stemming/ lemmatization, else? :type filepaths: str or list :param filepaths: Filepath of text(s) to be used in concordance. :rtype : str """ assert isinstance(filepaths, (str, list)) if isinstance(filepaths, str): filepath = filepaths # type: str text = read_file(filepath) # type: str elif isinstance(filepaths, list): text = '' for filepath in filepaths: text += read_file(filepath) list_of_lists = build_concordance(text) # type: List[List[str]] user_data_rel = '~/cltk_data/user_data' # type: str user_data = os.path.expanduser(user_data_rel) # type: str if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') concordance_output = '' # type: str for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: # type: IO open_file.write(concordance_output) logger.info("Wrote concordance to '%s'.", file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'. Error: %s", file_path, io_error)
def write_concordance_from_file(self, filepaths, name): """This calls my modified ConcordanceIndex, taken and modified from the NLTK, and writes to disk a file named 'concordance_' + name at '~/cltk_data/user_data/'. TODO: Add language (here or in class), lowercase option, stemming/ lemmatization, else? :type filepaths: str or list :param filepaths: Filepath of text(s) to be used in concordance. :rtype : str """ assert isinstance(filepaths, (str, list)) if isinstance(filepaths, str): filepath = filepaths text = self._read_file(filepath) elif isinstance(filepaths, list): text = '' for filepath in filepaths: text += self._read_file(filepath) list_of_lists = self._build_concordance(text) user_data_rel = '~/cltk_data/user_data' user_data = os.path.expanduser(user_data_rel) if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') concordance_output = '' for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: open_file.write(concordance_output) logger.info("Wrote concordance to '%s'." % file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'." % file_path)
def index_corpus(self): """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5, or PHI7. TLG takes almost 13 min; PHI5 1.5 min. To setup index parameters >>> cltk_index = CLTKIndex('latin', 'phi5') # 1.5 min, 363 docs >>> cltk_index = CLTKIndex('latin', 'phi5', chunk='work') # 2 min, 837 docs >>> cltk_index = CLTKIndex('greek', 'tlg') # 13 min, 1823 docs >>> cltk_index = CLTKIndex('greek', 'tlg', chunk='work') #15.5 min, 6625 docs # And to start indexing: >>> cltk_index.index_corpus() TODO: Prevent overwriting. Ask user to rm old dir before re-indexing. TODO: Add option for lemmatizing. TODO: Add for figure out lower() options. TODO: Process TLG through forthcoming normalize(). TODO: Add name to each index. TODO: Turn off any language-specific mods (eg, stemming, case) that Whoosh might be doing by default. """ # Setup index dir schema = Schema(path=ID(stored=True), author=TEXT(stored=True), content=TEXT) try: _index = create_in(self.index_path, schema) except FileNotFoundError: os.makedirs(self.index_path) _index = create_in(self.index_path, schema) writer = _index.writer() # Setup corpus to be indexed if self.lang == 'greek' and self.corpus == 'tlg': corpus_path = os.path.expanduser( '~/cltk_data/greek/text/tlg/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser( '~/cltk_data/greek/text/tlg/individual_works/') elif self.lang == 'latin' and self.corpus == 'phi5': corpus_path = os.path.expanduser( '~/cltk_data/latin/text/phi5/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser( '~/cltk_data/latin/text/phi5/individual_works/') assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path # pylint: disable=line-too-long files = os.listdir(corpus_path) if self.lang == 'greek' and self.corpus == 'tlg': files = [f[:-4] for f in files if f.startswith('TLG')] corpus_index = TLG_AUTHOR_MAP elif self.lang == 'latin' and self.corpus == 'phi5': files = [f[:-4] for f in files if f.startswith('LAT')] corpus_index = PHI5_AUTHOR_MAP time_0 = time.time() logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus)) # pylint: disable=line-too-long logger.info('Index will be written to: "%s".' % self.index_path) if self.chunk == 'author': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': file = file[3:] author = corpus_index[file] path = os.path.join(corpus_path, 'TLG' + file + '.TXT') if self.lang == 'latin' and self.corpus == 'phi5': author = corpus_index[file] path = os.path.join(corpus_path, file + '.TXT') except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) if self.chunk == 'work': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[3:-8]] if self.lang == 'latin' and self.corpus == 'phi5': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[:-8]] except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) logger.info('Commencing to commit changes.') writer.commit() time_1 = time.time() elapsed = time_1 - time_0 logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed))) # pylint: disable=line-too-long
def import_corpus(self, corpus_name, local_path=None, branch='master'): # pylint: disable=R0912 """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :type corpus_name: str :param corpus_name: The name of an available corpus. :param local_path: str :param local_path: A filepath, required when importing local corpora. :param branch: What Git branch to clone. """ corpus_properties = self._get_corpus_properties(corpus_name) try: location = corpus_properties['location'] except KeyError: # git_uri = corpus_properties['git_remote'] git_name = corpus_properties[''] git_uri = corpus_properties['origin'] git_type = corpus_properties['type'] # pass this off to a special downloader just for custom urls self._git_user_defined_corpus(git_name, git_type, git_uri) return corpus_type = corpus_properties['type'] if location == 'remote': # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') git_uri = corpus_properties['origin'] type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri) logger.info(msg) Repo.clone_from(git_uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) elif location == 'local': msg = "Importing from local path: '{}'".format(local_path) logger.info(msg) if corpus_name in ('phi5', 'phi7', 'tlg'): if corpus_name == 'phi5': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI5': logger.info("Directory must be named 'PHI5'.") if corpus_name == 'phi7': # normalize local_path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI7': logger.info("Directory must be named 'PHI7'.") if corpus_name == 'tlg': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'TLG_E': logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, 'originals') # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) msg = "Wrote directory at '{}'.".format(originals_dir) logger.info(msg) tlg_originals_dir = os.path.join(data_dir, 'originals', corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) msg = "Removed directory at '{}'.".format(tlg_originals_dir) logger.info(msg) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir)
line_tokenize) from nltk.tokenize.stanford import StanfordTokenizer from nltk.tokenize.texttiling import TextTilingTokenizer #from nltk.tokenize.toktok import ToktokTokenizer from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize from nltk.tokenize.stanford_segmenter import StanfordSegmenter from cltk.utils.cltk_logger import logger do_arabic = False try: import pyarabic.araby as araby do_arabic = True except ImportError: logger.info( 'Arabic not supported. Install `pyarabic` library to tokenize Arabic.') pass __author__ = [ 'Patrick J. Burns <*****@*****.**>', 'Kyle P. Johnson <*****@*****.**>', 'Natasha Voake <*****@*****.**>' ] __license__ = 'MIT License. See LICENSE.' class WordTokenizer: # pylint: disable=too-few-public-methods """Tokenize according to rules specific to a given language.""" def __init__(self, language): """Take language as argument to the class. Check availability and setup class variables."""
def convert(self, input_path=None, output_path=None, markup=None, break_lines=False, divide_works=False, latin=False, extra_args=None): """ :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param break_lines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param latin: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path = os.path.expanduser(input_path) output_path = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), 'File {0} does not exist.'.format(input_path) # setup tlgu flags tlgu_options = [] if markup == 'full': full_args = ['v', 'w', 'x', 'y', 'z'] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if break_lines: tlgu_options.append('N') if divide_works: tlgu_options.append('W') if latin: tlgu_options.append('r') # setup extra args if extra_args is None: extra_args = [] else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options = tlgu_options + extra_args # assemble all tlgu flags tlgu_options = list(set(tlgu_options)) if tlgu_options: tlgu_flags = '-' + ' -'.join(tlgu_options) else: tlgu_flags = '' # make tlgu call tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error('Failed to convert %s to %s.', input_path, output_path) except Exception as exc: logger.error('Failed to convert %s to %s: %s', input_path, output_path, exc) raise
def import_corpus(self, corpus_name, local_path=None): # pylint: disable=R0912 """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :type corpus_name: str :param corpus_name: The name of an available corpus. :param local_path: str :param local_path: A filepath, required when importing local corpora. """ corpus_properties = self._check_corpus_availability(corpus_name) location = corpus_properties['location'] corpus_type = corpus_properties['type'] if location == 'remote': git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') #self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: logger.info("Cloning '%s' from '%s'" % (corpus_name, git_uri)) Repo.clone_from(git_uri, target_dir, depth=1) except Exception as e: logger.error("Git clone of '%s' failed: '%s'", (git_uri, e)) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() o = repo.remotes.origin logger.info("Pulling latest '%s' from '%s'." % (corpus_name, git_uri)) o.pull() except Exception as e: logger.error("Git pull of '%s' failed: '%s'" % (git_uri, e)) elif location == 'local': logger.info("Importing from local path: '%s'", local_path) if corpus_name in ('phi5', 'phi7', 'tlg'): if corpus_name == 'phi5': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI5': logger.info("Directory must be named 'PHI5'.") if corpus_name == 'phi7': # normalize local_path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI7': logger.info("Directory must be named 'PHI7'.") if corpus_name == 'tlg': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'TLG_E': logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, 'originals') # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) logger.info("Wrote directory at '%s'.", originals_dir) tlg_originals_dir = os.path.join(data_dir, 'originals', corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) logger.info("Removed directory at '%s'.", tlg_originals_dir) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir)
def import_corpus(self, corpus_name, local_path=None, branch='master'): # pylint: disable=R0912 """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :type corpus_name: str :param corpus_name: The name of an available corpus. :param local_path: str :param local_path: A filepath, required when importing local corpora. :param branch: What Git branch to clone. """ corpus_properties = self._get_corpus_properties(corpus_name) location = corpus_properties['location'] corpus_type = corpus_properties['type'] if location == 'remote': git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') # self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri) logger.info(msg) Repo.clone_from(git_uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) elif location == 'local': msg = "Importing from local path: '{}'".format(local_path) logger.info(msg) if corpus_name in ('phi5', 'phi7', 'tlg'): if corpus_name == 'phi5': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI5': logger.info("Directory must be named 'PHI5'.") if corpus_name == 'phi7': # normalize local_path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI7': logger.info("Directory must be named 'PHI7'.") if corpus_name == 'tlg': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'TLG_E': logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, 'originals') # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) msg = "Wrote directory at '{}'.".format(originals_dir) logger.info(msg) tlg_originals_dir = os.path.join(data_dir, 'originals', corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) msg = "Removed directory at '{}'.".format(tlg_originals_dir) logger.info(msg) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir)
from cltk.tokenize.word import WordTokenizer from cltk.stop.arabic.stops import STOPS_LIST as ARABIC_STOPS from cltk.utils.cltk_logger import logger try: import pyarabic.araby as araby except ImportError: logger.info( 'Arabic not supported. Install `pyarabic` library to strip diacritics.' ) pass def stopwords_filter(string): text = string # strip tashkeel because the stop words list contains voweled words text = araby.strip_tashkeel(text) word_tokenizer = WordTokenizer("arabic") tokens = word_tokenizer.tokenize(text) # filter stop words no_stops = [w for w in tokens if w not in ARABIC_STOPS] return no_stops
def convert(self, input_path=None, output_path=None, markup=None, rm_newlines=False, divide_works=False, latin=False, extra_args=None): """ :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param latin: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path = os.path.expanduser(input_path) output_path = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), 'File {0} does not exist.'.format( input_path) # setup tlgu flags tlgu_options = [] if markup == 'full': full_args = ['v', 'w', 'x', 'y', 'z'] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if rm_newlines: tlgu_options.append('N') if divide_works: tlgu_options.append('W') if latin: tlgu_options.append('r') # setup extra args if extra_args is None: extra_args = [] else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options = tlgu_options + extra_args # assemble all tlgu flags tlgu_options = list(set(tlgu_options)) if tlgu_options: tlgu_flags = '-' + ' -'.join(tlgu_options) else: tlgu_flags = '' # make tlgu call tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error('Failed to convert %s to %s.', input_path, output_path) except Exception as exc: logger.error('Failed to convert %s to %s: %s', input_path, output_path, exc) raise
def index_corpus(self): """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5, or PHI7. TLG takes almost 13 min; PHI5 1.5 min. To setup index parameters >>> # cltk_index = CLTKIndex('latin', 'phi5') # 1.5 min, 363 docs >>> # cltk_index = CLTKIndex('latin', 'phi5', chunk='work') # 2 min, 837 docs >>> # cltk_index = CLTKIndex('greek', 'tlg') # 13 min, 1823 docs >>> # cltk_index = CLTKIndex('greek', 'tlg', chunk='work') #15.5 min, 6625 docs # And to start indexing: >>> # cltk_index.index_corpus() TODO: Prevent overwriting. Ask user to rm old dir before re-indexing. TODO: Add option for lemmatizing. TODO: Add for figure out lower() options. TODO: Process TLG through forthcoming normalize(). TODO: Add name to each index. TODO: Turn off any language-specific mods (eg, stemming, case) that Whoosh might be doing by default. """ # Setup index dir schema = Schema(path=ID(stored=True), author=TEXT(stored=True), content=TEXT) try: _index = create_in(self.index_path, schema) except FileNotFoundError: os.makedirs(self.index_path) _index = create_in(self.index_path, schema) writer = _index.writer() # Setup corpus to be indexed if self.lang == 'greek' and self.corpus == 'tlg': corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/individual_works/') elif self.lang == 'latin' and self.corpus == 'phi5': corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/individual_works/') assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path # pylint: disable=line-too-long files = os.listdir(corpus_path) if self.lang == 'greek' and self.corpus == 'tlg': files = [f[:-4] for f in files if f.startswith('TLG')] corpus_index = TLG_AUTHOR_MAP elif self.lang == 'latin' and self.corpus == 'phi5': files = [f[:-4] for f in files if f.startswith('LAT')] corpus_index = PHI5_AUTHOR_MAP time_0 = time.time() logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus)) # pylint: disable=line-too-long logger.info('Index will be written to: "%s".' % self.index_path) if self.chunk == 'author': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': file = file[3:] author = corpus_index[file] path = os.path.join(corpus_path, 'TLG' + file + '.TXT') if self.lang == 'latin' and self.corpus == 'phi5': author = corpus_index[file] path = os.path.join(corpus_path, file + '.TXT') except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) if self.chunk == 'work': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[3:-8]] if self.lang == 'latin' and self.corpus == 'phi5': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[:-8]] except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) logger.info('Commencing to commit changes.') writer.commit() time_1 = time.time() elapsed = time_1 - time_0 logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed))) # pylint: disable=line-too-long
def make_syllables(self, sentences_words): """Divide the word tokens into a list of syllables. Note that a syllable in this instance is defined as a vocalic group (i.e., a vowel or a diphthong). This means that all syllables which are not the last syllable in the word will end with a vowel or diphthong. TODO: Determine whether Luke Hollis's module at `cltk.stem.latin.syllabifier could replace this method.` :param sentences_words: A list of sentences with tokenized words. :return: Syllabified words :rtype : list """ all_syllables = [] for sentence in sentences_words: syll_per_sent = [] for word in sentence: syll_start = 0 # Begins syllable iterator syll_per_word = [] cur_letter_in = 0 # Begins general iterator while cur_letter_in < len(word): letter = word[cur_letter_in] if not cur_letter_in == len(word) - 1: if word[cur_letter_in] + word[cur_letter_in + 1] in self.diphthongs: cur_letter_in += 1 # Syllable ends with a diphthong syll_per_word.append( word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 elif (letter in self.vowels) or \ (letter in self.long_vowels): # Syllable ends with a vowel syll_per_word.append( word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 elif (letter in self.vowels) or \ (letter in self.long_vowels): # Syllable ends with a vowel syll_per_word.append( word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 cur_letter_in += 1 try: last_vowel = syll_per_word[-1][-1] # Last vowel of a word # Modifies general iterator for consonants after the last # syllable in a word. cur_letter_in = len( word) - 1 # Contains all of the consonants after the last vowel in a # word leftovers = u'' while word[cur_letter_in] != last_vowel: if word[cur_letter_in] != u'.': # Adds consonants to leftovers leftovers = word[cur_letter_in] + leftovers cur_letter_in -= 1 # Adds leftovers to last syllable in a word syll_per_word[-1] += leftovers syll_per_sent.append(syll_per_word) except IndexError: logger.info("IndexError while making syllables of '%s'. Continuing.", word) all_syllables.append(syll_per_sent) return all_syllables