def find_alliteration(self): """ Find alliterations in the complete verse. :return: """ if len(self.phonological_features_text) == 0: logger.error("No phonological transcription found") raise ValueError else: first_sounds = [] for i, line in enumerate(self.phonological_features_text): first_sounds.append([]) for j, short_line in enumerate(line): first_sounds[i].append([]) for viisuord in short_line: first_sounds[i][j].append(viisuord[0]) verse_alliterations = [] n_alliterations_lines = [] for i, first_sound_line in enumerate(first_sounds): if isinstance(self.long_lines[i][0], ShortLine) and isinstance( self.long_lines[i][1], ShortLine): alli, counter = self.long_lines[i][0].find_alliterations( self.long_lines[i][1]) verse_alliterations.append(alli) n_alliterations_lines.append(counter) elif isinstance(self.long_lines[i][0], LongLine): alli, counter = self.long_lines[i][0].find_alliterations() verse_alliterations.append(alli) n_alliterations_lines.append(counter) return verse_alliterations, n_alliterations_lines
def open_pickle(path: str) -> Any: """Open a pickle and return loaded pickle object. :type path: str :param : path: File path to pickle file to be opened. :rtype : object """ try: with open(path, "rb") as opened_pickle: try: return pickle.load(opened_pickle) except Exception as pickle_error: logger.error(pickle_error) raise except FileNotFoundError as fnf_error: logger.error(fnf_error) raise except IOError as io_err: logger.error(io_err) raise except EOFError as eof_error: logger.error(eof_error) raise except pickle.UnpicklingError as unp_error: logger.error(unp_error) raise
def to_phonetics(self): """ Transcribing words in verse helps find alliteration. """ if len(self.long_lines) == 0: logger.error("No text has been imported") self.syllabified_text = [] else: transcriber = Transcriber( old_norse_transcription.DIPHTHONGS_IPA, old_norse_transcription.DIPHTHONGS_IPA_class, old_norse_transcription.IPA_class, old_norse_transcription.old_norse_rules, ) transcribed_text = [] phonological_features_text = [] for i, long_line in enumerate(self.long_lines): transcribed_text.append([]) phonological_features_text.append([]) for short_line in long_line: assert isinstance(short_line, ShortLine) or isinstance( short_line, LongLine) short_line.to_phonetics(transcriber) transcribed_text[i].append(short_line.transcribed) phonological_features_text[i].append( short_line.phonological_features_text) self.transcribed_text = transcribed_text self.phonological_features_text = phonological_features_text
def onekgreek_tei_xml_to_text(): """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus.""" if not bs4_installed: logger.error("Install `bs4` and `lxml` to parse these TEI files.") raise ImportError xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml") xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( "1K Greek corpus not installed. Use ``FetchCorpus`` to get `First1KGreek`." ) raise FileNotFoundError xml_paths = [path for path in xml_paths if "__cts__" not in path] # new dir new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/") if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip(".xml") xml_name += ".txt" with open(xml_path) as file_open: soup = BeautifulSoup(file_open, "lxml") body = soup.body text = body.get_text() new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, "w") as file_open: file_open.write(text)
def __init__(self, height=None, backness=None, rounded=None, length=None, ipar=None): if isinstance(height, Height) or height is None: self.height = height else: logger.error("Incorrect argument") raise ValueError if isinstance(backness, Backness) or backness is None: self.backness = backness else: logger.error("Incorrect argument") raise ValueError if type(rounded) == bool or rounded is None: self.rounded = rounded else: logger.error("Incorrect argument") raise TypeError if isinstance(length, Length) or length is None: self.length = length else: logger.error("Incorrect argument") raise ValueError self.ipar = ipar
def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None): if isinstance(place, Place) or place is None: self.place = place else: logger.error("Incorrect argument") if isinstance(manner, Manner) or manner is None: self.manner = manner else: logger.error("Incorrect argument") raise ValueError if type(voiced) == bool or voiced is None: self.voiced = voiced else: logger.error("Incorrect argument") raise TypeError if type(geminate) == bool or geminate is None: self.geminate = geminate else: logger.error("Incorrect argument") raise TypeError self.ipar = ipar
def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == "tlg": orig_dir = make_cltk_path("originals/tlg") works_dir = make_cltk_path("grc/text/tlg/individual_works") file_prefix = "TLG" lat = False elif corpus == "phi5": orig_dir = make_cltk_path("originals/phi5") works_dir = make_cltk_path("lat/text/phi5/individual_works") file_prefix = "LAT" lat = True # this is for the optional TLGU argument to convert() elif corpus == "phi7": raise CLTKException( "``phi7`` cannot be divided into individual works.") else: raise CLTKException( f"Invalid corpus '{corpus}'. This should never happen.") if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [ x for x in files if x.endswith(".TXT") and x.startswith(file_prefix) ] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, lat=lat) logger.info("Writing files at %s to %s.", orig_file_path, works_dir) except Exception as err: logger.error("Failed to convert files: %s.", err)
def _git_user_defined_corpus(self, corpus_name, corpus_type, uri: str, branch="master"): """Clone or update a git repo defined by user. TODO: This code is very redundant with what's in import_corpus(), could be refactored. """ type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) repo_name = uri.split("/")[-1] # eg, 'latin_corpus_newton_example.git' repo_name = repo_name.rstrip(".git") target_dir = os.path.join(type_dir, repo_name) target_file = os.path.join(type_dir, repo_name, "README.md") # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, uri) logger.info(msg) Repo.clone_from(uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format( uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format( uri, corpus_imp_err) logger.error(msg)
def syllabify(self, hierarchy: Dict[str, int]): """ Syllables may play a role in verse classification. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: syllabifier = Syllabifier(language="non", break_geminants=True) syllabifier.set_hierarchy(hierarchy) syllabified_text = [] for i, long_line in enumerate(self.long_lines): syllabified_text.append([]) for short_line in long_line: assert isinstance(short_line, ShortLine) or isinstance( short_line, LongLine) short_line.syllabify(syllabifier) syllabified_text[i].append(short_line.syllabified) self.syllabified_text = syllabified_text
def from_regular_expression(re_rule, estimated_sound, ipa_class): """ :param re_rule: pattern (first argument of re.sub) :param estimated_sound: an IPA character (second argument of re.sub) :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances :return: corresponding Rule instance """ assert len(re_rule) > 0 if re_rule[0] == "^": place = Rank.first elif re_rule[-1] == "$": place = Rank.last else: place = Rank.inner before_pattern = r"(?<=\(\?\<\=\[)\w*" core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)" after_pattern = r"(?<=\(\?\=\[)\w*" before_search = re.search(before_pattern, re_rule) core_search = re.search(core_pattern, re_rule) after_search = re.search(after_pattern, re_rule) if before_search is None: before = None else: before = [ ipa_class[ipar].to_abstract() for ipar in before_search.group(0) ] if core_search is not None: core = ipa_class[core_search.group(0)] else: logger.error("No core") raise ValueError if after_search is None: after = None else: after = [ ipa_class[ipar].to_abstract() for ipar in after_search.group(0) ] abstract_position = AbstractPosition(place, before, after) return Rule(abstract_position, core, ipa_class[estimated_sound])
def _get_corpus_properties(self, corpus_name: str): """Check whether a corpus is available for import. :type corpus_name: str :param corpus_name: Name of available corpus. :rtype : str """ try: corpora = self.all_corpora except NameError as name_error: msg = "Corpus not available for language " '"%s": %s' % ( self.language, name_error, ) logger.error(msg) raise CorpusImportError(msg) for corpus_properties in corpora: if corpus_properties["name"] == corpus_name: return corpus_properties msg = 'Corpus "%s" not available for the ' '"%s" language.' % ( corpus_name, self.language, ) logger.error(msg) raise CorpusImportError(msg)
def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = make_cltk_path( "grc/text/grc_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml" ) xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml") xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( "1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`." ) raise FileNotFoundError xml_paths = [path for path in xml_paths if "__cts__" not in path] # new dir new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/") if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip(".xml") xml_name += ".txt" plain_text = "" with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, "w") as file_open: file_open.write(plain_text)
def convert( input_path=None, output_path=None, markup=None, rm_newlines=False, divide_works=False, lat=False, extra_args=None, ): """ Do conversion. :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param lat: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path = os.path.expanduser(input_path) output_path = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), "File {0} does not exist.".format( input_path) # setup tlgu flags tlgu_options = [] if markup == "full": full_args = ["v", "w", "x", "y", "z"] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if rm_newlines: tlgu_options.append("N") if divide_works: tlgu_options.append("W") if lat: tlgu_options.append("r") # setup extra args if extra_args is None: extra_args = [] else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options = tlgu_options + extra_args # assemble all tlgu flags tlgu_options = list(set(tlgu_options)) if tlgu_options: tlgu_flags = "-" + " -".join(tlgu_options) else: tlgu_flags = "" # make tlgu call tlgu_call = "tlgu {0} {1} {2}".format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error("Failed to convert %s to %s.", input_path, output_path) except Exception as exc: logger.error("Failed to convert %s to %s: %s", input_path, output_path, exc) raise
def convert_corpus(self, corpus, markup=None, lat=None): # pylint: disable=W0613 """Look for imported TLG or PHI files and convert them all to ``~/cltk_data/grc/text/tlg/<plaintext>``. TODO: Add markup options to input. TODO: Add rm_newlines, divide_works, and extra_args """ orig_path = make_cltk_path("originals") target_path = make_cltk_path() assert corpus in [ "tlg", "phi5", "phi7", ], "Corpus must be 'tlg', 'phi5', or 'phi7'" if corpus in ["tlg", "phi5", "phi7"]: orig_path = os.path.join(orig_path, corpus) if corpus in ["tlg", "phi7"]: if "phi7" and lat is True: lat = True target_path = os.path.join(target_path, "lat", "text", corpus) else: lat = None target_path = os.path.join(target_path, "grc", "text", corpus) else: target_path = os.path.join(target_path, "lat", "text", corpus) lat = True try: corpus_files = os.listdir(orig_path) except Exception as exception: logger.error("Failed to find TLG files: %s", exception) raise # make a list of files to be converted txts = [x for x in corpus_files if x.endswith("TXT")] # loop through list and convert one at a time for txt in txts: orig_txt_path = os.path.join(orig_path, txt) if markup is None: target_txt_dir = os.path.join(target_path, "plaintext") else: target_txt_dir = os.path.join(target_path, str(markup)) if not os.path.isdir(target_txt_dir): os.makedirs(target_txt_dir) target_txt_path = os.path.join(target_txt_dir, txt) try: self.convert( orig_txt_path, target_txt_path, markup=False, rm_newlines=False, divide_works=False, lat=lat, extra_args=None, ) except Exception as exception: logger.error( "Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception, )
""" import re import unicodedata from nltk.tokenize import wordpunct_tokenize from cltk.core.cltk_logger import logger from cltk.prosody.lat import macronizer as m try: # James Tauber's greek_accentuation package from greek_accentuation import characters as chars except ImportError as import_error: message = ('Missing "greek_accentuation" package. Install with ' "`pip install greek-accentuation`.") logger.error(message) logger.error(import_error) raise __author__ = ["Jack Duff <*****@*****.**>"] __license__ = "MIT License. See LICENSE." # Dictionaries of phonological reconstructions for use in transcribing. # Allen, W. Sidney. 1965. Vox Latina. LATIN = { "Classical": { "Allen": { "correspondence": { "p": "p", "t": "t̪",
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(["which", "tlgu"]) except subprocess.SubprocessError as sub_err: print("TLGU not installed.") logger.info("TLGU not installed: %s", sub_err) logger.info("Installing TLGU.") if not subprocess.check_output(["which", "gcc"]): logger.error("GCC seems not to be installed.") else: tlgu_path = make_cltk_path("grc/software/grc_software_tlgu") if self.interactive: install_question = "Do you want to install TLGU?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) else: print("Non-interactive installation. Continuing ...") command = "cd {0} && make install".format(tlgu_path) print(f"Going to run command: ``{command}``") try: p_out = subprocess.call(command, shell=True) except subprocess.SubprocessError as sub_err: print( "Error executing installation. Going to check output of ``subprocess.call()`` ..." ) raise CLTKException(sub_err) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) return True else: msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..." print(msg) logger.error(msg) command = "cd {0} && sudo make install".format(tlgu_path) if self.interactive: install_question = "Do you want to install TLGU? with sudo?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) p_out = subprocess.call(command, shell=True) else: print("Going to run command:", command) p_out = subprocess.call(command, shell=True) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) else: msg = "TLGU install with sudo failed." print(msg) logger.error(msg) raise CLTKException( "TLGU installation required for this class to work.")
def import_corpus(self, corpus_name: str, local_path: str = None, branch: str = "master"): """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :param corpus_name: The name of an available corpus. :param local_path: A filepath, required when importing local corpora. :param branch: What Git branch to clone. """ matching_corpus_list = [ _dict for _dict in self.all_corpora_for_lang if _dict["name"] == corpus_name ] if not matching_corpus_list: raise CorpusImportError( f"No corpus ``{corpus_name}`` for language ``{self.language}``." ) if len(matching_corpus_list) > 1: raise CorpusImportError( f"Found more than one corpus with the name ``{corpus_name}``.") matching_corpus = matching_corpus_list[0] if matching_corpus.get("user_defined"): """{'origin': 'https://github.com/kylepjohnson/latin_corpus_newton_example.git', 'type': 'text', 'name': 'example_distributed_latin_corpus', 'user_defined': True} """ self._git_user_defined_corpus( matching_corpus["name"], matching_corpus["type"], matching_corpus["origin"], ) return elif matching_corpus.get("location") == "local": # {'location': 'local', 'name': 'phi5', 'origin': None, 'type': 'text'} msg = "Importing from local path: '{}'".format(local_path) logger.info(msg) if corpus_name not in ["phi5", "phi7", "tlg"]: raise CorpusImportError( f"Unsupported local corpus ``{corpus_name}``.") if corpus_name == "phi5": # normalize path for checking dir if local_path.endswith("/"): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != "PHI5": logger.info("Directory must be named 'PHI5'.") if corpus_name == "phi7": # normalize local_path for checking dir if local_path.endswith("/"): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != "PHI7": logger.info("Directory must be named 'PHI7'.") if corpus_name == "tlg": # normalize path for checking dir if local_path.endswith("/"): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != "TLG_E": logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, "originals") # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) msg = "Wrote directory at '{}'.".format(originals_dir) logger.info(msg) tlg_originals_dir = os.path.join(data_dir, "originals", corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) msg = "Removed directory at '{}'.".format(tlg_originals_dir) logger.info(msg) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir) else: """{'type': 'text', 'name': 'lat_text_perseus', 'origin': 'https://github.com/cltk/lat_text_perseus.git'}, """ if (not matching_corpus.get("type") and not matching_corpus.get("name") and not matching_corpus.get("origin")): raise FetchCorpus(f"Malformed record for ``{corpus_name}``.") git_uri = matching_corpus["origin"] type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, matching_corpus["type"]) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, "README.md") # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri) logger.info(msg) Repo.clone_from( git_uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter(), ) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format( git_uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format( corpus_name, git_uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format( git_uri, corpus_imp_err) logger.error(msg)