コード例 #1
0
ファイル: tei.py プロジェクト: todd-cook/cltk
def onekgreek_tei_xml_to_text():
    """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus."""
    if not bs4_installed:
        logger.error("Install `bs4` and `lxml` to parse these TEI files.")
        raise ImportError
    xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml")
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            "1K Greek corpus not installed. Use ``FetchCorpus`` to get `First1KGreek`."
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if "__cts__" not in path]

    # new dir
    new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/")
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip(".xml")
        xml_name += ".txt"
        with open(xml_path) as file_open:
            soup = BeautifulSoup(file_open, "lxml")
        body = soup.body
        text = body.get_text()
        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, "w") as file_open:
            file_open.write(text)
コード例 #2
0
ファイル: tlgu.py プロジェクト: yelircaasi/cltk
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this

        """
        if corpus == "tlg":
            orig_dir = make_cltk_path("originals/tlg")
            works_dir = make_cltk_path("grc/text/tlg/individual_works")
            file_prefix = "TLG"
            lat = False
        elif corpus == "phi5":
            orig_dir = make_cltk_path("originals/phi5")
            works_dir = make_cltk_path("lat/text/phi5/individual_works")
            file_prefix = "LAT"
            lat = True  # this is for the optional TLGU argument to convert()
        elif corpus == "phi7":
            raise CLTKException(
                "``phi7`` cannot be divided into individual works.")
        else:
            raise CLTKException(
                f"Invalid corpus '{corpus}'. This should never happen.")

        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [
            x for x in files
            if x.endswith(".TXT") and x.startswith(file_prefix)
        ]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path,
                             new_file_path,
                             divide_works=True,
                             lat=lat)
                logger.info("Writing files at %s to %s.", orig_file_path,
                            works_dir)
            except Exception as err:
                logger.error("Failed to convert files: %s.", err)
コード例 #3
0
def assemble_phi5_works_filepaths():
    """Reads PHI5 index and builds a list of absolute filepaths."""
    plaintext_dir = make_cltk_path("lat/text/phi5/individual_works/")
    all_filepaths = []
    for author_code in PHI5_WORKS_INDEX:
        author_data = PHI5_WORKS_INDEX[author_code]
        works = author_data["works"]
        for work in works:
            f = os.path.join(plaintext_dir,
                             author_code + ".TXT" + "-" + work + ".txt")
            all_filepaths.append(f)
    return all_filepaths
コード例 #4
0
ファイル: tei.py プロジェクト: todd-cook/cltk
def onekgreek_tei_xml_to_text_capitains():
    """Use MyCapitains program to convert TEI to plaintext."""
    file = make_cltk_path(
        "grc/text/grc_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml"
    )
    xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml")
    xml_paths = glob.glob(xml_dir)
    if not len(xml_paths):
        logger.error(
            "1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`."
        )
        raise FileNotFoundError
    xml_paths = [path for path in xml_paths if "__cts__" not in path]

    # new dir
    new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/")
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    for xml_path in xml_paths:
        _, xml_name = os.path.split(xml_path)
        xml_name = xml_name.rstrip(".xml")
        xml_name += ".txt"

        plain_text = ""
        with open(xml_path) as file_open:
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                text_line = psg.export(Mimetypes.PLAINTEXT,
                                       exclude=["tei:note"])
                plain_text += text_line

        new_plaintext_path = os.path.join(new_dir, xml_name)
        with open(new_plaintext_path, "w") as file_open:
            file_open.write(plain_text)
コード例 #5
0
ファイル: tlgu.py プロジェクト: yelircaasi/cltk
 def _check_and_download_tlgu_source(self):
     """Check if tlgu downloaded, if not download it."""
     path = make_cltk_path("grc/software/grc_software_tlgu/tlgu.h")
     if not os.path.isfile(path):
         dl_msg = f"This part of the CLTK depends upon TLGU, software written by Dimitri Marinakis `<http://tlgu.carmen.gr/>`_."
         print(dl_msg)
         repo_url = "https://github.com/cltk/grc_software_tlgu.git"
         dl_dir = os.path.split(path)[0]
         dl_question = (
             f"Do you want to download TLGU from '{repo_url}' to '{dl_dir}'?"
         )
         if self.interactive:
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="grc")
             fetch_corpus.import_corpus(corpus_name="grc_software_tlgu")
         else:
             raise CLTKException(
                 f"TLGU software required for this class to work.")
コード例 #6
0
 def __init__(self, interactive: bool = True):
     self.interactive = interactive
     self.lewis_yaml_fp = make_cltk_path(
         "lat", "lexicon", "cltk_lat_lewis_elementary_lexicon",
         "lewis.yaml")
     try:
         self.entries = self._load_entries()
     except FileNotFoundError:
         if self.interactive:
             dl_msg = f"This part of the CLTK depends upon Lewis's *An Elementary Latin Dictionary* (1890)."
             print(dl_msg)
             dl_question = "Do you want to download this?"
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="lat")
             fetch_corpus.import_corpus(
                 corpus_name="cltk_lat_lewis_elementary_lexicon")
         else:
             raise CLTKException(
                 f"File '{self.lewis_yaml_fp}' is not found. It is required for this class."
             )
         self.entries = self._load_entries()
コード例 #7
0
ファイル: non.py プロジェクト: yelircaasi/cltk
 def __init__(self, interactive: bool = True):
     self.interactive = interactive
     self.zoega_yaml_fp = make_cltk_path("non", "dictionary",
                                         "cltk_non_zoega_dictionary",
                                         "dictionary.yaml")
     try:
         self.entries = self._load_entries()
     except FileNotFoundError:
         if self.interactive:
             dl_msg = f"This part of the CLTK depends upon Zoëga's *A Concise Old Norse Dictionary* (1890)."
             print(dl_msg)
             dl_question = "Do you want to download this?"
             do_download = query_yes_no(question=dl_question)
         else:
             do_download = True
         if do_download:
             fetch_corpus = FetchCorpus(language="non")
             fetch_corpus.import_corpus(
                 corpus_name="cltk_non_zoega_dictionary")
         else:
             raise CLTKException(
                 f"File '{self.zoega_yaml_fp}' is not found. It is required for this class."
             )
         self.entries = self._load_entries()
コード例 #8
0
ファイル: file_utils.py プロジェクト: yelircaasi/cltk
def assemble_tlg_author_filepaths():
    """Reads TLG index and builds a list of absolute filepaths."""
    plaintext_dir = make_cltk_path("grc/text/tlg/plaintext/")
    filepaths = [os.path.join(plaintext_dir, x + ".TXT") for x in TLG_INDEX]
    return filepaths
コード例 #9
0
def assemble_phi5_author_filepaths():
    """Reads PHI5 index and builds a list of absolute filepaths."""
    plaintext_dir = make_cltk_path("lat/text/phi5/plaintext/")
    filepaths = [os.path.join(plaintext_dir, x + ".TXT") for x in PHI5_INDEX]
    return filepaths
コード例 #10
0
ファイル: test_utils.py プロジェクト: jfaville/cltk
 def test_path(self):
     """Test empty_path() with argument."""
     self.assertEqual(make_cltk_path('greek', 'perseus_corpus'),
                      os.path.expanduser(os.path.join('~', 'cltk_data', 'greek', 'perseus_corpus')))
コード例 #11
0
ファイル: test_utils.py プロジェクト: jfaville/cltk
 def test_empty_path(self):
     """Test empty empty_path()"""
     self.assertEqual(make_cltk_path(),
                      os.path.expanduser(os.path.join('~', 'cltk_data')))
コード例 #12
0
ファイル: test_utils.py プロジェクト: soumenganguly/cltk
 def test_path(self):
     """Test empty_path() with argument."""
     self.assertEqual(
         make_cltk_path('greek', 'perseus_corpus'),
         os.path.expanduser(
             os.path.join('~', 'cltk_data', 'greek', 'perseus_corpus')))
コード例 #13
0
ファイル: test_utils.py プロジェクト: soumenganguly/cltk
 def test_empty_path(self):
     """Test empty empty_path()"""
     self.assertEqual(make_cltk_path(),
                      os.path.expanduser(os.path.join('~', 'cltk_data')))
コード例 #14
0
 def test_empty_path(self):
     """Test empty empty_path()"""
     self.assertEqual(make_cltk_path(), get_cltk_data_dir())
コード例 #15
0
ファイル: tlgu.py プロジェクト: yelircaasi/cltk
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(["which", "tlgu"])
     except subprocess.SubprocessError as sub_err:
         print("TLGU not installed.")
         logger.info("TLGU not installed: %s", sub_err)
         logger.info("Installing TLGU.")
         if not subprocess.check_output(["which", "gcc"]):
             logger.error("GCC seems not to be installed.")
         else:
             tlgu_path = make_cltk_path("grc/software/grc_software_tlgu")
             if self.interactive:
                 install_question = "Do you want to install TLGU?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
             else:
                 print("Non-interactive installation. Continuing ...")
             command = "cd {0} && make install".format(tlgu_path)
             print(f"Going to run command: ``{command}``")
             try:
                 p_out = subprocess.call(command, shell=True)
             except subprocess.SubprocessError as sub_err:
                 print(
                     "Error executing installation. Going to check output of ``subprocess.call()`` ..."
                 )
                 raise CLTKException(sub_err)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
                 return True
             else:
                 msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..."
                 print(msg)
                 logger.error(msg)
             command = "cd {0} && sudo make install".format(tlgu_path)
             if self.interactive:
                 install_question = "Do you want to install TLGU? with sudo?"
                 do_install = query_yes_no(question=install_question)
                 if not do_install:
                     raise CLTKException(
                         "TLGU installation required for this class to work."
                     )
                 p_out = subprocess.call(command, shell=True)
             else:
                 print("Going to run command:", command)
                 p_out = subprocess.call(command, shell=True)
             if p_out == 0:
                 msg = "TLGU installed."
                 print(msg)
                 logger.info(msg)
             else:
                 msg = "TLGU install with sudo failed."
                 print(msg)
                 logger.error(msg)
                 raise CLTKException(
                     "TLGU installation required for this class to work.")
コード例 #16
0
ファイル: test_corpora.py プロジェクト: free-variation/cltk
 def test_tlgu_init(self):
     """Test constructors of TLGU module for check, import, and install."""
     TLGU(interactive=False)
     header_file = make_cltk_path(
         "greek/software/greek_software_tlgu/README.md")
     self.assertTrue(os.path.isfile(header_file))
コード例 #17
0
ファイル: tlgu.py プロジェクト: yelircaasi/cltk
 def convert_corpus(self, corpus, markup=None, lat=None):  # pylint: disable=W0613
     """Look for imported TLG or PHI files and convert them all to
     ``~/cltk_data/grc/text/tlg/<plaintext>``.
     TODO: Add markup options to input.
     TODO: Add rm_newlines, divide_works, and extra_args
     """
     orig_path = make_cltk_path("originals")
     target_path = make_cltk_path()
     assert corpus in [
         "tlg",
         "phi5",
         "phi7",
     ], "Corpus must be 'tlg', 'phi5', or 'phi7'"
     if corpus in ["tlg", "phi5", "phi7"]:
         orig_path = os.path.join(orig_path, corpus)
         if corpus in ["tlg", "phi7"]:
             if "phi7" and lat is True:
                 lat = True
                 target_path = os.path.join(target_path, "lat", "text",
                                            corpus)
             else:
                 lat = None
                 target_path = os.path.join(target_path, "grc", "text",
                                            corpus)
         else:
             target_path = os.path.join(target_path, "lat", "text", corpus)
             lat = True
     try:
         corpus_files = os.listdir(orig_path)
     except Exception as exception:
         logger.error("Failed to find TLG files: %s", exception)
         raise
     # make a list of files to be converted
     txts = [x for x in corpus_files if x.endswith("TXT")]
     # loop through list and convert one at a time
     for txt in txts:
         orig_txt_path = os.path.join(orig_path, txt)
         if markup is None:
             target_txt_dir = os.path.join(target_path, "plaintext")
         else:
             target_txt_dir = os.path.join(target_path, str(markup))
         if not os.path.isdir(target_txt_dir):
             os.makedirs(target_txt_dir)
         target_txt_path = os.path.join(target_txt_dir, txt)
         try:
             self.convert(
                 orig_txt_path,
                 target_txt_path,
                 markup=False,
                 rm_newlines=False,
                 divide_works=False,
                 lat=lat,
                 extra_args=None,
             )
         except Exception as exception:
             logger.error(
                 "Failed to convert file '%s' to '%s': %s",
                 orig_txt_path,
                 target_txt_path,
                 exception,
             )