Example #1
0
    def test_parse(self):
        storage = Storage(self.ROOT)
        path = self._download(kind="F")
        path = storage.parse("company.history")
        self.assertTrue(path.exists())
        self.assertTrue(path.joinpath("2018").exists())
        self.assertTrue(path.joinpath("2018/documents.csv").exists())
        self.assertGreater(
            len(list(path.joinpath("2018/docs").glob("*company_history.txt"))),
            0)

        path = storage.parse("business.risks", sec_code="1376")
        with path.joinpath("2018/documents.csv").open(encoding="utf-8") as f:
            self.assertEquals(len(f.readlines()), 2)
        self.assertEquals(
            len(list(path.joinpath("2018/docs").glob("*business_risks.txt"))),
            1)
Example #2
0
class CoARiJ(object):
    """
    Data management tool for CoARiJ
     dataset.
    """
    def __init__(self):
        self._storage = Storage()

    def download(self, directory="", kind="F", year="", force=False):
        """Download the {kind} {year} dataset to {directory}.

        Args:
            directory (str): Downloaded dataset to specified directory.
            kind (str): 'F': raw file datadata, 'E': text extracted data.
            year (str): Financial year of dataset.
            force (bool): When True, overwrite data if exist.

        Returns:
            str: Path to downloaded directory

        """
        return self._storage.download(directory, kind, year, force)

    def parse(self,
              aspect_element,
              source_directory="",
              target_directory="",
              year="",
              edinet_code="",
              sec_code="",
              jcn="",
              normalized=True):
        """
        Parse {aspect_to_element} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            aspect_element (str): Target aspect.element (ex: company.history).
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            normalized: (bool): True: extract text, False: save raw xml(html).

        Returns:
            str: Path to parsed files directory

        """
        return self._storage.parse(aspect_element, source_directory,
                                   target_directory, year, edinet_code,
                                   sec_code, jcn, normalized)

    def tokenize(self,
                 tokenizer="janome",
                 source_directory="",
                 target_directory="",
                 year="",
                 edinet_code="",
                 sec_code="",
                 jcn="",
                 aspect_element="",
                 mode="",
                 dictionary="",
                 dictionary_type=""):
        """
        Tokenize by {tokenizer} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            tokenizer (str): Japanese tokenizer ('janome' or 'sudachi').
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            aspect_element (str): Target aspect.element (ex: company.history).
            mode: (str): Sudachi tokenizer mode.
            dictionary: (str): Dictionary path for Janome.
            dictionary_type: (str): Dictionary type for Janome.

        Returns:
            str: Path to parsed files directory

        """
        return self._storage.tokenize(tokenizer, source_directory,
                                      target_directory, year, edinet_code,
                                      sec_code, jcn, aspect_element, mode,
                                      dictionary, dictionary_type)