def test_collect(self): storage = Storage(self.ROOT) path = storage.download(kind="XF", year=2018) ledger = storage.download_ledger(directory=f"{self.ROOT}/processed") self.assertGreater(len(ledger.data), 1) self.assertTrue(Path(ledger.path).exists) loaded = ledger.collect(edinet_code="E00021") self.assertGreater(len(loaded), 0)
def test_download_extracted(self): storage = Storage(self.ROOT) path = storage.download(directory=f"{self.ROOT}/rawe", kind="XE", year=2018) self.assertTrue(path.exists()) self.assertEqual(path.name, "2018") self.assertTrue(path.joinpath("documents.csv").exists()) self.assertTrue(path.joinpath("docs").exists()) self.assertGreater(len(list(path.joinpath("docs").glob("*.txt"))), 1)
def test_parse(self): storage = Storage(self.ROOT) path = self._download(kind="F") path = storage.parse("company.history") self.assertTrue(path.exists()) self.assertTrue(path.joinpath("2018").exists()) self.assertTrue(path.joinpath("2018/documents.csv").exists()) self.assertGreater( len(list(path.joinpath("2018/docs").glob("*company_history.txt"))), 0) path = storage.parse("business.risks", sec_code="1376") with path.joinpath("2018/documents.csv").open(encoding="utf-8") as f: self.assertEquals(len(f.readlines()), 2) self.assertEquals( len(list(path.joinpath("2018/docs").glob("*business_risks.txt"))), 1)
def __init__(self, version="v1.0"): self._storage = Storage(version=version)
class CoARiJ(object): """ Data management tool for CoARiJ dataset. """ def __init__(self, version="v1.0"): self._storage = Storage(version=version) def download(self, directory="", kind="F", year="", force=False): """Download the {kind} {year} dataset to {directory}. Args: directory (str): Downloaded dataset to specified directory. kind (str): 'F': raw file datadata, 'E': text extracted data. year (str): Financial year of dataset. force (bool): When True, overwrite data if exist. Returns: str: Path to downloaded directory """ return self._storage.download(directory, kind, year, force) def extract(self, aspect_element, year="", edinet_code="", sec_code="", jcn="", source_directory="", target_directory="", normalized=True): """ Extract {aspect_to_element} from files in {source_directory}{year} and save it in {target_directory}{year} as txt/html file. Args: aspect_element (str): Target aspect.element (ex: company.history). year (str): Target financial year. edinet_code (str): EDINET code to specify compan. sec_code (str): SEC code to specify compan. jcn (str): Target JCN code to specify compan. source_directory (str): Source directory includes XBRL files. target_directory (str): Target directory that txt/htmls are saved. normalized: (bool): True: extract text, False: save raw xml(html). Returns: str: Path to extracted files directory """ return self._storage.extract(aspect_dot_element=aspect_element, year=year, edinet_code=edinet_code, sec_code=sec_code, jcn=jcn, source_directory=source_directory, target_directory=target_directory, normalized=normalized) def tokenize(self, tokenizer="janome", mode="", dictionary="", dictionary_type="", year="", edinet_code="", sec_code="", jcn="", aspect_element="", source_directory="", target_directory=""): """ Tokenize by {tokenizer} from files in {source_directory}{year} and save it in {target_directory}{year} as txt/html file. Args: tokenizer (str): Japanese tokenizer ('janome' or 'sudachi'). mode: (str): Sudachi tokenizer mode. dictionary: (str): Dictionary path for Janome. dictionary_type: (str): Dictionary type for Janome. year (str): Target financial year. edinet_code (str): EDINET code to specify compan. sec_code (str): SEC code to specify compan. jcn (str): Target JCN code to specify compan. aspect_element (str): Target aspect.element (ex: company.history). source_directory (str): Source directory includes XBRL files. target_directory (str): Target directory that txt/htmls are saved. Returns: str: Path to tokenized files directory """ return self._storage.tokenize(tokenizer=tokenizer, mode=mode, dictionary=dictionary, dictionary_type=dictionary_type, year=year, edinet_code=edinet_code, sec_code=sec_code, jcn=jcn, aspect_dot_element=aspect_element, source_directory=source_directory, target_directory=target_directory)
def _download(self, kind="F"): storage = Storage(self.ROOT) path = storage.download(kind="X" + kind, year=2018) return path
def test_download_ledger(self): storage = Storage(self.ROOT) ledger = storage.download_ledger(directory=f"{self.ROOT}/processed") self.assertGreater(len(ledger.data), 1) self.assertTrue(Path(ledger.path).exists)
def __init__(self): self._storage = Storage()