Exemple #1
0
 def test_get_pdf_without_dir(self):
     client = DocumentClient()
     file_path = client.get_pdf("S100FGR9")
     self.assertTrue(os.path.exists(file_path))
     name = os.path.basename(file_path)
     self.assertTrue(str(name).startswith("S100FGR9_2__"))
     self.assertTrue(str(name).endswith(".pdf"))
     os.remove(file_path)
Exemple #2
0
 def get_pdf(self, save_dir: str = "", file_name: str = ""):
     from xbrr.edinet.client.document_client import DocumentClient
     client = DocumentClient()
     return client.get_pdf(self.document_id, save_dir, file_name)
Exemple #3
0
 def test_get_pdf(self):
     _dir = os.path.dirname(__file__)
     client = DocumentClient()
     file_path = client.get_pdf("S100FGR9", save_dir=_dir)
     self.assertTrue(os.path.exists(file_path))
     os.remove(file_path)
Exemple #4
0
    def collect(self,
                directory="",
                source_directory="",
                year="",
                edinet_code="",
                sec_code="",
                jcn="",
                file_type="xbrl"):
        """
        Collect the documents based on ledger file.
        """

        if not source_directory:
            s_dir = Path.cwd().joinpath(self.storage._default_raw_data)
        else:
            s_dir = Path(source_directory)
            if not s_dir.is_absolute():
                s_dir = Path.cwd().joinpath(source_directory)

        target = self.data

        filters = {
            "fiscal_year": str(year),
            "edinet_code": edinet_code,
            "sec_code": str(sec_code),
            "jcn": str(jcn)
        }

        conditions = []
        for k in filters:
            if filters[k]:
                target = target[target[k] == filters[k]]
                conditions.append(filters[k])

        if len(conditions) == 0:
            raise Exception("You have to specify at least one condition.")

        if not directory:
            t_dir = Path.cwd().joinpath(self.storage._default_raw_data)
        else:
            t_dir = Path(directory)
            if not t_dir.is_absolute():
                t_dir = Path.cwd().joinpath(directory)

        t_dir = t_dir.joinpath("_".join(conditions))
        if not t_dir.exists():
            t_dir.mkdir(parents=True, exist_ok=True)

        for i, r in tqdm(target.iterrows(), total=target.shape[0]):
            fiscal_year = r["fiscal_year"]
            doc_id = r["doc_id"]
            y_s_dir = s_dir.joinpath(fiscal_year).joinpath("docs")
            y_s_path = y_s_dir.joinpath(f"{doc_id}.xbrl")

            if y_s_path.exists():
                shutil.copy(str(y_s_path),
                            str(t_dir.joinpath(f"{doc_id}.xbrl")))
            else:
                client = DocumentClient()
                file_type_matched = True
                try:
                    if file_type == "pdf":
                        file_path = client.get_pdf(doc_id, save_dir=t_dir)
                    elif file_type == "xbrl":
                        file_path = client.get_xbrl(doc_id,
                                                    save_dir=t_dir,
                                                    expand_level="file")
                    elif file_type == "zip":
                        file_path = client.get_xbrl(doc_id,
                                                    save_dir=t_dir,
                                                    expand_level="dir")
                    elif file_type == "csr":
                        if isinstance(r['csr_path'], str) and r['csr_path']:
                            print(r['csr_path'])
                            file_name = os.path.basename(r['csr_path'])
                            url = f"https://s3-ap-northeast-1.amazonaws.com/chakki.esg.csr.jp/{r['csr_path']}"
                            self.storage._download(url,
                                                   t_dir.joinpath(file_name))
                    else:
                        file_type_matched = False
                except Exception as e:
                    print("Can not download {}.".format(doc_id))

                if not file_type_matched:
                    raise Exception(f"File type {file_type} is not supported")

                time.sleep(0.1)  # to save api host

        return target