def extract_dir(dir, ds): parser = EdinetXbrlParser() for path in glob(dir + "/*"): if os.path.isdir(path): d = extract_dir(path, ds) continue ext = path.split(".")[-1] if not ext == "xbrl": continue print(path) xbrl = parser.parse_file(path) for key in xbrl.get_keys(): print("key", key) for v in xbrl.get_data_list(key): s = v.get_value() if not s: continue ds.append({ "key": key, "context_ref": v.get_context_ref(), "value": s, "ishtml": s.startswith("<") and s.endswith(">") }) # if not s: # continue # s = s.replace("\n", "") # if s.startswith("<") and s.endswith(">"): # soup = BeautifulSoup(s, "html.parser") # else: # print(s) return ds
def extract_dir(dir, items, values): parser = EdinetXbrlParser() for path in glob(dir + "/*"): if os.path.isdir(path): items, values = extract_dir(path, items, values) continue ext = path.split(".")[-1] if path.endswith("_ixbrl.htm"): f = open(path) html = f.read() f.close() p = path.replace(dir, "").replace("/", "")[:7] part = int(p) if p.isdecimal() else 9999999 values.append({"value": html, "part": part}) elif ext == "xbrl": xbrl = parser.parse_file(path) for key in xbrl.get_keys(): for v in xbrl.get_data_list(key): s = v.get_value() if not s: continue tris = s.replace("\n", "") items.append({ "key": key, "value": s, "ishtml": (tris.startswith("<") and tris.endswith(">")) or (tris.startswith("<") and tris.endswith(">")) }) return items, values
def get(self, doc_id: str) -> EdinetObjWrapper: detail_url = self._generate_doc_url(doc_id) # XBRLの取得 path = self._download_file(detail_url) if not path: logging.error('エラー! ファイルが取得または開くことができませんでした') return None xbrl_path = get_xbrl(path) parser = EdinetXbrlParser() return EdinetObjWrapper(parser.parse_file(xbrl_path))
class EdinetExtractParser(Command): def __init__(self, target_text, workspace): self.parser = EdinetXbrlParser() self.xbrl_dir_path = workspace.download_file_dir self.issues_dir_path = workspace.output_dir self.target_text = target_text.lower() # fontが気になる場合は、font-family: 'MS Mincho'; 消す def execute(self): super().execute() only_files = [f for f in listdir(self.xbrl_dir_path) if isfile(join(self.xbrl_dir_path, f))] template_file = open('public/template.html') template = Template(template_file.read()) for f in only_files: edinet_xbrl_object = self.parser.parse_file(self.xbrl_dir_path + f) issue_keys = {key for key in edinet_xbrl_object.get_keys() if str(key).__contains__(self.target_text)} for key in issue_keys: for edinet_data in edinet_xbrl_object.get_data_list(key): html_file = open(self.issues_dir_path + f + "_issue.html", "w", encoding='UTF-8') html_file.write(self.generateHtml(template, edinet_data)) @staticmethod def generateHtml(template, value): return template.substitute({'main_contents': value.get_value()})
from edinet_xbrl.edinet_xbrl_parser import EdinetXbrlParser ## init parser parser = EdinetXbrlParser() ## parse xbrl file and get data container xbrl_file_path = "./data/jpcrp030000-asr-001_E05663-000_2018-06-30_01_2018-09-19.xbrl" edinet_xbrl_object = parser.parse_file(xbrl_file_path) ## 貸借対照表 key = "jppfs_cor:Assets" context_ref = "CurrentYearInstant" current_year_assets = edinet_xbrl_object.get_data_by_context_ref( key, context_ref).get_value() print(current_year_assets)
def __init__(self, target_text, workspace): self.parser = EdinetXbrlParser() self.xbrl_dir_path = workspace.download_file_dir self.issues_dir_path = workspace.output_dir self.target_text = target_text.lower()
def parse(dir, ticker): print("START PARSING") folder_path = dir + "/" + ticker file_list = glob.glob(folder_path + "/*") ## init parser parser = EdinetXbrlParser() ## get value from container keys_dict = { "社名": ["jpcrp_cor:CompanyNameCoverPage"], "文書名": ["jpcrp_cor:DocumentTitleCoverPage"], "期間": ["jpcrp_cor:QuarterlyAccountingPeriodCoverPage"], "Sales": ["jppfs_cor:NetSales"], "OperatingIncome": ["jppfs_cor:OperatingIncome"], "OrdinaryIncome": ["jppfs_cor:OrdinaryIncome"], "CurrentAssets": ["jppfs_cor:CurrentAssets"], "FixedAssets": ["jppfs_cor:IntangibleAssets", "jppfs_cor:NoncurrentAssets"], "CurrentLiabilities": ["jppfs_cor:CurrentLiabilities"], "FixedLiabilities": ["jppfs_cor:LongTermLoansPayable", "jppfs_cor:NoncurrentLiabilities"], "SalesCF": ["jppfs_cor:NetCashProvidedByUsedInOperatingActivities"], "InvestmentCF": ["jppfs_cor:NetCashProvidedByUsedInInvestmentActivities"], "FinanceCF": ["jppfs_cor:NetCashProvidedByUsedInFinancingActivities"] } key_list = [ "Sales", "OperatingIncome", "OrdinaryIncome", "CurrentAssets", "FixedAssets", "CurrentLiabilities", "FixedLiabilities", "SalesCF", "InvestmentCF", "FinanceCF" ] context_ref = [ "CurrentYTDDuration", "CurrentQuarterInstant", "CurrentYearInstant", "CurrentYearInstant_NonConsolidatedMember", "CurrentYearDuration", "CurrentQuarterInstant_NonConsolidatedMember", "CurrentYTDDuration_NonConsolidatedMember", "CurrentYearDuration_NonConsolidatedMember" ] #ここから file_list = glob.glob(folder_path + "/*") #フォルダ内のファイル名一覧を取得 file_list.reverse() df = pd.DataFrame(columns=[ "txt", "Sales", "OperatingIncome", "OrdinaryIncome", "CurrentAssets", "FixedAssets", "CurrentLiabilities", "FixedLiabilities", "SalesCF", "InvestmentCF", "FinanceCF" ]) cnt = 0 for filename in file_list: edinet_xbrl_object = parser.parse_file(filename) copname = "" doc_name = "" term = "" row = [] #pandas用 #社名 key_copname = keys_dict["社名"][0] contxtref = "FilingDateInstant" dtobj = edinet_xbrl_object.get_data_by_context_ref( key_copname, contxtref) if dtobj != None: copname = dtobj.get_value() #文書名 key_docname = keys_dict["文書名"][0] contxtref = "FilingDateInstant" dtobj = edinet_xbrl_object.get_data_by_context_ref( key_docname, contxtref) if dtobj != None: doc_name = dtobj.get_value() if (("四半期報告書" in doc_name) | ("有価証券報告書" in doc_name)): print("--------") print(filename[1:]) row.append(filename[1:]) key_term = keys_dict["期間"][0] contxtref = "FilingDateInstant" dtobj = edinet_xbrl_object.get_data_by_context_ref( key_term, contxtref) if dtobj != None: term = dtobj.get_value() for key in key_list: flg = 0 for i in range(len(keys_dict[key])): word = keys_dict[key][i] for j in range(len(context_ref)): dtobj = edinet_xbrl_object.get_data_by_context_ref( word, context_ref[j]) if dtobj != None: val = dtobj.get_value() if val == None: val = "-" row.append(int(val)) flg = 1 break if flg == 1: break if flg == 0: row.append(pd.np.nan) s = pd.Series(row, index=df.columns, name=cnt) df = df.append(s) cnt = cnt + 1 df["txt"] = df["txt"].str.strip(folder_path) df["txt"] = df["txt"].str.strip('.xbrl') df["txt"] = df["txt"].str.strip('\\') df = df.sort_values(by="txt") print("FINISH PARSING") return df
def get_value_dict(filename: str): parser = EdinetXbrlParser() suite = EdinetObjWrapper( parser.parse_file('tests/adapter/%s.xbrl' % filename)) return suite.get_value_dict()
def parse(xbrl_file_path): parser = EdinetXbrlParser() return parser.parse_file(xbrl_file_path)