コード例 #1
0
ファイル: etl.py プロジェクト: temy13/something
def extract_dir(dir, ds):
    parser = EdinetXbrlParser()
    for path in glob(dir + "/*"):
        if os.path.isdir(path):
            d = extract_dir(path, ds)
            continue
        ext = path.split(".")[-1]
        if not ext == "xbrl":
            continue
        print(path)
        xbrl = parser.parse_file(path)
        for key in xbrl.get_keys():
            print("key", key)
            for v in xbrl.get_data_list(key):
                s = v.get_value()
                if not s:
                    continue
                ds.append({
                    "key": key,
                    "context_ref": v.get_context_ref(),
                    "value": s,
                    "ishtml": s.startswith("<") and s.endswith(">")
                })
                # if not s:
                #     continue
                # s = s.replace("\n", "")
                # if s.startswith("<") and s.endswith(">"):
                #     soup = BeautifulSoup(s, "html.parser")
                # else:
                #     print(s)
    return ds
コード例 #2
0
ファイル: etl.py プロジェクト: temy13/edinet_search
def extract_dir(dir, items, values):
    parser = EdinetXbrlParser()
    for path in glob(dir + "/*"):
        if os.path.isdir(path):
            items, values = extract_dir(path, items, values)
            continue
        ext = path.split(".")[-1]

        if path.endswith("_ixbrl.htm"):
            f = open(path)
            html = f.read()
            f.close()
            p = path.replace(dir, "").replace("/", "")[:7]
            part = int(p) if p.isdecimal() else 9999999
            values.append({"value": html, "part": part})
        elif ext == "xbrl":
            xbrl = parser.parse_file(path)
            for key in xbrl.get_keys():
                for v in xbrl.get_data_list(key):
                    s = v.get_value()
                    if not s:
                        continue
                    tris = s.replace("\n", "")
                    items.append({
                        "key":
                        key,
                        "value":
                        s,
                        "ishtml": (tris.startswith("<") and tris.endswith(">"))
                        or (tris.startswith("&lt;") and tris.endswith("&gt;"))
                    })

    return items, values
コード例 #3
0
    def get(self, doc_id: str) -> EdinetObjWrapper:
        detail_url = self._generate_doc_url(doc_id)

        # XBRLの取得
        path = self._download_file(detail_url)
        if not path:
            logging.error('エラー! ファイルが取得または開くことができませんでした')
            return None

        xbrl_path = get_xbrl(path)
        parser = EdinetXbrlParser()
        return EdinetObjWrapper(parser.parse_file(xbrl_path))
コード例 #4
0
class EdinetExtractParser(Command):
    def __init__(self, target_text, workspace):
        self.parser = EdinetXbrlParser()
        self.xbrl_dir_path = workspace.download_file_dir
        self.issues_dir_path = workspace.output_dir
        self.target_text = target_text.lower()

    # fontが気になる場合は、font-family: &apos;MS Mincho&apos;; 消す
    def execute(self):
        super().execute()
        only_files = [f for f in listdir(self.xbrl_dir_path) if isfile(join(self.xbrl_dir_path, f))]

        template_file = open('public/template.html')
        template = Template(template_file.read())

        for f in only_files:
            edinet_xbrl_object = self.parser.parse_file(self.xbrl_dir_path + f)
            issue_keys = {key for key in edinet_xbrl_object.get_keys() if str(key).__contains__(self.target_text)}
            for key in issue_keys:
                for edinet_data in edinet_xbrl_object.get_data_list(key):
                    html_file = open(self.issues_dir_path + f + "_issue.html", "w", encoding='UTF-8')
                    html_file.write(self.generateHtml(template, edinet_data))

    @staticmethod
    def generateHtml(template, value):
        return template.substitute({'main_contents': value.get_value()})
コード例 #5
0
from edinet_xbrl.edinet_xbrl_parser import EdinetXbrlParser

## init parser
parser = EdinetXbrlParser()

## parse xbrl file and get data container
xbrl_file_path = "./data/jpcrp030000-asr-001_E05663-000_2018-06-30_01_2018-09-19.xbrl"
edinet_xbrl_object = parser.parse_file(xbrl_file_path)

## 貸借対照表
key = "jppfs_cor:Assets"
context_ref = "CurrentYearInstant"
current_year_assets = edinet_xbrl_object.get_data_by_context_ref(
    key, context_ref).get_value()
print(current_year_assets)
コード例 #6
0
 def __init__(self, target_text, workspace):
     self.parser = EdinetXbrlParser()
     self.xbrl_dir_path = workspace.download_file_dir
     self.issues_dir_path = workspace.output_dir
     self.target_text = target_text.lower()
コード例 #7
0
def parse(dir, ticker):
    print("START PARSING")
    folder_path = dir + "/" + ticker
    file_list = glob.glob(folder_path + "/*")
    ## init parser
    parser = EdinetXbrlParser()

    ## get value from container
    keys_dict = {
        "社名": ["jpcrp_cor:CompanyNameCoverPage"],
        "文書名": ["jpcrp_cor:DocumentTitleCoverPage"],
        "期間": ["jpcrp_cor:QuarterlyAccountingPeriodCoverPage"],
        "Sales": ["jppfs_cor:NetSales"],
        "OperatingIncome": ["jppfs_cor:OperatingIncome"],
        "OrdinaryIncome": ["jppfs_cor:OrdinaryIncome"],
        "CurrentAssets": ["jppfs_cor:CurrentAssets"],
        "FixedAssets":
        ["jppfs_cor:IntangibleAssets", "jppfs_cor:NoncurrentAssets"],
        "CurrentLiabilities": ["jppfs_cor:CurrentLiabilities"],
        "FixedLiabilities":
        ["jppfs_cor:LongTermLoansPayable", "jppfs_cor:NoncurrentLiabilities"],
        "SalesCF": ["jppfs_cor:NetCashProvidedByUsedInOperatingActivities"],
        "InvestmentCF":
        ["jppfs_cor:NetCashProvidedByUsedInInvestmentActivities"],
        "FinanceCF": ["jppfs_cor:NetCashProvidedByUsedInFinancingActivities"]
    }

    key_list = [
        "Sales", "OperatingIncome", "OrdinaryIncome", "CurrentAssets",
        "FixedAssets", "CurrentLiabilities", "FixedLiabilities", "SalesCF",
        "InvestmentCF", "FinanceCF"
    ]

    context_ref = [
        "CurrentYTDDuration", "CurrentQuarterInstant", "CurrentYearInstant",
        "CurrentYearInstant_NonConsolidatedMember", "CurrentYearDuration",
        "CurrentQuarterInstant_NonConsolidatedMember",
        "CurrentYTDDuration_NonConsolidatedMember",
        "CurrentYearDuration_NonConsolidatedMember"
    ]

    #ここから
    file_list = glob.glob(folder_path + "/*")  #フォルダ内のファイル名一覧を取得
    file_list.reverse()

    df = pd.DataFrame(columns=[
        "txt", "Sales", "OperatingIncome", "OrdinaryIncome", "CurrentAssets",
        "FixedAssets", "CurrentLiabilities", "FixedLiabilities", "SalesCF",
        "InvestmentCF", "FinanceCF"
    ])
    cnt = 0
    for filename in file_list:
        edinet_xbrl_object = parser.parse_file(filename)
        copname = ""
        doc_name = ""
        term = ""
        row = []  #pandas用
        #社名
        key_copname = keys_dict["社名"][0]
        contxtref = "FilingDateInstant"
        dtobj = edinet_xbrl_object.get_data_by_context_ref(
            key_copname, contxtref)
        if dtobj != None:
            copname = dtobj.get_value()

        #文書名
        key_docname = keys_dict["文書名"][0]
        contxtref = "FilingDateInstant"
        dtobj = edinet_xbrl_object.get_data_by_context_ref(
            key_docname, contxtref)
        if dtobj != None:
            doc_name = dtobj.get_value()

        if (("四半期報告書" in doc_name) | ("有価証券報告書" in doc_name)):
            print("--------")
            print(filename[1:])
            row.append(filename[1:])
            key_term = keys_dict["期間"][0]
            contxtref = "FilingDateInstant"
            dtobj = edinet_xbrl_object.get_data_by_context_ref(
                key_term, contxtref)
            if dtobj != None:
                term = dtobj.get_value()
            for key in key_list:
                flg = 0
                for i in range(len(keys_dict[key])):
                    word = keys_dict[key][i]
                    for j in range(len(context_ref)):
                        dtobj = edinet_xbrl_object.get_data_by_context_ref(
                            word, context_ref[j])
                        if dtobj != None:
                            val = dtobj.get_value()
                            if val == None:
                                val = "-"
                            row.append(int(val))
                            flg = 1
                            break
                    if flg == 1:
                        break
                if flg == 0:
                    row.append(pd.np.nan)
            s = pd.Series(row, index=df.columns, name=cnt)
            df = df.append(s)
            cnt = cnt + 1

    df["txt"] = df["txt"].str.strip(folder_path)
    df["txt"] = df["txt"].str.strip('.xbrl')
    df["txt"] = df["txt"].str.strip('\\')
    df = df.sort_values(by="txt")
    print("FINISH PARSING")
    return df
コード例 #8
0
def get_value_dict(filename: str):
    parser = EdinetXbrlParser()
    suite = EdinetObjWrapper(
        parser.parse_file('tests/adapter/%s.xbrl' % filename))
    return suite.get_value_dict()
コード例 #9
0
 def parse(xbrl_file_path):
     parser = EdinetXbrlParser()
     return parser.parse_file(xbrl_file_path)