コード例 #1
0
def convert_tbody_to_dataframe(columns: list, fs_table: dict):
    """ Html의 tbody를 DataFrame으로 변환하는 함수"""
    column_matrix = OrderedDict()
    for idx, column in enumerate(columns):
        key = tuple(column)
        if column_matrix.get(key):
            column_matrix[key].append(idx)
        else:
            column_matrix[key] = []
            column_matrix[key].append(idx)
    deduplicated = [key for key in column_matrix]

    df_columns = pd.MultiIndex.from_tuples(deduplicated)
    df = pd.DataFrame(columns=df_columns)

    tbody = fs_table['table'].tbody
    regex = str_to_regex('label_ko OR comment')
    str_unit = extract_unit_from_header(fs_table['header'])
    unit = str_unit_to_number_unit(str_unit)
    unit_regex = re.compile(r'\(단위\s*?:\s*([a-zA-Zㄱ-힣])\)')

    for idx, tr in enumerate(tbody.find_all('tr')):
        extracted = [
            re.sub(r'\s+|=+', '', td.text) for td in tr.find_all('td')
        ]
        row = {key: 0 for key in deduplicated}
        for key, index_list in column_matrix.items():
            for index in index_list:
                if len(extracted) <= index:
                    row[key] = None
                elif isinstance(key[1], str):
                    row[key] = extracted[index]
                elif regex.search(' '.join(key[1])):
                    value = extracted[index]
                    row[key] = value
                else:
                    value = str_to_float(extracted[index], unit)
                    row[key] += value

            if isinstance(row[key], float):
                if abs(row[key]) < 1e-10:
                    row[key] = ''
                else:
                    row[key] = row[key] * unit

        ordered_list = []
        for column in df_columns.tolist():
            ordered_list.append(row.get(column, None))

        row_unit = unit_regex.search(ordered_list[0])
        if row_unit:
            row_unit = str_unit_to_number_unit(row_unit.group(1))
            for jdx, value in enumerate(ordered_list):
                if isinstance(value, str):
                    pass
                else:
                    ordered_list[jdx] = ordered_list[jdx] / unit * row_unit

        df.loc[idx] = ordered_list
    return df
コード例 #2
0
def get_value_from_dataset(
    classification,
    dataset,
    concept_id,
    label_ko=None,
    lang='ko',
):
    """ dataset에서 값을 추출하는 함수 """
    def str_to_float(val):
        try:
            return float(val)
        except ValueError:
            return val

    if isinstance(classification, dict):
        classification = [classification]

    # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드
    currency_unit = None
    if label_ko is not None:
        regex = re.compile(r'\(단위:(.*)\)')
        unit = regex.search(label_ko)
        if unit is not None:
            unit = unit.group(0)
            currency = get_currency_str(unit)
            if currency is not None:
                currency_unit = str_unit_to_number_unit(currency)

    results = list()
    added_title = list()
    for cls in classification:
        value = float('nan')
        for data in dataset[cls['cls_id']]:
            if str_compare(data.concept.id, concept_id):
                value = str_to_float(data.value)
                # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드
                if currency_unit is not None:
                    decimals = str_to_float(data.decimals)
                    # decimals이 없을 경우 0으로 처리
                    if math.isinf(decimals) or math.isnan(decimals):
                        decimals = 0
                    value = value * pow(10, decimals)
                    value = value * currency_unit
                break

        title = get_title(cls, lang)
        if title in added_title:
            index = added_title.index(title)
            if not math.isnan(value):
                results[index] = value
        else:
            results.append(value)
            added_title.append(title)
    return results