def convert_tbody_to_dataframe(columns: list, fs_table: dict): """ Html의 tbody를 DataFrame으로 변환하는 함수""" column_matrix = OrderedDict() for idx, column in enumerate(columns): key = tuple(column) if column_matrix.get(key): column_matrix[key].append(idx) else: column_matrix[key] = [] column_matrix[key].append(idx) deduplicated = [key for key in column_matrix] df_columns = pd.MultiIndex.from_tuples(deduplicated) df = pd.DataFrame(columns=df_columns) tbody = fs_table['table'].tbody regex = str_to_regex('label_ko OR comment') str_unit = extract_unit_from_header(fs_table['header']) unit = str_unit_to_number_unit(str_unit) unit_regex = re.compile(r'\(단위\s*?:\s*([a-zA-Zㄱ-힣])\)') for idx, tr in enumerate(tbody.find_all('tr')): extracted = [ re.sub(r'\s+|=+', '', td.text) for td in tr.find_all('td') ] row = {key: 0 for key in deduplicated} for key, index_list in column_matrix.items(): for index in index_list: if len(extracted) <= index: row[key] = None elif isinstance(key[1], str): row[key] = extracted[index] elif regex.search(' '.join(key[1])): value = extracted[index] row[key] = value else: value = str_to_float(extracted[index], unit) row[key] += value if isinstance(row[key], float): if abs(row[key]) < 1e-10: row[key] = '' else: row[key] = row[key] * unit ordered_list = [] for column in df_columns.tolist(): ordered_list.append(row.get(column, None)) row_unit = unit_regex.search(ordered_list[0]) if row_unit: row_unit = str_unit_to_number_unit(row_unit.group(1)) for jdx, value in enumerate(ordered_list): if isinstance(value, str): pass else: ordered_list[jdx] = ordered_list[jdx] / unit * row_unit df.loc[idx] = ordered_list return df
def get_value_from_dataset( classification, dataset, concept_id, label_ko=None, lang='ko', ): """ dataset에서 값을 추출하는 함수 """ def str_to_float(val): try: return float(val) except ValueError: return val if isinstance(classification, dict): classification = [classification] # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드 currency_unit = None if label_ko is not None: regex = re.compile(r'\(단위:(.*)\)') unit = regex.search(label_ko) if unit is not None: unit = unit.group(0) currency = get_currency_str(unit) if currency is not None: currency_unit = str_unit_to_number_unit(currency) results = list() added_title = list() for cls in classification: value = float('nan') for data in dataset[cls['cls_id']]: if str_compare(data.concept.id, concept_id): value = str_to_float(data.value) # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드 if currency_unit is not None: decimals = str_to_float(data.decimals) # decimals이 없을 경우 0으로 처리 if math.isinf(decimals) or math.isnan(decimals): decimals = 0 value = value * pow(10, decimals) value = value * currency_unit break title = get_title(cls, lang) if title in added_title: index = added_title.index(title) if not math.isnan(value): results[index] = value else: results.append(value) added_title.append(title) return results