def get_value_from_dataset( classification, dataset, concept_id, label_ko=None, lang='ko', ): """ dataset에서 값을 추출하는 함수 """ def str_to_float(val): try: return float(val) except ValueError: return val if isinstance(classification, dict): classification = [classification] # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드 currency_unit = None if label_ko is not None: regex = re.compile(r'\(단위:(.*)\)') unit = regex.search(label_ko) if unit is not None: unit = unit.group(0) currency = get_currency_str(unit) if currency is not None: currency_unit = str_unit_to_number_unit(currency) results = list() added_title = list() for cls in classification: value = float('nan') for data in dataset[cls['cls_id']]: if str_compare(data.concept.id, concept_id): value = str_to_float(data.value) # XBRL 내부 주당이익에서 발생하는 오류 수정을 위한 코드 if currency_unit is not None: decimals = str_to_float(data.decimals) # decimals이 없을 경우 0으로 처리 if math.isinf(decimals) or math.isnan(decimals): decimals = 0 value = value * pow(10, decimals) value = value * currency_unit break title = get_title(cls, lang) if title in added_title: index = added_title.index(title) if not math.isnan(value): results[index] = value else: results.append(value) added_title.append(title) return results
def convert_thead_into_columns(fs_tp: str, fs_table: dict, separate: bool = False, lang: str = 'ko'): """ thead에서 DataFrame의 columns을 추출하는 Method""" def column_ko_to_en(ko): ko_to_en = { '과목': 'label_ko', '주석': 'comment' } en = ko_to_en.get(ko) return en if en else ko thead = fs_table['table'].thead if thead is None: tt = fs_table['table'].tbody.tr.extract() thead = BeautifulSoup('<thead></thead>', 'html.parser') thead.thead.append(tt) for td in thead.tr.find_all('td'): td.name = 'th' th_colspan_list = [int(th.attrs.get('colspan', 1)) for th in thead.tr.find_all('th')] date_info = extract_date_from_header(fs_table['header']) # Regular Expression for title regex = str_to_regex('과목 OR 주석') fs_string = { 'bs': 'Statement of financial position', 'is': 'Income statement', 'cis': 'Statement of comprehensive income', 'cf': 'Statement of cash flows' } str_unit = extract_unit_from_header(fs_table['header']) str_unit = get_currency_str(str_unit) if str_unit: for key in fs_string: fs_string[key] = fs_string[key] + '(Unit: {})'.format(str_unit) label = { 'ko': { True: '별도재무제표', False: '연결재무제표' }, 'en': { True: 'Separate', False: 'Consolidated' } } # 최대 Col col_length = sum(th_colspan_list) # 최대 Row row_length = len(thead.find_all('tr')) row_length = row_length + 1 if row_length == 1 else row_length # row-sapn, col-span을 처리하기 위한 Matrix columns_matrix = [[None for _y in range(col_length)] for _x in range(row_length)] for idx, tr in enumerate(thead.find_all('tr')): start_idx = 0 for ele_idx, element in enumerate(columns_matrix[idx]): if element is None: start_idx = ele_idx break for jdx, th in enumerate(tr.find_all('th')): row_span = int(th.attrs.get('rowspan', 1)) col_span = int(th.attrs.get('colspan', 1)) text = re.sub(r'\s+', '', th.text) date_list = [datetime(1900, 1, 1)] if idx == 0: if jdx == 0: text = '과목' elif regex.search(text) is None: if len(date_info) > 0: date_list = date_info.pop(0) else: import warnings date = '-'.join([date.strftime('%Y%m%d') for date in date_list]) warnings_text = "Date data length does not match table header."\ + "So last date was set using last data({}). ".format(date) warnings.warn(warnings_text, RuntimeWarning) text = '-'.join([date.strftime('%Y%m%d') for date in date_list]) if regex.search(text): row_span = 2 for mdx in range(row_span): for ndx in range(col_span): new_text = text if mdx == 0 and regex.search(text): new_text = fs_string[fs_tp] columns_matrix[idx + mdx][start_idx + ndx] = new_text start_idx = start_idx + ndx + 1 regex_3month = re.compile(r'3개월') regex_total = str_to_regex(r'누적 OR 금액') columns = [] for jdx in range(len(columns_matrix[0])): column = [] sec_item = [] for idx in range(len(columns_matrix)): item = columns_matrix[idx][jdx] if idx == 0: column.append(item) continue elif idx == 1 and (item is None or regex.search(item) is None): sec_item.append(label[lang][separate]) else: pass if item is None: pass elif str_compare(column[0], item): continue elif regex_3month.search(item): # extract date info date_info = [datetime.strptime(date_str, '%Y%m%d') for date_str in column[0].split('-')] # calculating start_dt delta = relativedelta(months=3) start_dt = date_info[1] - delta start_dt = start_dt.replace(day=1) end_dt = date_info[1] column[0] = '-'.join([date.strftime('%Y%m%d') for date in [start_dt, end_dt]]) elif regex_total.search(item): pass else: sec_item.append(column_ko_to_en(item)) if sec_item[0] in ['label_ko', 'comment']: column.append(sec_item[0]) else: column.append(tuple(sec_item)) columns.append(column) return columns