def extract_related_reports(self): """ 연관 보고서 리스트 추출 Returns ------- list of RelatedReport 연관 보고서리스트 반환 """ if self.html is None: self._get_report() results = [] soup = self.html family = soup.find('select', id='family') related_reports = family.find_all('option') for report in related_reports: value = report.attrs.get('value') if compare_str(value, 'null'): continue rpt_nm = re.sub(r'\s+', ' ', report.text).strip() rcp_no = value.split('=')[1] if compare_str(self.rcp_no, rcp_no): if self.info.get('rpt_nm') is None: self.info['rpt_nm'] = rpt_nm continue info = {'rcp_no': rcp_no, 'rpt_nm': rpt_nm, 'parent': self} results.append(RelatedReport(**info)) self._related_reports = sorted(results, key=lambda x: x.rcp_no, reverse=True) return self._related_reports
def extract_attached_reports(self): """ 첨부된 보고서 리스트 추출 및 반환 Returns ------- list of AttachedReport 첨부된 보고서 리스트 """ if self.html is None: self._get_report() soup = self.html attached = soup.find('p', class_='f_none') attached_list = attached.find_all('option') attached_reports = [] for docs in attached_list: rpt_nm = re.sub(r'\s+', ' ', docs.text).strip() docs_url = docs.attrs.get('value') if compare_str(docs_url, 'null'): pass else: info = dict() parsed = parse_qs(docs_url) info['rcp_no'] = parsed.get('rcpNo')[0] info['dcm_no'] = parsed.get('dcmNo')[0] info['rpt_nm'] = rpt_nm info['parent'] = self attached_reports.append(AttachedReport(**info)) self._attached_reports = sorted(attached_reports, key=lambda x: x.rcp_no, reverse=True) return self._attached_reports
def run_test(self): fs = self.crp.get_financial_statement(start_dt=self.start_dt, separate=self.separate, report_tp=self.report_tp) for test in self.test_set: tp = test['fs_tp'] date = test['date'] column = test['column'] item = test['item'] expected = test['expected'] df = fs[tp] date_column = find_all_columns(df=df, query=date)[0] label_column = find_all_columns(df=df, query=column)[0] actual = None for idx in range(len(df)): text = df[label_column].iloc[idx].replace(' ', '') if compare_str(text, item): actual = df[date_column].iloc[idx] if actual != expected: pytest.fail("Test failed: crp_cd='{}', fs_tp='{}', ".format( self.crp.crp_cd, tp) + "start_dt='{}', report_tp='{}', ".format( self.start_dt, fs.info['report_tp']) + "date='{}', column='{}',".format(date, column) + "item='{}', actual='{}', expected='{}'".format( item, actual, expected))
def get_value_from_dataset(classification, dataset, concept_id): """ dataset에서 값을 추출하는 함수 """ def str_to_float(val): try: return float(val) except ValueError: return val if isinstance(classification, dict): classification = [classification] results = list() added_title = list() for cls in classification: value = float('nan') for data in dataset[cls['cls_id']]: if compare_str(data.concept.id, concept_id): value = str_to_float(data.value) break title = get_title(cls, 'en') if title in added_title: index = added_title.index(title) if not math.isnan(value): results[index] = value else: results.append(value) added_title.append(title) return results
def get_table_by_code(self, code: str) -> Union[Table, None]: """ Table 코드와 일치하는 Table 반환 Parameters ---------- code: str Table 코드번호 Returns ------- Table or None 코드 번호에 맞는 Table 또는 None """ for table in self.tables: if compare_str(table.code, code): return table return None
def search_financial_statement(crp_cd: str, start_dt: str, end_dt: str = None, fs_tp: Tuple[str] = ('fs', 'is', 'ci', 'cf'), separate: bool = False, report_tp: str = 'annual', lang: str = 'ko', separator: bool = True) -> FinancialStatement: """ 재무제표 검색 Parameters ---------- crp_cd: str 종목코드 start_dt: str 검색 시작일자(YYYYMMDD) end_dt: str, optional 검색 종료일자(YYYYMMDD) fs_tp: tuple of str, optional 'fs' 재무상태표, 'is' 손익계산서, 'ci' 포괄손익계산서, 'cf' 현금흐름표 separate: bool, optional 개별재무제표 여부 report_tp: str, optional 'annual' 1년, 'half' 반기, 'quarter' 분기 lang: str, optional 'ko' 한글, 'en' 영문 separator: bool, optional 1000단위 구분자 표시 여부 Returns ------- FinancialStatement 제무제표 검색 결과 """ if is_notebook(): from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm # 재무제표 검색 결과 statements = None # 사업보고서 검색(최종보고서) reports = search_report(crp_cd=crp_cd, start_dt=start_dt, end_dt=end_dt, bsn_tp='A001', page_count=100, fin_rpt=True) if len(reports) == 0: # todo 감사보고서를 이용하여 재무제표 검색 raise RuntimeError('Could not find an annual report') next_index = None for idx, _ in enumerate(reports): # 가장 최근 보고서의 경우 XBRL 파일을 이용하여 재무제표 검색 latest_report = reports[idx] latest_xbrl = latest_report.xbrl # XBRL 파일이 존재할 때 if latest_xbrl: if separate is False and not latest_xbrl.exist_consolidated(): raise NotFoundConsolidated('Could not find consolidated financial statements') # XBRL 정보를 이용하여 재무제표 정보 초기화 analyzed_results = analyze_xbrl(latest_report, fs_tp=fs_tp, separate=separate, lang=lang, show_abstract=False, show_class=True, show_depth=10, show_concept=True, separator=separator) statements = copy.deepcopy(analyzed_results) break else: statements = analyze_html(latest_report, fs_tp=fs_tp, separate=separate, lang=lang) # Report 에 재무제표 정보 없이 수정 사항만 기록된 경우 다음 리포트 검색 if statements is not None: next_index = idx + 1 break if separate is False and all([statements[tp] is None for tp in statements]): raise NotFoundConsolidated('Could not find consolidated financial statements') label_df = None for report in tqdm(reports[next_index:], desc='Annual reports', unit='report'): statements, label_df = merge_fs(statements, label_df, report, fs_tp=fs_tp, separate=separate, lang=lang) if compare_str(report_tp, 'half') or compare_str(report_tp, 'quarter'): half = search_report(crp_cd=crp_cd, start_dt=start_dt, end_dt=end_dt, bsn_tp=['A002'], page_count=100, fin_rpt=True) for report in tqdm(half, desc='Semiannual reports', unit='report'): statements, label_df = merge_fs(statements, label_df, report, fs_tp=fs_tp, separate=separate, lang=lang) if compare_str(report_tp, 'quarter'): quarter = search_report(crp_cd=crp_cd, start_dt=start_dt, end_dt=end_dt, bsn_tp=['A003'], page_count=100, fin_rpt=True) for report in tqdm(quarter, desc='Quarterly report', unit='report'): statements, label_df = merge_fs(statements, label_df, report, fs_tp=fs_tp, separate=separate, lang=lang) statements = drop_empty_columns(statements) label_df = drop_empty_columns(label_df) statements = sorting_columns(statements) label_df = sorting_columns(label_df) info = { 'crp_cd': crp_cd, 'start_dt': start_dt, 'end_dt': end_dt, 'separate': separate, 'report_tp': report_tp, 'lang': lang, 'separator': separator } return FinancialStatement(statements, label_df, info)
def convert_thead_into_columns(fs_tp: str, fs_table: dict, separate: bool = False, lang: str = 'ko'): """ thead에서 DataFrame의 columns을 추출하는 Method""" def column_ko_to_en(ko): ko_to_en = { '과목': 'label_ko', '주석': 'comment' } en = ko_to_en.get(ko) return en if en else ko thead = fs_table['table'].thead if thead is None: tt = fs_table['table'].tbody.tr.extract() thead = BeautifulSoup('<thead></thead>', 'html.parser') thead.thead.append(tt) for td in thead.tr.find_all('td'): td.name = 'th' th_colspan_list = [int(th.attrs.get('colspan', 1)) for th in thead.tr.find_all('th')] date_info = extract_date_from_header(fs_table['header']) # Regular Expression for title regex = str_to_regex('과목 OR 주석') fs_string = { 'fs': 'Statement of financial position', 'is': 'Income statement', 'ci': 'Statement of comprehensive income', 'cf': 'Statement of cash flows' } str_unit = extract_unit_from_header(fs_table['header']) str_unit = str_to_regex('원 OR USD').search(str_unit) if str_unit: str_unit = str_unit.group(0) str_unit = 'KRW' if compare_str('원', str_unit) else 'USD' for key in fs_string: fs_string[key] = fs_string[key] + '(Unit: {})'.format(str_unit) label = { 'ko': { True: '별도재무제표', False: '연결재무제표' }, 'en': { True: 'Separate', False: 'Consolidated' } } # 최대 Col col_length = sum(th_colspan_list) # 최대 Row row_length = len(thead.find_all('tr')) row_length = row_length + 1 if row_length == 1 else row_length # row-sapn, col-span을 처리하기 위한 Matrix columns_matrix = [[None for y in range(col_length)] for x in range(row_length)] for idx, tr in enumerate(thead.find_all('tr')): start_idx = 0 for ele_idx, element in enumerate(columns_matrix[idx]): if element is None: start_idx = ele_idx break for jdx, th in enumerate(tr.find_all('th')): row_span = int(th.attrs.get('rowspan', 1)) col_span = int(th.attrs.get('colspan', 1)) text = re.sub(r'\s+', '', th.text) date_list = [datetime(1900, 1, 1)] if idx == 0: if jdx == 0: text = '과목' elif regex.search(text) is None: if len(date_info) > 0: date_list = date_info.pop(0) else: import warnings date = '-'.join([date.strftime('%Y%m%d') for date in date_list]) warnings_text = "Date data length does not match table header."\ + "So last date was set using last data({}). ".format(date) warnings.warn(warnings_text, RuntimeWarning) text = '-'.join([date.strftime('%Y%m%d') for date in date_list]) if regex.search(text): row_span = 2 for mdx in range(row_span): for ndx in range(col_span): new_text = text if mdx == 0 and regex.search(text): new_text = fs_string[fs_tp] columns_matrix[idx + mdx][start_idx + ndx] = new_text start_idx = start_idx + ndx + 1 regex_3month = re.compile(r'3개월') regex_total = str_to_regex(r'누적 OR 금액') columns = [] for jdx in range(len(columns_matrix[0])): column = [] sec_item = [] for idx in range(len(columns_matrix)): item = columns_matrix[idx][jdx] if idx == 0: column.append(item) continue elif idx == 1 and (item is None or regex.search(item) is None): sec_item.append(label[lang][separate]) else: pass if item is None: pass elif compare_str(column[0], item): continue elif regex_3month.search(item): # extract date info date_info = [datetime.strptime(date_str, '%Y%m%d') for date_str in column[0].split('-')] # calculating start_dt delta = relativedelta(months=3) start_dt = date_info[1] - delta start_dt = start_dt.replace(day=1) end_dt = date_info[1] column[0] = '-'.join([date.strftime('%Y%m%d') for date in [start_dt, end_dt]]) elif regex_total.search(item): pass else: sec_item.append(column_ko_to_en(item)) if sec_item[0] in ['label_ko', 'comment']: column.append(sec_item[0]) else: column.append(tuple(sec_item)) columns.append(column) return columns