Exemple #1
0
def report_find_all(report: Report, query: dict, fs_tp: Tuple[str],
                    separate: bool) -> Tuple[int, Dict[str, Dict]]:
    """
    Report의 Page 중 Query 조건에 맞는 페이지 검색후 모든 재무제표 Table 추출

    Parameters
    ----------
    report: Report
        Report
    query: dict
        검색 조건
    fs_tp:  tuple of str
        검색할 재무제표 타입
    separate: bool
        개별 재무제표 여부

    Returns
    -------

    """
    count = 0
    fs_table = None
    searched_end = False
    searched = report.find_all(**query)

    for key in searched:
        for page in searched[key]:
            non_break_space = u'\xa0'
            html = page.html.replace(non_break_space, ' ')
            soup = BeautifulSoup(html, 'html.parser')

            tables = soup.find_all('table', border='1')
            fs_table = search_fs_table(tables=tables,
                                       fs_tp=fs_tp,
                                       separate=separate)
            count = sum(
                [fs_table[fs_tp]['table'] is not None for fs_tp in fs_table])
            if count > 0:
                searched_end = True
                break
        if searched_end:
            break
    return count, fs_table
Exemple #2
0
 def __init__(self, resp):
     self._page_no = resp['page_no']
     self._page_count = resp['page_count']
     self._total_count = resp['total_count']
     self._total_page = resp['total_page']
     self._report_list = [Report(**x) for x in resp['list']]
Exemple #3
0
def merge_fs(fs_df: Dict[str, DataFrame],
             label_df: Dict[str, DataFrame],
             report: Report,
             fs_tp: Tuple[str] = ('bs', 'is', 'cis', 'cf'),
             lang: str = 'ko',
             separate: bool = False):
    """
    재무제표 DataFrame과 Report의 데이터를 합쳐주는 Method

    Parameters
    ----------
    fs_df: dict of {str: DataFrame}
        재무제표
    label_df: dict of {str: DataFrame}
        재무제표 검색결과시 추출된 값의 Label
    report: Report
        Report
    fs_tp: tuple of str, optional
        'bs' 재무상태표, 'is' 손익계산서, 'cis' 포괄손익계산서, 'cf' 현금흐름표
    lang: str, optional
        'ko' 한글, 'en' 영문
    separate: bool, optional
        개별재무제표 여부

    Returns
    -------
    tuple of dict of {str: DataFrame}
        재무제표, 추출된 Label 리스트
    """
    try:
        global additional_comparison_function
        # 보고서의 웹페이지에서 재무제표 추출
        nfs_df = analyze_html(report=report,
                              fs_tp=fs_tp,
                              lang=lang,
                              separate=separate)

        if label_df is None:
            label_df = {tp: None for tp in fs_tp}

        for tp in fs_df:
            if tp in fs_tp:
                # 추가될 재무제표의 DataFrame
                df = fs_df[tp]

                # 새로 추가할 재무제표
                ndf = nfs_df[tp]

                # 재무제표가 없을시 추가 검색 X
                if df is None:
                    if ndf is None:
                        continue
                    else:
                        fs_df[tp] = ndf.copy(deep=True)
                        df = fs_df[tp]

                # 검색된 재무제표가 없을시 추가 검색 X
                if ndf is None:
                    continue

                # label_df가 없을시 초기화
                if label_df.get(tp) is None:
                    concept_column = find_all_columns(df, r'concept_id')
                    ko_column = find_all_columns(df, r'label_ko')
                    # Label_ko 가 없을시 Table 오류 이므로 None 처리
                    if len(ko_column) == 0:
                        fs_df[tp] = None
                        continue
                    else:
                        ko_column = ko_column[0]
                    date_columns = find_all_columns(df, r'\d{8}')

                    label_columns = []
                    if len(concept_column) == 1:
                        label_columns.append((
                            'default',
                            'concept_id',
                        ))
                    for column in date_columns:
                        label_columns.append(column)
                    nlabel_columns = pd.MultiIndex.from_tuples(label_columns)
                    label_df[tp] = pd.DataFrame(columns=nlabel_columns)

                    if len(concept_column) == 1:
                        label_df[tp][label_columns[0]] = [
                            extract_account_title(x)
                            for x in list(df[concept_column[0]])
                        ]

                    for column in date_columns:
                        label_df[tp][column] = list(df[ko_column])

                df_columns = set(df.columns.tolist())
                ndf_columns = set(ndf.columns.tolist())

                overlap = df_columns.intersection(ndf_columns)

                date_regex = re.compile(r'\d{8}')
                diff = [
                    x for x in (ndf_columns - overlap)
                    if date_regex.search(x[0])
                ]
                diff.sort(key=lambda x: date_regex.findall(x[0])[0],
                          reverse=True)

                # Data가 동일할 경우 Continue
                if len(diff) == 0:
                    continue

                for column in diff:
                    ndata = [None for _ in range(len(df))]
                    nlabels = ['' for _ in range(len(df))]
                    if len(overlap) > 0:
                        ndata, nlabels = compare_df_and_ndf_value(
                            column, df, ndf, ndata, nlabels)

                    for compare_func in additional_comparison_function:
                        ndata, nlabels = compare_func(column, df, ndf,
                                                      label_df[tp], ndata,
                                                      nlabels)

                    label_df[tp][column] = nlabels
                    fs_df[tp][column] = ndata
        return fs_df, label_df
    except Exception:
        msg = 'An error occurred while fetching or analyzing {}.'.format(
            report.to_dict())
        raise RuntimeError(msg)