Ejemplo n.º 1
0
    def test_store_data_frames(self):
        # '''
        df_cash_flow_before = _sync_cash_flow_statement(4564, 2013, to_year=2019)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW)
        db_repository.put_data(4564, df_cash_flow_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        # '''

        # '''
        df_profit_statement_before = _sync_profit_statement(4564, 2013, to_year=2019)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT)
        db_repository.put_data(4564, df_profit_statement_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        # '''
        # '''
        df_balance_sheet_before = _sync_balance_sheet(4564, 2013, to_year=2020)
        print(df_balance_sheet_before)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET)
        db_repository.put_data(4564, df_balance_sheet_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.columns)
        # '''
        # '''
        df_dividend_before = _sync_dividend_policy(4564, 2013)
        print(df_dividend_before)
        print(df_dividend_before.index)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY)
        db_repository.put_data(4564, df_dividend_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        # '''
        # '''
        df_dividend_before = _sync_dividend_policy(4564, 2013)
        print(df_dividend_before)
        print(type(df_dividend_before.index))
        self.assertIsInstance(df_dividend_before.index, pd.PeriodIndex)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY)
        db_repository.put_data(4564, df_dividend_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        self.assertIsInstance(data_frame.index, pd.PeriodIndex)
Ejemplo n.º 2
0
class DividendPolicyProcessor2(StatementProcessor):

    def __init__(self):
        super().__init__(None)
        self.dividend_policy_fetcher = _DividendPolicyFetcher2()
        self.__repository = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY)

    def get_data_frame(self, year, season):
        pass

    def get_data_frames(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year):
        cache_df = self._parse_raw_data(stock_id, self.__repository.get_data(stock_id))
        if cache_df is None or cache_df[str(to_year)].empty or cache_df[str(to_year)].isnull().values.all():
            latest_raw_data = self._get_raw_data(stock_id, start_year=start_year, to_year=to_year)
            print('latest_raw_data = ', latest_raw_data)
            return self._parse_raw_data(stock_id=stock_id, raw_data=latest_raw_data)
        else:
            return cache_df

    def _parse_raw_data(self, stock_id, raw_data):
        try:
            soup = BeautifulSoup(raw_data, 'html.parser')
            table = soup.find('table', attrs={"class": "hasBorder", "width": "99%"})
            data_frame = pd.read_html(str(table))[0]
            print('dividend data_frame = ', data_frame)
        except Exception as e:
            print('get', e, ' when get dividend policy')
            return None
        data_frame = data_frame.iloc[3:, :]
        period_list = list(map(lambda x: pd.Period(self.__parse_period(x)), data_frame.iloc[:, 1].tolist()))
        dividend_cash_list = list(map(lambda x: float(x), data_frame.iloc[:, 10].tolist()))
        dividend_cash_stock_list = list(map(lambda x: float(x), data_frame.iloc[:, 13].tolist()))
        dividend_record_version = list(map(lambda x: int(x), data_frame.iloc[:, 3].tolist()))
        meeting_progress = list(map(lambda x: str(x), data_frame.iloc[:, 0].tolist()))
        parse_dict = {}
        for index in range(0, len(period_list)):
            period = period_list[index]
            if parse_dict.get(period) is None:
                parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index],
                                      dividend_record_version[index]]
            else:
                print('duplicate ', period)
                if meeting_progress[index].find('股東會確認') != -1 and parse_dict[period][2] < dividend_record_version[index]:
                    parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index],
                                          dividend_record_version[index]]
        period_list = parse_dict.keys()
        dividend_cash_list = [value[0] for value in parse_dict.values()]
        dividend_cash_stock_list = [value[1] for value in parse_dict.values()]
        dict_dividend = {'現金股利': dividend_cash_list, '配股': dividend_cash_stock_list}
        print(dict_dividend)

        now = datetime.now()

        def get_default_time_line_periods():
            periods = []
            for year in range(2013, now.year + 1):
                for quarter in range(1, 5):
                    periods.append(pd.Period(str(year) + 'Q' + str(quarter)))
            return periods

        df_dividend = pd.DataFrame(dict_dividend, index=period_list).reindex(get_default_time_line_periods()).applymap(
            lambda x: pd.np.nan if pd.isnull(x) else x)
        print('df_dividend = ', df_dividend)
        dic_dividend = {}
        for year in range(2013, now.year + 1):
            df_extract = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:]
            df_extract_sum = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:].sum()

            isnan = all(pd.np.isnan(ele) for ele in list(itertools.chain(*df_extract.values)))
            dic_dividend[pd.Period(year)] = [pd.np.nan for ele in df_extract_sum] if isnan else df_extract_sum

        print('df_dividend 2 = ', dic_dividend)
        print("\n")
        df_dividend = pd.DataFrame(dic_dividend)
        df_dividend = df_dividend.T
        print('df_dividend 3 = ', df_dividend)
        return df_dividend

    def _get_raw_data(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year):
        result = self.dividend_policy_fetcher.fetch(
            {'stock_id': stock_id, 'start_year': start_year - 1911, 'to_year': to_year - 1911})
        if result.ok is False:
            print('get content fail')
            return
        print('result content = ', result.content)

        self.__repository.put_data(stock_id, result.content)
        return result.content

    def __parse_period(self, period_string):
        if period_string.find("年年度") > -1:
            return str(int(period_string.replace("年年度", "")) + 1911) + "Q4"
        elif period_string.find("年上半年") > -1:
            return str(int(period_string.replace("年上半年", "")) + 1911) + "Q2"
        elif period_string.find("年下半年") > -1:
            return str(int(period_string.replace("年下半年", "")) + 1911) + "Q4"
        else:
            period_strings = period_string.replace("季", "").split("年第")
            return str((int(period_strings[0]) + 1911)) + "Q" + period_strings[1]