Ejemplo n.º 1
0
class StockCountProcessor:
    def __init__(self):
        self.__repository = MongoDBRepository(MongoDBMeta.STOCK_COUNT)

    def get_stock_count(self, stock_id, year):
        raw_data = self.__repository.get_data(stock_id, {'year': year})
        if raw_data is None:
            fetch_stock_count_raw_data(stock_id, year, year)
            raw_data = self.__repository.get_data(stock_id, {'year': year})
        if raw_data is None:
            return None
        bs = BeautifulSoup(raw_data, 'html.parser')
        table = bs.find_all(has_table_width_no_class)
        # print(bs.prettify())
        # print(len(table))
        # print(table[0].prettify())
        if len(table) == 0:
            return None
        rows = table[0].find_all('tr')
        for row in rows:
            r = [
                x.get_text().strip().replace(" ", "").replace(",", "")
                for x in row.find_all('td')
            ]
            print(r)

            if len(r) > 3 and r[1] == '合計':
                return int(r[3])

        return 0

    def get_data_frame(self, stock_id, since, to=None):
        if to is None or to < since:
            to = datetime.now().year
        stocks = []
        end_year = since
        start_year = since
        for year in range(since, to + 1):
            stock_count = self.get_stock_count(stock_id, year)
            print("StockCountProcessor year = ", year, " stocks = ",
                  stock_count)

            if stock_count is None:
                if start_year == year:
                    start_year = start_year + 1
                    continue
                else:
                    if len(stocks) > 0:
                        break
                    else:
                        return
            stocks.append(stock_count)
            end_year = year
        period_index = pd.PeriodIndex(start=pd.Period(start_year, freq='Y'),
                                      end=pd.Period(end_year, freq='Y'),
                                      freq='Y')
        return pd.DataFrame(data={'股數': stocks}, index=period_index)
Ejemplo n.º 2
0
    def test_store_data_frames(self):
        # '''
        df_cash_flow_before = _sync_cash_flow_statement(4564, 2013, to_year=2019)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW)
        db_repository.put_data(4564, df_cash_flow_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        # '''

        # '''
        df_profit_statement_before = _sync_profit_statement(4564, 2013, to_year=2019)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT)
        db_repository.put_data(4564, df_profit_statement_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        # '''
        # '''
        df_balance_sheet_before = _sync_balance_sheet(4564, 2013, to_year=2020)
        print(df_balance_sheet_before)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET)
        db_repository.put_data(4564, df_balance_sheet_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.columns)
        # '''
        # '''
        df_dividend_before = _sync_dividend_policy(4564, 2013)
        print(df_dividend_before)
        print(df_dividend_before.index)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY)
        db_repository.put_data(4564, df_dividend_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        # '''
        # '''
        df_dividend_before = _sync_dividend_policy(4564, 2013)
        print(df_dividend_before)
        print(type(df_dividend_before.index))
        self.assertIsInstance(df_dividend_before.index, pd.PeriodIndex)
        db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY)
        db_repository.put_data(4564, df_dividend_before)
        data_frame = db_repository.get_data(4564)
        print(data_frame)
        print(data_frame.index)
        self.assertIsInstance(data_frame.index, pd.PeriodIndex)
Ejemplo n.º 3
0
 def test_sync_performance(self):
     _sync_performance(2841)
     _repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PERFORMANCE)
     content = _repository.get_data(2841)
     print(content)
     print('index = ', content.index)
     print('columns = ', content.columns)
Ejemplo n.º 4
0
class TWSEPriceMeasurementTransformer:
    def __init__(self):
        self.__in_repository = MongoDBRepository(MongoDBMeta.TWSE_PRICE_MEASUREMENT)
        self.__out_repository = _data_frame_repository

    def transform_to_dataframe(self, stock_id):
        content = self.__in_repository.get_data(stock_id)
        # print(content['fields'])
        print('content = ', content)
        if content is None or content['stat'] != 'OK':
            return
        rows = []
        indexes = []
        _logger.info("TWSEPriceMeasurementTransformer transform " + str(stock_id))
        for row_items in content['data']:
            row = [str(row_item).replace(',', '') for row_item in row_items]
            row[1] = int(row[1])
            row[2] = int(row[2])
            row[3] = int(row[3])
            row[4] = float(row[4])
            row[6] = float(row[6])
            row[8] = float(row[8])
            indexes.append(pd.Period(value=str(int(row[0]) + 1911)))
            rows.append(row[1:])
        data_frame = pd.DataFrame(rows, index=indexes,
                                  columns=['成交股數', '成交金額', '成交筆數', '最高價', '日期', '最低價', '日期', '收盤平均價'])
        print(data_frame)
        self.__out_repository.put_data(stock_id, data_frame)
        return data_frame
Ejemplo n.º 5
0
 def test_sync_statements(self):
     # '''
     db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW)
     data_frame_before = db_repository.get_data(2841)
     data_frame_after = _sync_cash_flow_statement(2841, 2013, to_year=2021, df_cash_flow_statement=data_frame_before)
     print('before = ', data_frame_before)
     print('after = ', data_frame_after)
     # '''
     # '''
     db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT)
     data_frame_before = db_repository.get_data(2841)
     data_frame_after = _sync_profit_statement(2841, 2013, df_profit_statement=data_frame_before)
     print('before = ', data_frame_before)
     print('after = ', data_frame_after)
      # '''
     # '''
     db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET)
     data_frame_before = db_repository.get_data(2841)
     data_frame_after = _sync_balance_sheet(2841, 2013, 2019, df_balance_sheet=data_frame_before)
     print('before = ', data_frame_before)
     print('after = ', data_frame_after)
Ejemplo n.º 6
0
class TPEXPriceMeasurementTransformer:
    def __init__(self):
        self.__in_repository = MongoDBRepository(MongoDBMeta.TPEX_PRICE_MEASUREMENT)
        self.__out_repository = _data_frame_repository

    def transform_to_dataframe(self, stock_id):
        # collection = db[TABLE_TPEX_PRICE_MEASUREMENT]
        # record = collection.find_one({"stock_id": str(stock_id)})
        record = self.__in_repository.get_data(stock_id)
        _logger.info("TWSEPriceMeasurementTransformer transform " + str(stock_id))
        if record is not None:
            try:
                soup = BeautifulSoup(record, 'html.parser')
                table = soup.find('table', attrs={"class": "page-table-board"})
                rows = []
                indexes = []
                for tr in table.find_all('tr'):
                    if tr.find('td', attrs={"class": "page-table-body-center"}) is not None:
                        tds = tr.find_all('td')
                        row = [td.string.replace(',', '') for td in tds]
                        row[0] = int(row[0]) + 1911
                        row[1] = int(row[1]) * 1000
                        row[2] = int(row[2]) * 1000
                        row[3] = int(row[3]) * 1000
                        row[4] = float(row[4])
                        row[6] = float(row[6])
                        row[8] = float(row[8])
                        indexes.append(pd.Period(str(row[0])))
                        rows.append(row[1:])
                data_frame = pd.DataFrame(rows, index=indexes,
                                          columns=['成交股數', '成交金額', '成交筆數', '最高價', '日期', '最低價', '日期', '收盤平均價'])
                print(data_frame)
                self.__out_repository.put_data(stock_id, data_frame)
                return data_frame

            except Exception as inst:
                _logger.error("get exception in " + str(stock_id) + ":" + str(inst))
                traceback.print_tb(inst.__traceback__)
Ejemplo n.º 7
0
class DividendPolicyProcessor2(StatementProcessor):

    def __init__(self):
        super().__init__(None)
        self.dividend_policy_fetcher = _DividendPolicyFetcher2()
        self.__repository = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY)

    def get_data_frame(self, year, season):
        pass

    def get_data_frames(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year):
        cache_df = self._parse_raw_data(stock_id, self.__repository.get_data(stock_id))
        if cache_df is None or cache_df[str(to_year)].empty or cache_df[str(to_year)].isnull().values.all():
            latest_raw_data = self._get_raw_data(stock_id, start_year=start_year, to_year=to_year)
            print('latest_raw_data = ', latest_raw_data)
            return self._parse_raw_data(stock_id=stock_id, raw_data=latest_raw_data)
        else:
            return cache_df

    def _parse_raw_data(self, stock_id, raw_data):
        try:
            soup = BeautifulSoup(raw_data, 'html.parser')
            table = soup.find('table', attrs={"class": "hasBorder", "width": "99%"})
            data_frame = pd.read_html(str(table))[0]
            print('dividend data_frame = ', data_frame)
        except Exception as e:
            print('get', e, ' when get dividend policy')
            return None
        data_frame = data_frame.iloc[3:, :]
        period_list = list(map(lambda x: pd.Period(self.__parse_period(x)), data_frame.iloc[:, 1].tolist()))
        dividend_cash_list = list(map(lambda x: float(x), data_frame.iloc[:, 10].tolist()))
        dividend_cash_stock_list = list(map(lambda x: float(x), data_frame.iloc[:, 13].tolist()))
        dividend_record_version = list(map(lambda x: int(x), data_frame.iloc[:, 3].tolist()))
        meeting_progress = list(map(lambda x: str(x), data_frame.iloc[:, 0].tolist()))
        parse_dict = {}
        for index in range(0, len(period_list)):
            period = period_list[index]
            if parse_dict.get(period) is None:
                parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index],
                                      dividend_record_version[index]]
            else:
                print('duplicate ', period)
                if meeting_progress[index].find('股東會確認') != -1 and parse_dict[period][2] < dividend_record_version[index]:
                    parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index],
                                          dividend_record_version[index]]
        period_list = parse_dict.keys()
        dividend_cash_list = [value[0] for value in parse_dict.values()]
        dividend_cash_stock_list = [value[1] for value in parse_dict.values()]
        dict_dividend = {'現金股利': dividend_cash_list, '配股': dividend_cash_stock_list}
        print(dict_dividend)

        now = datetime.now()

        def get_default_time_line_periods():
            periods = []
            for year in range(2013, now.year + 1):
                for quarter in range(1, 5):
                    periods.append(pd.Period(str(year) + 'Q' + str(quarter)))
            return periods

        df_dividend = pd.DataFrame(dict_dividend, index=period_list).reindex(get_default_time_line_periods()).applymap(
            lambda x: pd.np.nan if pd.isnull(x) else x)
        print('df_dividend = ', df_dividend)
        dic_dividend = {}
        for year in range(2013, now.year + 1):
            df_extract = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:]
            df_extract_sum = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:].sum()

            isnan = all(pd.np.isnan(ele) for ele in list(itertools.chain(*df_extract.values)))
            dic_dividend[pd.Period(year)] = [pd.np.nan for ele in df_extract_sum] if isnan else df_extract_sum

        print('df_dividend 2 = ', dic_dividend)
        print("\n")
        df_dividend = pd.DataFrame(dic_dividend)
        df_dividend = df_dividend.T
        print('df_dividend 3 = ', df_dividend)
        return df_dividend

    def _get_raw_data(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year):
        result = self.dividend_policy_fetcher.fetch(
            {'stock_id': stock_id, 'start_year': start_year - 1911, 'to_year': to_year - 1911})
        if result.ok is False:
            print('get content fail')
            return
        print('result content = ', result.content)

        self.__repository.put_data(stock_id, result.content)
        return result.content

    def __parse_period(self, period_string):
        if period_string.find("年年度") > -1:
            return str(int(period_string.replace("年年度", "")) + 1911) + "Q4"
        elif period_string.find("年上半年") > -1:
            return str(int(period_string.replace("年上半年", "")) + 1911) + "Q2"
        elif period_string.find("年下半年") > -1:
            return str(int(period_string.replace("年下半年", "")) + 1911) + "Q4"
        else:
            period_strings = period_string.replace("季", "").split("年第")
            return str((int(period_strings[0]) + 1911)) + "Q" + period_strings[1]
Ejemplo n.º 8
0
class SimpleIncomeStatementProcessor:
    def __init__(self):
        self.__repository = MongoDBRepository(MongoDBMeta.SIMPLE_BALANCE_SHEET)

    def get_data_frames(self,
                        stock_id,
                        since,
                        to=None,
                        source_policy=Source.CACHE_ONLY):
        time_lines = get_time_lines(since=since, to=to)
        year = time_lines[0].get('year')
        season = time_lines[0].get('season')
        last_result = self._get_data_dict(stock_id, year, season -
                                          1) if season > 1 else None
        dfs = []

        for time_line in time_lines:
            data_dict = self._get_data_dict(stock_id, time_line.get('year'),
                                            time_line.get('season'))
            if data_dict is None:
                continue

            if last_result is not None:
                result = {
                    k: (v - last_result[k])
                    for (k, v) in data_dict.items()
                }
            else:
                result = data_dict
            print('result = ', result, ' last_result', last_result)

            last_result = None if time_line.get('season') == 4 else data_dict
            str_period = "{}Q{}".format(time_line.get('year'),
                                        time_line.get('season'))
            period_index = pd.PeriodIndex(start=pd.Period(str_period,
                                                          freq='Q'),
                                          end=pd.Period(str_period, freq='Q'),
                                          freq='Q')
            dfs.append(
                pd.DataFrame([result.values()],
                             columns=result.keys(),
                             index=period_index))

        return pd.concat(dfs) if len(dfs) > 0 else None

    def get_data_frame(self,
                       stock_id,
                       year,
                       season,
                       source_policy=Source.CACHE_ONLY):
        return self.get_data_frames(stock_id=stock_id,
                                    since={
                                        'year': year,
                                        'season': season
                                    },
                                    to={
                                        'year': year,
                                        'season': season
                                    },
                                    source_policy=source_policy)

    def _get_data_dict(self, stock_id, year, season):
        # result = self.__data_fetcher.fetch({'stock_id': self._stock_id, 'year': year - 1911, 'season': season})
        # if result.ok is False:
        #     return None
        try:
            dict_datas = {}
            raw_data = self.__repository.get_data(stock_id, {
                'year': year,
                'season': season
            })
            if raw_data is None:
                fetch_simple_balance_sheet_raw_data(stock_id, year, season)
                raw_data = self.__repository.get_data(stock_id, {
                    'year': year,
                    'season': season
                })
            bs = BeautifulSoup(raw_data, 'html.parser')
            print(' get ', bs.text)
            tables = bs.find_all('table',
                                 attrs={
                                     "class": "hasBorder",
                                     "align": "center",
                                     "width": "70%"
                                 })
            table = tables[2]
            rows = table.find_all('tr')
            for row in rows:
                r = [x.get_text() for x in row.find_all('td')]
                # print(r)
                if '每股盈餘' in r[0]:
                    dict_datas['EPS'] = float(r[1])
                if '本期綜合損益總額' in r[0]:
                    dict_datas['稅後淨利'] = int(r[1].replace(',', ''))
            return dict_datas

        except Exception as inst:
            print("get exception", inst, " when get data in year ", year,
                  ' and season ', season)
            traceback.print_tb(inst.__traceback__)
            return None
Ejemplo n.º 9
0
class ShareholderEquityProcessor(StatementProcessor):
    def __init__(self, stock_id):
        super().__init__(stock_id)
        self.__tag = "ShareholderEquityProcessor"
        self.__repository = MongoDBRepository(MongoDBMeta.SHARE_HOLDER)
        # self._data_fetcher = _ShareholderEquityFetcher()
        self.items_to_get = ('期初餘額', '期末餘額')
        self.fields_to_get = ('權益總額', )

    def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY):
        time_lines = get_time_lines(since=since, to=to)
        dfs = []
        column_index = pd.MultiIndex.from_product(
            [self.fields_to_get, self.items_to_get], names=['first', 'second'])
        print(column_index)
        last_result = self._get_data_dict(
            time_lines[0].get('year'), time_lines[0].get('season')
        ) if len(time_lines) > 0 and time_lines[0].get('season') > 1 else None

        for time_line in time_lines:
            result = self._get_data_dict(time_line.get('year'),
                                         time_line.get('season'))
            if result is None:
                continue
            if last_result is not None:
                for key in result.keys():
                    result[key]['期初餘額'] = last_result[key]['期末餘額']
            last_result = result
            print(result)
            str_period = "{}Q{}".format(time_line.get('year'),
                                        time_line.get('season'))
            period_index = pd.PeriodIndex(start=pd.Period(str_period,
                                                          freq='Q'),
                                          end=pd.Period(str_period, freq='Q'),
                                          freq='Q')
            data_list = []
            for inner in result.values():
                data_list.extend(inner.values())
            print(data_list)
            dfs.append(
                pd.DataFrame([data_list],
                             columns=column_index,
                             index=period_index))

        # return super().get_data_frames(since, to)
        print(self.__tag, "dfs = ", dfs)
        return pd.concat(dfs) if len(dfs) > 0 else None

    def get_data_frame(self, year, season):
        return self.get_data_frames(since={
            'year': year,
            'season': season
        },
                                    to={
                                        'year': year,
                                        'season': season
                                    })

    def _get_data_dict(self, year, season):
        raw_data = self.__repository.get_data(self._stock_id, {
            'year': year,
            'season': season
        })
        if raw_data is None:
            fetch_shareholder_equity_raw_data(self._stock_id, year, season)
            raw_data = self.__repository.get_data(self._stock_id, {
                'year': year,
                'season': season
            })
        if raw_data is not None:
            return self._parse_data(raw_data)

    def _parse_data(self, content):
        try:
            bs = BeautifulSoup(content, 'html.parser')
            # print(bs.prettify())
            tables = bs.find_all('table',
                                 attrs={
                                     "class": "hasBorder",
                                     "align": "center"
                                 })

            if len(tables) < 1:
                print('ShareholderEquityProcessor - error 1')

                return None

            table = tables[0]
            # print(table.prettify())
            rows = table.find_all('tr')

            headers = []
            rows_data = []
            for row in rows:
                columns_raw = [
                    column for column in row.contents if column != '\n'
                ]
                columns = [column.get_text().strip() for column in columns_raw]
                if len(columns) > 1:
                    if columns_raw[0].name == 'th' and len(headers) == 0:
                        headers = columns
                        if not all(field in headers
                                   for field in self.fields_to_get):
                            print('ShareholderEquityProcessor - error 2')

                            return None
                    else:
                        rows_data.append(columns)

            rows_data = [
                row_data for row_data in rows_data
                if row_data[0] in self.items_to_get
            ]
            result = {
                row_data[0]: {
                    k: int(row_data[headers.index(k)].replace(',', ''))
                    for k in self.fields_to_get
                }
                for row_data in rows_data
            }
            result2 = {
                key: {item: 0
                      for item in self.items_to_get}
                for key in self.fields_to_get
            }
            for key in result.keys():
                for key2 in result[key]:
                    result2[key2][key] = result[key][key2]

            print("result = ", result2)
            return result2

        except Exception as inst:
            print("get exception", inst)
            traceback.print_tb(inst.__traceback__)
            return None
Ejemplo n.º 10
0
class CashFlowStatementProcessor(StatementProcessor):
    """業主盈餘現金流 = 營業活動之淨現金流入 + 取得不動產、廠房及設備 + 其他投資活動
       自由現金流 = 營業活動之淨現金流入 + 投資活動之淨現金流入"""
    def __init__(self, stock_id):
        super().__init__(stock_id)
        self.__repository = MongoDBRepository(MongoDBMeta.CASH_FLOW)
        self._fetch_fields = ('營業活動之淨現金流入', '取得不動產、廠房及設備', '其他投資活動',
                              '投資活動之淨現金流入')

    def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY):
        time_lines = get_time_lines(since=since, to=to)
        # time_first = time_lines[0]
        # if time_first.get('season') > 1:
        #     time_lines.insert(0, {'year': time_first.get('year'), 'season': (time_first.get('season') - 1)})
        # print(time_lines)

        time_lines.reverse()

        dfs = []
        cache_data_dict = None
        for time_line in time_lines:
            print('In ', time_line)
            year = time_line.get('year')
            season = time_line.get('season')
            if cache_data_dict is None:
                data_dict = self._get_data_dict(self._fetch_fields, year,
                                                season)
            else:
                data_dict = cache_data_dict

            if data_dict is None:
                continue
            if season > 1:
                cache_data_dict = self._get_data_dict(self._fetch_fields, year,
                                                      season - 1)
                if data_dict is None or cache_data_dict is None:
                    print('get None value in year ', year, ' season ', season,
                          " data_dict = ", data_dict, " cache_data_dic = ",
                          cache_data_dict)
                else:
                    for key in self._fetch_fields:
                        data_dict[key] = data_dict.get(
                            key, 0) - cache_data_dict.get(key, 0)
            else:
                cache_data_dict = None
            data_dict['業主盈餘現金流'] = data_dict.get('營業活動之淨現金流入', 0) + data_dict.get('取得不動產、廠房及設備', 0)\
                                   + data_dict.get('其他投資活動', 0)
            data_dict['自由現金流'] = data_dict.get(
                '營業活動之淨現金流入', 0) + data_dict.get('投資活動之淨現金流入', 0)
            print(data_dict)
            str_period = "{}Q{}".format(year, season)
            period_index = pd.PeriodIndex(start=pd.Period(str_period,
                                                          freq='Q'),
                                          end=pd.Period(str_period, freq='Q'),
                                          freq='Q')
            dfs.append(
                pd.DataFrame([data_dict.values()],
                             columns=data_dict.keys(),
                             index=period_index))
        return None if len(dfs) == 0 else pd.concat(dfs, sort=False)

    def get_data_frame(self, year, season, source_policy=Source.CACHE_ONLY):
        return self.get_data_frames(since={
            'year': year,
            'season': season
        },
                                    to={
                                        'year': year,
                                        'season': season
                                    },
                                    source_policy=source_policy)

    def _get_data_dict(self, fields, year, season):
        # result = self._data_fetcher.fetch(params={'stock_id': self._stock_id, 'year': year - 1911, 'season': season})
        # if result.ok is False:
        #     return None

        data_dict = {}
        try:
            raw_data = self.__repository.get_data(str(self._stock_id), {
                'year': year,
                'season': season
            })
            if raw_data is None:
                fetch_cash_flow_raw_data(self._stock_id, year, season)
                raw_data = self.__repository.get_data(str(self._stock_id), {
                    'year': year,
                    'season': season
                })
            # raw_data = get_raw_data(PATH_DIR_RAW_DATA_CASH_FLOW + str(year) + "Q" + str(season), str(self._stock_id))
            bs = BeautifulSoup(raw_data, 'html.parser')
            table = bs.find_all('table',
                                attrs={
                                    "class": "hasBorder",
                                    "align": "center"
                                })
            #print(table[0].prettify())

            rows = table[0].find_all('tr')
            for row in rows:
                r = [x.get_text() for x in row.find_all('td')]
                if len(r) == 0:
                    continue
                for field in fields:
                    if field in r[0]:
                        data_dict[field] = int(r[1].replace(',', ''))
                        break

        except Exception as inst:
            print("get exception", inst)
            traceback.print_tb(inst.__traceback__)
            return None
        # print(data_dict)
        return data_dict