Esempio n. 1
0
    def __init__(self,
                 entity_type: EntityType = EntityType.Stock,
                 exchanges=None,
                 entity_ids=None,
                 codes=None,
                 batch_size=10,
                 force_update=False,
                 sleeping_time=5,
                 default_size=findy_config['batch_size'],
                 real_time=False,
                 fix_duplicate_way='add',
                 start_timestamp=None,
                 end_timestamp=None,
                 close_hour=0,
                 close_minute=0,
                 share_para=None) -> None:
        self.default_size = default_size
        self.real_time = real_time

        self.close_hour = close_hour
        self.close_minute = close_minute

        self.fix_duplicate_way = fix_duplicate_way
        self.start_timestamp = to_pd_timestamp(start_timestamp)
        self.end_timestamp = to_pd_timestamp(end_timestamp)

        super().__init__(entity_type, exchanges, entity_ids, codes, batch_size,
                         force_update, sleeping_time, share_para=share_para)
    def format(self, content, exchange):
        df = pd.DataFrame(content)

        if df is not None:
            df.rename(columns={
                'symbol': 'code',
                'ipoyear': 'list_date',
                'marketCap': 'market_cap'
            },
                      inplace=True)

            timestamp_str = self.get_original_time_field()
            df[timestamp_str] = df[timestamp_str].apply(
                lambda x: to_pd_timestamp(x))
            df.fillna({timestamp_str: to_pd_timestamp('1980-01-01')},
                      inplace=True)

            df['timestamp'] = df[timestamp_str]
            df['entity_type'] = EntityType.Stock.value
            df['exchange'] = exchange
            df['is_active'] = True
            df['code'] = df['code'].str.strip()
            df['id'] = self.generate_domain_id(exchange, df)
            df['entity_id'] = df['id']
            df.drop_duplicates(subset=('id'), keep='last', inplace=True)

        return df
Esempio n. 3
0
    def format(self, entity, df):
        df.rename(columns={'symbol': 'code', 'market': 'sector', 'province': 'state', 'employees': 'fulltime_employees',
                           'reg_capital': 'market_cap', 'setup_date': 'date_of_establishment'}, inplace=True)

        timestamp_str = self.get_original_time_field()
        df[timestamp_str] = df[timestamp_str].apply(lambda x: to_pd_timestamp(x))
        df.fillna({timestamp_str: to_pd_timestamp('1980-01-01')}, inplace=True)

        df['timestamp'] = df[timestamp_str]
        df['entity_type'] = EntityType.StockDetail.value
        df['code'] = df['code'].str.strip()
        df['id'] = self.generate_domain_id(entity, df)
        df['entity_id'] = df['id']
        df.drop_duplicates(subset=('id'), keep='last', inplace=True)

        return df
    def init_timestamps(self, entity, http_session):
        param = {
            "color": "w",
            "fc": get_fc(entity),
            "DataType": self.data_type
        }

        if self.finance_report_type == 'LiRunBiaoList' or self.finance_report_type == 'XianJinLiuLiangBiaoList':
            param['ReportType'] = 1

        timestamp_json_list = call_eastmoney_api(
            http_session,
            url=self.timestamps_fetching_url,
            path_fields=self.timestamp_list_path_fields,
            params=param)

        if timestamp_json_list is not None and self.timestamp_path_fields:
            timestamps = [
                get_from_path_fields(data, self.timestamp_path_fields)
                for data in timestamp_json_list
            ]
        else:
            return []

        return [to_pd_timestamp(t) for t in timestamps]
Esempio n. 5
0
    def eval_size_of_timestamp(self,
                               start_timestamp: pd.Timestamp,
                               end_timestamp: pd.Timestamp,
                               level: IntervalLevel,
                               one_day_trading_minutes):
        assert end_timestamp is not None

        time_delta = end_timestamp - to_pd_timestamp(start_timestamp)

        one_day_trading_seconds = one_day_trading_minutes * 60

        if level == IntervalLevel.LEVEL_1DAY:
            return time_delta.days

        if level == IntervalLevel.LEVEL_1WEEK:
            return int(math.ceil(time_delta.days / 7))

        if level == IntervalLevel.LEVEL_1MON:
            return int(math.ceil(time_delta.days / 30))

        if time_delta.days > 0:
            seconds = (time_delta.days + 1) * one_day_trading_seconds
            return int(math.ceil(seconds / level.to_second()))
        else:
            seconds = time_delta.total_seconds()
            return min(int(math.ceil(seconds / level.to_second())),
                       one_day_trading_seconds / level.to_second())
    async def persist_index(self, df) -> None:
        df['timestamp'] = df['timestamp'].apply(lambda x: to_pd_timestamp(x))
        df['list_date'] = df['list_date'].apply(lambda x: to_pd_timestamp(x))
        df['id'] = df['code'].apply(lambda code: f'index_cn_{code}')
        df['entity_id'] = df['id']
        df['exchange'] = 'cn'
        df['entity_type'] = EntityType.Index.value

        df = df.dropna(axis=0, how='any')
        df = df.drop_duplicates(subset='id', keep='last')

        db_session = get_db_session(self.region, self.provider, Index)
        await df_to_db(region=self.region,
                       provider=self.provider,
                       data_schema=Index,
                       db_session=db_session,
                       df=df)
Esempio n. 7
0
async def init_main_index(region: Region, provider=Provider.Exchange):
    if region == Region.CHN:
        for item in CHINA_STOCK_MAIN_INDEX:
            item['timestamp'] = to_pd_timestamp(item['timestamp'])
        df = pd.DataFrame(CHINA_STOCK_MAIN_INDEX)
    elif region == Region.US:
        for item in US_STOCK_MAIN_INDEX:
            item['timestamp'] = to_pd_timestamp(item['timestamp'])
        df = pd.DataFrame(US_STOCK_MAIN_INDEX)
    else:
        print("index not initialized, in file: init_main_index")
        df = pd.DataFrame()

    if pd_valid(df):
        await df_to_db(region=region,
                       provider=provider,
                       data_schema=Index,
                       db_session=get_db_session(region, provider, Index),
                       df=df)
Esempio n. 8
0
def common_filter(query: Query,
                  data_schema,
                  ids: List[str] = None,
                  entity_ids: List[str] = None,
                  entity_id: str = None,
                  codes: List[str] = None,
                  code: str = None,
                  start_timestamp=None,
                  end_timestamp=None,
                  filters=None,
                  order=None,
                  limit: int = None,
                  time_field='timestamp'):
    assert data_schema is not None

    if entity_id is not None:
        query = query.filter(data_schema.entity_id == entity_id)
    if entity_ids is not None:
        query = query.filter(data_schema.entity_id.in_(entity_ids))
    if code is not None:
        query = query.filter(data_schema.code == code)
    if codes is not None:
        query = query.filter(data_schema.code.in_(codes))
    if ids is not None:
        query = query.filter(data_schema.id.in_(ids))

    time_col = eval(f'data_schema.{time_field}')

    if start_timestamp:
        query = query.filter(time_col >= to_pd_timestamp(start_timestamp))
    if end_timestamp:
        query = query.filter(time_col <= to_pd_timestamp(end_timestamp))

    if filters:
        for filter in filters:
            query = query.filter(filter)
    if order is not None:
        query = query.order_by(order)
    if limit:
        query = query.limit(limit)

    return query
Esempio n. 9
0
def to_report_period_type(report_date):
    the_date = to_pd_timestamp(report_date)
    if the_date.month == 3 and the_date.day == 31:
        return ReportPeriod.season1.value
    if the_date.month == 6 and the_date.day == 30:
        return ReportPeriod.half_year.value
    if the_date.month == 9 and the_date.day == 30:
        return ReportPeriod.season3.value
    if the_date.month == 12 and the_date.day == 31:
        return ReportPeriod.year.value
    return None
Esempio n. 10
0
def to_jq_report_period(timestamp):
    the_date = to_pd_timestamp(timestamp)
    report_period = to_report_period_type(timestamp)
    if report_period == ReportPeriod.year.value:
        return f'{the_date.year}'
    if report_period == ReportPeriod.season1.value:
        return f'{the_date.year}q1'
    if report_period == ReportPeriod.half_year.value:
        return f'{the_date.year}q2'
    if report_period == ReportPeriod.season3.value:
        return f'{the_date.year}q3'
    assert False
    def format(self, resp, exchange):
        df = None
        if exchange == ChnExchange.SSE.value:
            # df = pd.read_excel(io.BytesIO(resp.content), sheet_name='主板A股', dtype=str, parse_dates=['上市日期'])
            df = pd.read_csv(io.BytesIO(resp), sep='\t', encoding='GB2312', dtype=str,
                             parse_dates=['上市日期'])
            if df is not None:
                df.columns = [column.strip() for column in df.columns]
                df = df.loc[:, ['公司代码', '公司简称', '上市日期']]

        elif exchange == ChnExchange.SZSE.value:
            df = pd.read_excel(io.BytesIO(resp), sheet_name='A股列表', dtype=str, parse_dates=['A股上市日期'])
            if df is not None:
                df = df.loc[:, ['A股代码', 'A股简称', 'A股上市日期']]

        if df is not None:
            df.columns = ['code', 'name', 'list_date']

            timestamp_str = self.get_original_time_field()
            # handle the dirty data
            # 600996,贵广网络,2016-12-26,2016-12-26,sh,stock,stock_sh_600996,,次新股,贵州,,
            df.loc[df['code'] == '600996', timestamp_str] = '2016-12-26'
            # print(df[df['list_date'] == '-'])
            df[timestamp_str] = df[timestamp_str].apply(lambda x: to_pd_timestamp(x))
            df.fillna({timestamp_str: to_pd_timestamp('1980-01-01')}, inplace=True)

            df['timestamp'] = df[timestamp_str]
            df['entity_type'] = EntityType.Stock.value
            df['exchange'] = exchange
            df['is_active'] = True
            df['code'] = df['code'].str.strip()
            df['id'] = self.generate_domain_id(exchange, df)
            df['entity_id'] = df['id']
            df = df.drop_duplicates(subset=('id'), keep='last')

        return df
Esempio n. 12
0
    def init_timestamps(self, entity, http_session):
        param = {"color": "w", "fc": get_fc(entity)}

        timestamp_json_list = call_eastmoney_api(
            http_session,
            url=self.timestamps_fetching_url,
            path_fields=self.timestamp_list_path_fields,
            params=param)

        if self.timestamp_path_fields and timestamp_json_list:
            timestamps = [
                get_from_path_fields(data, self.timestamp_path_fields)
                for data in timestamp_json_list
            ]
            return [to_pd_timestamp(t) for t in timestamps]
        return []
Esempio n. 13
0
def get_recent_report_date(the_date, step=0):
    the_date = to_pd_timestamp(the_date)
    assert step >= 0
    if the_date.month >= 10:
        recent = f"{the_date.year}{'-09-30'}"
    elif the_date.month >= 7:
        recent = f"{the_date.year}{'-06-30'}"
    elif the_date.month >= 4:
        recent = f"{the_date.year}{'-03-31'}"
    else:
        recent = f"{the_date.year - 1}{'-12-31'}"

    if step == 0:
        return recent
    else:
        step = step - 1
        return get_recent_report_date(recent, step)
Esempio n. 14
0
async def get_portfolio_stocks(region: Region,
                               provider: Provider,
                               timestamp,
                               portfolio_entity=Fund,
                               code=None,
                               codes=None,
                               ids=None):
    portfolio_stock = f'{portfolio_entity.__name__}Stock'
    data_schema: PortfolioStockHistory = get_schema_by_name(portfolio_stock)
    db_session = get_db_session(region, provider, data_schema)

    latests, column_names = data_schema.query_data(
        region=region,
        provider=provider,
        db_session=db_session,
        code=code,
        end_timestamp=timestamp,
        order=data_schema.timestamp.desc(),
        limit=1)

    if latests and len(latests) > 0:
        latest_record = latests[0]
        # 获取最新的报表
        data, column_names = data_schema.query_data(
            region=region,
            provider=provider,
            db_session=db_session,
            code=code,
            codes=codes,
            ids=ids,
            end_timestamp=timestamp,
            filters=[data_schema.report_date == latest_record.report_date])

        if data and len(data) > 0:
            df = pd.DataFrame([s.__dict__ for s in data], columns=column_names)

            # 最新的为年报或者半年报
            if latest_record.report_period == ReportPeriod.year or latest_record.report_period == ReportPeriod.half_year:
                return df
            # 季报,需要结合 年报或半年报 来算持仓
            else:
                step = 0
                while step <= 20:
                    report_date = get_recent_report_date(
                        latest_record.report_date, step=step)

                    data, column_names = data_schema.query_data(
                        region=region,
                        provider=provider,
                        db_session=db_session,
                        code=code,
                        codes=codes,
                        ids=ids,
                        end_timestamp=timestamp,
                        filters=[
                            data_schema.report_date == to_pd_timestamp(
                                report_date)
                        ])

                    if data and len(data) > 0:
                        pre_df = pd.DataFrame.from_records(
                            [s.__dict__ for s in data], columns=column_names)
                        df = df.append(pre_df)

                    # 半年报和年报
                    if (ReportPeriod.half_year.value
                            in pre_df['report_period'].tolist()) or (
                                ReportPeriod.year.value
                                in pre_df['report_period'].tolist()):
                        # 保留最新的持仓
                        df = df.drop_duplicates(subset=['stock_code'],
                                                keep='first')
                        return df
                    step = step + 1
Esempio n. 15
0
    def __init__(self,
                 region: Region,
                 data_schema: Type[Mixin],
                 entity_schema: Type[EntityMixin],
                 provider: Provider = None,
                 entity_ids: List[str] = None,
                 exchanges: List[str] = None,
                 codes: List[str] = None,
                 the_timestamp: Union[str, pd.Timestamp] = None,
                 start_timestamp: Union[str, pd.Timestamp] = None,
                 end_timestamp: Union[str, pd.Timestamp] = None,
                 columns: List = None,
                 filters: List = None,
                 order: object = None,
                 limit: int = None,
                 level: IntervalLevel = None,
                 category_field: str = 'entity_id',
                 time_field: str = 'timestamp',
                 computing_window: int = None) -> None:
        self.logger = logging.getLogger(self.__class__.__name__)

        self.data_schema = data_schema
        self.entity_schema = entity_schema

        self.region = region
        self.provider = provider

        if end_timestamp is None:
            end_timestamp = now_pd_timestamp(self.region)

        self.the_timestamp = the_timestamp
        if the_timestamp:
            self.start_timestamp = the_timestamp
            self.end_timestamp = the_timestamp
        else:
            self.start_timestamp = start_timestamp
            self.end_timestamp = end_timestamp

        self.start_timestamp = to_pd_timestamp(self.start_timestamp)
        self.end_timestamp = to_pd_timestamp(self.end_timestamp)

        self.exchanges = exchanges

        if codes:
            if type(codes) == str:
                codes = codes.replace(' ', '')
                if codes.startswith('[') and codes.endswith(']'):
                    codes = json.loads(codes)
                else:
                    codes = codes.split(',')

        self.codes = codes
        self.entity_ids = entity_ids
        self.filters = filters
        self.order = order
        self.limit = limit

        if level:
            self.level = IntervalLevel(level)
        else:
            self.level = level

        self.category_field = category_field
        self.time_field = time_field
        self.computing_window = computing_window

        self.category_col = eval(f'self.data_schema.{self.category_field}')
        self.time_col = eval(f'self.data_schema.{self.time_field}')

        self.columns = columns

        # we store the data in a multiple index(category_column,timestamp) Dataframe
        if self.columns:
            # support str
            if type(columns[0]) == str:
                self.columns = []
                for col in columns:
                    self.columns.append(eval(f'data_schema.{col}'))

            # always add category_column and time_field for normalizing
            self.columns = list(
                set(self.columns) | {self.category_col, self.time_col})

        self.data_listeners: List[DataListener] = []

        self.data_df: pd.DataFrame = None