Example #1
0
    def load_data(self):
        self.logger.info('load_data start')
        start_time = time.time()

        if self.entity_ids:
            self.data_df = get_data(data_schema=self.data_schema, entity_ids=self.entity_ids,
                                    provider=self.provider, columns=self.columns,
                                    start_timestamp=self.start_timestamp,
                                    end_timestamp=self.end_timestamp, filters=self.filters, order=self.order,
                                    limit=self.limit,
                                    level=self.level,
                                    index=[self.category_field, self.time_field],
                                    time_field=self.time_field)
        else:
            self.data_df = get_data(data_schema=self.data_schema, codes=self.codes,
                                    provider=self.provider, columns=self.columns,
                                    start_timestamp=self.start_timestamp,
                                    end_timestamp=self.end_timestamp, filters=self.filters, order=self.order,
                                    limit=self.limit,
                                    level=self.level,
                                    index=[self.category_field, self.time_field],
                                    time_field=self.time_field)
        cost_time = time.time() - start_time
        self.logger.info('load_data finish cost_time:{}'.format(cost_time))

        for listener in self.data_listeners:
            listener.on_data_loaded(self.data_df)
Example #2
0
def test_get_data():
    df = get_data(data_schema=Stock,
                  entity_ids=['stock_sz_000338', 'stock_sz_000778'],
                  provider='sina')
    assert len(df) == 2

    df = get_data(data_schema=Stock,
                  codes=['000338', '000778'],
                  provider='sina')
    assert len(df) == 2

    df = get_data(data_schema=Stock,
                  start_timestamp='2019-01-01',
                  provider='sina')
    print(f'2019 list count:{len(df.index)}')

    df = get_data(data_schema=Stock,
                  end_timestamp='2018-12-31',
                  provider='sina')
    print(f'from start to 2019 list count:{len(df.index)}')

    df = get_data(data_schema=Stock,
                  end_timestamp='2018-12-31',
                  limit=10,
                  provider='sina')
    assert len(df) == 10
Example #3
0
    def load_data(self):
        if self.entity_ids:
            self.data_df = get_data(data_schema=self.data_schema, entity_ids=self.entity_ids,
                                    provider=self.provider, columns=self.columns,
                                    start_timestamp=self.start_timestamp,
                                    end_timestamp=self.end_timestamp, filters=self.filters, order=self.order,
                                    limit=self.limit,
                                    level=self.level,
                                    time_field=self.time_field,
                                    index=self.time_field)
        else:
            self.data_df = get_data(data_schema=self.data_schema, codes=self.codes,
                                    provider=self.provider, columns=self.columns,
                                    start_timestamp=self.start_timestamp,
                                    end_timestamp=self.end_timestamp, filters=self.filters, order=self.order,
                                    limit=self.limit,
                                    level=self.level,
                                    time_field=self.time_field,
                                    index=self.time_field)

        if self.trip_timestamp:
            if self.level == IntervalLevel.LEVEL_1DAY:
                self.data_df[self.time_field] = self.data_df[self.time_field].apply(
                    lambda x: to_pd_timestamp(to_time_str(x)))

        if df_is_not_null(self.data_df):
            self.normal_data = NormalData(df=self.data_df, category_field=self.category_field,
                                          index_field=self.time_field, is_timeseries=True)
            self.data_df = self.normal_data.data_df

        for listener in self.data_listeners:
            listener.on_data_loaded(self.data_df)
Example #4
0
def get_trader(trader_name=None,
               return_type='df',
               start_timestamp=None,
               end_timestamp=None,
               filters=None,
               session=None,
               order=None,
               limit=None) -> List[business.Trader]:
    if trader_name:
        if filters:
            filters = filters + [business.Trader.trader_name == trader_name]
        else:
            filters = [business.Trader.trader_name == trader_name]

    return get_data(data_schema=business.Trader,
                    entity_id=None,
                    codes=None,
                    level=None,
                    provider='zvt',
                    columns=None,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit)
Example #5
0
    def on_finish_entity(self, entity):
        super().on_finish_entity(entity)

        if not self.fetch_jq_timestamp:
            return

        # fill the timestamp for report published date
        the_data_list = get_data(
            data_schema=self.data_schema,
            provider=self.provider,
            entity_id=entity.id,
            order=self.data_schema.timestamp.asc(),
            return_type='domain',
            session=self.session,
            filters=[
                self.data_schema.timestamp == self.data_schema.report_date,
                self.data_schema.timestamp >= to_pd_timestamp('2005-01-01')
            ])
        if the_data_list:
            if self.data_schema == FinanceFactor:
                for the_data in the_data_list:
                    self.fill_timestamp_with_jq(entity, the_data)
            else:
                df = get_finance_factor(
                    entity_id=entity.id,
                    columns=[
                        FinanceFactor.timestamp, FinanceFactor.report_date,
                        FinanceFactor.id
                    ],
                    filters=[
                        FinanceFactor.timestamp != FinanceFactor.report_date,
                        FinanceFactor.timestamp >=
                        to_pd_timestamp('2005-01-01'),
                        FinanceFactor.report_date >=
                        the_data_list[0].report_date,
                        FinanceFactor.report_date <=
                        the_data_list[-1].report_date,
                    ])

                if pd_is_not_null(df):
                    index_df(df, index='report_date', time_field='report_date')

                for the_data in the_data_list:
                    if (df is not None) and (
                            not df.empty) and the_data.report_date in df.index:
                        the_data.timestamp = df.at[the_data.report_date,
                                                   'timestamp']
                        self.logger.info(
                            'db fill {} {} timestamp:{} for report_date:{}'.
                            format(self.data_schema, entity.id,
                                   the_data.timestamp, the_data.report_date))
                        self.session.commit()
                    else:
                        # self.logger.info(
                        #     'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema,
                        #                                                                    security_item.id,
                        #                                                                    the_data.timestamp,
                        #                                                                    the_data.report_date))

                        self.fill_timestamp_with_jq(entity, the_data)
Example #6
0
def get_top_ten_holder(entity_ids: List[str] = None,
                       entity_id: str = None,
                       codes: List[str] = None,
                       level: Union[IntervalLevel, str] = None,
                       provider: str = 'eastmoney',
                       columns: List = None,
                       return_type: str = 'df',
                       start_timestamp: Union[pd.Timestamp, str] = None,
                       end_timestamp: Union[pd.Timestamp, str] = None,
                       filters: List = None,
                       session: Session = None,
                       order=None,
                       limit: int = None,
                       index: str = 'timestamp',
                       index_is_time: bool = True,
                       time_field: str = 'timestamp'):
    return get_data(data_schema=TopTenHolder,
                    entity_ids=entity_ids,
                    entity_id=entity_id,
                    codes=codes,
                    level=level,
                    provider=provider,
                    columns=columns,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit,
                    index=index,
                    index_is_time=index_is_time,
                    time_field=time_field)
Example #7
0
def get_big_deal_trading(entity_ids: List[str] = None,
                         entity_id: str = None,
                         codes: List[str] = None,
                         level: Union[IntervalLevel, str] = None,
                         provider: str = 'eastmoney',
                         columns: List = None,
                         return_type: str = 'df',
                         start_timestamp: Union[pd.Timestamp, str] = None,
                         end_timestamp: Union[pd.Timestamp, str] = None,
                         filters: List = None,
                         session: Session = None,
                         order=None,
                         limit: int = None,
                         index: str = 'timestamp',
                         time_field: str = 'timestamp'):
    return get_data(data_schema=BigDealTrading,
                    entity_ids=entity_ids,
                    entity_id=entity_id,
                    codes=codes,
                    level=level,
                    provider=provider,
                    columns=columns,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit,
                    index=index,
                    time_field=time_field)
Example #8
0
def get_stock_money_flow(entity_ids: List[str] = None,
                         entity_id: str = None,
                         codes: List[str] = None,
                         level: Union[IntervalLevel, str] = None,
                         provider: str = 'sina',
                         columns: List = None,
                         return_type: str = 'df',
                         start_timestamp: Union[pd.Timestamp, str] = None,
                         end_timestamp: Union[pd.Timestamp, str] = None,
                         filters: List = None,
                         session: Session = None,
                         order=None,
                         limit: int = None,
                         index: str = 'timestamp',
                         time_field: str = 'timestamp'):
    return get_data(data_schema=StockMoneyFlow,
                    entity_ids=entity_ids,
                    entity_id=entity_id,
                    codes=codes,
                    level=level,
                    provider=provider,
                    columns=columns,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit,
                    index=index,
                    time_field=time_field)
Example #9
0
def get_kdata(entity_id,
              level=IntervalLevel.LEVEL_1DAY.value,
              provider='eastmoney',
              columns=None,
              return_type='df',
              start_timestamp=None,
              end_timestamp=None,
              filters=None,
              session=None,
              order=None,
              limit=None):
    entity_type, exchange, code = decode_entity_id(entity_id)
    data_schema = get_kdata_schema(entity_type, level=level)

    return get_data(data_schema=data_schema,
                    entity_id=entity_id,
                    level=level,
                    provider=provider,
                    columns=columns,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit)
Example #10
0
def follow_someone():
    import random
    year = random.randint(2012, 2014)
    mon = random.randint(1, 12)
    day = random.randint(1, 28)

    start_timestamp = f'{year}-{mon}-{day}'

    year = random.randint(2015, 2019)
    mon = random.randint(1, 12)
    day = random.randint(1, 28)

    end_timestamp = f'{year}-{mon}-{day}'

    users = get_data(provider='github',
                     data_schema=GithubUser,
                     start_timestamp=start_timestamp,
                     end_timestamp=end_timestamp,
                     return_type='domain',
                     limit=1000)

    for seed in range(0, len(GithubAccount.tokens)):
        for user in users:
            resp = request_with_auth(url=url.format(user.code),
                                     method='put',
                                     token=GithubAccount.get_token(seed=seed),
                                     headers={'Content-Length': '0'})
            if resp.status_code == 204:
                print('follow:{} ok'.format(user.code))
            else:
                print(resp.status_code)
Example #11
0
File: api.py Project: zilinly/zvt
def get_index(ids: List[str] = None,
              entity_ids: List[str] = None,
              entity_id: str = None,
              codes: List[str] = None,
              code: str = None,
              level: Union[IntervalLevel, str] = None,
              provider: str = 'exchange',
              columns: List = None,
              return_type: str = 'df',
              start_timestamp: Union[pd.Timestamp, str] = None,
              end_timestamp: Union[pd.Timestamp, str] = None,
              filters: List = None,
              session: Session = None,
              order=None,
              limit: int = None,
              index: Union[str, list] = 'timestamp',
              time_field: str = 'timestamp'):
    return get_data(data_schema=Index,
                    ids=ids,
                    entity_ids=entity_ids,
                    entity_id=entity_id,
                    codes=codes,
                    code=code,
                    level=level,
                    provider=provider,
                    columns=columns,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit,
                    index=index,
                    time_field=time_field)
Example #12
0
def get_position(trader_name=None,
                 return_type='df',
                 start_timestamp=None,
                 end_timestamp=None,
                 filters=None,
                 session=None,
                 order=None,
                 limit=None):
    if trader_name:
        if filters:
            filters = filters + [Position.trader_name == trader_name]
        else:
            filters = [Position.trader_name == trader_name]

    return get_data(data_schema=Position,
                    entity_id=None,
                    codes=None,
                    level=None,
                    provider='zvt',
                    columns=None,
                    return_type=return_type,
                    start_timestamp=start_timestamp,
                    end_timestamp=end_timestamp,
                    filters=filters,
                    session=session,
                    order=order,
                    limit=limit)
Example #13
0
    def generate_domain(self, entity, original_data):
        """
        generate the data_schema instance using entity and original_data,the original_data is from record result

        :param entity:
        :param original_data:
        """

        got_new_data = False

        # if the domain is directly generated in record method, we just return it
        if isinstance(original_data, self.data_schema):
            got_new_data = True
            return got_new_data, original_data

        the_id = self.generate_domain_id(entity, original_data)

        # optional way
        # item = self.session.query(self.data_schema).get(the_id)

        items = get_data(data_schema=self.data_schema,
                         session=self.session,
                         provider=self.provider,
                         entity_id=entity.id,
                         filters=[self.data_schema.id == the_id],
                         return_type='domain')

        if items and not self.force_update:
            self.logger.info('ignore the data {}:{} saved before'.format(
                self.data_schema, the_id))
            return got_new_data, None

        if not items:
            timestamp_str = original_data[self.get_original_time_field()]
            timestamp = None
            try:
                timestamp = to_pd_timestamp(timestamp_str)
            except Exception as e:
                self.logger.exception(e)

            if 'name' in get_schema_columns(self.data_schema):
                domain_item = self.data_schema(id=the_id,
                                               code=entity.code,
                                               name=entity.name,
                                               entity_id=entity.id,
                                               timestamp=timestamp)
            else:
                domain_item = self.data_schema(id=the_id,
                                               code=entity.code,
                                               entity_id=entity.id,
                                               timestamp=timestamp)
            got_new_data = True
        else:
            domain_item = items[0]

        fill_domain_from_dict(domain_item, original_data, self.get_data_map())
        return got_new_data, domain_item
Example #14
0
    def get_latest_saved_record(self, entity):
        order = eval('self.data_schema.{}.desc()'.format(self.get_evaluated_time_field()))

        return get_data(entity_id=entity.id,
                        provider=self.provider,
                        data_schema=self.data_schema,
                        order=order,
                        limit=1,
                        return_type='domain',
                        session=self.session)
 def get_latest_saved_record(self, entity):
     records = get_data(provider=self.provider,
                        data_schema=self.data_schema,
                        order=self.data_schema.timestamp.desc(),
                        limit=1,
                        return_type='domain',
                        session=self.session)
     if records:
         return records[0]
     return None
Example #16
0
    def get_latest_saved_pipe(self):
        order = eval('self.factor_schema.{}.desc()'.format(self.time_field))

        records = get_data(provider=self.provider,
                           data_schema=self.pipe_schema,
                           order=order,
                           limit=1,
                           return_type='domain',
                           session=self.session)
        if records:
            return records[0]
        return None
Example #17
0
    def evaluate_start_end_size_timestamps(self, entity):
        # get latest record
        latest_record = get_data(entity_id=entity.id,
                                 provider=self.provider,
                                 data_schema=self.data_schema,
                                 order=self.data_schema.timestamp.desc(), limit=1,
                                 return_type='domain',
                                 session=self.session)
        if latest_record:
            if not self.get_remote_latest_record(entity) or (
                    latest_record[0].id == self.get_remote_latest_record(entity).id):
                return None, None, 0, None
            else:
                return None, None, 10, None

        return None, None, 1000, None
Example #18
0
    def get_latest_saved_record(self, entity):
        order = eval('self.data_schema.{}.desc()'.format(self.get_evaluated_time_field()))

        records = get_data(entity_id=entity.id,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           order=order,
                           limit=2,
                           return_type='domain',
                           session=self.session,
                           level=self.level.value)
        if records:
            # delete unfinished kdata
            if len(records) == 2:
                if is_in_same_interval(t1=records[0].timestamp, t2=records[1].timestamp, level=self.level):
                    self.session.delete(records[0])
                    self.session.flush()
                    return records[1]
            return records[0]
        return None
Example #19
0
    def get_latest_saved_record(self, entity):
        order = eval('self.data_schema.{}.desc()'.format(
            self.get_evaluated_time_field()))

        records = get_data(entity_id=entity.id,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           order=order,
                           limit=2,
                           return_type='domain',
                           session=self.session,
                           level=self.level.value)
        if records:
            # just keep one unfinished kdata
            if not is_finished_kdata_timestamp(records[-1].timestamp,
                                               level=self.level):
                self.session.delete(records[-1])
                return records[0]
            return records[-1]
        return None
Example #20
0
 def load_window_df(self, provider, data_schema):
     window_df = None
     if not self.entity_ids:
         self.entity_ids = get_entity_ids(provider='eastmoney', entity_type=self.entity_type,
                                          exchanges=self.exchanges,
                                          codes=self.codes)
     dfs = []
     for entity_id in self.entity_ids:
         df = get_data(provider=provider,
                       data_schema=data_schema,
                       start_timestamp=self.start_timestamp,
                       index=[self.category_field, self.time_field],
                       order=data_schema.timestamp.desc(),
                       entity_id=entity_id,
                       limit=self.computing_window)
         if pd_is_not_null(df):
             dfs.append(df)
     if dfs:
         window_df = pd.concat(dfs)
         window_df = window_df.sort_index(level=[0, 1])
     return window_df
    def init_entities(self):
        items = get_data(
            data_schema=self.data_schema,
            session=self.session,
            provider=self.provider,
            entity_id='user_github_mojombo',
            filters=[self.data_schema.id == 'user_github_mojombo'],
            return_type='domain')

        first_user = GithubUser(
            id='user_github_mojombo',
            entity_id='user_github_mojombo',
            node_id='MDQ6VXNlcjE=',
            avatar_url='https://avatars0.githubusercontent.com/u/1?v=4',
            gravatar_id=None,
            site_admin=False,
            code='mojombo',
            name='Tom Preston-Werner',
            company=None,
            blog='http://tom.preston-werner.com',
            location='San Francisco',
            email=None,
            hireable=False,
            bio=None,
            public_repos=61,
            public_gists=62,
            followers=21529,
            following=11,
            timestamp=to_pd_timestamp(to_time_str("2007-10-20T05:24:19Z")),
            created_timestamp=to_pd_timestamp(
                to_time_str("2007-10-20T05:24:19Z")),
            updated_timestamp=to_pd_timestamp(
                to_time_str("2019-06-25T17:22:10Z")))

        if not items:
            self.session.add(first_user)
            self.session.commit()

        self.entities = [first_user]
Example #22
0
    def __init__(
            self,
            data_schema: object,
            entity_ids: List[str] = None,
            entity_type: str = 'stock',
            exchanges: List[str] = ['sh', 'sz'],
            codes: List[str] = None,
            the_timestamp: Union[str, pd.Timestamp] = None,
            start_timestamp: Union[str, pd.Timestamp] = None,
            end_timestamp: Union[str, pd.Timestamp] = None,
            columns: List = None,
            filters: List = None,
            order: object = None,
            limit: int = None,
            provider: str = 'eastmoney',
            level: Union[str, IntervalLevel] = IntervalLevel.LEVEL_1DAY,
            category_field: str = 'entity_id',
            time_field: str = 'timestamp',
            computing_window: int = 250,
            # child added arguments
            keep_all_timestamp: bool = False,
            fill_method: str = 'ffill',
            effective_number: int = 10,
            transformer: Transformer = None,
            accumulator: Accumulator = None,
            need_persist: bool = True,
            dry_run: bool = False) -> None:

        super().__init__(data_schema, entity_ids, entity_type, exchanges,
                         codes, the_timestamp, start_timestamp, end_timestamp,
                         columns, filters, order, limit, provider, level,
                         category_field, time_field, computing_window)

        self.factor_name = type(self).__name__.lower()

        self.keep_all_timestamp = keep_all_timestamp
        self.fill_method = fill_method
        self.effective_number = effective_number
        self.transformer = transformer
        self.accumulator = accumulator

        self.need_persist = need_persist
        self.dry_run = dry_run

        # 计算因子的结果,可持久化
        self.factor_df: pd.DataFrame = None
        # 中间结果,不持久化
        self.pipe_df: pd.DataFrame = None
        # result_df是用于选股的标准df
        self.result_df: pd.DataFrame = None

        # 如果是accumulate类的运算,需要利用之前的factor_df,比如全市场的一些统计信息
        if self.need_persist:
            # 如果只是为了计算因子,只需要读取valid_window的factor_df
            if self.dry_run:
                self.factor_df = self.load_window_df(
                    provider='zvt', data_schema=self.factor_schema)
            else:
                self.factor_df = get_data(
                    provider='zvt',
                    data_schema=self.factor_schema,
                    start_timestamp=self.start_timestamp,
                    index=[self.category_field, self.time_field])

        if pd_is_not_null(self.factor_df):
            dfs = []
            for entity_id, df in self.data_df.groupby(level=0):
                if entity_id in self.factor_df.index.levels[0]:
                    df = df[df.timestamp >= self.factor_df.loc[(
                        entity_id, )].index[0]]
                dfs.append(df)

            self.data_df = pd.concat(dfs)

        self.register_data_listener(self)
Example #23
0
    def move_on(self, to_timestamp: Union[str, pd.Timestamp] = None,
                timeout: int = 20) -> bool:
        """
        get the data happened before to_timestamp,if not set,get all the data which means to now

        Parameters
        ----------
        to_timestamp :
        timeout : the time waiting the data ready in seconds

        Returns
        -------
        whether got data
        """
        if not df_is_not_null(self.data_df):
            self.load_data()
            return False

        df = self.data_df.reset_index(level='timestamp')
        recorded_timestamps = df.groupby(level=0)['timestamp'].max()

        self.logger.info('level:{},current_timestamps:\n{}'.format(self.level, recorded_timestamps))

        changed = False
        # FIXME:we suppose history data should be there at first
        start_time = time.time()
        for category, recorded_timestamp in recorded_timestamps.iteritems():
            while True:
                category_filter = [self.category_column == category]
                if self.filters:
                    filters = self.filters + category_filter
                else:
                    filters = category_filter

                added = get_data(data_schema=self.data_schema, provider=self.provider, columns=self.columns,
                                 start_timestamp=recorded_timestamp,
                                 end_timestamp=to_timestamp, filters=filters, level=self.level)

                if df_is_not_null(added):
                    would_added = added[added['timestamp'] != recorded_timestamp].copy()
                    if not would_added.empty:
                        added = index_df_with_category_xfield(would_added, category_field=self.category_field,
                                                              xfield=self.time_field)
                        self.logger.info('category:{},added:\n{}'.format(category, added))

                        self.data_df = self.data_df.append(added)
                        self.data_df = self.data_df.sort_index(level=[0, 1])

                        for listener in self.data_listeners:
                            listener.on_category_data_added(category=category, added_data=added)
                        changed = True
                        # if got data,just move to another category
                        break

                cost_time = time.time() - start_time
                if cost_time > timeout:
                    self.logger.warning(
                        'category:{} level:{} getting data timeout,to_timestamp:{},now:{}'.format(category, self.level,
                                                                                                  to_timestamp,
                                                                                                  now_pd_timestamp()))
                    break

        if changed:
            for listener in self.data_listeners:
                listener.on_data_changed(self.data_df)

        return changed
Example #24
0
    def move_on(self, to_timestamp: Union[str, pd.Timestamp] = None,
                timeout: int = 20) -> object:
        """
        using continual fetching data in realtime
        1)get the data happened before to_timestamp,if not set,get all the data which means to now
        2)if computing_window set,the data_df would be cut for saving memory


        :param to_timestamp:
        :type to_timestamp:
        :param timeout:
        :type timeout: int
        :return:
        :rtype:
        """
        if not pd_is_not_null(self.data_df):
            self.load_data()
            return

        start_time = time.time()

        # FIXME:we suppose history data should be there at first
        has_got = []
        dfs = []
        changed = False
        while True:
            for entity_id, df in self.data_df.groupby(level=0):
                if entity_id in has_got:
                    continue

                recorded_timestamp = df['timestamp'].max()

                if self.computing_window:
                    df = df.iloc[-self.computing_window:]

                added_filter = [self.category_col == entity_id, self.time_col > recorded_timestamp]
                if self.filters:
                    filters = self.filters + added_filter
                else:
                    filters = added_filter

                added_df = get_data(data_schema=self.data_schema, provider=self.provider, columns=self.columns,
                                    end_timestamp=to_timestamp, filters=filters, level=self.level,
                                    index=[self.category_field, self.time_field])

                if pd_is_not_null(added_df):
                    self.logger.info('entity_id:{},added:\n{}'.format(entity_id, added_df))

                    for listener in self.data_listeners:
                        listener.on_entity_data_changed(entity=entity_id, added_data=added_df)
                    # if got data,just move to another entity_id
                    changed = True
                    has_got.append(entity_id)
                    df = df.append(added_df, sort=False)
                    dfs.append(df)
                else:
                    cost_time = time.time() - start_time
                    if cost_time > timeout:
                        # if timeout,just add the old data
                        has_got.append(entity_id)
                        dfs.append(df)
                        self.logger.warning(
                            'category:{} level:{} getting data timeout,to_timestamp:{},now:{}'.format(entity_id,
                                                                                                      self.level,
                                                                                                      to_timestamp,
                                                                                                      now_pd_timestamp()))
                        continue

            if len(has_got) == len(self.data_df.index.levels[0]):
                break

        if dfs:
            self.data_df = pd.concat(dfs, sort=False)
            self.data_df.sort_index(level=[0, 1])

            if changed:
                for listener in self.data_listeners:
                    listener.on_data_changed(self.data_df)
Example #25
0
    def __init__(self,
                 data_schema: object,
                 entity_ids: List[str] = None,
                 entity_type: str = 'stock',
                 exchanges: List[str] = ['sh', 'sz'],
                 codes: List[str] = None,
                 the_timestamp: Union[str, pd.Timestamp] = None,
                 start_timestamp: Union[str, pd.Timestamp] = None,
                 end_timestamp: Union[str, pd.Timestamp] = None,
                 columns: List = None,
                 filters: List = None,
                 order: object = None,
                 limit: int = None,
                 provider: str = 'eastmoney',
                 level: Union[str, IntervalLevel] = IntervalLevel.LEVEL_1DAY,
                 category_field: str = 'entity_id',
                 time_field: str = 'timestamp',
                 computing_window: int = None,
                 # child added arguments
                 keep_all_timestamp: bool = False,
                 fill_method: str = 'ffill',
                 effective_number: int = None,
                 transformer: Transformer = None,
                 accumulator: Accumulator = None,
                 persist_factor: bool = False,
                 dry_run: bool = False) -> None:
        """

        :param data_schema:
        :param entity_ids:
        :param entity_type:
        :param exchanges:
        :param codes:
        :param the_timestamp:
        :param start_timestamp:
        :param end_timestamp:
        :param columns:
        :param filters:
        :param order:
        :param limit:
        :param provider:
        :param level:
        :param category_field:
        :param time_field:
        :param computing_window: the window size for computing factor
        :param keep_all_timestamp: whether fill all timestamp gap,default False
        :param fill_method:
        :param effective_number:
        :param transformer:
        :param accumulator:
        :param persist_factor: whether persist factor
        :param dry_run: True for just computing factor, False for backtesting
        """
        self.entity_type = entity_type
        if self.entity_type == 'stock':
            self.entity_provider = 'eastmoney'
        elif self.entity_type == 'coin':
            self.entity_provider = 'ccxt'
        else:
            self.entity_provider = 'joinquant'

        super().__init__(data_schema, self.entity_provider, entity_ids, entity_type, exchanges, codes, the_timestamp,
                         start_timestamp, end_timestamp, columns, filters, order, limit, provider, level,
                         category_field, time_field, computing_window)

        self.factor_name = type(self).__name__.lower()

        self.keep_all_timestamp = keep_all_timestamp
        self.fill_method = fill_method
        self.effective_number = effective_number
        self.transformer = transformer
        self.accumulator = accumulator

        self.persist_factor = persist_factor
        self.dry_run = dry_run

        # 中间结果,不持久化
        # data_df->pipe_df
        self.pipe_df: pd.DataFrame = None

        # 计算因子的结果,可持久化,通过对pipe_df的计算得到
        # pipe_df->factor_df
        self.factor_df: pd.DataFrame = None

        # result_df是用于选股的标准df,通过对factor_df的计算得到
        # factor_df->result_df
        self.result_df: pd.DataFrame = None

        if self.persist_factor:
            if self.dry_run:
                # 如果只是为了计算因子,只需要读取acc_window的factor_df
                if self.accumulator is not None:
                    self.factor_df = self.load_window_df(provider='zvt', data_schema=self.factor_schema,
                                                         window=accumulator.acc_window)

            else:
                self.factor_df = get_data(provider='zvt',
                                          data_schema=self.factor_schema,
                                          start_timestamp=self.start_timestamp,
                                          end_timestamp=self.end_timestamp,
                                          index=[self.category_field, self.time_field])

            # 根据已经计算的factor_df和computing_window来保留data_df
            if pd_is_not_null(self.data_df):
                dfs = []
                for entity_id, df in self.data_df.groupby(level=0):
                    latest_laved = get_data(provider='zvt',
                                            data_schema=self.factor_schema,
                                            entity_id=entity_id,
                                            order=self.factor_schema.timestamp.desc(),
                                            limit=1,
                                            index=[self.category_field, self.time_field], return_type='domain')
                    if latest_laved:
                        df1 = df[df.timestamp < latest_laved[0].timestamp].iloc[-self.computing_window:]
                        if pd_is_not_null(df1):
                            df = df[df.timestamp >= df1.iloc[0].timestamp]
                    dfs.append(df)

                self.data_df = pd.concat(dfs)

        self.register_data_listener(self)
Example #26
0
                            xref='x',
                            yref='y',
                            text=item['flag'],
                            showarrow=True,
                            align='center',
                            arrowhead=2,
                            arrowsize=1,
                            arrowwidth=2,
                            # arrowcolor='#030813',
                            ax=-10,
                            ay=-30,
                            bordercolor='#c7c7c7',
                            borderwidth=1,
                            bgcolor=color,
                            opacity=0.8))

    return annotations


if __name__ == '__main__':
    df = get_data(data_schema=Stock1dKdata,
                  provider='joinquant',
                  entity_ids=['stock_sz_000001', 'stock_sz_000002'])
    df1 = get_data(data_schema=Stock1dMaStateStats,
                   provider='zvt',
                   entity_ids=['stock_sz_000001', 'stock_sz_000002'],
                   columns=['current_count'])

    drawer = Drawer(df, df1[['current_count']])
    drawer.draw_kline()