Ejemplo n.º 1
0
    def on_finish_entity(self, entity):
        super().on_finish_entity(entity)

        if not self.fetch_jq_timestamp:
            return

        # fill the timestamp for report published date
        the_data_list = get_data(
            data_schema=self.data_schema,
            provider=self.provider,
            entity_id=entity.id,
            order=self.data_schema.timestamp.asc(),
            return_type='domain',
            session=self.session,
            filters=[
                self.data_schema.timestamp == self.data_schema.report_date,
                self.data_schema.timestamp >= to_pd_timestamp('2005-01-01')
            ])
        if the_data_list:
            if self.data_schema == FinanceFactor:
                for the_data in the_data_list:
                    self.fill_timestamp_with_jq(entity, the_data)
            else:
                df = get_finance_factor(
                    entity_id=entity.id,
                    columns=[
                        FinanceFactor.timestamp, FinanceFactor.report_date,
                        FinanceFactor.id
                    ],
                    filters=[
                        FinanceFactor.timestamp != FinanceFactor.report_date,
                        FinanceFactor.timestamp >=
                        to_pd_timestamp('2005-01-01'),
                        FinanceFactor.report_date >=
                        the_data_list[0].report_date,
                        FinanceFactor.report_date <=
                        the_data_list[-1].report_date,
                    ])

                if pd_is_not_null(df):
                    index_df(df, index='report_date', time_field='report_date')

                for the_data in the_data_list:
                    if (df is not None) and (
                            not df.empty) and the_data.report_date in df.index:
                        the_data.timestamp = df.at[the_data.report_date,
                                                   'timestamp']
                        self.logger.info(
                            'db fill {} {} timestamp:{} for report_date:{}'.
                            format(self.data_schema, entity.id,
                                   the_data.timestamp, the_data.report_date))
                        self.session.commit()
                    else:
                        # self.logger.info(
                        #     'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema,
                        #                                                                    security_item.id,
                        #                                                                    the_data.timestamp,
                        #                                                                    the_data.report_date))

                        self.fill_timestamp_with_jq(entity, the_data)
Ejemplo n.º 2
0
def get_data(data_schema,
             entity_ids: List[str] = None,
             entity_id: str = None,
             codes: List[str] = None,
             level: Union[IntervalLevel, str] = None,
             provider: str = None,
             columns: List = None,
             return_type: str = 'df',
             start_timestamp: Union[pd.Timestamp, str] = None,
             end_timestamp: Union[pd.Timestamp, str] = None,
             filters: List = None,
             session: Session = None,
             order=None,
             limit: int = None,
             index: str = 'timestamp',
             index_is_time: bool = True,
             time_field: str = 'timestamp'):
    assert data_schema is not None
    assert provider is not None
    assert provider in global_providers

    local_session = False
    if not session:
        session = get_db_session(provider=provider, data_schema=data_schema)
        local_session = True

    try:
        time_col = eval('data_schema.{}'.format(time_field))

        if columns:
            # support str
            if type(columns[0]) == str:
                columns_ = []
                for col in columns:
                    columns_.append(eval('data_schema.{}'.format(col)))
                columns = columns_

            if time_col not in columns:
                columns.append(time_col)
            query = session.query(*columns)
        else:
            query = session.query(data_schema)

        if entity_id:
            query = query.filter(data_schema.entity_id == entity_id)
        if codes:
            query = query.filter(data_schema.code.in_(codes))
        if entity_ids:
            query = query.filter(data_schema.entity_id.in_(entity_ids))

        # we always store different level in different schema,the level param is not useful now
        if level:
            try:
                # some schema has no level,just ignore it
                data_schema.level
                if type(level) == IntervalLevel:
                    level = level.value
                query = query.filter(data_schema.level == level)
            except Exception as e:
                pass

        query = common_filter(query,
                              data_schema=data_schema,
                              start_timestamp=start_timestamp,
                              end_timestamp=end_timestamp,
                              filters=filters,
                              order=order,
                              limit=limit,
                              time_field=time_field)

        if return_type == 'df':
            df = pd.read_sql(query.statement, query.session.bind)
            if df_is_not_null(df):
                return index_df(df,
                                drop=False,
                                index=index,
                                index_is_time=index_is_time)
            return df
        elif return_type == 'domain':
            return query.all()
        elif return_type == 'dict':
            return [item.__dict__ for item in query.all()]
    except Exception:
        raise
    finally:
        if local_session:
            session.close()
Ejemplo n.º 3
0
 def normalize_result_df(self, df):
     if pd_is_not_null(df):
         df = df.reset_index()
         df = index_df(df)
         df = df.sort_values(by=['score', 'entity_id'])
     return df