def get_top_entities(data_schema: Mixin, column: str, start_timestamp=None, end_timestamp=None, pct=0.1, method: WindowMethod = WindowMethod.change, return_type: TopType = None, filters=None): """ get top entities in specific domain between time range :param data_schema: schema in domain :param column: schema column :param start_timestamp: :param end_timestamp: :param pct: range (0,1] :param method: :param return_type: :param filters: :return: """ if type(method) == str: method = WindowMethod(method) if type(return_type) == str: return_type = TopType(return_type) all_df = data_schema.query_data(start_timestamp=start_timestamp, end_timestamp=end_timestamp, columns=['entity_id', column], filters=filters) g = all_df.groupby('entity_id') tops = {} for entity_id, df in g: if method == WindowMethod.change: start = df[column].iloc[0] end = df[column].iloc[-1] change = (end - start) / start tops[entity_id] = change elif method == WindowMethod.avg: tops[entity_id] = df[column].mean() elif method == WindowMethod.sum: tops[entity_id] = df[column].sum() positive_df = None negative_df = None top_index = int(len(tops) * pct) if return_type is None or return_type == TopType.positive: # from big to small positive_tops = {k: v for k, v in sorted(tops.items(), key=lambda item: item[1], reverse=True)} positive_tops = dict(itertools.islice(positive_tops.items(), top_index)) positive_df = pd.DataFrame.from_dict(positive_tops, orient='index') col = 'score' positive_df.columns = [col] positive_df.sort_values(by=col, ascending=False) if return_type is None or return_type == TopType.negative: # from small to big negative_tops = {k: v for k, v in sorted(tops.items(), key=lambda item: item[1])} negative_tops = dict(itertools.islice(negative_tops.items(), top_index)) negative_df = pd.DataFrame.from_dict(negative_tops, orient='index') col = 'score' negative_df.columns = [col] negative_df.sort_values(by=col) return positive_df, negative_df
def get_top_entities(data_schema: Mixin, column, start_timestamp=None, end_timestamp=None, pct=0.1, method='change', return_type='both', filters=None): all_df = data_schema.query_data(start_timestamp=start_timestamp, end_timestamp=end_timestamp, columns=['entity_id', column], filters=filters) g = all_df.groupby('entity_id') tops = {} for entity_id, df in g: if method == 'change': start = df[column].iloc[0] end = df[column].iloc[-1] change = (end - start) / start tops[entity_id] = change elif method == 'avg': tops[entity_id] = df[column].mean() elif method == 'sum': tops[entity_id] = df[column].sum() positive_df = None negative_df = None top_index = int(len(tops) * pct) if return_type == 'positive' or return_type == 'both': # from big to small positive_tops = { k: v for k, v in sorted( tops.items(), key=lambda item: item[1], reverse=True) } positive_tops = dict(itertools.islice(positive_tops.items(), top_index)) positive_df = pd.DataFrame.from_dict(positive_tops, orient='index') col = 'score' positive_df.columns = [col] positive_df.sort_values(by=col, ascending=False) if return_type == 'negative' or return_type == 'both': # from small to big negative_tops = { k: v for k, v in sorted(tops.items(), key=lambda item: item[1]) } negative_tops = dict(itertools.islice(negative_tops.items(), top_index)) negative_df = pd.DataFrame.from_dict(negative_tops, orient='index') col = 'score' negative_df.columns = [col] negative_df.sort_values(by=col) return positive_df, negative_df
def __init__(self, data_schema: Mixin = FinanceFactor, entity_schema: EntityMixin = Stock, provider: str = None, entity_provider: str = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, level: Union[str, IntervalLevel] = IntervalLevel.LEVEL_1DAY, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = None, keep_all_timestamp: bool = False, fill_method: str = 'ffill', effective_number: int = None, transformer: Transformer = None, accumulator: Accumulator = None, need_persist: bool = False, dry_run: bool = False) -> None: if not columns: columns = data_schema.important_cols() super().__init__(data_schema, entity_schema, provider, entity_provider, entity_ids, exchanges, codes, the_timestamp, start_timestamp, end_timestamp, columns, filters, order, limit, level, category_field, time_field, computing_window, keep_all_timestamp, fill_method, effective_number, transformer, accumulator, need_persist, dry_run)
def get_top_entities( data_schema: Mixin, column: str, start_timestamp=None, end_timestamp=None, pct=0.1, method: WindowMethod = WindowMethod.change, return_type: TopType = None, kdata_filters=None, show_name=False, data_provider=None, ): """ get top entities in specific domain between time range :param data_schema: schema in domain :param column: schema column :param start_timestamp: :param end_timestamp: :param pct: range (0,1] :param method: :param return_type: :param entity_filters: :param kdata_filters: :param show_name: show entity name :return: """ if type(method) == str: method = WindowMethod(method) if type(return_type) == str: return_type = TopType(return_type) if show_name: columns = ["entity_id", column, "name"] else: columns = ["entity_id", column] all_df = data_schema.query_data( start_timestamp=start_timestamp, end_timestamp=end_timestamp, columns=columns, filters=kdata_filters, provider=data_provider, ) if not pd_is_not_null(all_df): return None, None g = all_df.groupby("entity_id") tops = {} names = {} for entity_id, df in g: if method == WindowMethod.change: start = df[column].iloc[0] end = df[column].iloc[-1] if start != 0: change = (end - start) / abs(start) else: change = 0 tops[entity_id] = change elif method == WindowMethod.avg: tops[entity_id] = df[column].mean() elif method == WindowMethod.sum: tops[entity_id] = df[column].sum() if show_name: names[entity_id] = df["name"].iloc[0] positive_df = None negative_df = None top_index = int(len(tops) * pct) if return_type is None or return_type == TopType.positive: # from big to small positive_tops = { k: v for k, v in sorted( tops.items(), key=lambda item: item[1], reverse=True) } positive_tops = dict(itertools.islice(positive_tops.items(), top_index)) positive_df = pd.DataFrame.from_dict(positive_tops, orient="index") col = "score" positive_df.columns = [col] positive_df.sort_values(by=col, ascending=False) if return_type is None or return_type == TopType.negative: # from small to big negative_tops = { k: v for k, v in sorted(tops.items(), key=lambda item: item[1]) } negative_tops = dict(itertools.islice(negative_tops.items(), top_index)) negative_df = pd.DataFrame.from_dict(negative_tops, orient="index") col = "score" negative_df.columns = [col] negative_df.sort_values(by=col) if names: if pd_is_not_null(positive_df): positive_df["name"] = positive_df.index.map(lambda x: names[x]) if pd_is_not_null(negative_df): negative_df["name"] = negative_df.index.map(lambda x: names[x]) return positive_df, negative_df