Ejemplo n.º 1
0
def get_top_entities(data_schema: Mixin, column: str, start_timestamp=None, end_timestamp=None, pct=0.1,
                     method: WindowMethod = WindowMethod.change,
                     return_type: TopType = None, filters=None):
    """
    get top entities in specific domain between time range

    :param data_schema: schema in domain
    :param column: schema column
    :param start_timestamp:
    :param end_timestamp:
    :param pct: range (0,1]
    :param method:
    :param return_type:
    :param filters:
    :return:
    """
    if type(method) == str:
        method = WindowMethod(method)

    if type(return_type) == str:
        return_type = TopType(return_type)

    all_df = data_schema.query_data(start_timestamp=start_timestamp, end_timestamp=end_timestamp,
                                    columns=['entity_id', column], filters=filters)
    g = all_df.groupby('entity_id')
    tops = {}
    for entity_id, df in g:
        if method == WindowMethod.change:
            start = df[column].iloc[0]
            end = df[column].iloc[-1]
            change = (end - start) / start
            tops[entity_id] = change
        elif method == WindowMethod.avg:
            tops[entity_id] = df[column].mean()
        elif method == WindowMethod.sum:
            tops[entity_id] = df[column].sum()

    positive_df = None
    negative_df = None
    top_index = int(len(tops) * pct)
    if return_type is None or return_type == TopType.positive:
        # from big to small
        positive_tops = {k: v for k, v in sorted(tops.items(), key=lambda item: item[1], reverse=True)}
        positive_tops = dict(itertools.islice(positive_tops.items(), top_index))
        positive_df = pd.DataFrame.from_dict(positive_tops, orient='index')

        col = 'score'
        positive_df.columns = [col]
        positive_df.sort_values(by=col, ascending=False)
    if return_type is None or return_type == TopType.negative:
        # from small to big
        negative_tops = {k: v for k, v in sorted(tops.items(), key=lambda item: item[1])}
        negative_tops = dict(itertools.islice(negative_tops.items(), top_index))
        negative_df = pd.DataFrame.from_dict(negative_tops, orient='index')

        col = 'score'
        negative_df.columns = [col]
        negative_df.sort_values(by=col)

    return positive_df, negative_df
Ejemplo n.º 2
0
def get_top_entities(data_schema: Mixin,
                     column,
                     start_timestamp=None,
                     end_timestamp=None,
                     pct=0.1,
                     method='change',
                     return_type='both',
                     filters=None):
    all_df = data_schema.query_data(start_timestamp=start_timestamp,
                                    end_timestamp=end_timestamp,
                                    columns=['entity_id', column],
                                    filters=filters)
    g = all_df.groupby('entity_id')
    tops = {}
    for entity_id, df in g:
        if method == 'change':
            start = df[column].iloc[0]
            end = df[column].iloc[-1]
            change = (end - start) / start
            tops[entity_id] = change
        elif method == 'avg':
            tops[entity_id] = df[column].mean()
        elif method == 'sum':
            tops[entity_id] = df[column].sum()

    positive_df = None
    negative_df = None
    top_index = int(len(tops) * pct)
    if return_type == 'positive' or return_type == 'both':
        # from big to small
        positive_tops = {
            k: v
            for k, v in sorted(
                tops.items(), key=lambda item: item[1], reverse=True)
        }
        positive_tops = dict(itertools.islice(positive_tops.items(),
                                              top_index))
        positive_df = pd.DataFrame.from_dict(positive_tops, orient='index')

        col = 'score'
        positive_df.columns = [col]
        positive_df.sort_values(by=col, ascending=False)
    if return_type == 'negative' or return_type == 'both':
        # from small to big
        negative_tops = {
            k: v
            for k, v in sorted(tops.items(), key=lambda item: item[1])
        }
        negative_tops = dict(itertools.islice(negative_tops.items(),
                                              top_index))
        negative_df = pd.DataFrame.from_dict(negative_tops, orient='index')

        col = 'score'
        negative_df.columns = [col]
        negative_df.sort_values(by=col)

    return positive_df, negative_df
Ejemplo n.º 3
0
    def __init__(self, data_schema: Mixin = FinanceFactor, entity_schema: EntityMixin = Stock, provider: str = None,
                 entity_provider: str = None, entity_ids: List[str] = None, exchanges: List[str] = None,
                 codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None,
                 start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None,
                 columns: List = None, filters: List = None, order: object = None, limit: int = None,
                 level: Union[str, IntervalLevel] = IntervalLevel.LEVEL_1DAY, category_field: str = 'entity_id',
                 time_field: str = 'timestamp', computing_window: int = None, keep_all_timestamp: bool = False,
                 fill_method: str = 'ffill', effective_number: int = None, transformer: Transformer = None,
                 accumulator: Accumulator = None, need_persist: bool = False, dry_run: bool = False) -> None:
        if not columns:
            columns = data_schema.important_cols()

        super().__init__(data_schema, entity_schema, provider, entity_provider, entity_ids, exchanges, codes,
                         the_timestamp, start_timestamp, end_timestamp, columns, filters, order, limit, level,
                         category_field, time_field, computing_window, keep_all_timestamp, fill_method,
                         effective_number, transformer, accumulator, need_persist, dry_run)
Ejemplo n.º 4
0
def get_top_entities(
    data_schema: Mixin,
    column: str,
    start_timestamp=None,
    end_timestamp=None,
    pct=0.1,
    method: WindowMethod = WindowMethod.change,
    return_type: TopType = None,
    kdata_filters=None,
    show_name=False,
    data_provider=None,
):
    """
    get top entities in specific domain between time range

    :param data_schema: schema in domain
    :param column: schema column
    :param start_timestamp:
    :param end_timestamp:
    :param pct: range (0,1]
    :param method:
    :param return_type:
    :param entity_filters:
    :param kdata_filters:
    :param show_name: show entity name
    :return:
    """
    if type(method) == str:
        method = WindowMethod(method)

    if type(return_type) == str:
        return_type = TopType(return_type)

    if show_name:
        columns = ["entity_id", column, "name"]
    else:
        columns = ["entity_id", column]

    all_df = data_schema.query_data(
        start_timestamp=start_timestamp,
        end_timestamp=end_timestamp,
        columns=columns,
        filters=kdata_filters,
        provider=data_provider,
    )
    if not pd_is_not_null(all_df):
        return None, None
    g = all_df.groupby("entity_id")
    tops = {}
    names = {}
    for entity_id, df in g:
        if method == WindowMethod.change:
            start = df[column].iloc[0]
            end = df[column].iloc[-1]
            if start != 0:
                change = (end - start) / abs(start)
            else:
                change = 0
            tops[entity_id] = change
        elif method == WindowMethod.avg:
            tops[entity_id] = df[column].mean()
        elif method == WindowMethod.sum:
            tops[entity_id] = df[column].sum()

        if show_name:
            names[entity_id] = df["name"].iloc[0]

    positive_df = None
    negative_df = None
    top_index = int(len(tops) * pct)
    if return_type is None or return_type == TopType.positive:
        # from big to small
        positive_tops = {
            k: v
            for k, v in sorted(
                tops.items(), key=lambda item: item[1], reverse=True)
        }
        positive_tops = dict(itertools.islice(positive_tops.items(),
                                              top_index))
        positive_df = pd.DataFrame.from_dict(positive_tops, orient="index")

        col = "score"
        positive_df.columns = [col]
        positive_df.sort_values(by=col, ascending=False)
    if return_type is None or return_type == TopType.negative:
        # from small to big
        negative_tops = {
            k: v
            for k, v in sorted(tops.items(), key=lambda item: item[1])
        }
        negative_tops = dict(itertools.islice(negative_tops.items(),
                                              top_index))
        negative_df = pd.DataFrame.from_dict(negative_tops, orient="index")

        col = "score"
        negative_df.columns = [col]
        negative_df.sort_values(by=col)

    if names:
        if pd_is_not_null(positive_df):
            positive_df["name"] = positive_df.index.map(lambda x: names[x])
        if pd_is_not_null(negative_df):
            negative_df["name"] = negative_df.index.map(lambda x: names[x])
    return positive_df, negative_df