def _get_sq_field_for_blender_field(field,
                                    queries,
                                    field_maps,
                                    reference=None):
    unmodified_field = find_field_in_modified_field(field)
    field_alias = alias_selector(reference_type_alias(field, reference))

    # search for the field in each field map to determine which subquery it will be in
    for query, field_map in zip(queries, field_maps):
        if query is None or unmodified_field not in field_map:
            continue

        mapped_field = field_map[unmodified_field]
        mapped_field_alias = alias_selector(
            reference_type_alias(mapped_field, reference))

        subquery_field = query[mapped_field_alias]
        # case #1 modified fields, ex. day(timestamp) or rollup(dimension)
        return field.for_(subquery_field).as_(field_alias)

    # Need to copy the metrics if there are references so that the `get_sql` monkey patch does not conflict.
    # Given some of them might have nested metrics themselves, the clone process is performed recursively.

    definition = field.definition

    while isinstance(definition, Field):
        definition = definition.definition

    # case #2: complex blender fields
    return _deepcopy_recursive(definition).as_(field_alias)
def map_blender_field_to_dataset_field(field, field_map, dataset):
    field_from_blender = find_field_in_modified_field(field)
    if field_from_blender in dataset.fields:
        return field

    if field_from_blender in field_map:
        return field.for_(field_map[field_from_blender])
Example #3
0
def make_reference_dimensions(dimensions, ref_dimension, offset_func,
                              field_transformer, trunc_date):
    return [
        _replace_reference_dimension(dimension, offset_func, field_transformer,
                                     trunc_date) if
        ref_dimension is find_field_in_modified_field(dimension) else dimension
        for dimension in dimensions
    ]
    def _map_fields(fields):
        """
        TODO describe this
        """
        for field in fields:
            field_from_blender = find_field_in_modified_field(field)

            if field_from_blender in dataset.fields:
                yield field
                continue
            if field_from_blender not in field_map:
                continue

            yield field.for_(field_map[field_from_blender])
Example #5
0
def fetch_data(
        database: Database,
        queries: List[Type[QueryBuilder]],
        dimensions: Iterable[Field],
        share_dimensions: Iterable[Field] = (),
        reference_groups=(),
) -> Tuple[int, pd.DataFrame]:
    queries = [str(query) for query in queries]

    # Indicate which dimensions need to be parsed as date types
    # For this we create a dictionary with the dimension alias as key and PANDAS_TO_DATETIME_FORMAT as value
    pandas_parse_dates = {}
    for dimension in dimensions:
        unmodified_dimension = find_field_in_modified_field(dimension)
        if unmodified_dimension.data_type == DataType.date:
            pandas_parse_dates[alias_selector(
                unmodified_dimension.alias)] = PANDAS_TO_DATETIME_FORMAT

    results = database.fetch_dataframes(*queries,
                                        parse_dates=pandas_parse_dates)
    max_rows_returned = 0
    for result_df in results:
        row_count = len(result_df)
        if row_count > max_rows_returned:
            max_rows_returned = row_count
        if row_count > database.max_result_set_size:
            logger.warning('row_count_over_max',
                           extra={
                               'row_count': len(result_df),
                               'database': str(database)
                           })
            # drop all result rows above database.max_result_set_size in place
            result_df.drop(result_df.index[database.max_result_set_size:],
                           inplace=True)

    logger.info('max_rows_returned',
                extra={
                    'row_count': max_rows_returned,
                    'database': str(database)
                })
    return max_rows_returned, reduce_result_set(results, reference_groups,
                                                dimensions, share_dimensions)
    def _get_sq_field_for_blender_field(field, reference=None):
        unmodified_field = find_field_in_modified_field(field)
        field_alias = alias_selector(reference_alias(field, reference))

        # search for the field in each field map to determine which subquery it will be in
        for query, field_map in zip(queries, field_maps):
            if unmodified_field not in field_map:
                continue

            mapped_field = field_map[unmodified_field]
            mapped_field_alias = alias_selector(
                reference_alias(mapped_field, reference))

            subquery_field = query[mapped_field_alias]
            # case #1 modified fields, ex. day(timestamp) or rollup(dimension)
            return field.for_(subquery_field).as_(field_alias)

        # Need to copy the metrics if there are references so that the `get_sql` monkey patch does not conflict
        definition = copy.deepcopy(field.definition)
        # case #2: complex blender fields
        return definition.as_(field_alias)
def _blender_join_criteria(base_query, join_query, dimensions, base_field_map,
                           join_field_map):
    """
    Build a criteria for joining this join query to the base query in datset blender queries. This should be a set of
    equality conditions like A0=B0 AND A1=B1 AND An=Bn for each mapped dimension between dataset from
    `DataSetBlender.dimension_map`.
    """
    join_criteria = None
    for dimension in dimensions:
        dimension = find_field_in_modified_field(dimension)
        if not all([dimension in base_field_map, dimension in join_field_map]):
            continue

        alias0, alias1 = [
            alias_selector(field_map[dimension].alias)
            for field_map in [base_field_map, join_field_map]
        ]

        next_criteria = base_query[alias0] == join_query[alias1]
        join_criteria = (next_criteria if join_criteria is None else
                         (join_criteria & next_criteria))

    return join_criteria