Beispiel #1
0
    def dframe(self, query_args=None, keep_parent_ids=False, padded=False,
               index=False, reload_=False, keep_mongo_keys=False):
        """Fetch the dframe for this dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        :param keep_parent_ids: Do not remove parent IDs from the dframe,
            default False.
        :param padded: Used for joining, default False.
        :param index: Return the index with dframe, default False.
        :param reload_: Force refresh of data, default False.
        :param keep_mongo_keys: Used for updating documents, default False.

        :returns: Return BambooFrame with contents based on query parameters
            passed to MongoDB. BambooFrame will not have parent ids if
            `keep_parent_ids` is False.
        """
        # bypass cache if we need specific version
        cacheable = not (query_args or keep_parent_ids or padded)

        # use cached copy if we have already fetched it
        if cacheable and not reload_ and self.__is_cached:
            return self.__dframe

        query_args = query_args or QueryArgs()
        observations = self.observations(query_args, as_cursor=True)

        if query_args.distinct:
            return BambooFrame(observations)

        dframe = Observation.batch_read_dframe_from_cursor(
            self, observations, query_args.distinct, query_args.limit)

        dframe.decode_mongo_reserved_keys(keep_mongo_keys=keep_mongo_keys)

        excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX]
        dframe.remove_bamboo_reserved_keys(filter(bool, excluded))

        if index:
            dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'}))

        dframe = self.__maybe_pad(dframe, padded)

        if cacheable:
            self.__dframe = dframe

        return dframe
Beispiel #2
0
def __merge_datasets(datasets, mapping):
    """Merge two or more datasets."""
    dframes = []

    if not mapping:
        mapping = {}

    for dataset in datasets:
        dframe = dataset.dframe()
        column_map = mapping.get(dataset.dataset_id)

        if column_map:
            dframe = BambooFrame(dframe.rename(columns=column_map))

        dframe = dframe.add_parent_column(dataset.dataset_id)
        dframes.append(dframe)

    return concat(dframes, ignore_index=True)
Beispiel #3
0
    def dframe(self, query_args=QueryArgs(), keep_parent_ids=False,
               padded=False, index=False):
        """Fetch the dframe for this dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        :param keep_parent_ids: Do not remove parent IDs from the dframe,
            default False.
        :param index: Return the index with dframe, default False.

        :returns: Return BambooFrame with contents based on query parameters
            passed to MongoDB. BambooFrame will not have parent ids if
            `keep_parent_ids` is False.
        """
        observations = self.observations(query_args, as_cursor=True)

        dframe = self.__batch_read_dframe_from_cursor(
            observations, query_args.distinct, query_args.limit)

        dframe.decode_mongo_reserved_keys()

        excluded = []

        if keep_parent_ids:
            excluded.append(PARENT_DATASET_ID)
        if index:
            excluded.append(INDEX)

        dframe.remove_bamboo_reserved_keys(excluded)

        if index:
            dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'}))

        if padded:
            if len(dframe.columns):
                on = dframe.columns[0]
                place_holder = self.place_holder_dframe(dframe).set_index(on)
                dframe = BambooFrame(dframe.join(place_holder, on=on))
            else:
                dframe = self.place_holder_dframe()

        return dframe
Beispiel #4
0
def encode(dframe, dataset, add_index=True):
    """Encode the columns for `dataset` to slugs and add ID column.

    The ID column is the dataset_id for dataset.  This is
    used to link observations to a specific dataset.

    :param dframe: The DataFrame to encode.
    :param dataset: The Dataset to use a mapping for.
    :param add_index: Add index to the DataFrame, default True.

    :returns: A modified `dframe` as a BambooFrame.
    """
    dframe = BambooFrame(dframe)

    if add_index:
        dframe = dframe.add_index()

    dframe = dframe.add_id_column(dataset.dataset_id)
    encoded_columns_map = dataset.schema.rename_map_for_dframe(dframe)

    return dframe.rename(columns=encoded_columns_map)