def dframe(self, query_args=None, keep_parent_ids=False, padded=False, index=False, reload_=False, keep_mongo_keys=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param padded: Used for joining, default False. :param index: Return the index with dframe, default False. :param reload_: Force refresh of data, default False. :param keep_mongo_keys: Used for updating documents, default False. :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ # bypass cache if we need specific version cacheable = not (query_args or keep_parent_ids or padded) # use cached copy if we have already fetched it if cacheable and not reload_ and self.__is_cached: return self.__dframe query_args = query_args or QueryArgs() observations = self.observations(query_args, as_cursor=True) if query_args.distinct: return BambooFrame(observations) dframe = Observation.batch_read_dframe_from_cursor( self, observations, query_args.distinct, query_args.limit) dframe.decode_mongo_reserved_keys(keep_mongo_keys=keep_mongo_keys) excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX] dframe.remove_bamboo_reserved_keys(filter(bool, excluded)) if index: dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'})) dframe = self.__maybe_pad(dframe, padded) if cacheable: self.__dframe = dframe return dframe
def __merge_datasets(datasets, mapping): """Merge two or more datasets.""" dframes = [] if not mapping: mapping = {} for dataset in datasets: dframe = dataset.dframe() column_map = mapping.get(dataset.dataset_id) if column_map: dframe = BambooFrame(dframe.rename(columns=column_map)) dframe = dframe.add_parent_column(dataset.dataset_id) dframes.append(dframe) return concat(dframes, ignore_index=True)
def dframe(self, query_args=QueryArgs(), keep_parent_ids=False, padded=False, index=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param index: Return the index with dframe, default False. :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ observations = self.observations(query_args, as_cursor=True) dframe = self.__batch_read_dframe_from_cursor( observations, query_args.distinct, query_args.limit) dframe.decode_mongo_reserved_keys() excluded = [] if keep_parent_ids: excluded.append(PARENT_DATASET_ID) if index: excluded.append(INDEX) dframe.remove_bamboo_reserved_keys(excluded) if index: dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'})) if padded: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
def encode(dframe, dataset, add_index=True): """Encode the columns for `dataset` to slugs and add ID column. The ID column is the dataset_id for dataset. This is used to link observations to a specific dataset. :param dframe: The DataFrame to encode. :param dataset: The Dataset to use a mapping for. :param add_index: Add index to the DataFrame, default True. :returns: A modified `dframe` as a BambooFrame. """ dframe = BambooFrame(dframe) if add_index: dframe = dframe.add_index() dframe = dframe.add_id_column(dataset.dataset_id) encoded_columns_map = dataset.schema.rename_map_for_dframe(dframe) return dframe.rename(columns=encoded_columns_map)