def __maybe_pad(self, dframe, pad): if pad: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
def dframe(self, query_args=None, keep_parent_ids=False, padded=False, index=False, reload_=False, keep_mongo_keys=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param padded: Used for joining, default False. :param index: Return the index with dframe, default False. :param reload_: Force refresh of data, default False. :param keep_mongo_keys: Used for updating documents, default False. :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ # bypass cache if we need specific version cacheable = not (query_args or keep_parent_ids or padded) # use cached copy if we have already fetched it if cacheable and not reload_ and self.__is_cached: return self.__dframe query_args = query_args or QueryArgs() observations = self.observations(query_args, as_cursor=True) if query_args.distinct: return BambooFrame(observations) dframe = Observation.batch_read_dframe_from_cursor( self, observations, query_args.distinct, query_args.limit) dframe.decode_mongo_reserved_keys(keep_mongo_keys=keep_mongo_keys) excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX] dframe.remove_bamboo_reserved_keys(filter(bool, excluded)) if index: dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'})) dframe = self.__maybe_pad(dframe, padded) if cacheable: self.__dframe = dframe return dframe
def __merge_datasets(datasets, mapping): """Merge two or more datasets.""" dframes = [] if not mapping: mapping = {} for dataset in datasets: dframe = dataset.dframe() column_map = mapping.get(dataset.dataset_id) if column_map: dframe = BambooFrame(dframe.rename(columns=column_map)) dframe = dframe.add_parent_column(dataset.dataset_id) dframes.append(dframe) return concat(dframes, ignore_index=True)
def dframe(self, query_args=QueryArgs(), keep_parent_ids=False, padded=False, index=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param index: Return the index with dframe, default False. :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ observations = self.observations(query_args, as_cursor=True) dframe = self.__batch_read_dframe_from_cursor( observations, query_args.distinct, query_args.limit) dframe.decode_mongo_reserved_keys() excluded = [] if keep_parent_ids: excluded.append(PARENT_DATASET_ID) if index: excluded.append(INDEX) dframe.remove_bamboo_reserved_keys(excluded) if index: dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'})) if padded: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
def test_only_rows_for_parent_id(self): parent_id = 1 len_parent_rows = len(self.bframe) / 2 column = Series([parent_id] * len_parent_rows) column.name = PARENT_DATASET_ID self.bframe = BambooFrame(self.bframe.join(column)) bframe_only = self.bframe.only_rows_for_parent_id(parent_id) self.assertFalse(PARENT_DATASET_ID in bframe_only.columns) self.assertEqual(len(bframe_only), len_parent_rows)
def encode(dframe, dataset, add_index=True): """Encode the columns for `dataset` to slugs and add ID column. The ID column is the dataset_id for dataset. This is used to link observations to a specific dataset. :param dframe: The DataFrame to encode. :param dataset: The Dataset to use a mapping for. :param add_index: Add index to the DataFrame, default True. :returns: A modified `dframe` as a BambooFrame. """ dframe = BambooFrame(dframe) if add_index: dframe = dframe.add_index() dframe = dframe.add_id_column(dataset.dataset_id) encoded_columns_map = dataset.schema.rename_map_for_dframe(dframe) return dframe.rename(columns=encoded_columns_map)
def save(self): """Save this aggregation. If an aggregated dataset for this aggregations group already exists store in this dataset, if not create a new aggregated dataset and store the aggregation in this new aggregated dataset. """ new_dframe = BambooFrame(self.aggregation.eval(self.columns)) new_dframe = new_dframe.add_parent_column(self.dataset.dataset_id) agg_dataset = self.dataset.aggregated_dataset(self.groups) if agg_dataset is None: agg_dataset = self.dataset.new_agg_dataset( new_dframe, self.groups) else: agg_dframe = agg_dataset.dframe() new_dframe = self.__merge_dframes([agg_dframe, new_dframe]) agg_dataset.replace_observations(new_dframe) self.new_dframe = new_dframe
def dframe(self, query=None, select=None, distinct=None, keep_parent_ids=False, limit=0, order_by=None, padded=False): """Fetch the dframe for this dataset. :param select: An optional select to limit the fields in the dframe. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param limit: Limit on the number of rows in the returned dframe. :param order_by: Sort resulting rows according to a column value and sign indicating ascending or descending. Example of `order_by`: - ``order_by='mycolumn'`` - ``order_by='-mycolumn'`` :returns: Return BambooFrame with contents based on query parameters passed to MongoDB. BambooFrame will not have parent ids if `keep_parent_ids` is False. """ observations = self.observations( query=query, select=select, limit=limit, order_by=order_by, as_cursor=True) dframe = self._batch_read_dframe_from_cursor( observations, distinct, limit) dframe.decode_mongo_reserved_keys() dframe.remove_bamboo_reserved_keys(keep_parent_ids) if padded: if len(dframe.columns): on = dframe.columns[0] place_holder = self.place_holder_dframe(dframe).set_index(on) dframe = BambooFrame(dframe.join(place_holder, on=on)) else: dframe = self.place_holder_dframe() return dframe
def _add_bamboo_reserved_keys(self, value=1): for key in BAMBOO_RESERVED_KEYS: column = Series([value] * len(self.bframe)) column.name = key self.bframe = BambooFrame(self.bframe.join(column))
def setUp(self): TestBase.setUp(self) self.dframe = self.get_data('good_eats.csv') self.bframe = BambooFrame(self.dframe)
class TestFrame(TestBase): def setUp(self): TestBase.setUp(self) self.dframe = self.get_data('good_eats.csv') self.bframe = BambooFrame(self.dframe) def _add_bamboo_reserved_keys(self, value=1): for key in BAMBOO_RESERVED_KEYS: column = Series([value] * len(self.bframe)) column.name = key self.bframe = BambooFrame(self.bframe.join(column)) def test_add_parent_column(self): value = 1 self._add_bamboo_reserved_keys(value) for index, item in self.bframe[PARENT_DATASET_ID].iteritems(): self.assertEqual(item, value) def test_decode_mongo_reserved_keys(self): prev_columns = self.bframe.columns for col in MONGO_RESERVED_KEYS: self.assertTrue(col in self.bframe.columns) self.bframe.decode_mongo_reserved_keys() for col in MONGO_RESERVED_KEYS: self.assertFalse(col in self.bframe.columns) def test_recognize_dates(self): bframe_with_dates = self.bframe.recognize_dates() for field in bframe_with_dates['submit_date']: self.assertTrue(isinstance(field, datetime)) def test_recognize_dates_from_schema(self): schema = Schema({ 'submit_date': { SIMPLETYPE: DATETIME } }) bframe_with_dates = self.bframe.recognize_dates_from_schema(schema) for field in bframe_with_dates['submit_date']: self.assertTrue(isinstance(field, datetime)) def test_remove_bamboo_reserved_keys(self): self._add_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertTrue(key in self.bframe.columns) self.bframe.remove_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertFalse(key in self.bframe.columns) def test_remove_bamboo_reserved_keys_exclusion(self): self._add_bamboo_reserved_keys() for key in BAMBOO_RESERVED_KEYS: self.assertTrue(key in self.bframe.columns) self.bframe.remove_bamboo_reserved_keys([PARENT_DATASET_ID]) for key in BAMBOO_RESERVED_KEYS: if key == PARENT_DATASET_ID: self.assertTrue(key in self.bframe.columns) else: self.assertFalse(key in self.bframe.columns) def test_only_rows_for_parent_id(self): parent_id = 1 len_parent_rows = len(self.bframe) / 2 column = Series([parent_id] * len_parent_rows) column.name = PARENT_DATASET_ID self.bframe = BambooFrame(self.bframe.join(column)) bframe_only = self.bframe.only_rows_for_parent_id(parent_id) self.assertFalse(PARENT_DATASET_ID in bframe_only.columns) self.assertEqual(len(bframe_only), len_parent_rows) def test_to_jsondict(self): jsondict = self.bframe.to_jsondict() self.assertEqual(len(jsondict), len(self.bframe)) for col in jsondict: self.assertEqual(len(col), len(self.bframe.columns)) def test_to_json(self): json = self.bframe.to_json() self.assertEqual(type(json), str)