Esempio n. 1
0
    def __maybe_pad(self, dframe, pad):
        if pad:
            if len(dframe.columns):
                on = dframe.columns[0]
                place_holder = self.place_holder_dframe(dframe).set_index(on)
                dframe = BambooFrame(dframe.join(place_holder, on=on))
            else:
                dframe = self.place_holder_dframe()

        return dframe
Esempio n. 2
0
    def dframe(self, query_args=None, keep_parent_ids=False, padded=False,
               index=False, reload_=False, keep_mongo_keys=False):
        """Fetch the dframe for this dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        :param keep_parent_ids: Do not remove parent IDs from the dframe,
            default False.
        :param padded: Used for joining, default False.
        :param index: Return the index with dframe, default False.
        :param reload_: Force refresh of data, default False.
        :param keep_mongo_keys: Used for updating documents, default False.

        :returns: Return BambooFrame with contents based on query parameters
            passed to MongoDB. BambooFrame will not have parent ids if
            `keep_parent_ids` is False.
        """
        # bypass cache if we need specific version
        cacheable = not (query_args or keep_parent_ids or padded)

        # use cached copy if we have already fetched it
        if cacheable and not reload_ and self.__is_cached:
            return self.__dframe

        query_args = query_args or QueryArgs()
        observations = self.observations(query_args, as_cursor=True)

        if query_args.distinct:
            return BambooFrame(observations)

        dframe = Observation.batch_read_dframe_from_cursor(
            self, observations, query_args.distinct, query_args.limit)

        dframe.decode_mongo_reserved_keys(keep_mongo_keys=keep_mongo_keys)

        excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX]
        dframe.remove_bamboo_reserved_keys(filter(bool, excluded))

        if index:
            dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'}))

        dframe = self.__maybe_pad(dframe, padded)

        if cacheable:
            self.__dframe = dframe

        return dframe
Esempio n. 3
0
def __merge_datasets(datasets, mapping):
    """Merge two or more datasets."""
    dframes = []

    if not mapping:
        mapping = {}

    for dataset in datasets:
        dframe = dataset.dframe()
        column_map = mapping.get(dataset.dataset_id)

        if column_map:
            dframe = BambooFrame(dframe.rename(columns=column_map))

        dframe = dframe.add_parent_column(dataset.dataset_id)
        dframes.append(dframe)

    return concat(dframes, ignore_index=True)
Esempio n. 4
0
File: dataset.py Progetto: j/bamboo
    def dframe(self, query_args=QueryArgs(), keep_parent_ids=False,
               padded=False, index=False):
        """Fetch the dframe for this dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        :param keep_parent_ids: Do not remove parent IDs from the dframe,
            default False.
        :param index: Return the index with dframe, default False.

        :returns: Return BambooFrame with contents based on query parameters
            passed to MongoDB. BambooFrame will not have parent ids if
            `keep_parent_ids` is False.
        """
        observations = self.observations(query_args, as_cursor=True)

        dframe = self.__batch_read_dframe_from_cursor(
            observations, query_args.distinct, query_args.limit)

        dframe.decode_mongo_reserved_keys()

        excluded = []

        if keep_parent_ids:
            excluded.append(PARENT_DATASET_ID)
        if index:
            excluded.append(INDEX)

        dframe.remove_bamboo_reserved_keys(excluded)

        if index:
            dframe = BambooFrame(dframe.rename(columns={INDEX: 'index'}))

        if padded:
            if len(dframe.columns):
                on = dframe.columns[0]
                place_holder = self.place_holder_dframe(dframe).set_index(on)
                dframe = BambooFrame(dframe.join(place_holder, on=on))
            else:
                dframe = self.place_holder_dframe()

        return dframe
Esempio n. 5
0
    def test_only_rows_for_parent_id(self):
        parent_id = 1
        len_parent_rows = len(self.bframe) / 2

        column = Series([parent_id] * len_parent_rows)
        column.name = PARENT_DATASET_ID

        self.bframe = BambooFrame(self.bframe.join(column))
        bframe_only = self.bframe.only_rows_for_parent_id(parent_id)

        self.assertFalse(PARENT_DATASET_ID in bframe_only.columns)
        self.assertEqual(len(bframe_only), len_parent_rows)
Esempio n. 6
0
def encode(dframe, dataset, add_index=True):
    """Encode the columns for `dataset` to slugs and add ID column.

    The ID column is the dataset_id for dataset.  This is
    used to link observations to a specific dataset.

    :param dframe: The DataFrame to encode.
    :param dataset: The Dataset to use a mapping for.
    :param add_index: Add index to the DataFrame, default True.

    :returns: A modified `dframe` as a BambooFrame.
    """
    dframe = BambooFrame(dframe)

    if add_index:
        dframe = dframe.add_index()

    dframe = dframe.add_id_column(dataset.dataset_id)
    encoded_columns_map = dataset.schema.rename_map_for_dframe(dframe)

    return dframe.rename(columns=encoded_columns_map)
Esempio n. 7
0
    def save(self):
        """Save this aggregation.

        If an aggregated dataset for this aggregations group already exists
        store in this dataset, if not create a new aggregated dataset and store
        the aggregation in this new aggregated dataset.

        """
        new_dframe = BambooFrame(self.aggregation.eval(self.columns))
        new_dframe = new_dframe.add_parent_column(self.dataset.dataset_id)

        agg_dataset = self.dataset.aggregated_dataset(self.groups)

        if agg_dataset is None:
            agg_dataset = self.dataset.new_agg_dataset(
                new_dframe, self.groups)
        else:
            agg_dframe = agg_dataset.dframe()
            new_dframe = self.__merge_dframes([agg_dframe, new_dframe])
            agg_dataset.replace_observations(new_dframe)

        self.new_dframe = new_dframe
Esempio n. 8
0
    def dframe(self, query=None, select=None, distinct=None,
               keep_parent_ids=False, limit=0, order_by=None, padded=False):
        """Fetch the dframe for this dataset.

        :param select: An optional select to limit the fields in the dframe.
        :param keep_parent_ids: Do not remove parent IDs from the dframe,
            default False.
        :param limit: Limit on the number of rows in the returned dframe.
        :param order_by: Sort resulting rows according to a column value and
            sign indicating ascending or descending.

        Example of `order_by`:

          - ``order_by='mycolumn'``
          - ``order_by='-mycolumn'``

        :returns: Return BambooFrame with contents based on query parameters
            passed to MongoDB. BambooFrame will not have parent ids if
            `keep_parent_ids` is False.
        """
        observations = self.observations(
            query=query, select=select, limit=limit, order_by=order_by,
            as_cursor=True)

        dframe = self._batch_read_dframe_from_cursor(
            observations, distinct, limit)

        dframe.decode_mongo_reserved_keys()
        dframe.remove_bamboo_reserved_keys(keep_parent_ids)

        if padded:
            if len(dframe.columns):
                on = dframe.columns[0]
                place_holder = self.place_holder_dframe(dframe).set_index(on)
                dframe = BambooFrame(dframe.join(place_holder, on=on))
            else:
                dframe = self.place_holder_dframe()

        return dframe
Esempio n. 9
0
 def _add_bamboo_reserved_keys(self, value=1):
     for key in BAMBOO_RESERVED_KEYS:
         column = Series([value] * len(self.bframe))
         column.name = key
         self.bframe = BambooFrame(self.bframe.join(column))
Esempio n. 10
0
 def setUp(self):
     TestBase.setUp(self)
     self.dframe = self.get_data('good_eats.csv')
     self.bframe = BambooFrame(self.dframe)
Esempio n. 11
0
class TestFrame(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.dframe = self.get_data('good_eats.csv')
        self.bframe = BambooFrame(self.dframe)

    def _add_bamboo_reserved_keys(self, value=1):
        for key in BAMBOO_RESERVED_KEYS:
            column = Series([value] * len(self.bframe))
            column.name = key
            self.bframe = BambooFrame(self.bframe.join(column))

    def test_add_parent_column(self):
        value = 1
        self._add_bamboo_reserved_keys(value)
        for index, item in self.bframe[PARENT_DATASET_ID].iteritems():
            self.assertEqual(item, value)

    def test_decode_mongo_reserved_keys(self):
        prev_columns = self.bframe.columns
        for col in MONGO_RESERVED_KEYS:
            self.assertTrue(col in self.bframe.columns)
        self.bframe.decode_mongo_reserved_keys()
        for col in MONGO_RESERVED_KEYS:
            self.assertFalse(col in self.bframe.columns)

    def test_recognize_dates(self):
        bframe_with_dates = self.bframe.recognize_dates()
        for field in bframe_with_dates['submit_date']:
            self.assertTrue(isinstance(field, datetime))

    def test_recognize_dates_from_schema(self):
        schema = Schema({
            'submit_date': {
                SIMPLETYPE: DATETIME
            }
        })
        bframe_with_dates = self.bframe.recognize_dates_from_schema(schema)
        for field in bframe_with_dates['submit_date']:
            self.assertTrue(isinstance(field, datetime))

    def test_remove_bamboo_reserved_keys(self):
        self._add_bamboo_reserved_keys()
        for key in BAMBOO_RESERVED_KEYS:
            self.assertTrue(key in self.bframe.columns)
        self.bframe.remove_bamboo_reserved_keys()
        for key in BAMBOO_RESERVED_KEYS:
            self.assertFalse(key in self.bframe.columns)

    def test_remove_bamboo_reserved_keys_exclusion(self):
        self._add_bamboo_reserved_keys()
        for key in BAMBOO_RESERVED_KEYS:
            self.assertTrue(key in self.bframe.columns)
        self.bframe.remove_bamboo_reserved_keys([PARENT_DATASET_ID])
        for key in BAMBOO_RESERVED_KEYS:
            if key == PARENT_DATASET_ID:
                self.assertTrue(key in self.bframe.columns)
            else:
                self.assertFalse(key in self.bframe.columns)

    def test_only_rows_for_parent_id(self):
        parent_id = 1
        len_parent_rows = len(self.bframe) / 2

        column = Series([parent_id] * len_parent_rows)
        column.name = PARENT_DATASET_ID

        self.bframe = BambooFrame(self.bframe.join(column))
        bframe_only = self.bframe.only_rows_for_parent_id(parent_id)

        self.assertFalse(PARENT_DATASET_ID in bframe_only.columns)
        self.assertEqual(len(bframe_only), len_parent_rows)

    def test_to_jsondict(self):
        jsondict = self.bframe.to_jsondict()
        self.assertEqual(len(jsondict), len(self.bframe))
        for col in jsondict:
            self.assertEqual(len(col), len(self.bframe.columns))

    def test_to_json(self):
        json = self.bframe.to_json()
        self.assertEqual(type(json), str)