def parse_columns(dataset, formula, name, dframe=None, no_index=False): """Parse a formula and return columns resulting from its functions. Parse a formula into a list of functions then apply those functions to the Data Frame and return the resulting columns. :param formula: The formula to parse. :param name: Name of the formula. :param dframe: A DataFrame to apply functions to. :param no_index: Drop the index on result columns. """ functions = Parser.parse_functions(formula) dependent_columns = Parser.dependent_columns(formula, dataset) # make select from dependent_columns if dframe is None: select = {col: 1 for col in dependent_columns or [MONGO_ID]} dframe = dataset.dframe( query_args=QueryArgs(select=select), keep_mongo_keys=True).set_index(MONGO_ID_ENCODED) if not dependent_columns: # constant column, use dummy dframe['dummy'] = 0 return __build_columns(dataset, dframe, functions, name, no_index)
def eval(self, row, dataset): # parse date from string col = self.value.value query_args = QueryArgs(select={col: 1}) column = dataset.dframe(query_args=query_args)[col] field = self.value.field(row) return percentileofscore(column, field)
def test_plot_select(self): column = 'community_pop' select = {column: 1} result = self.controller.plot(self.dataset_id, select=json.dumps(select)) dframe = self.dataset.dframe(QueryArgs(select=select)) self.__test_result(result, dframe)
def test_plot(self): result = self.controller.plot(self.dataset_id) dframe = self.dataset.dframe(query_args=QueryArgs( select=self.dataset.schema.numerics_select)) dframe = dframe.dropna() self.__test_result(result, dframe)
def __parse_query_args(self, limit, order_by, query, select, distinct=None, dataset=None): limit = parse_int(limit, 0) query = self.__parse_query(query) select = self.__parse_select(select) return QueryArgs(query=query, select=select, distinct=distinct, limit=limit, order_by=order_by, dataset=dataset)
def observations(self, query_args=None, as_cursor=False): """Return observations for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param as_cursor: Return the observations as a cursor. """ return Observation.find(self, query_args or QueryArgs(), as_cursor=as_cursor)
def test_find_with_select(self): self.__save_records() query_args = QueryArgs(select={"rating": 1}) rows = Observation.find(self.dataset, query_args) self.assertTrue(isinstance(rows, list)) row = self.__decode(rows[0]) self.assertEquals(sorted(row.keys()), ['_id', 'rating'])
def rolling(self, win_type, window): """Calculate a rolling window over all numeric columns. :param win_type: The type of window, see pandas pandas.rolling_window. :param window: The number of observations used for calculating the window. :returns: A DataFrame of the rolling window calculated for this dataset. """ dframe = self.dframe(QueryArgs(select=self.schema.numerics_select)) return rolling_window(dframe, window, win_type)
def resample(self, date_column, interval, how, query=None): """Resample a dataset given a new time frame. :param date_column: The date column use as the index for resampling. :param interval: The interval code for resampling. :param how: How to aggregate in the resample. :returns: A DataFrame of the resampled DataFrame for this dataset. """ query_args = QueryArgs(query=query) dframe = self.dframe(query_args).set_index(date_column) resampled = dframe.resample(interval, how=how) return resampled.reset_index()
def test_plot_index(self): dataset_id = self._post_file() dataset = Dataset.find_one(dataset_id) column = 'amount' select = {column: 1} result = self.controller.plot(dataset_id, select=json.dumps(select), index='submit_date') dframe = dataset.dframe() dframe = self.dataset.dframe(QueryArgs(select=select)) self.__test_result(result, dframe)
def dframe(self, query_args=None, keep_parent_ids=False, padded=False, index=False, reload_=False, keep_mongo_keys=False): """Fetch the dframe for this dataset. :param query_args: An optional QueryArgs to hold the query arguments. :param keep_parent_ids: Do not remove parent IDs from the dframe, default False. :param padded: Used for joining, default False. :param index: Return the index with dframe, default False. :param reload_: Force refresh of data, default False. :param keep_mongo_keys: Used for updating documents, default False. :returns: Return DataFrame with contents based on query parameters passed to MongoDB. DataFrame will not have parent ids if `keep_parent_ids` is False. """ # bypass cache if we need specific version cacheable = not (query_args or keep_parent_ids or padded) # use cached copy if we have already fetched it if cacheable and not reload_ and self.__is_cached: return self.__dframe query_args = query_args or QueryArgs() observations = self.observations(query_args, as_cursor=True) if query_args.distinct: return DataFrame(observations) dframe = Observation.batch_read_dframe_from_cursor( self, observations, query_args.distinct, query_args.limit) dframe = df_mongo_decode(dframe, keep_mongo_keys=keep_mongo_keys) excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX] dframe = remove_reserved_keys(dframe, filter(bool, excluded)) if index: dframe.rename(columns={INDEX: 'index'}, inplace=True) dframe = self.__maybe_pad(dframe, padded) if cacheable: self.__dframe = dframe return dframe
def count(self, query_args=None): """Return the count of rows matching query in dataset. :param query_args: An optional QueryArgs to hold the query arguments. """ query_args = query_args or QueryArgs() obs = self.observations(query_args, as_cursor=True) count = len(obs) if query_args.distinct else obs.count() limit = query_args.limit if limit > 0 and count > limit: count = limit return count
def __create_aggregator(dataset, formula, name, groups, dframe=None): # TODO this should work with index eventually columns = parse_columns(dataset, formula, name, dframe, no_index=True) dependent_columns = Parser.dependent_columns(formula, dataset) aggregation = Parser.parse_aggregation(formula) # get dframe with only the necessary columns select = combine_dicts({group: 1 for group in groups}, {col: 1 for col in dependent_columns}) # ensure at least one column (MONGO_ID) for the count aggregation query_args = QueryArgs(select=select or {MONGO_ID: 1}) dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select) return Aggregator(dframe, groups, aggregation, name, columns)
def find(cls, dataset, include_aggs=True, only_aggs=False): """Return the calculations for`dataset`. :param dataset: The dataset to retrieve the calculations for. :param include_aggs: Include aggregations, default True. :param only_aggs: Exclude non-aggregations, default False. """ query = {DATASET_ID: dataset.dataset_id} if not include_aggs: query[cls.AGGREGATION] = None if only_aggs: query[cls.AGGREGATION] = {'$ne': None} query_args = QueryArgs(query=query, order_by='name') return super(cls, cls).find(query_args)
def test_delete_with_query(self): dataset_id = self._post_file() query = {'food_type': 'caffeination'} dataset = Dataset.find_one(dataset_id) dframe = dataset.dframe(query_args=QueryArgs(query=query)) len_after_delete = len(dataset.dframe()) - len(dframe) query = json.dumps(query) result = json.loads(self.controller.delete(dataset_id, query=query)) message = result[Datasets.SUCCESS] self.assertTrue('deleted dataset' in message) self.assertTrue(query in message) self.assertEqual(result[Dataset.ID], dataset_id) dframe = Dataset.find_one(dataset_id).dframe() self.assertEqual(len(dframe), len_after_delete)
def action(dataset, query=query, select=select, limit=limit): if not dataset.is_ready: raise ArgumentError('dataset is not finished importing') limit = parse_int(limit, 0) query = self.__parse_query(query) select = self.__parse_select(select, required=True) groups = dataset.split_groups(group) [valid_column(dataset, c) for c in groups] # if select append groups to select if select: select.update(dict(zip(groups, [1] * len(groups)))) query_args = QueryArgs(query=query, select=select, limit=limit, order_by=order_by) dframe = dataset.dframe(query_args) return dataset.summarize(dframe, groups=groups, no_cache=query or select, flat=flat)
def __update_is_valid(dataset, new_dframe): """Check if the update is valid. Check whether this is a right-hand side of any joins and deny the update if the update would produce an invalid join as a result. :param dataset: The dataset to check if update valid for. :param new_dframe: The update dframe to check. :returns: True is the update is valid, False otherwise. """ select = {on: 1 for on in dataset.on_columns_for_rhs_of_joins if on in new_dframe.columns and on in dataset.columns} dframe = dataset.dframe(query_args=QueryArgs(select=select)) for on in select.keys(): merged_join_column = concat([new_dframe[on], dframe[on]]) if len(merged_join_column) != merged_join_column.nunique(): return False return True
def find(cls, dataset, query_args=None, as_cursor=False, include_deleted=False): """Return observation rows matching parameters. :param dataset: Dataset to return rows for. :param include_deleted: If True, return delete records, default False. :param query_args: An optional QueryArgs to hold the query arguments. :raises: `JSONError` if the query could not be parsed. :returns: A list of dictionaries matching the passed in `query` and other parameters. """ encoding = cls.encoding(dataset) or {} query_args = query_args or QueryArgs() query_args.query = parse_timestamp_query(query_args.query, dataset.schema) query_args.encode(encoding, {DATASET_ID: dataset.dataset_id}) if not include_deleted: query = query_args.query query[cls.DELETED_AT] = 0 query_args.query = query # exclude deleted at column query_args.select = query_args.select or {cls.DELETED_AT: 0} distinct = query_args.distinct records = super(cls, cls).find(query_args, as_dict=True, as_cursor=(as_cursor or distinct)) return records.distinct(encoding.get(distinct, distinct)) if distinct\ else records
def setUp(self): TestBase.setUp(self) self.dataset = Dataset() self.dataset.save(self.test_dataset_ids['good_eats.csv']) self.query_args = QueryArgs({"rating": "delectible"})
def parent_ids(self): query_args = QueryArgs(select={PARENT_DATASET_ID: 1}, distinct=PARENT_DATASET_ID) return self.observations(query_args)
def find(cls, dataset_id): """Return datasets for `dataset_id`.""" query_args = QueryArgs(query={DATASET_ID: dataset_id}) return super(cls, cls).find(query_args)