def _extract_key_values(self, dict_, unique_keys): '''\ Extract the values for the keys in ``unique_keys`` from the ``dict_``. ``dict_`` A dict like object ``unique_keys`` A list of strings Returns: A ``tuple`` of values of the unique_keys. Raises: :exc:`KeyError` if a unique column has no value. ''' res = [] for k in unique_keys: v = deep_get(dict_, k) # deep_get doesn't raise KeyErrors, so do it here. if v is None: raise KeyError("Unique key %s missing from dict: %s" % (k, dict_)) else: res.append(v) return tuple(res)
def sorted_extract(drilldown): extracted = [] for cell in drilldown: cell_extract = [] for key in ('from.name', 'to.name', 'num_entries', 'amount'): cell_extract.append(deep_get(cell, key)) extracted.append(cell_extract) return sorted(extracted)
def _cell_id_for_row(self, row, query_dimensions): cell_keys = [] for dimension in query_dimensions: value = util.deep_get(row, dimension) if isinstance(value, dict): from_day = util.deep_get(value, 'from.day') if from_day: cell_keys.append(from_day) cell_keys.append(util.deep_get(value, 'to.day')) elif '_id' in value: cell_keys.append(value['_id']) elif 'name' in value: cell_keys.append(value['name']) else: cell_keys.append(value) return util.hash_values(map(lambda x: unicode(x).encode('utf8'), cell_keys))
def assert_order(result, keys, expect): results = [] for key in keys: results.append([deep_get(cell, key) for cell in result['drilldown']]) if len(results) == 1: result = results[0] else: result = zip(*results) h.assert_equal(result, expect, 'Not the expected order. result: %s, expected: %s' % (result, expect))
def make_new_cell(cell_id): new_cell = {'_id': cell_id} for key in query_dimensions: # handle dates specially, collect year and month if key == 'time': if 'year' in used_time_dimensions: value = int(util.deep_get(row, 'time.from.year')) new_cell['year'] = value if 'month' in used_time_dimensions: value = int(util.deep_get(row, 'time.from.month')[-2:]) new_cell['month'] = value continue value = util.deep_get(row, key) if isinstance(value, dict): from_day = util.deep_get(value, 'from.day') if from_day: new_cell[key] = row[key] continue if isinstance(value, dict): subdict = {} for subkey in ('name', 'label', 'color', '_id', 'ref', 'taxonomy'): if subkey in value: subdict[subkey] = value[subkey] if not subdict.get('name'): # create a name so we can rely on it, # e.g. in queries subdict['name'] = str(subdict['_id']) new_cell[key] = subdict elif isinstance(value, self.simpletypes): new_cell[key] = value new_cell['amount'] = row.get('amount', 0.0) new_cell['num_entries'] = 1 return new_cell
def _sort(self, cells, order): ''' sort the *cells* by one or more *order* criteria. ``cells`` A list of cells ``order`` See :meth:`query` Returns: The sorted `list` of cells ''' if order is not None: for (dimension, direction) in reversed(order): key_getter = lambda cell: deep_get(cell, dimension) cells = sorted(cells, key=key_getter, reverse=direction) return cells
def compute(self): """ Create the cube. This processes all entries of the dataset, aggregates cells based on the dimensions of the cube and saves them into a mongodb collection in the cubes namespace. """ log.debug("compute cube for dataset '%s', cube name: '%s', " \ "dimensions: '%s'", self.dataset.name, self.name, ', '.join(self.dimensions)) begin = time.time() # query fields: We query for all fields, but handle the date # have to query for 'time' if dates are involved. # time is a required field for entries, and some datasets # add a dimension for time, others don't. # If we specify cubes, we do it with 'year' (and maybe 'month') query_dimensions = set(self.dimensions) used_time_dimensions = query_dimensions.intersection(['year', 'month']) additional_dimensions = ['amount'] if used_time_dimensions: query_dimensions = query_dimensions - used_time_dimensions additional_dimensions.append('time') query_dimensions = query_dimensions.union(additional_dimensions) cursor = _aggregation_query(self.dataset, {}, fields=list(query_dimensions), as_class=dict) cells = {} for row in cursor: cell_key_values = [] for dimension in query_dimensions: value = deep_get(row, dimension) if isinstance(value, dict): from_day = deep_get(value, 'from.day') if from_day: value = (from_day, deep_get(value, 'to.day')) elif '_id' in value: value = str(value['_id']) elif 'name' in value: value = value['name'] cell_key_values.append(value) cell_key = tuple(cell_key_values) try: cell = cells.get(cell_key, None) except TypeError: raise AssertionError("Value must be hash()able: %s" % repr(cell_key)) if cell is None: new_cell = {} for key in query_dimensions: # handle dates especially, collect year and month if key == 'time': if 'year' in used_time_dimensions: value = int(deep_get(row, 'time.from.year')) new_cell['year'] = value if 'month' in used_time_dimensions: value = int(deep_get(row, 'time.from.month')[-2:]) new_cell['month'] = value continue value = deep_get(row, key) if isinstance(value, dict): from_day = deep_get(value, 'from.day') if from_day: new_cell[key] = row[key] continue if isinstance(value, dict): subdict = {} for subkey in ('name', 'label', 'color', '_id', 'ref', 'taxonomy'): if subkey in value: subdict[subkey] = value[subkey] if not subdict.get('name'): # create a name so we can rely on it, # e.g. in queries subdict['name'] = subdict['_id'] new_cell[key] = subdict elif isinstance(value, self.simpletypes): new_cell[key] = value # if the row has no amount set 0.0 amount = row.get('amount') new_cell['amount'] = amount and amount or 0.0 # new_cell['entries'] = [row['_id']] new_cell['num_entries'] = 1 cells[cell_key] = new_cell else: cell['amount'] += row.get('amount', 0.0) cell['num_entries'] += 1 # cell['entries'].append(row['_id']) # remove a collection if there is one if self.is_computed(): self.db.drop_collection(self.collection_name) collection = self.db[self.collection_name] for cell in cells.itervalues(): collection.insert(cell) #for dimension in query_dimensions.union(used_time_dimensions): # collection.ensure_index([(dimension, ASCENDING)]) # collection.ensure_index([(dimension, DESCENDING)]) self.dataset['cubes'][self.name]['num_cells'] = len(cells) self.dataset.save() log.debug("Done. Took: %ds", int(time.time() - begin))