def find(self, owner, cube, query, fields=None, date=None, sort=None, one=False, explain=False, merge_versions=True, skip=0, limit=0): self.cube_exists(owner, cube) self.requires_owner_read(owner, cube) sort = self.check_sort(sort) fields = self.get_fields(owner, cube, fields) if date is None or fields is None or ('_id' in fields and fields['_id']): merge_versions = False query = query or '' query = query_add_date(query, date) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) if explain: result = _cube.find(spec, fields=fields, sort=sort, skip=skip, limit=limit).explain() elif one: result = _cube.find_one(spec, fields=fields, sort=sort, skip=skip, limit=limit) elif merge_versions: # merge_versions ignores sort (for now) result = self._merge_versions(_cube, spec, fields, skip=skip, limit=limit) else: result = tuple(_cube.find(spec, fields=fields, sort=sort, skip=skip, limit=limit)) return result
def remove_objects(self, owner, cube, query, date=None): ''' Remove all the objects (docs) from the given cube (mongodb collection) :param pymongo.collection cube: cube object (pymongo collection connection) :param string query: pql query string :param string date: metrique date(range) ''' self.cube_exists(owner, cube) self.requires_owner_admin(owner, cube) if not query: return [] if isinstance(query, basestring): query = query_add_date(query, date) spec = parse_pql_query(query) elif isinstance(query, (list, tuple)): spec = {'_id': {'$in': query}} else: raise ValueError( 'Expected query string or list of ids, got: %s' % type(query)) _cube = self.timeline(owner, cube, admin=True) return _cube.remove(spec)
def sample(self, owner, cube, sample_size=None, fields=None, date=None, query=None): ''' Draws a sample of objects at random from the cube. Query sytax parsing is handled by `pql`. :param cube: cube name :param owner: username of cube owner :param sample_size: Size of the sample :param fields: Fields that should be returned :param date: date (metrique date range) that should be queried If date==None then the most recent versions of the objects will be queried :param query: query used to filter sampleset ''' self.requires_read(owner, cube) fields = self.get_fields(owner, cube, fields) query = query_add_date(query, date) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) _docs = _cube.find(spec, fields=fields) n = _docs.count() if n <= sample_size: docs = tuple(_docs) else: to_sample = sorted(set(random.sample(xrange(n), sample_size))) docs = [_docs[i] for i in to_sample] return docs
def history(self, owner, cube, query, by_field=None, date_list=None): self.cube_exists(owner, cube) self.requires_owner_read(owner, cube) date_list = sorted(map(dt2ts, date_list)) spec = parse_pql_query( '%s and _start < %s and (_end >= %s or _end == None)' % ( query, max(date_list), min(date_list))) _cube = self.timeline(owner, cube) agg = [{'$match': spec}, {'$group': {'_id': '$%s' % by_field if by_field else 'id', 'starts': {'$push': '$_start'}, 'ends': {'$push': '$_end'}} }] logger.debug('Aggregation: %s' % agg) data = _cube.aggregate(agg)['result'] # accumulate the counts res = defaultdict(lambda: defaultdict(int)) for group in data: starts = sorted(group['starts']) ends = sorted([e for e in group['ends'] if e is not None]) _id = group['_id'] ind = 0 # assuming date_list is sorted for date in date_list: while ind < len(starts) and starts[ind] < date: ind += 1 res[date][_id] = ind ind = 0 for date in date_list: while ind < len(ends) and ends[ind] < date: ind += 1 res[date][_id] -= ind # convert to the return form ret = [] for date, value in res.items(): if by_field: vals = [] for field_val, count in value.items(): vals.append({by_field: field_val, "count": count}) ret.append({"date": date, "values": vals}) else: ret.append({"date": date, "count": value['id']}) return ret
def find(self, owner, cube, query, fields=None, date=None, sort=None, one=False, explain=False, merge_versions=True, skip=0, limit=0): ''' Wrapper around pymongo's find() command. Query sytax parsing is handled by `pql`. :param cube: cube name :param owner: username of cube owner :param query: The query in pql :param fields: Fields that should be returned (comma-separated) :param date: date (metrique date range) that should be queried. If date==None then the most recent versions of the objects will be queried. :param explain: return execution plan instead of results :param merge_versions: merge versions where fields values equal :param one: return back only first matching object :param sort: return back results sorted :param skip: number of results matched to skip and not return :param limit: number of results matched to return of total found ''' self.requires_read(owner, cube) sort = self.check_sort(sort) fields = self.get_fields(owner, cube, fields) if date is None or fields is None or ('_id' in fields and fields['_id']): merge_versions = False query = query or '' query = query_add_date(query, date) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) if explain: result = _cube.find(spec, fields=fields, sort=sort, skip=skip, limit=limit).explain() elif one: result = _cube.find_one(spec, fields=fields, sort=sort, skip=skip, limit=limit) elif merge_versions: # merge_versions ignores sort (for now) result = self._merge_versions(_cube, spec, fields, skip=skip, limit=limit) else: result = tuple(_cube.find(spec, fields=fields, sort=sort, skip=skip, limit=limit)) return result
def sample(self, owner, cube, sample_size=None, fields=None, date=None, query=None): self.cube_exists(owner, cube) self.requires_owner_read(owner, cube) fields = self.get_fields(owner, cube, fields) query = query_add_date(query, date) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) _docs = _cube.find(spec, fields=fields) n = _docs.count() if n <= sample_size: docs = tuple(_docs) else: to_sample = sorted(set(random.sample(xrange(n), sample_size))) docs = [_docs[i] for i in to_sample] return docs
def sample_timeline(self, owner, cube, sample_size=None, query=None): if not (owner and cube): self._raise(400, "owner and cube required") if sample_size is None: sample_size = SAMPLE_SIZE query = set_default(query, '', null_ok=True) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) docs = _cube.find(spec) n = docs.count() if n <= sample_size: docs = tuple(docs) else: to_sample = sorted(set(random.sample(xrange(n), sample_size))) docs = [docs[i] for i in to_sample] return docs
def count(self, owner, cube, query, date=None): ''' Wrapper around pymongo's find().count() command. Query sytax parsing is handled by `pql`. :param cube: cube name :param owner: username of cube owner :param query: The query in pql :param date: date (metrique date range) that should be queried If date==None then the most recent versions of the objects will be queried. ''' self.requires_read(owner, cube) query = query or '' query = query_add_date(query, date) # FIXME: logging move to parse_pql_query, after # logging refactor spec = parse_pql_query(query) _cube = self.timeline(owner, cube) docs = _cube.find(spec=spec) return docs.count() if docs else 0
def sample_cube(self, owner, cube, sample_size=None, query=None): ''' Take a psuedo-random sampling of objects from a given cube. :param cube: cube name :param owner: username of cube owner :param sample_size: number of objects to sample :param query: high-level query used to create population to sample ''' if not (owner and cube): self._raise(400, "owner and cube required") if sample_size is None: sample_size = SAMPLE_SIZE query = set_default(query, '', null_ok=True) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) docs = _cube.find(spec) n = docs.count() if n <= sample_size: docs = tuple(docs) else: to_sample = sorted(set(random.sample(xrange(n), sample_size))) docs = [docs[i] for i in to_sample] return docs
def distinct(self, owner, cube, field, query=None, date=None): ''' Return back a distinct (unique) list of field values across the entire cube dataset Query sytax parsing is handled by `pql`. :param cube: cube name :param owner: username of cube owner :param field: field to get distinct token values from :param query: pql query to run as a pre-filter :param string date: metrique date(range) If query is provided, rather than running collection.distinct(field) directly, run on a find cursor. ''' self.requires_read(owner, cube) if isinstance(query, basestring): query = query_add_date(query, date) spec = parse_pql_query(query) result = self.timeline(owner, cube).find(spec).distinct(field) else: result = self.timeline(owner, cube).distinct(field) return result
def remove_objects(self, owner, cube, query, date=None): ''' Remove all the objects from the given cube. :param owner: username of cube owner :param cube: cube name :param string query: pql query string :param string date: metrique date(range) ''' self.requires_admin(owner, cube) if not query: return [] if isinstance(query, basestring): query = query_add_date(query, date) spec = parse_pql_query(query) elif isinstance(query, (list, tuple)): spec = {'_id': {'$in': query}} else: raise ValueError( 'Expected query string or list of ids, got: %s' % type(query)) _cube = self.timeline(owner, cube, admin=True) return _cube.remove(spec)
def history(self, owner, cube, query, by_field=None, date_list=None): ''' Run a pql mongodb based query on the given cube and return back the aggregate historical counts of matching results. Query sytax parsing is handled by `pql`. :param cube: cube name :param owner: username of cube owner :param query: The query in pql :param by_field: Which field to slice/dice and aggregate from :param date: list of dates that should be used to bin the results ''' self.requires_read(owner, cube) date_list = sorted(map(dt2ts, date_list)) query = '%s and _start < %s and (_end >= %s or _end == None)' % ( query, max(date_list), min(date_list)) spec = parse_pql_query(query) _cube = self.timeline(owner, cube) agg = [{'$match': spec}, {'$group': {'_id': '$%s' % by_field if by_field else 'id', 'starts': {'$push': '$_start'}, 'ends': {'$push': '$_end'}} }] logger.debug('Aggregation: %s' % agg) data = _cube.aggregate(agg)['result'] # accumulate the counts res = defaultdict(lambda: defaultdict(int)) for group in data: starts = sorted(group['starts']) ends = sorted([x for x in group['ends'] if x is not None]) _id = group['_id'] ind = 0 # assuming date_list is sorted for date in date_list: while ind < len(starts) and starts[ind] < date: ind += 1 res[date][_id] = ind ind = 0 for date in date_list: while ind < len(ends) and ends[ind] < date: ind += 1 res[date][_id] -= ind # convert to the return form ret = [] for date, value in res.items(): if by_field: vals = [] for field_val, count in value.items(): vals.append({by_field: field_val, "count": count}) ret.append({"date": date, "values": vals}) else: ret.append({"date": date, "count": value['id']}) return ret