Exemple #1
0
    def find(self, owner, cube, query, fields=None, date=None,
             sort=None, one=False, explain=False, merge_versions=True,
             skip=0, limit=0):
        self.cube_exists(owner, cube)
        self.requires_owner_read(owner, cube)

        sort = self.check_sort(sort)
        fields = self.get_fields(owner, cube, fields)

        if date is None or fields is None or ('_id' in fields and
                                              fields['_id']):
            merge_versions = False

        query = query or ''
        query = query_add_date(query, date)
        spec = parse_pql_query(query)

        _cube = self.timeline(owner, cube)
        if explain:
            result = _cube.find(spec, fields=fields, sort=sort,
                                skip=skip, limit=limit).explain()
        elif one:
            result = _cube.find_one(spec, fields=fields, sort=sort,
                                    skip=skip, limit=limit)
        elif merge_versions:
            # merge_versions ignores sort (for now)
            result = self._merge_versions(_cube, spec, fields,
                                          skip=skip, limit=limit)
        else:
            result = tuple(_cube.find(spec, fields=fields, sort=sort,
                                      skip=skip, limit=limit))
        return result
Exemple #2
0
    def remove_objects(self, owner, cube, query, date=None):
        '''
        Remove all the objects (docs) from the given
        cube (mongodb collection)

        :param pymongo.collection cube:
            cube object (pymongo collection connection)
        :param string query:
            pql query string
        :param string date:
            metrique date(range)
        '''
        self.cube_exists(owner, cube)
        self.requires_owner_admin(owner, cube)
        if not query:
            return []

        if isinstance(query, basestring):
            query = query_add_date(query, date)
            spec = parse_pql_query(query)
        elif isinstance(query, (list, tuple)):
            spec = {'_id': {'$in': query}}
        else:
            raise ValueError(
                'Expected query string or list of ids, got: %s' % type(query))

        _cube = self.timeline(owner, cube, admin=True)
        return _cube.remove(spec)
Exemple #3
0
    def sample(self, owner, cube, sample_size=None, fields=None,
               date=None, query=None):
        '''
        Draws a sample of objects at random from the cube.

        Query sytax parsing is handled by `pql`.

        :param cube: cube name
        :param owner: username of cube owner
        :param sample_size: Size of the sample
        :param fields: Fields that should be returned
        :param date: date (metrique date range) that should be queried
                        If date==None then the most recent versions of
                        the objects will be queried
        :param query: query used to filter sampleset
        '''
        self.requires_read(owner, cube)
        fields = self.get_fields(owner, cube, fields)
        query = query_add_date(query, date)
        spec = parse_pql_query(query)
        _cube = self.timeline(owner, cube)
        _docs = _cube.find(spec, fields=fields)
        n = _docs.count()
        if n <= sample_size:
            docs = tuple(_docs)
        else:
            to_sample = sorted(set(random.sample(xrange(n), sample_size)))
            docs = [_docs[i] for i in to_sample]
        return docs
Exemple #4
0
    def history(self, owner, cube, query, by_field=None, date_list=None):
        self.cube_exists(owner, cube)
        self.requires_owner_read(owner, cube)

        date_list = sorted(map(dt2ts, date_list))
        spec = parse_pql_query(
            '%s and _start < %s and (_end >= %s or _end == None)' % (
                query, max(date_list), min(date_list)))

        _cube = self.timeline(owner, cube)

        agg = [{'$match': spec},
               {'$group':
                {'_id': '$%s' % by_field if by_field else 'id',
                 'starts': {'$push': '$_start'},
                 'ends': {'$push': '$_end'}}
                }]
        logger.debug('Aggregation: %s' % agg)
        data = _cube.aggregate(agg)['result']

        # accumulate the counts
        res = defaultdict(lambda: defaultdict(int))
        for group in data:
            starts = sorted(group['starts'])
            ends = sorted([e for e in group['ends'] if e is not None])
            _id = group['_id']
            ind = 0
            # assuming date_list is sorted
            for date in date_list:
                while ind < len(starts) and starts[ind] < date:
                    ind += 1
                res[date][_id] = ind
            ind = 0
            for date in date_list:
                while ind < len(ends) and ends[ind] < date:
                    ind += 1
                res[date][_id] -= ind

        # convert to the return form
        ret = []
        for date, value in res.items():
            if by_field:
                vals = []
                for field_val, count in value.items():
                    vals.append({by_field: field_val,
                                 "count": count})
                ret.append({"date": date,
                            "values": vals})
            else:
                ret.append({"date": date,
                            "count": value['id']})
        return ret
Exemple #5
0
    def find(self, owner, cube, query, fields=None, date=None,
             sort=None, one=False, explain=False, merge_versions=True,
             skip=0, limit=0):
        '''
        Wrapper around pymongo's find() command.

        Query sytax parsing is handled by `pql`.

        :param cube: cube name
        :param owner: username of cube owner
        :param query: The query in pql
        :param fields: Fields that should be returned (comma-separated)
        :param date: date (metrique date range) that should be queried.
                    If date==None then the most recent versions of the
                    objects will be queried.
        :param explain: return execution plan instead of results
        :param merge_versions: merge versions where fields values equal
        :param one: return back only first matching object
        :param sort: return back results sorted
        :param skip: number of results matched to skip and not return
        :param limit: number of results matched to return of total found
        '''
        self.requires_read(owner, cube)

        sort = self.check_sort(sort)
        fields = self.get_fields(owner, cube, fields)

        if date is None or fields is None or ('_id' in fields and
                                              fields['_id']):
            merge_versions = False

        query = query or ''
        query = query_add_date(query, date)
        spec = parse_pql_query(query)

        _cube = self.timeline(owner, cube)
        if explain:
            result = _cube.find(spec, fields=fields, sort=sort,
                                skip=skip, limit=limit).explain()
        elif one:
            result = _cube.find_one(spec, fields=fields, sort=sort,
                                    skip=skip, limit=limit)
        elif merge_versions:
            # merge_versions ignores sort (for now)
            result = self._merge_versions(_cube, spec, fields,
                                          skip=skip, limit=limit)
        else:
            result = tuple(_cube.find(spec, fields=fields, sort=sort,
                                      skip=skip, limit=limit))
        return result
Exemple #6
0
 def sample(self, owner, cube, sample_size=None, fields=None,
            date=None, query=None):
     self.cube_exists(owner, cube)
     self.requires_owner_read(owner, cube)
     fields = self.get_fields(owner, cube, fields)
     query = query_add_date(query, date)
     spec = parse_pql_query(query)
     _cube = self.timeline(owner, cube)
     _docs = _cube.find(spec, fields=fields)
     n = _docs.count()
     if n <= sample_size:
         docs = tuple(_docs)
     else:
         to_sample = sorted(set(random.sample(xrange(n), sample_size)))
         docs = [_docs[i] for i in to_sample]
     return docs
Exemple #7
0
 def sample_timeline(self, owner, cube, sample_size=None, query=None):
     if not (owner and cube):
         self._raise(400, "owner and cube required")
     if sample_size is None:
         sample_size = SAMPLE_SIZE
     query = set_default(query, '', null_ok=True)
     spec = parse_pql_query(query)
     _cube = self.timeline(owner, cube)
     docs = _cube.find(spec)
     n = docs.count()
     if n <= sample_size:
         docs = tuple(docs)
     else:
         to_sample = sorted(set(random.sample(xrange(n), sample_size)))
         docs = [docs[i] for i in to_sample]
     return docs
Exemple #8
0
    def count(self, owner, cube, query, date=None):
        '''
        Wrapper around pymongo's find().count() command.

        Query sytax parsing is handled by `pql`.

        :param cube: cube name
        :param owner: username of cube owner
        :param query: The query in pql
        :param date: date (metrique date range) that should be queried
                           If date==None then the most recent versions of the
                           objects will be queried.
        '''
        self.requires_read(owner, cube)

        query = query or ''
        query = query_add_date(query, date)
        # FIXME: logging move to parse_pql_query, after
        # logging refactor
        spec = parse_pql_query(query)
        _cube = self.timeline(owner, cube)
        docs = _cube.find(spec=spec)
        return docs.count() if docs else 0
Exemple #9
0
    def sample_cube(self, owner, cube, sample_size=None, query=None):
        '''
        Take a psuedo-random sampling of objects from a given cube.

        :param cube: cube name
        :param owner: username of cube owner
        :param sample_size: number of objects to sample
        :param query: high-level query used to create population to sample
        '''
        if not (owner and cube):
            self._raise(400, "owner and cube required")
        if sample_size is None:
            sample_size = SAMPLE_SIZE
        query = set_default(query, '', null_ok=True)
        spec = parse_pql_query(query)
        _cube = self.timeline(owner, cube)
        docs = _cube.find(spec)
        n = docs.count()
        if n <= sample_size:
            docs = tuple(docs)
        else:
            to_sample = sorted(set(random.sample(xrange(n), sample_size)))
            docs = [docs[i] for i in to_sample]
        return docs
Exemple #10
0
    def distinct(self, owner, cube, field, query=None, date=None):
        '''
        Return back a distinct (unique) list of field values
        across the entire cube dataset

        Query sytax parsing is handled by `pql`.

        :param cube: cube name
        :param owner: username of cube owner
        :param field: field to get distinct token values from
        :param query: pql query to run as a pre-filter
        :param string date: metrique date(range)

        If query is provided, rather than running collection.distinct(field)
        directly, run on a find cursor.
        '''
        self.requires_read(owner, cube)
        if isinstance(query, basestring):
            query = query_add_date(query, date)
            spec = parse_pql_query(query)
            result = self.timeline(owner, cube).find(spec).distinct(field)
        else:
            result = self.timeline(owner, cube).distinct(field)
        return result
Exemple #11
0
    def remove_objects(self, owner, cube, query, date=None):
        '''
        Remove all the objects from the given cube.

        :param owner: username of cube owner
        :param cube: cube name
        :param string query: pql query string
        :param string date: metrique date(range)
        '''
        self.requires_admin(owner, cube)
        if not query:
            return []

        if isinstance(query, basestring):
            query = query_add_date(query, date)
            spec = parse_pql_query(query)
        elif isinstance(query, (list, tuple)):
            spec = {'_id': {'$in': query}}
        else:
            raise ValueError(
                'Expected query string or list of ids, got: %s' % type(query))

        _cube = self.timeline(owner, cube, admin=True)
        return _cube.remove(spec)
Exemple #12
0
    def history(self, owner, cube, query, by_field=None, date_list=None):
        '''
        Run a pql mongodb based query on the given cube and return back the
        aggregate historical counts of matching results.

        Query sytax parsing is handled by `pql`.

        :param cube: cube name
        :param owner: username of cube owner
        :param query: The query in pql
        :param by_field: Which field to slice/dice and aggregate from
        :param date: list of dates that should be used to bin the results
        '''
        self.requires_read(owner, cube)

        date_list = sorted(map(dt2ts, date_list))
        query = '%s and _start < %s and (_end >= %s or _end == None)' % (
                query, max(date_list), min(date_list))
        spec = parse_pql_query(query)
        _cube = self.timeline(owner, cube)

        agg = [{'$match': spec},
               {'$group':
                {'_id': '$%s' % by_field if by_field else 'id',
                 'starts': {'$push': '$_start'},
                 'ends': {'$push': '$_end'}}
                }]
        logger.debug('Aggregation: %s' % agg)
        data = _cube.aggregate(agg)['result']

        # accumulate the counts
        res = defaultdict(lambda: defaultdict(int))
        for group in data:
            starts = sorted(group['starts'])
            ends = sorted([x for x in group['ends'] if x is not None])
            _id = group['_id']
            ind = 0
            # assuming date_list is sorted
            for date in date_list:
                while ind < len(starts) and starts[ind] < date:
                    ind += 1
                res[date][_id] = ind
            ind = 0
            for date in date_list:
                while ind < len(ends) and ends[ind] < date:
                    ind += 1
                res[date][_id] -= ind

        # convert to the return form
        ret = []
        for date, value in res.items():
            if by_field:
                vals = []
                for field_val, count in value.items():
                    vals.append({by_field: field_val,
                                 "count": count})
                ret.append({"date": date,
                            "values": vals})
            else:
                ret.append({"date": date,
                            "count": value['id']})
        return ret