def find(cube, query, fields=None, date=None, most_recent=True): logger.debug('Running Find') if date is not None: # we will be doing a timeline query so we need to rename the fields # WARNING: might not work if some field is a substring of other field all_fields = get_fields(cube, '__all__') for f in all_fields: query = re.sub(f, 'fields.%s' % f, query) # add the date constraint query = query + ' and ' + _get_date_pql_string(date) pql_parser = pql.SchemaFreeParser() try: # FIXME: make it a schema aware parser spec = pql_parser.parse(query) except Exception as e: raise ValueError("Invalid Query (%s)" % str(e)) c = get_cube(cube) _cube = c.get_collection(timeline=(date is not None)) logger.debug('Query: %s' % spec) fields = get_fields(cube, fields) if date is not None: project_d = dict([(f, '$fields.%s' % f) for f in fields]) project_d.update(dict(_id='$id', _start='$start', _end='$end')) if most_recent: docs = _cube.aggregate([{'$match': spec}, {'$sort': {'start': -1}}, {'$group': {'_id': '$id', 'fields': {'$first': '$fields'}, 'start': {'$first': '$start'}, 'end': {'$first': '$end'}, 'id': {'$first': '$id'}}}, {'$project': project_d}]) else: docs = _cube.aggregate([{'$match': spec}, {'$project': project_d}]) docs = docs['result'] else: docs = _cube.find(spec, fields) docs.batch_size(10000000) # hard limit is 16M... docs = [d for d in docs] return docs
def index_warehouse(cube, fields, force=False): ''' NOTE: _id key index is generated automatically by mongo ''' c = get_cube(cube) _cube = c.get_collection(admin=True) fields = get_fields(cube, fields) result = {} for field in fields: name = '%s-tokens' % field if force or c.get_field_property('index', field): logger.info(' %s... Indexing Warehouse (%s)%s' % (YELLOW, field, ENDC)) key = [(field, -1)] result[field] = _cube.ensure_index(key, name=name) else: result[field] = -1 return result
def extract(cube, **kwargs): logger.info(' Starting Update operation!') logger.info(' %sCube: %s%s' % (YELLOW, cube, ENDC)) c = get_cube(cube) logger.debug('%sExtract - Start%s' % (YELLOW, ENDC)) _fields = kwargs.get('fields') fields = get_fields(cube, _fields) if fields: result = {} for field in fields: kwargs['field'] = field logger.debug('%sField: %s%s' % (YELLOW, field, ENDC)) result[field] = c.extract_func(**kwargs) logger.info('Extract - Complete: (%s.%s): %s' % (cube, field, result[field])) else: result = c.extract_func(**kwargs) logger.info('Extract - Complete: (%s): %s' % (cube, result)) return result
def fetch(cube, fields, skip=0, limit=0, ids=None): logger.debug('Running Fetch (skip:%s, limit:%s, ids:%s)' % ( skip, limit, len(ids))) logger.debug('... Fields: %s' % fields) c = get_cube(cube) _cube = c.get_collection() fields = get_fields(cube, fields) logger.debug('Return Fields: %s' % fields) sort = [('_id', 1)] if ids: spec = {'_id': {'$in': parse_ids(ids)}} else: spec = {} docs = _cube.find(spec, fields, skip=skip, limit=limit, sort=sort) docs.batch_size(10000000) # hard limit is 16M... return [d for d in docs]