def get(self): cube = self.get_argument('cube') details = self.get_argument('details', None) if cube is None: # return a list of cubes return get_cubes() else: # return a list of fields in a cube if details: result = get_cube(cube).fields else: result = sorted(get_cube(cube).fields.keys()) return result
def save_doc(cube, field, tokens, id=None): ''' All subclasses use this method to 'save' a document into the warehouse ''' c = get_cube(cube) if field not in c.fields: raise ValueError("Invalid field (%s)" % field) container = c.get_field_property('container', field, False) if tokens and container: if type(tokens) is not list: raise TypeError("Tokens type must be list()") else: tokens = sorted(tokens) # normalize empty lists -> None if not tokens: tokens = None if id is None: id = ObjectId() now = datetime.now(UTC) spec_now = {'_id': id} update_now = {'$set': {field: tokens, '_mtime': now}} _cube = c.get_collection(admin=True) _cube.update(spec_now, update_now, upsert=True) return 1 # eg, one document added
def _activity_import(cube, ids): c = get_cube(cube) timeline = c.get_collection(timeline=True, admin=True) timeline.ensure_index([('id', 1), ('start', 1)]) time_docs = timeline.find({'id': {'$in': ids}}, sort=[('id', 1), ('start', 1)]) h = c.get_collection(timeline=False, admin=False, cube='%s_activity' % c.name) act_docs = h.find({'id': {'$in': ids}}, sort=[('id', 1), ('when', -1)]) act_docs_iter = iter(act_docs) act_id = -1 last_doc_id = -1 activities = [] for time_doc in time_docs: tid = time_doc['id'] # we want to update only the oldest version of the object if tid != last_doc_id: last_doc_id = tid while act_id <= tid: try: act_doc = act_docs_iter.next() act_id = act_doc['id'] activities.append(act_doc) except StopIteration: break acts = [act for act in activities if act['id'] == tid] _activity_import_doc(c, time_doc, acts, timeline) activities = [act for act in activities if act['id'] > tid]
def activity_import(cube, ids=None): logger.debug('Running activity history import') c = get_cube(cube) h = c.get_collection(timeline=False, admin=True, cube='%s_activity' % c.name) h.ensure_index([('id', 1), ('when', -1)]) if ids is None: # Run on all the ids t = c.get_collection(timeline=True, admin=True) docs = t.find({'current': True}, fields=['id']) logger.debug('Found %s docs' % docs.count()) ids = [] for done, doc in enumerate(docs): ids.append(doc['id']) if done % 100000 == 0: _activity_import(cube, ids) ids = [] logger.debug(' ... %s done' % done) _activity_import(cube, ids) logger.debug(' ... %s done' % (done + 1)) elif type(ids) is list: _activity_import(cube, ids) elif isinstance(ids, basestring): ids = map(int, ids.split(',')) _activity_import(cube, ids)
def last_known_warehouse_mtime(cube, field=None, value=None): '''get the last known warehouse object mtime''' c = get_cube(cube) _cube = c.get_collection() start = None if field: # we need to check the etl_activity collection if value: spec = {'cube': cube, field: value} doc = c.c_etl_activity.find_one(spec, ['%s.mtime' % field]) else: spec = {'cube': cube, field: {'$exists': True}} doc = _cube.find_one(spec, ['%s._mtime' % field]) if doc: start = doc[field]['mtime'] else: # get the most recent _mtime of all objects in the cube mtime = '_mtime' spec = {} doc = _cube.find_one(spec, [mtime], sort=[(mtime, -1)]) if doc: start = doc[mtime] logger.debug('... Last field mtime: %s' % start) return start
def save_objects(cube, objs): if not objs: raise ValueError("Empty objects list") c = get_cube(cube) expected_fields = set(c.fields.keys()) expected_fields.add('_id') # we always expect the _id to be defined as well _cube = c.get_collection(admin=True) if not type(objs) in [list, tuple]: raise TypeError("Expected list or tuple, got type(%s)" % type(objs)) now = datetime.now(UTC) for x, obj in enumerate(objs): if not obj: raise ValueError("Empty object") elif not isinstance(obj, dict): raise TypeError( "Expected objects as dict, got type(%s)" % type(obj)) obj_fields = set(obj.keys()) if not obj_fields <= expected_fields: raise ValueError( "Object includes unexpected fields.\n" "Unexpected: %s" % (obj_fields - expected_fields)) else: objs[x].update({'_mtime': now}) _cube.insert(objs, manipulate=False) return len(objs)
def _snapshot(cube, ids): c = get_cube(cube) w = c.get_collection(admin=False, timeline=False) t = c.get_collection(admin=True, timeline=True) docs = w.find({'_id': {'$in': ids}}, sort=[('_id', 1)]) logger.debug('Snapshot Timeline Index: Start') t.ensure_index([('current', 1), ('id', 1)]) logger.debug('... Snapshot Timeline Index: Done') time_docs = t.find({'current': True, 'id': {'$in': ids}}, sort=[('id', 1)]) time_docs_iter = iter(time_docs) tid = -1 batch_insert = [] for doc in docs: _id = doc.pop('_id') _mtime = doc.pop('_mtime') # time_doc will contain first doc that has id >= _id, # it might be a document where id > _id while tid < _id: try: time_doc = time_docs_iter.next() tid = time_doc['id'] except StopIteration: break store_new_doc = False if _id == tid: if doc != time_doc['fields']: store_new_doc = True spec_now = {'_id': time_doc['_id']} update_now = {'$set': {'current': False}, '$set': {'end': _mtime}} t.update(spec_now, update_now, upsert=True) else: store_new_doc = True if store_new_doc: new_doc = {'fields': doc, 'id': _id, 'start': _mtime, 'end': None, 'current': True} batch_insert.append(new_doc) if len(batch_insert) > 1000: t.insert(batch_insert) batch_insert = [] if len(batch_insert) > 0: t.insert(batch_insert)
def get_last_id(cube, field): ''' ''' c = get_cube(cube) spec = {field: {'$exists': True}} _cube = c.get_collection() logger.debug(" ... %s.%s._get_last_id spec: %s" % ( _cube.db.name, _cube.collection, spec)) last_id = _cube.find_one(spec, {'_id': 1}, sort=[('_id', -1)]) if last_id: value = last_id['_id'] else: value = None logger.debug(" ... ... Last ID: %s" % last_id) return value
def find(cube, query, fields=None, date=None, most_recent=True): logger.debug('Running Find') if date is not None: # we will be doing a timeline query so we need to rename the fields # WARNING: might not work if some field is a substring of other field all_fields = get_fields(cube, '__all__') for f in all_fields: query = re.sub(f, 'fields.%s' % f, query) # add the date constraint query = query + ' and ' + _get_date_pql_string(date) pql_parser = pql.SchemaFreeParser() try: # FIXME: make it a schema aware parser spec = pql_parser.parse(query) except Exception as e: raise ValueError("Invalid Query (%s)" % str(e)) c = get_cube(cube) _cube = c.get_collection(timeline=(date is not None)) logger.debug('Query: %s' % spec) fields = get_fields(cube, fields) if date is not None: project_d = dict([(f, '$fields.%s' % f) for f in fields]) project_d.update(dict(_id='$id', _start='$start', _end='$end')) if most_recent: docs = _cube.aggregate([{'$match': spec}, {'$sort': {'start': -1}}, {'$group': {'_id': '$id', 'fields': {'$first': '$fields'}, 'start': {'$first': '$start'}, 'end': {'$first': '$end'}, 'id': {'$first': '$id'}}}, {'$project': project_d}]) else: docs = _cube.aggregate([{'$match': spec}, {'$project': project_d}]) docs = docs['result'] else: docs = _cube.find(spec, fields) docs.batch_size(10000000) # hard limit is 16M... docs = [d for d in docs] return docs
def index_warehouse(cube, fields, force=False): ''' NOTE: _id key index is generated automatically by mongo ''' c = get_cube(cube) _cube = c.get_collection(admin=True) fields = get_fields(cube, fields) result = {} for field in fields: name = '%s-tokens' % field if force or c.get_field_property('index', field): logger.info(' %s... Indexing Warehouse (%s)%s' % (YELLOW, field, ENDC)) key = [(field, -1)] result[field] = _cube.ensure_index(key, name=name) else: result[field] = -1 return result
def count(cube, query): logger.debug('Running Count') pql_parser = pql.SchemaFreeParser() try: # FIXME: make it a schema aware parser spec = pql_parser.parse(query) except Exception as e: raise ValueError("Invalid Query (%s)" % str(e)) c = get_cube(cube) _cube = c.get_collection() logger.debug('Query: %s' % spec) docs = _cube.find(spec) if docs: return docs.count() else: return 0
def _extract_func(cube, field, **kwargs): c = get_cube(cube) # id_x if None will become ObjectID() id_x = c.get_field_property('id_x', field) # raw_x if None will become field raw_x = c.get_field_property('raw_x', field, field) # convert if None will skip convert step convert = c.get_field_property('convert', field) # _type will be default if not set _type = c.get_field_property('type', field) saved = 0 failed = [] for item in c._reader: if not item: continue try: id_ = id_x(item) except TypeError: id_ = item[id_x] try: raw = raw_x(item) except TypeError: raw = item[raw_x] tokens = type_cast(raw, _type) if convert: tokens = convert(tokens) saved += save_doc(c.name, field, tokens, id_) if not saved: failed.append(id_) result = {'saved': saved} if failed: result.update({'failed_ids': failed}) return result
def snapshot(cube, ids=None): logger.debug('Running snapshot') if ids is None: # Run on all the ids c = get_cube(cube) w = c.get_collection(admin=False, timeline=False) docs = w.find(fields=['_id']) logger.debug('Found %s docs' % docs.count()) ids_to_snapshot = [] for done, doc in enumerate(docs): ids_to_snapshot.append(doc['_id']) if done % 100000 == 0: _snapshot(cube, ids_to_snapshot) ids_to_snapshot = [] logger.debug(' ... %s done' % done) _snapshot(cube, ids_to_snapshot) elif type(ids) is list: _snapshot(cube, ids) elif isinstance(ids, basestring): ids = map(int, ids.split(',')) _snapshot(cube, ids)
def get_tokens(cube, qspec, return_field=None): ''' shortcut for finding fields tokens; return a list of tokens which map to raw_pattern and return_field/compare_field ''' c = get_cube(cube) _cube = c.get_collection() if return_field is None: return_field = '_id' spec = {} else: spec = {return_field: {'$exists': True}} for compare_field, raw_pattern in qspec.iteritems(): spec.update({compare_field: raw_pattern}) rf = {return_field: 1} docs = _cube.find(spec, rf, manipulate=False) docs.batch_size(10000000) # hard limit is 16M... _tokens = [] if docs: for doc in docs: tokens = doc.get(return_field) if not tokens: continue elif type(tokens) is list: _tokens.extend(tokens) else: _tokens.append(tokens) if not _tokens: _tokens = None elif len(_tokens) is 1: _tokens = _tokens[0] return _tokens
def extract(cube, **kwargs): logger.info(' Starting Update operation!') logger.info(' %sCube: %s%s' % (YELLOW, cube, ENDC)) c = get_cube(cube) logger.debug('%sExtract - Start%s' % (YELLOW, ENDC)) _fields = kwargs.get('fields') fields = get_fields(cube, _fields) if fields: result = {} for field in fields: kwargs['field'] = field logger.debug('%sField: %s%s' % (YELLOW, field, ENDC)) result[field] = c.extract_func(**kwargs) logger.info('Extract - Complete: (%s.%s): %s' % (cube, field, result[field])) else: result = c.extract_func(**kwargs) logger.info('Extract - Complete: (%s): %s' % (cube, result)) return result
def fetch(cube, fields, skip=0, limit=0, ids=None): logger.debug('Running Fetch (skip:%s, limit:%s, ids:%s)' % ( skip, limit, len(ids))) logger.debug('... Fields: %s' % fields) c = get_cube(cube) _cube = c.get_collection() fields = get_fields(cube, fields) logger.debug('Return Fields: %s' % fields) sort = [('_id', 1)] if ids: spec = {'_id': {'$in': parse_ids(ids)}} else: spec = {} docs = _cube.find(spec, fields, skip=skip, limit=limit, sort=sort) docs.batch_size(10000000) # hard limit is 16M... return [d for d in docs]
def _extract_func(cube, **kwargs): ''' SQL import method ''' c = get_cube(cube) field = kwargs.get('field') if not field: raise ValueError("Field argument required") force = int(kwargs.get('force', 0)) id_delta = kwargs.get('id_delta', None) if id_delta: if force: raise RuntimeError( "force and id_delta can't be used simultaneously") else: touch = False else: touch = True db = c.get_field_property('db', field) table = c.get_field_property('table', field) db_table = '%s.%s' % (db, table) column = c.get_field_property('column', field) table_column = '%s.%s' % (table, column) # max number of rows to return per call (ie, LIMIT) row_limit = c.get_field_property('row_limit', field, c.row_limit) try: row_limit = int(row_limit) except (TypeError, ValueError): raise ValueError("row_limit must be a number") sql_where = [] sql_groupby = '' _sql = c.get_field_property('sql', field) if not _sql: sql = 'SELECT %s, %s.%s FROM %s' % (table_column, table, field, db_table) else: sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0]) _from = [db_table] # FIXME: THIS IS UGLY! use a dict... or sqlalchemy if _sql[1]: _from.extend(_sql[1]) sql += ', '.join(_from) sql += ' ' if _sql[2]: sql += ' '.join(_sql[2]) sql += ' ' if _sql[3]: sql_where.append('(%s)' % ' OR '.join(_sql[3])) try: if _sql[4]: sql_groupby = _sql[4] except IndexError: pass delta_filter = [] delta_filter_sql = None # force full update if force: _delta = False else: _delta = c.get_field_property('delta', field, True) if _delta: # delta is enabled # the following deltas are mutually exclusive if id_delta: delta_sql = "(%s IN (%s))" % (table_column, id_delta) delta_filter.append(delta_sql) elif c.get_field_property('delta_new_ids', field): # if we delta_new_ids is on, but there is no 'last_id', # then we need to do a FULL run... last_id = get_last_id(c.name, field) if last_id: # FIXME: any reason to ensure we know what the _id is typecasted as? try: last_id = int(last_id) except (TypeError, ValueError): pass if type(last_id) in [INT_TYPE, FLOAT_TYPE]: last_id_sql = "%s > %s" % (table_column, last_id) else: last_id_sql = "%s > '%s'" % (table_column, last_id) delta_filter.append(last_id_sql) mtime_columns = c.get_field_property('delta_mtime', field) if mtime_columns: if isinstance(mtime_columns, basestring): mtime_columns = [mtime_columns] last_update_dt = last_known_warehouse_mtime(c.name, field) if last_update_dt: last_update_dt = last_update_dt.strftime( '%Y-%m-%d %H:%M:%S %z') dt_format = "yyyy-MM-dd HH:mm:ss z" for _column in mtime_columns: _sql = "%s > parseTimestamp('%s', '%s')" % ( _column, last_update_dt, dt_format) delta_filter.append(_sql) if delta_filter: delta_filter_sql = ' OR '.join(delta_filter) sql_where.append('(%s)' % delta_filter_sql) if sql_where: sql += ' WHERE %s ' % ' AND '.join(sql_where) if sql_groupby: sql += ' GROUP BY %s ' % sql_groupby if c.get_field_property('sort', field, True): sql += " ORDER BY %s ASC" % table_column # whether to query for distinct rows only or not; default, no if c.get_field_property('distinct', field, False): sql = re.sub('^SELECT', 'SELECT DISTINCT', sql) start = 0 saved = 0 _stop = False rows = [] failed = [] # FIXME: prefetch the next set of rows while importing to mongo logger.debug('... ... Starting SQL fetchall routine!') container = c.get_field_property('container', field) if touch: now = datetime.now(UTC) spec_mtime = {'cube': cube} update_mtime = {'$set': {field: {'mtime': now}}} while not _stop: rows = c._sql_fetchall(sql, start, field, row_limit) k = len(rows) if k > 0: logger.debug('... ... Starting Processer') grouped = c.grouper(rows) logger.debug('... ... Saving docs now!') t0 = time.time() _id_k = 0 for _id in grouped.iterkeys(): _id_k += 1 for field in grouped[_id].iterkeys(): tokens = grouped[_id][field] if not tokens: tokens = None elif container and type(tokens) is not list: tokens = [tokens] elif not container and type(tokens) is list: if len(tokens) > 1: raise TypeError( "Tokens contains too many values (%s); " "(set container=True?)" % (tokens)) else: tokens = tokens[0] try: saved += save_doc(c.name, field, tokens, _id) except Exception as e: logger.error('Error saving (%s) %s: %s' % (tokens, _id, e)) saved = 0 if not saved: failed.append(_id) t1 = time.time() logger.info('... ... Saved %i docs (%i/sec)' % (k, k / (t1 - t0))) else: logger.debug('... ... No rows; nothing to process') if k < row_limit: _stop = True else: start += k if k != row_limit: # theoretically, k == row_limit logger.warn( "rows count seems incorrect! row_limit: %s, row returned: %s" % (row_limit, k)) result = {'saved': saved} if failed: result.update({'failed_ids': failed}) else: if touch: # update the mtimestamp for when this field was last touched # to the moment we started updating c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True) return result
def _extract_func(cube, **kwargs): ''' SQL import method ''' c = get_cube(cube) field = kwargs.get('field') if not field: raise ValueError("Field argument required") force = int(kwargs.get('force', 0)) id_delta = kwargs.get('id_delta', None) if id_delta: if force: raise RuntimeError("force and id_delta can't be used simultaneously") else: touch = False else: touch = True db = c.get_field_property('db', field) table = c.get_field_property('table', field) db_table = '%s.%s' % (db, table) column = c.get_field_property('column', field) table_column = '%s.%s' % (table, column) # max number of rows to return per call (ie, LIMIT) row_limit = c.get_field_property('row_limit', field, c.row_limit) try: row_limit = int(row_limit) except (TypeError, ValueError): raise ValueError("row_limit must be a number") sql_where = [] sql_groupby = '' _sql = c.get_field_property('sql', field) if not _sql: sql = 'SELECT %s, %s.%s FROM %s' % ( table_column, table, field, db_table) else: sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0]) _from = [db_table] # FIXME: THIS IS UGLY! use a dict... or sqlalchemy if _sql[1]: _from.extend(_sql[1]) sql += ', '.join(_from) sql += ' ' if _sql[2]: sql += ' '.join(_sql[2]) sql += ' ' if _sql[3]: sql_where.append('(%s)' % ' OR '.join(_sql[3])) try: if _sql[4]: sql_groupby = _sql[4] except IndexError: pass delta_filter = [] delta_filter_sql = None # force full update if force: _delta = False else: _delta = c.get_field_property('delta', field, True) if _delta: # delta is enabled # the following deltas are mutually exclusive if id_delta: delta_sql = "(%s IN (%s))" % (table_column, id_delta) delta_filter.append(delta_sql) elif c.get_field_property('delta_new_ids', field): # if we delta_new_ids is on, but there is no 'last_id', # then we need to do a FULL run... last_id = get_last_id(c.name, field) if last_id: # FIXME: any reason to ensure we know what the _id is typecasted as? try: last_id = int(last_id) except (TypeError, ValueError): pass if type(last_id) in [INT_TYPE, FLOAT_TYPE]: last_id_sql = "%s > %s" % (table_column, last_id) else: last_id_sql = "%s > '%s'" % (table_column, last_id) delta_filter.append(last_id_sql) mtime_columns = c.get_field_property('delta_mtime', field) if mtime_columns: if isinstance(mtime_columns, basestring): mtime_columns = [mtime_columns] last_update_dt = last_known_warehouse_mtime(c.name, field) if last_update_dt: last_update_dt = last_update_dt.strftime('%Y-%m-%d %H:%M:%S %z') dt_format = "yyyy-MM-dd HH:mm:ss z" for _column in mtime_columns: _sql = "%s > parseTimestamp('%s', '%s')" % ( _column, last_update_dt, dt_format) delta_filter.append(_sql) if delta_filter: delta_filter_sql = ' OR '.join(delta_filter) sql_where.append('(%s)' % delta_filter_sql) if sql_where: sql += ' WHERE %s ' % ' AND '.join(sql_where) if sql_groupby: sql += ' GROUP BY %s ' % sql_groupby if c.get_field_property('sort', field, True): sql += " ORDER BY %s ASC" % table_column # whether to query for distinct rows only or not; default, no if c.get_field_property('distinct', field, False): sql = re.sub('^SELECT', 'SELECT DISTINCT', sql) start = 0 saved = 0 _stop = False rows = [] failed = [] # FIXME: prefetch the next set of rows while importing to mongo logger.debug('... ... Starting SQL fetchall routine!') container = c.get_field_property('container', field) if touch: now = datetime.now(UTC) spec_mtime = {'cube': cube} update_mtime = {'$set': {field: {'mtime': now}}} while not _stop: rows = c._sql_fetchall(sql, start, field, row_limit) k = len(rows) if k > 0: logger.debug('... ... Starting Processer') grouped = c.grouper(rows) logger.debug('... ... Saving docs now!') t0 = time.time() _id_k = 0 for _id in grouped.iterkeys(): _id_k += 1 for field in grouped[_id].iterkeys(): tokens = grouped[_id][field] if not tokens: tokens = None elif container and type(tokens) is not list: tokens = [tokens] elif not container and type(tokens) is list: if len(tokens) > 1: raise TypeError( "Tokens contains too many values (%s); " "(set container=True?)" % (tokens)) else: tokens = tokens[0] try: saved += save_doc(c.name, field, tokens, _id) except Exception as e: logger.error( 'Error saving (%s) %s: %s' % (tokens, _id, e)) saved = 0 if not saved: failed.append(_id) t1 = time.time() logger.info('... ... Saved %i docs (%i/sec)' % ( k, k / (t1 - t0))) else: logger.debug('... ... No rows; nothing to process') if k < row_limit: _stop = True else: start += k if k != row_limit: # theoretically, k == row_limit logger.warn( "rows count seems incorrect! row_limit: %s, row returned: %s" % ( row_limit, k)) result = {'saved': saved} if failed: result.update({'failed_ids': failed}) else: if touch: # update the mtimestamp for when this field was last touched # to the moment we started updating c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True) return result
def aggregate(cube, pipeline): logger.debug('Running Aggregation') logger.debug('Pipeline (%s): %s' % (type(pipeline), pipeline)) c = get_cube(cube) _cube = c.get_collection() return _cube.aggregate(pipeline)