Example #1
0
 def get(self):
     cube = self.get_argument('cube')
     details = self.get_argument('details', None)
     if cube is None:
         # return a list of cubes
         return get_cubes()
     else:
         # return a list of fields in a cube
         if details:
             result = get_cube(cube).fields
         else:
             result = sorted(get_cube(cube).fields.keys())
         return result
Example #2
0
def save_doc(cube, field, tokens, id=None):
    '''
    All subclasses use this method to 'save' a document into the warehouse
    '''
    c = get_cube(cube)
    if field not in c.fields:
        raise ValueError("Invalid field (%s)" % field)

    container = c.get_field_property('container', field, False)

    if tokens and container:
        if type(tokens) is not list:
            raise TypeError("Tokens type must be list()")
        else:
            tokens = sorted(tokens)

    # normalize empty lists -> None
    if not tokens:
        tokens = None

    if id is None:
        id = ObjectId()

    now = datetime.now(UTC)
    spec_now = {'_id': id}
    update_now = {'$set': {field: tokens, '_mtime': now}}

    _cube = c.get_collection(admin=True)
    _cube.update(spec_now, update_now, upsert=True)

    return 1  # eg, one document added
Example #3
0
def _activity_import(cube, ids):
    c = get_cube(cube)

    timeline = c.get_collection(timeline=True, admin=True)
    timeline.ensure_index([('id', 1), ('start', 1)])
    time_docs = timeline.find({'id': {'$in': ids}},
                              sort=[('id', 1), ('start', 1)])

    h = c.get_collection(timeline=False, admin=False,
                         cube='%s_activity' % c.name)
    act_docs = h.find({'id': {'$in': ids}}, sort=[('id', 1), ('when', -1)])
    act_docs_iter = iter(act_docs)

    act_id = -1
    last_doc_id = -1
    activities = []
    for time_doc in time_docs:
        tid = time_doc['id']
        # we want to update only the oldest version of the object
        if tid != last_doc_id:
            last_doc_id = tid
            while act_id <= tid:
                try:
                    act_doc = act_docs_iter.next()
                    act_id = act_doc['id']
                    activities.append(act_doc)
                except StopIteration:
                    break
            acts = [act for act in activities if act['id'] == tid]
            _activity_import_doc(c, time_doc, acts, timeline)
            activities = [act for act in activities if act['id'] > tid]
Example #4
0
def activity_import(cube, ids=None):
    logger.debug('Running activity history import')
    c = get_cube(cube)
    h = c.get_collection(timeline=False, admin=True,
                         cube='%s_activity' % c.name)
    h.ensure_index([('id', 1), ('when', -1)])
    if ids is None:
        # Run on all the ids
        t = c.get_collection(timeline=True, admin=True)
        docs = t.find({'current': True}, fields=['id'])
        logger.debug('Found %s docs' % docs.count())

        ids = []
        for done, doc in enumerate(docs):
            ids.append(doc['id'])
            if done % 100000 == 0:
                _activity_import(cube, ids)
                ids = []
                logger.debug(' ... %s done' % done)
        _activity_import(cube, ids)
        logger.debug(' ... %s done' % (done + 1))
    elif type(ids) is list:
        _activity_import(cube, ids)
    elif isinstance(ids, basestring):
        ids = map(int, ids.split(','))
        _activity_import(cube, ids)
Example #5
0
def last_known_warehouse_mtime(cube, field=None, value=None):
    '''get the last known warehouse object mtime'''
    c = get_cube(cube)
    _cube = c.get_collection()

    start = None
    if field:
        # we need to check the etl_activity collection
        if value:
            spec = {'cube': cube, field: value}
            doc = c.c_etl_activity.find_one(spec, ['%s.mtime' % field])
        else:
            spec = {'cube': cube, field: {'$exists': True}}
            doc = _cube.find_one(spec, ['%s._mtime' % field])
        if doc:
            start = doc[field]['mtime']
    else:
        # get the most recent _mtime of all objects in the cube
        mtime = '_mtime'
        spec = {}
        doc = _cube.find_one(spec, [mtime], sort=[(mtime, -1)])
        if doc:
            start = doc[mtime]

    logger.debug('... Last field mtime: %s' % start)
    return start
Example #6
0
def save_objects(cube, objs):
    if not objs:
        raise ValueError("Empty objects list")
    c = get_cube(cube)
    expected_fields = set(c.fields.keys())
    expected_fields.add('_id')  # we always expect the _id to be defined as well
    _cube = c.get_collection(admin=True)

    if not type(objs) in [list, tuple]:
        raise TypeError("Expected list or tuple, got type(%s)" % type(objs))

    now = datetime.now(UTC)
    for x, obj in enumerate(objs):
        if not obj:
            raise ValueError("Empty object")
        elif not isinstance(obj, dict):
            raise TypeError(
                "Expected objects as dict, got type(%s)" % type(obj))

        obj_fields = set(obj.keys())
        if not obj_fields <= expected_fields:
            raise ValueError(
                "Object includes unexpected fields.\n"
                "Unexpected: %s" % (obj_fields - expected_fields))
        else:
            objs[x].update({'_mtime': now})

    _cube.insert(objs, manipulate=False)
    return len(objs)
Example #7
0
def _snapshot(cube, ids):
    c = get_cube(cube)
    w = c.get_collection(admin=False, timeline=False)
    t = c.get_collection(admin=True, timeline=True)

    docs = w.find({'_id': {'$in': ids}}, sort=[('_id', 1)])

    logger.debug('Snapshot Timeline Index: Start')
    t.ensure_index([('current', 1), ('id', 1)])
    logger.debug('... Snapshot Timeline Index: Done')

    time_docs = t.find({'current': True, 'id': {'$in': ids}},
                       sort=[('id', 1)])
    time_docs_iter = iter(time_docs)
    tid = -1

    batch_insert = []
    for doc in docs:
        _id = doc.pop('_id')
        _mtime = doc.pop('_mtime')

        # time_doc will contain first doc that has id >= _id,
        # it might be a document where id > _id
        while tid < _id:
            try:
                time_doc = time_docs_iter.next()
                tid = time_doc['id']
            except StopIteration:
                break

        store_new_doc = False
        if _id == tid:
            if doc != time_doc['fields']:
                store_new_doc = True
                spec_now = {'_id': time_doc['_id']}
                update_now = {'$set': {'current': False},
                              '$set': {'end': _mtime}}
                t.update(spec_now, update_now, upsert=True)
        else:
            store_new_doc = True

        if store_new_doc:
            new_doc = {'fields': doc,
                       'id': _id,
                       'start': _mtime,
                       'end': None,
                       'current': True}
            batch_insert.append(new_doc)
        if len(batch_insert) > 1000:
            t.insert(batch_insert)
            batch_insert = []
    if len(batch_insert) > 0:
        t.insert(batch_insert)
Example #8
0
def get_last_id(cube, field):
    '''
    '''
    c = get_cube(cube)
    spec = {field: {'$exists': True}}
    _cube = c.get_collection()
    logger.debug(" ... %s.%s._get_last_id spec: %s" % (
        _cube.db.name, _cube.collection, spec))
    last_id = _cube.find_one(spec, {'_id': 1}, sort=[('_id', -1)])
    if last_id:
        value = last_id['_id']
    else:
        value = None
    logger.debug(" ... ... Last ID: %s" % last_id)
    return value
Example #9
0
def find(cube, query, fields=None, date=None, most_recent=True):
    logger.debug('Running Find')
    if date is not None:
        # we will be doing a timeline query so we need to rename the fields
        # WARNING: might not work if some field is a substring of other field
        all_fields = get_fields(cube, '__all__')
        for f in all_fields:
            query = re.sub(f, 'fields.%s' % f, query)
        # add the date constraint
        query = query + ' and ' + _get_date_pql_string(date)
    pql_parser = pql.SchemaFreeParser()
    try:
        # FIXME: make it a schema aware parser
        spec = pql_parser.parse(query)
    except Exception as e:
        raise ValueError("Invalid Query (%s)" % str(e))

    c = get_cube(cube)
    _cube = c.get_collection(timeline=(date is not None))

    logger.debug('Query: %s' % spec)

    fields = get_fields(cube, fields)

    if date is not None:
        project_d = dict([(f, '$fields.%s' % f) for f in fields])
        project_d.update(dict(_id='$id', _start='$start', _end='$end'))
        if most_recent:
            docs = _cube.aggregate([{'$match': spec},
                                    {'$sort': {'start': -1}},
                                    {'$group': {'_id': '$id',
                                                'fields': {'$first':
                                                           '$fields'},
                                                'start': {'$first': '$start'},
                                                'end':  {'$first': '$end'},
                                                'id': {'$first': '$id'}}},
                                    {'$project': project_d}])
        else:
            docs = _cube.aggregate([{'$match': spec},
                                    {'$project': project_d}])
        docs = docs['result']
    else:
        docs = _cube.find(spec, fields)
        docs.batch_size(10000000)  # hard limit is 16M...
    docs = [d for d in docs]
    return docs
Example #10
0
def index_warehouse(cube, fields, force=False):
    '''
    NOTE: _id key index is generated automatically by mongo
    '''
    c = get_cube(cube)
    _cube = c.get_collection(admin=True)

    fields = get_fields(cube, fields)
    result = {}
    for field in fields:
        name = '%s-tokens' % field
        if force or c.get_field_property('index', field):
            logger.info(' %s... Indexing Warehouse (%s)%s' %
                        (YELLOW, field, ENDC))
            key = [(field, -1)]
            result[field] = _cube.ensure_index(key, name=name)
        else:
            result[field] = -1
    return result
Example #11
0
def count(cube, query):
    logger.debug('Running Count')
    pql_parser = pql.SchemaFreeParser()
    try:
        # FIXME: make it a schema aware parser
        spec = pql_parser.parse(query)
    except Exception as e:
        raise ValueError("Invalid Query (%s)" % str(e))

    c = get_cube(cube)
    _cube = c.get_collection()

    logger.debug('Query: %s' % spec)

    docs = _cube.find(spec)
    if docs:
        return docs.count()
    else:
        return 0
Example #12
0
def _extract_func(cube, field, **kwargs):
    c = get_cube(cube)
    # id_x if None will become ObjectID()
    id_x = c.get_field_property('id_x', field)
    # raw_x if None will become field
    raw_x = c.get_field_property('raw_x', field, field)
    # convert if None will skip convert step
    convert = c.get_field_property('convert', field)
    # _type will be default if not set
    _type = c.get_field_property('type', field)

    saved = 0
    failed = []
    for item in c._reader:
        if not item:
            continue

        try:
            id_ = id_x(item)
        except TypeError:
            id_ = item[id_x]

        try:
            raw = raw_x(item)
        except TypeError:
            raw = item[raw_x]

        tokens = type_cast(raw, _type)
        if convert:
            tokens = convert(tokens)

        saved += save_doc(c.name, field, tokens, id_)
        if not saved:
            failed.append(id_)

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    return result
Example #13
0
def snapshot(cube, ids=None):
    logger.debug('Running snapshot')
    if ids is None:
        # Run on all the ids
        c = get_cube(cube)
        w = c.get_collection(admin=False, timeline=False)
        docs = w.find(fields=['_id'])
        logger.debug('Found %s docs' % docs.count())

        ids_to_snapshot = []
        for done, doc in enumerate(docs):
            ids_to_snapshot.append(doc['_id'])
            if done % 100000 == 0:
                _snapshot(cube, ids_to_snapshot)
                ids_to_snapshot = []
                logger.debug(' ... %s done' % done)
        _snapshot(cube, ids_to_snapshot)
    elif type(ids) is list:
        _snapshot(cube, ids)
    elif isinstance(ids, basestring):
        ids = map(int, ids.split(','))
        _snapshot(cube, ids)
Example #14
0
def _extract_func(cube, field, **kwargs):
    c = get_cube(cube)
    # id_x if None will become ObjectID()
    id_x = c.get_field_property('id_x', field)
    # raw_x if None will become field
    raw_x = c.get_field_property('raw_x', field, field)
    # convert if None will skip convert step
    convert = c.get_field_property('convert', field)
    # _type will be default if not set
    _type = c.get_field_property('type', field)

    saved = 0
    failed = []
    for item in c._reader:
        if not item:
            continue

        try:
            id_ = id_x(item)
        except TypeError:
            id_ = item[id_x]

        try:
            raw = raw_x(item)
        except TypeError:
            raw = item[raw_x]

        tokens = type_cast(raw, _type)
        if convert:
            tokens = convert(tokens)

        saved += save_doc(c.name, field, tokens, id_)
        if not saved:
            failed.append(id_)

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    return result
Example #15
0
def get_tokens(cube, qspec, return_field=None):
    '''
    shortcut for finding fields tokens;
    return a list of tokens which map to raw_pattern and
    return_field/compare_field
    '''
    c = get_cube(cube)
    _cube = c.get_collection()
    if return_field is None:
        return_field = '_id'
        spec = {}
    else:
        spec = {return_field: {'$exists': True}}

    for compare_field, raw_pattern in qspec.iteritems():
        spec.update({compare_field: raw_pattern})

    rf = {return_field: 1}

    docs = _cube.find(spec, rf, manipulate=False)
    docs.batch_size(10000000)  # hard limit is 16M...

    _tokens = []
    if docs:
        for doc in docs:
            tokens = doc.get(return_field)
            if not tokens:
                continue
            elif type(tokens) is list:
                _tokens.extend(tokens)
            else:
                _tokens.append(tokens)

    if not _tokens:
        _tokens = None
    elif len(_tokens) is 1:
        _tokens = _tokens[0]

    return _tokens
Example #16
0
def extract(cube, **kwargs):
    logger.info(' Starting Update operation!')
    logger.info(' %sCube: %s%s' % (YELLOW, cube, ENDC))
    c = get_cube(cube)

    logger.debug('%sExtract - Start%s' % (YELLOW, ENDC))

    _fields = kwargs.get('fields')
    fields = get_fields(cube, _fields)

    if fields:
        result = {}
        for field in fields:
            kwargs['field'] = field
            logger.debug('%sField: %s%s' % (YELLOW, field, ENDC))
            result[field] = c.extract_func(**kwargs)
            logger.info('Extract - Complete: (%s.%s): %s' %
                        (cube, field, result[field]))
    else:
        result = c.extract_func(**kwargs)
        logger.info('Extract - Complete: (%s): %s' % (cube, result))

    return result
Example #17
0
def fetch(cube, fields, skip=0, limit=0, ids=None):
    logger.debug('Running Fetch (skip:%s, limit:%s, ids:%s)' % (
        skip, limit, len(ids)))
    logger.debug('... Fields: %s' % fields)

    c = get_cube(cube)
    _cube = c.get_collection()

    fields = get_fields(cube, fields)
    logger.debug('Return Fields: %s' % fields)

    sort = [('_id', 1)]

    if ids:
        spec = {'_id': {'$in': parse_ids(ids)}}
    else:
        spec = {}

    docs = _cube.find(spec, fields,
                      skip=skip, limit=limit,
                      sort=sort)
    docs.batch_size(10000000)  # hard limit is 16M...
    return [d for d in docs]
Example #18
0
def _extract_func(cube, **kwargs):
    '''
    SQL import method
    '''
    c = get_cube(cube)
    field = kwargs.get('field')
    if not field:
        raise ValueError("Field argument required")
    force = int(kwargs.get('force', 0))
    id_delta = kwargs.get('id_delta', None)

    if id_delta:
        if force:
            raise RuntimeError(
                "force and id_delta can't be used simultaneously")
        else:
            touch = False
    else:
        touch = True

    db = c.get_field_property('db', field)
    table = c.get_field_property('table', field)
    db_table = '%s.%s' % (db, table)
    column = c.get_field_property('column', field)
    table_column = '%s.%s' % (table, column)

    # max number of rows to return per call (ie, LIMIT)
    row_limit = c.get_field_property('row_limit', field, c.row_limit)
    try:
        row_limit = int(row_limit)
    except (TypeError, ValueError):
        raise ValueError("row_limit must be a number")

    sql_where = []
    sql_groupby = ''
    _sql = c.get_field_property('sql', field)
    if not _sql:
        sql = 'SELECT %s, %s.%s FROM %s' % (table_column, table, field,
                                            db_table)
    else:
        sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0])
        _from = [db_table]

        # FIXME: THIS IS UGLY! use a dict... or sqlalchemy
        if _sql[1]:
            _from.extend(_sql[1])
        sql += ', '.join(_from)
        sql += ' '

        if _sql[2]:
            sql += ' '.join(_sql[2])
        sql += ' '

        if _sql[3]:
            sql_where.append('(%s)' % ' OR '.join(_sql[3]))

        try:
            if _sql[4]:
                sql_groupby = _sql[4]
        except IndexError:
            pass

    delta_filter = []
    delta_filter_sql = None

    # force full update
    if force:
        _delta = False
    else:
        _delta = c.get_field_property('delta', field, True)

    if _delta:
        # delta is enabled
        # the following deltas are mutually exclusive
        if id_delta:
            delta_sql = "(%s IN (%s))" % (table_column, id_delta)
            delta_filter.append(delta_sql)
        elif c.get_field_property('delta_new_ids', field):
            # if we delta_new_ids is on, but there is no 'last_id',
            # then we need to do a FULL run...
            last_id = get_last_id(c.name, field)
            if last_id:
                # FIXME: any reason to ensure we know what the _id is typecasted as?
                try:
                    last_id = int(last_id)
                except (TypeError, ValueError):
                    pass

                if type(last_id) in [INT_TYPE, FLOAT_TYPE]:
                    last_id_sql = "%s > %s" % (table_column, last_id)
                else:
                    last_id_sql = "%s > '%s'" % (table_column, last_id)
                delta_filter.append(last_id_sql)

            mtime_columns = c.get_field_property('delta_mtime', field)
            if mtime_columns:
                if isinstance(mtime_columns, basestring):
                    mtime_columns = [mtime_columns]
                last_update_dt = last_known_warehouse_mtime(c.name, field)
                if last_update_dt:
                    last_update_dt = last_update_dt.strftime(
                        '%Y-%m-%d %H:%M:%S %z')
                    dt_format = "yyyy-MM-dd HH:mm:ss z"
                    for _column in mtime_columns:
                        _sql = "%s > parseTimestamp('%s', '%s')" % (
                            _column, last_update_dt, dt_format)
                        delta_filter.append(_sql)

    if delta_filter:
        delta_filter_sql = ' OR '.join(delta_filter)
        sql_where.append('(%s)' % delta_filter_sql)

    if sql_where:
        sql += ' WHERE %s ' % ' AND '.join(sql_where)

    if sql_groupby:
        sql += ' GROUP BY %s ' % sql_groupby

    if c.get_field_property('sort', field, True):
        sql += " ORDER BY %s ASC" % table_column

    # whether to query for distinct rows only or not; default, no
    if c.get_field_property('distinct', field, False):
        sql = re.sub('^SELECT', 'SELECT DISTINCT', sql)

    start = 0
    saved = 0
    _stop = False
    rows = []
    failed = []

    # FIXME: prefetch the next set of rows while importing to mongo
    logger.debug('... ... Starting SQL fetchall routine!')

    container = c.get_field_property('container', field)

    if touch:
        now = datetime.now(UTC)
        spec_mtime = {'cube': cube}
        update_mtime = {'$set': {field: {'mtime': now}}}

    while not _stop:
        rows = c._sql_fetchall(sql, start, field, row_limit)
        k = len(rows)
        if k > 0:
            logger.debug('... ... Starting Processer')
            grouped = c.grouper(rows)
            logger.debug('... ... Saving docs now!')
            t0 = time.time()
            _id_k = 0
            for _id in grouped.iterkeys():
                _id_k += 1
                for field in grouped[_id].iterkeys():
                    tokens = grouped[_id][field]
                    if not tokens:
                        tokens = None
                    elif container and type(tokens) is not list:
                        tokens = [tokens]
                    elif not container and type(tokens) is list:
                        if len(tokens) > 1:
                            raise TypeError(
                                "Tokens contains too many values (%s); "
                                "(set container=True?)" % (tokens))
                        else:
                            tokens = tokens[0]

                    try:
                        saved += save_doc(c.name, field, tokens, _id)
                    except Exception as e:
                        logger.error('Error saving (%s) %s: %s' %
                                     (tokens, _id, e))
                        saved = 0
                    if not saved:
                        failed.append(_id)
            t1 = time.time()
            logger.info('... ... Saved %i docs (%i/sec)' % (k, k / (t1 - t0)))
        else:
            logger.debug('... ... No rows; nothing to process')

        if k < row_limit:
            _stop = True
        else:
            start += k
            if k != row_limit:  # theoretically, k == row_limit
                logger.warn(
                    "rows count seems incorrect! row_limit: %s, row returned: %s"
                    % (row_limit, k))

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    else:
        if touch:
            # update the mtimestamp for when this field was last touched
            # to the moment we started updating
            c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True)
    return result
Example #19
0
def _extract_func(cube, **kwargs):
    '''
    SQL import method
    '''
    c = get_cube(cube)
    field = kwargs.get('field')
    if not field:
        raise ValueError("Field argument required")
    force = int(kwargs.get('force', 0))
    id_delta = kwargs.get('id_delta', None)

    if id_delta:
        if force:
            raise RuntimeError("force and id_delta can't be used simultaneously")
        else:
            touch = False
    else:
        touch = True

    db = c.get_field_property('db', field)
    table = c.get_field_property('table', field)
    db_table = '%s.%s' % (db, table)
    column = c.get_field_property('column', field)
    table_column = '%s.%s' % (table, column)

    # max number of rows to return per call (ie, LIMIT)
    row_limit = c.get_field_property('row_limit', field, c.row_limit)
    try:
        row_limit = int(row_limit)
    except (TypeError, ValueError):
        raise ValueError("row_limit must be a number")

    sql_where = []
    sql_groupby = ''
    _sql = c.get_field_property('sql', field)
    if not _sql:
        sql = 'SELECT %s, %s.%s FROM %s' % (
            table_column, table, field, db_table)
    else:
        sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0])
        _from = [db_table]

        # FIXME: THIS IS UGLY! use a dict... or sqlalchemy
        if _sql[1]:
            _from.extend(_sql[1])
        sql += ', '.join(_from)
        sql += ' '

        if _sql[2]:
            sql += ' '.join(_sql[2])
        sql += ' '

        if _sql[3]:
            sql_where.append('(%s)' % ' OR '.join(_sql[3]))

        try:
            if _sql[4]:
                sql_groupby = _sql[4]
        except IndexError:
            pass

    delta_filter = []
    delta_filter_sql = None

    # force full update
    if force:
        _delta = False
    else:
        _delta = c.get_field_property('delta', field, True)

    if _delta:
        # delta is enabled
        # the following deltas are mutually exclusive
        if id_delta:
            delta_sql = "(%s IN (%s))" % (table_column, id_delta)
            delta_filter.append(delta_sql)
        elif c.get_field_property('delta_new_ids', field):
            # if we delta_new_ids is on, but there is no 'last_id',
            # then we need to do a FULL run...
            last_id = get_last_id(c.name, field)
            if last_id:
                # FIXME: any reason to ensure we know what the _id is typecasted as?
                try:
                        last_id = int(last_id)
                except (TypeError, ValueError):
                        pass

                if type(last_id) in [INT_TYPE, FLOAT_TYPE]:
                    last_id_sql = "%s > %s" % (table_column, last_id)
                else:
                    last_id_sql = "%s > '%s'" % (table_column, last_id)
                delta_filter.append(last_id_sql)

            mtime_columns = c.get_field_property('delta_mtime', field)
            if mtime_columns:
                if isinstance(mtime_columns, basestring):
                    mtime_columns = [mtime_columns]
                last_update_dt = last_known_warehouse_mtime(c.name, field)
                if last_update_dt:
                    last_update_dt = last_update_dt.strftime('%Y-%m-%d %H:%M:%S %z')
                    dt_format = "yyyy-MM-dd HH:mm:ss z"
                    for _column in mtime_columns:
                        _sql = "%s > parseTimestamp('%s', '%s')" % (
                            _column, last_update_dt, dt_format)
                        delta_filter.append(_sql)

    if delta_filter:
        delta_filter_sql = ' OR '.join(delta_filter)
        sql_where.append('(%s)' % delta_filter_sql)

    if sql_where:
        sql += ' WHERE %s ' % ' AND '.join(sql_where)

    if sql_groupby:
        sql += ' GROUP BY %s ' % sql_groupby

    if c.get_field_property('sort', field, True):
        sql += " ORDER BY %s ASC" % table_column

    # whether to query for distinct rows only or not; default, no
    if c.get_field_property('distinct', field, False):
        sql = re.sub('^SELECT', 'SELECT DISTINCT', sql)

    start = 0
    saved = 0
    _stop = False
    rows = []
    failed = []

    # FIXME: prefetch the next set of rows while importing to mongo
    logger.debug('... ... Starting SQL fetchall routine!')

    container = c.get_field_property('container', field)

    if touch:
        now = datetime.now(UTC)
        spec_mtime = {'cube': cube}
        update_mtime = {'$set': {field: {'mtime': now}}}

    while not _stop:
        rows = c._sql_fetchall(sql, start, field, row_limit)
        k = len(rows)
        if k > 0:
            logger.debug('... ... Starting Processer')
            grouped = c.grouper(rows)
            logger.debug('... ... Saving docs now!')
            t0 = time.time()
            _id_k = 0
            for _id in grouped.iterkeys():
                _id_k += 1
                for field in grouped[_id].iterkeys():
                    tokens = grouped[_id][field]
                    if not tokens:
                        tokens = None
                    elif container and type(tokens) is not list:
                        tokens = [tokens]
                    elif not container and type(tokens) is list:
                        if len(tokens) > 1:
                            raise TypeError(
                                "Tokens contains too many values (%s); "
                                "(set container=True?)" % (tokens))
                        else:
                            tokens = tokens[0]

                    try:
                        saved += save_doc(c.name, field, tokens, _id)
                    except Exception as e:
                        logger.error(
                            'Error saving (%s) %s: %s' % (tokens, _id, e))
                        saved = 0
                    if not saved:
                        failed.append(_id)
            t1 = time.time()
            logger.info('... ... Saved %i docs (%i/sec)' % (
                k, k / (t1 - t0)))
        else:
            logger.debug('... ... No rows; nothing to process')

        if k < row_limit:
            _stop = True
        else:
            start += k
            if k != row_limit:  # theoretically, k == row_limit
                logger.warn(
                    "rows count seems incorrect! row_limit: %s, row returned: %s" % (
                        row_limit, k))

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    else:
        if touch:
            # update the mtimestamp for when this field was last touched
            # to the moment we started updating
            c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True)
    return result
Example #20
0
def aggregate(cube, pipeline):
    logger.debug('Running Aggregation')
    logger.debug('Pipeline (%s): %s' % (type(pipeline), pipeline))
    c = get_cube(cube)
    _cube = c.get_collection()
    return _cube.aggregate(pipeline)