def _extract_func(cube, field, **kwargs): c = get_cube(cube) # id_x if None will become ObjectID() id_x = c.get_field_property('id_x', field) # raw_x if None will become field raw_x = c.get_field_property('raw_x', field, field) # convert if None will skip convert step convert = c.get_field_property('convert', field) # _type will be default if not set _type = c.get_field_property('type', field) saved = 0 failed = [] for item in c._reader: if not item: continue try: id_ = id_x(item) except TypeError: id_ = item[id_x] try: raw = raw_x(item) except TypeError: raw = item[raw_x] tokens = type_cast(raw, _type) if convert: tokens = convert(tokens) saved += save_doc(c.name, field, tokens, id_) if not saved: failed.append(id_) result = {'saved': saved} if failed: result.update({'failed_ids': failed}) return result
def _extract_func(cube, **kwargs): ''' SQL import method ''' c = get_cube(cube) field = kwargs.get('field') if not field: raise ValueError("Field argument required") force = int(kwargs.get('force', 0)) id_delta = kwargs.get('id_delta', None) if id_delta: if force: raise RuntimeError("force and id_delta can't be used simultaneously") else: touch = False else: touch = True db = c.get_field_property('db', field) table = c.get_field_property('table', field) db_table = '%s.%s' % (db, table) column = c.get_field_property('column', field) table_column = '%s.%s' % (table, column) # max number of rows to return per call (ie, LIMIT) row_limit = c.get_field_property('row_limit', field, c.row_limit) try: row_limit = int(row_limit) except (TypeError, ValueError): raise ValueError("row_limit must be a number") sql_where = [] sql_groupby = '' _sql = c.get_field_property('sql', field) if not _sql: sql = 'SELECT %s, %s.%s FROM %s' % ( table_column, table, field, db_table) else: sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0]) _from = [db_table] # FIXME: THIS IS UGLY! use a dict... or sqlalchemy if _sql[1]: _from.extend(_sql[1]) sql += ', '.join(_from) sql += ' ' if _sql[2]: sql += ' '.join(_sql[2]) sql += ' ' if _sql[3]: sql_where.append('(%s)' % ' OR '.join(_sql[3])) try: if _sql[4]: sql_groupby = _sql[4] except IndexError: pass delta_filter = [] delta_filter_sql = None # force full update if force: _delta = False else: _delta = c.get_field_property('delta', field, True) if _delta: # delta is enabled # the following deltas are mutually exclusive if id_delta: delta_sql = "(%s IN (%s))" % (table_column, id_delta) delta_filter.append(delta_sql) elif c.get_field_property('delta_new_ids', field): # if we delta_new_ids is on, but there is no 'last_id', # then we need to do a FULL run... last_id = get_last_id(c.name, field) if last_id: # FIXME: any reason to ensure we know what the _id is typecasted as? try: last_id = int(last_id) except (TypeError, ValueError): pass if type(last_id) in [INT_TYPE, FLOAT_TYPE]: last_id_sql = "%s > %s" % (table_column, last_id) else: last_id_sql = "%s > '%s'" % (table_column, last_id) delta_filter.append(last_id_sql) mtime_columns = c.get_field_property('delta_mtime', field) if mtime_columns: if isinstance(mtime_columns, basestring): mtime_columns = [mtime_columns] last_update_dt = last_known_warehouse_mtime(c.name, field) if last_update_dt: last_update_dt = last_update_dt.strftime('%Y-%m-%d %H:%M:%S %z') dt_format = "yyyy-MM-dd HH:mm:ss z" for _column in mtime_columns: _sql = "%s > parseTimestamp('%s', '%s')" % ( _column, last_update_dt, dt_format) delta_filter.append(_sql) if delta_filter: delta_filter_sql = ' OR '.join(delta_filter) sql_where.append('(%s)' % delta_filter_sql) if sql_where: sql += ' WHERE %s ' % ' AND '.join(sql_where) if sql_groupby: sql += ' GROUP BY %s ' % sql_groupby if c.get_field_property('sort', field, True): sql += " ORDER BY %s ASC" % table_column # whether to query for distinct rows only or not; default, no if c.get_field_property('distinct', field, False): sql = re.sub('^SELECT', 'SELECT DISTINCT', sql) start = 0 saved = 0 _stop = False rows = [] failed = [] # FIXME: prefetch the next set of rows while importing to mongo logger.debug('... ... Starting SQL fetchall routine!') container = c.get_field_property('container', field) if touch: now = datetime.now(UTC) spec_mtime = {'cube': cube} update_mtime = {'$set': {field: {'mtime': now}}} while not _stop: rows = c._sql_fetchall(sql, start, field, row_limit) k = len(rows) if k > 0: logger.debug('... ... Starting Processer') grouped = c.grouper(rows) logger.debug('... ... Saving docs now!') t0 = time.time() _id_k = 0 for _id in grouped.iterkeys(): _id_k += 1 for field in grouped[_id].iterkeys(): tokens = grouped[_id][field] if not tokens: tokens = None elif container and type(tokens) is not list: tokens = [tokens] elif not container and type(tokens) is list: if len(tokens) > 1: raise TypeError( "Tokens contains too many values (%s); " "(set container=True?)" % (tokens)) else: tokens = tokens[0] try: saved += save_doc(c.name, field, tokens, _id) except Exception as e: logger.error( 'Error saving (%s) %s: %s' % (tokens, _id, e)) saved = 0 if not saved: failed.append(_id) t1 = time.time() logger.info('... ... Saved %i docs (%i/sec)' % ( k, k / (t1 - t0))) else: logger.debug('... ... No rows; nothing to process') if k < row_limit: _stop = True else: start += k if k != row_limit: # theoretically, k == row_limit logger.warn( "rows count seems incorrect! row_limit: %s, row returned: %s" % ( row_limit, k)) result = {'saved': saved} if failed: result.update({'failed_ids': failed}) else: if touch: # update the mtimestamp for when this field was last touched # to the moment we started updating c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True) return result
def _extract_func(cube, **kwargs): ''' SQL import method ''' c = get_cube(cube) field = kwargs.get('field') if not field: raise ValueError("Field argument required") force = int(kwargs.get('force', 0)) id_delta = kwargs.get('id_delta', None) if id_delta: if force: raise RuntimeError( "force and id_delta can't be used simultaneously") else: touch = False else: touch = True db = c.get_field_property('db', field) table = c.get_field_property('table', field) db_table = '%s.%s' % (db, table) column = c.get_field_property('column', field) table_column = '%s.%s' % (table, column) # max number of rows to return per call (ie, LIMIT) row_limit = c.get_field_property('row_limit', field, c.row_limit) try: row_limit = int(row_limit) except (TypeError, ValueError): raise ValueError("row_limit must be a number") sql_where = [] sql_groupby = '' _sql = c.get_field_property('sql', field) if not _sql: sql = 'SELECT %s, %s.%s FROM %s' % (table_column, table, field, db_table) else: sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0]) _from = [db_table] # FIXME: THIS IS UGLY! use a dict... or sqlalchemy if _sql[1]: _from.extend(_sql[1]) sql += ', '.join(_from) sql += ' ' if _sql[2]: sql += ' '.join(_sql[2]) sql += ' ' if _sql[3]: sql_where.append('(%s)' % ' OR '.join(_sql[3])) try: if _sql[4]: sql_groupby = _sql[4] except IndexError: pass delta_filter = [] delta_filter_sql = None # force full update if force: _delta = False else: _delta = c.get_field_property('delta', field, True) if _delta: # delta is enabled # the following deltas are mutually exclusive if id_delta: delta_sql = "(%s IN (%s))" % (table_column, id_delta) delta_filter.append(delta_sql) elif c.get_field_property('delta_new_ids', field): # if we delta_new_ids is on, but there is no 'last_id', # then we need to do a FULL run... last_id = get_last_id(c.name, field) if last_id: # FIXME: any reason to ensure we know what the _id is typecasted as? try: last_id = int(last_id) except (TypeError, ValueError): pass if type(last_id) in [INT_TYPE, FLOAT_TYPE]: last_id_sql = "%s > %s" % (table_column, last_id) else: last_id_sql = "%s > '%s'" % (table_column, last_id) delta_filter.append(last_id_sql) mtime_columns = c.get_field_property('delta_mtime', field) if mtime_columns: if isinstance(mtime_columns, basestring): mtime_columns = [mtime_columns] last_update_dt = last_known_warehouse_mtime(c.name, field) if last_update_dt: last_update_dt = last_update_dt.strftime( '%Y-%m-%d %H:%M:%S %z') dt_format = "yyyy-MM-dd HH:mm:ss z" for _column in mtime_columns: _sql = "%s > parseTimestamp('%s', '%s')" % ( _column, last_update_dt, dt_format) delta_filter.append(_sql) if delta_filter: delta_filter_sql = ' OR '.join(delta_filter) sql_where.append('(%s)' % delta_filter_sql) if sql_where: sql += ' WHERE %s ' % ' AND '.join(sql_where) if sql_groupby: sql += ' GROUP BY %s ' % sql_groupby if c.get_field_property('sort', field, True): sql += " ORDER BY %s ASC" % table_column # whether to query for distinct rows only or not; default, no if c.get_field_property('distinct', field, False): sql = re.sub('^SELECT', 'SELECT DISTINCT', sql) start = 0 saved = 0 _stop = False rows = [] failed = [] # FIXME: prefetch the next set of rows while importing to mongo logger.debug('... ... Starting SQL fetchall routine!') container = c.get_field_property('container', field) if touch: now = datetime.now(UTC) spec_mtime = {'cube': cube} update_mtime = {'$set': {field: {'mtime': now}}} while not _stop: rows = c._sql_fetchall(sql, start, field, row_limit) k = len(rows) if k > 0: logger.debug('... ... Starting Processer') grouped = c.grouper(rows) logger.debug('... ... Saving docs now!') t0 = time.time() _id_k = 0 for _id in grouped.iterkeys(): _id_k += 1 for field in grouped[_id].iterkeys(): tokens = grouped[_id][field] if not tokens: tokens = None elif container and type(tokens) is not list: tokens = [tokens] elif not container and type(tokens) is list: if len(tokens) > 1: raise TypeError( "Tokens contains too many values (%s); " "(set container=True?)" % (tokens)) else: tokens = tokens[0] try: saved += save_doc(c.name, field, tokens, _id) except Exception as e: logger.error('Error saving (%s) %s: %s' % (tokens, _id, e)) saved = 0 if not saved: failed.append(_id) t1 = time.time() logger.info('... ... Saved %i docs (%i/sec)' % (k, k / (t1 - t0))) else: logger.debug('... ... No rows; nothing to process') if k < row_limit: _stop = True else: start += k if k != row_limit: # theoretically, k == row_limit logger.warn( "rows count seems incorrect! row_limit: %s, row returned: %s" % (row_limit, k)) result = {'saved': saved} if failed: result.update({'failed_ids': failed}) else: if touch: # update the mtimestamp for when this field was last touched # to the moment we started updating c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True) return result