def get_all_signatures(db_config, sql): """ RETURN ALL SIGNATURES FROM PERFHERDER DATABASE """ db = MySQL(db_config) with db: return db.query(text(sql))
def setUpClass(cls): Log.start(settings.debug) with Timer("setup database"): try: with MySQL(schema=None, kwargs=settings.database) as db: db.query("drop database testing") except Exception as e: if "Can't drop database " in e: pass else: Log.warning("problem removing db", cause=e) MySQL.execute_file("tests/resources/database.sql", schema=None, kwargs=settings.database)
def __init__(self, kwargs=None): self.settings = kwargs self.settings.exclude = set(self.settings.exclude) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) with self.db: with self.db.transaction(): self._scan_database()
def test_lean_inline_all(self): config = set_default( { "extract": { "ids": "select * from fact_table" }, "snowflake": { "show_foreign_keys": False, "reference_only": ["inner1.value", "inner2.value"] } }, config_template) db = MySQL(**config.snowflake.database) data = [10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22] Extract(kwargs=config).extract(db=db, start_point=Null, first_value=Null, data=data, please_stop=Null) result = File(filename).read_json() for r in result: r.etl = None expected = expected_results["lean_inline_all"] self.assertEqual(result, expected, "expecting identical") self.assertEqual(expected, result, "expecting identical")
def extract(please_stop): with MySQL(**settings.snowflake.database) as db: with db.transaction(): for kwargs in extractor.queue: if please_stop: break try: extractor.extract(db=db, please_stop=please_stop, **kwargs) except Exception as e: Log.warning("Could not extract", cause=e) extractor.queue.add(kwargs)
def __init__(self, host, port, username, password, debug=False, schema=None, preamble=None, readonly=False, kwargs=None): from pyLibrary.sql.mysql import MySQL self.settings = kwargs self._db = MySQL(kwargs)
def test_complex(self): config = config_template db = MySQL(**config.snowflake.database) Extract(kwargs=config).extract(db=db, start_point=Null, first_value=Null, data=[10], please_stop=Null) result = File(filename).read_json() result[0].etl = None expected = expected_results["complex"] self.assertEqual(result, expected, "expecting identical") self.assertEqual(expected, result, "expecting identical")
def test_lean(self): config = set_default({"snowflake": { "show_foreign_keys": False }}, config_template) db = MySQL(**config.snowflake.database) Extract(kwargs=config).extract(db=db, start_point=Null, first_value=Null, data=[10], please_stop=Null) result = File(filename).read_json() result[0].etl = None expected = expected_results["lean"] self.assertEqual(result, expected, "expecting identical") self.assertEqual(expected, result, "expecting identical")
def test_inline(self): config = set_default( { "snowflake": { "reference_only": ["inner1.value", "inner2.value"] } }, config_template) db = MySQL(**config.snowflake.database) Extract(kwargs=config).extract(db=db, start_point=Null, first_value=Null, data=[10], please_stop=Null) result = File(filename).read_json() result[0].etl = None expected = expected_results["inline"] self.assertEqual(result, expected, "expecting identical") self.assertEqual(expected, result, "expecting identical")
def get_dataum(db_config, signature_id): db = MySQL(db_config) with db: return db.query(expand_template(datum_sql, quote_list(listwrap(signature_id))))
def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining)
def pull_all_remaining(self, please_stop): try: try: content = File(self.settings.extract.last).read_json() if len(content) == 1: Log.note("Got a manually generated file {{filename}}", filename=self.settings.extract.last) start_point = tuple(content[0]) first_value = [ self._extract.start[0] + (start_point[0] * DAY), start_point[1] ] else: Log.note("Got a machine generated file {{filename}}", filename=self.settings.extract.last) start_point, first_value = content start_point = tuple(start_point) Log.note("First value is {{start1|date}}, {{start2}}", start1=first_value[0], start2=first_value[1]) except Exception as _: Log.error( "Expecting a file {{filename}} with the last good S3 bucket etl id in array form eg: [[954, 0]]", filename=self.settings.extract.last) start_point = tuple(self._extract.start) first_value = Null counter = Counter(start=0) for t, s, b, f, i in reversed( zip(self._extract.type, self._extract.start, self._extract.batch, listwrap(first_value) + DUMMY_LIST, range(len(self._extract.start)))): if t == "time": counter = DurationCounter(start=s, duration=b, child=counter) first_value[i] = Date(f) else: counter = BatchCounter(start=s, size=b, child=counter) batch_size = self._extract.batch.last( ) * 2 * self.settings.extract.threads with MySQL(**self.settings.snowflake.database) as db: while not please_stop: sql = self._build_list_sql(db, first_value, batch_size + 1) pending = [] counter.reset(start_point) with Timer("Grab a block of ids for processing"): with closing(db.db.cursor()) as cursor: acc = [] cursor.execute(sql) count = 0 for row in cursor: detail_key = counter.next(row) key = tuple(detail_key[:-1]) count += 1 if key != start_point: if first_value: if not acc: Log.error( "not expected, {{filename}} is probably set too far in the past", filename=self.settings.extract. last) pending.append({ "start_point": start_point, "first_value": first_value, "data": acc }) acc = [] start_point = key first_value = row acc.append( row[-1] ) # ASSUME LAST COLUMN IS THE FACT TABLE id Log.note("adding {{num}} for processing", num=len(pending)) self.queue.extend(pending) if count < batch_size: self.queue.add(THREAD_STOP) break except Exception as e: Log.warning("Problem pulling data", cause=e) finally: self.done_pulling.go() Log.note("pulling new data is done")
class SnowflakeSchema(object): @override def __init__(self, kwargs=None): self.settings = kwargs self.settings.exclude = set(self.settings.exclude) self.settings.show_foreign_keys = coalesce( self.settings.show_foreign_keys, True) self.all_nested_paths = None self.nested_path_to_join = None self.columns = None with Explanation("scan database", debug=DEBUG): self.db = MySQL(**kwargs.database) with self.db: with self.db.transaction(): self._scan_database() def get_sql(self, get_ids): sql = self._compose_sql(get_ids) # ORDERING sort = [] ordering = [] for ci, c in enumerate(self.columns): if c.sort: sort.append(quote_column(c.column_alias) + SQL_IS_NOT_NULL) sort.append(quote_column(c.column_alias)) ordering.append(ci) union_all_sql = SQL_UNION_ALL.join(sql) union_all_sql = (SQL_SELECT + SQL_STAR + SQL_FROM + sql_alias(sql_iso(union_all_sql), quote_column('a')) + SQL_ORDERBY + sql_list(sort)) return union_all_sql def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query(""" SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: a, b = map(strings.trim, r.split("->")) a = a.split(".") b = b.split(".") raw_relations.append( Data(table_schema=a[0], table_name=a[1], referenced_table_schema=b[0], referenced_table_name=b[1], referenced_column_name=b[2], constraint_name=Random.hex(20), column_name=a[2], ordinal_position=1)) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select(raw_relations, [{ "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }]) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """, param=self.settings.database) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name: ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index) # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index] }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """, param=self.settings.database) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = { (r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema) for r in relations } related_column_table_schema_triples = foreign_column_table_schema_triples | referenced_column_table_schema_triples columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif (c.column_name, c.table_name, c.table_schema) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif (c.column_name, c.table_name, c.table_schema) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif (c.column_name, c.table_name, c.table_schema) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query("SELECT * FROM " + quote_column(position.name, position.schema) + " LIMIT 1") if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) # INNER OBJECTS referenced_tables = list( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema } }), "constraint.name")) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text_type(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path }) # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)]) # HANDLE THE COMMON *id SUFFIX name = [] for a, b in zip(constraint_columns.column.name, constraint_columns.referenced.table.name): if a.startswith(b): name.append(b) elif a.endswith("_id"): name.append(a[:-3]) else: name.append(a) referenced_column_path = join_field( split_field(path) + ["/".join(name)]) col_pointer_name = relative_field(referenced_column_path, nested_path[0]) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10 # id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].referenced.table.name and col.table.schema == constraint_columns[ 0].referenced.table.schema: col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema: # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) if position.name in reference_only_tables: continue todo.append( Data(position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) # NESTED OBJECTS if not no_nested_docs: for g, constraint_columns in jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema } }), "constraint.name"): g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue referenced_column_path = join_field( split_field(path) + ["/".join(many_table)]) new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) # if new_path not in self.settings.include: # Log.note("Exclude nested path {{path}}", path=new_path) # continue one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text_type(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default({}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path })) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].table.name and col.table.schema == constraint_columns[ 0].table.schema: col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None }) todo.append( Data(position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path }] todo.append( Data(position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False)) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns def _compose_sql(self, get_ids): """ :param get_ids: SQL to get the ids, and used to select the documents returned :return: """ sql = [] for nested_path in self.all_nested_paths: # MAKE THE REQUIRED JOINS sql_joins = [] for i, curr_join in enumerate( self.nested_path_to_join[nested_path[0]]): curr_join = wrap(curr_join) rel = curr_join.join_columns[0] if i == 0: sql_joins.append( SQL_FROM + sql_alias(sql_iso(get_ids), quote_column(rel.referenced.table.alias))) elif curr_join.children: full_name = quote_column(rel.table.name, rel.table.schema) sql_joins.append(SQL_JOIN + sql_alias( full_name, quote_column(rel.table.alias) ) + SQL_ON + sql_and( quote_column(const_col.column.name, rel.table.alias) + "=" + quote_column(const_col.referenced.column.name, rel.referenced.table.alias) for const_col in curr_join.join_columns)) else: full_name = quote_column(rel.referenced.table.name, rel.referenced.table.schema) sql_joins.append(SQL_LEFT_JOIN + sql_alias( full_name, quote_column(rel.referenced.table.alias) ) + SQL_ON + sql_and( quote_column(const_col.referenced.column.name, rel.referenced.table.alias) + "=" + quote_column(const_col.column.name, rel.table.alias) for const_col in curr_join.join_columns)) # ONLY SELECT WHAT WE NEED, NULL THE REST selects = [] not_null_column_seen = False for ci, c in enumerate(self.columns): if c.column_alias[1:] != text_type(ci): Log.error("expecting consistency") if c.nested_path[0] == nested_path[0]: s = sql_alias( quote_column(c.column.column.name, c.table_alias), quote_column(c.column_alias)) if s == None: Log.error("bug") selects.append(s) not_null_column_seen = True elif startswith_field(nested_path[0], c.path): # PARENT ID REFERENCES if c.column.is_id: s = sql_alias( quote_column(c.column.column.name, c.table_alias), quote_column(c.column_alias)) selects.append(s) not_null_column_seen = True else: selects.append( sql_alias(SQL_NULL, quote_column(c.column_alias))) else: selects.append( sql_alias(SQL_NULL, quote_column(c.column_alias))) if not_null_column_seen: sql.append(SQL_SELECT + sql_list(selects) + "".join(sql_joins)) return sql
def update_repo(repo, settings): with MySQL(settings.database) as db: try: pull_repo(repo) # GET LATEST DATE existing_range = db.query( """ SELECT max(`date`) `max`, min(`date`) `min`, min(revision) min_rev, max(revision) max_rev FROM changesets WHERE repos={{repos}} """, {"repos": repo.name})[0] ranges = wrap([{ "min": coalesce(existing_range.max, convert.milli2datetime(0)) + timedelta(days=1) }, { "max": existing_range.min }]) for r in ranges: for g, docs in qb.groupby(get_changesets(date_range=r, repo=repo), size=100): for doc in docs: doc.file_changes = None doc.file_adds = None doc.file_dels = None doc.description = doc.description[0:16000] db.insert_list("changesets", docs) db.flush() missing_revisions = find_holes(db, "changesets", "revision", { "min": 0, "max": existing_range.max_rev + 1 }, {"term": { "repos": repo.name }}) for _range in missing_revisions: for g, docs in qb.groupby(get_changesets(revision_range=_range, repo=repo), size=100): for doc in docs: doc.file_changes = None doc.file_adds = None doc.file_dels = None doc.description = doc.description[0:16000] db.insert_list("changesets", docs) db.flush() except Exception, e: Log.warning("Failure to pull from {{repos.name}}", {"repos": repo}, e)