def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [(start + direction * count, rev, int(time.time()) if timestamp else -1) for count, rev in enumerate(ordered_rev_list)] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.chunk(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list)) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): self.maintenance_signal.go()
def test_chunk(self): data = [] for g, d in jx.chunk(data, size=5): assert False data = [1, 2, 3] for g, d in jx.chunk(data, size=5): assert d == [1, 2, 3] data = [1, 2, 3, 4, 5] for g, d in jx.chunk(data, size=5): assert d == [1, 2, 3, 4, 5] data = [1, 2, 3, 4, 5, 6] for g, d in jx.chunk(data, size=5): assert d == [1, 2, 3, 4, 5] or d == [6] data = [1, 2, 3, 4, 5, 6, 7, 8, 9] for g, d in jx.chunk(data, size=5): assert d == [1, 2, 3, 4, 5] or d == [6, 7, 8, 9]
def save_money(self, remaining_budget, net_new_utility): remove_spot_requests = wrap([]) # FIRST CANCEL THE PENDING REQUESTS if remaining_budget < 0: requests = self._get_managed_spot_requests() for r in requests: if r.status.code in PENDING_STATUS_CODES | PROBABLY_NOT_FOR_A_WHILE | MIGHT_HAPPEN: remove_spot_requests.append(r.id) net_new_utility += self.settings.utility[ r.launch_specification.instance_type].utility remaining_budget += r.price instances = jx.sort(self.running_instances(), "markup.estimated_value") remove_list = wrap([]) for s in instances: if remaining_budget >= 0: break remove_list.append(s) net_new_utility += coalesce(s.markup.type.utility, 0) remaining_budget += coalesce(s.request.bid_price, s.markup.price_80, s.markup.current_price) if not remove_list: return remaining_budget, net_new_utility # SEND SHUTDOWN TO EACH INSTANCE Log.warning("Shutdown {{instances}} to save money!", instances=remove_list.id) if ALLOW_SHUTDOWN: for g, removals in jx.chunk(remove_list, size=20): for i, t in [(i, Thread.run("teardown " + i.id, self.instance_manager.teardown, i, please_stop=False)) for i in removals]: try: t.join() except Exception: Log.note("Problem with shutdown of {{id}}", id=i.id) remove_spot_requests.extend(remove_list.spot_instance_request_id) # TERMINATE INSTANCES self.ec2_conn.terminate_instances(instance_ids=remove_list.id) # TERMINATE SPOT REQUESTS self.ec2_conn.cancel_spot_instance_requests( request_ids=remove_spot_requests) return remaining_budget, net_new_utility
def _execute_backlog(self): if not self.backlog: return backlog, self.backlog = self.backlog, [] for i, g in jx.chunk(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: self.debug and Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def md5(source, chunk_size=CHUNK_SIZE): md5s = [] for g, data in jx.chunk(source.read_bytes(), size=chunk_size): md5s.append(hashlib.md5(data).digest()) if len(md5s) == 0: return '"d41d8cd98f00b204e9800998ecf8427e"' elif len(md5s) == 1: return quote(md5s[0].encode("hex")) else: Log.warning("not known to work") new_md5 = hashlib.md5(b"".join(md5s)) return unicode(new_md5.hexdigest() + b"-" + str(len(md5s)))
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.chunk(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: chain = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(link, depth=3) for link in chain ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) break Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e)
def unescape_name(esc_name): if not isinstance(esc_name, ApiName): Log.error("expecting an api name") if len(esc_name.values) > 1: Log.error("do not knwo how to handle") try: parts = text(esc_name).split("_") result = parts[:1] for i, (p, q) in jx.chunk(parts[1:], 2): if len(p) == 0: result.append("_") else: result.append(hex2chr(p)) result.append(q) name = "".join(result) return name except Exception: return esc_name.values[0]
def merge_shards(self): shards = [] tables = list(self.container.client.list_tables( self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match( text(table_api_name)[len(text(api_name)):]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning("could not merge table {{table}}", table=table, cause=e) if not current_view: Log.error("expecting {{table}} to be a view pointing to a table", table=api_name) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) Log.note("inserting into table {{table}}", table=text(primary_shard_name)) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, (sql_query({ "from": self.container.full_name + ApiName(shard.table_id) }) for _, shard, _ in merge_chunk), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note("job {{id}} state = {{state}}", id=job.job_id, state=job.state) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all(" does not have a schema." in m for m in wrap(job.errors).message): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name)