def _extract_primary_shard_name(view_sql): # TODO: USE REAL PARSER e = view_sql.lower().find("from ") return ApiName(view_sql[e + 5:].strip().split(".")[-1].strip("`"))
def merge_shards(self): shards = [] tables = list(self.container.client.list_tables( self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match( text(table_api_name)[len(text(api_name)):]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning("could not merge table {{table}}", table=table, cause=e) if not current_view: Log.error("expecting {{table}} to be a view pointing to a table", table=api_name) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) Log.note("inserting into table {{table}}", table=text(primary_shard_name)) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, (sql_query({ "from": self.container.full_name + ApiName(shard.table_id) }) for _, shard, _ in merge_chunk), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note("job {{id}} state = {{state}}", id=job.job_id, state=job.state) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all(" does not have a schema." in m for m in wrap(job.errors).message): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name)
def __init__( self, table, typed, read_only, sharded, container, id=Null, partition=Null, cluster=Null, top_level_fields=Null, kwargs=None, ): self.short_name = table self.typed = typed self.read_only = read_only self.cluster = cluster self.id = id self.top_level_fields = top_level_fields self.config = Data( # USED TO REPLICATE THIS typed=typed, read_only=read_only, sharded=sharded, id=id, partition=partition, cluster=cluster, top_level_fields=top_level_fields, ) esc_name = escape_name(table) self.full_name = container.full_name + esc_name self.alias_view = alias_view = container.client.get_table( text(self.full_name)) self.partition = partition self.container = container if not sharded: if not read_only and alias_view.table_type == "VIEW": Log.error("Expecting a table, not a view") self.shard = alias_view self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) else: if alias_view.table_type != "VIEW": Log.error("Sharded tables require a view") current_view = container.client.get_table(text(self.full_name)) view_sql = current_view.view_query shard_name = _extract_primary_shard_name(view_sql) try: self.shard = container.client.get_table( text(container.full_name + shard_name)) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) except Exception as e: Log.warning("view {{name}} is invalid", name=shard_name, cause=e) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) # REMOVE STALE VIEW container.client.delete_table(current_view) # MAKE NEW VIEW POINTING TO NEW SHARD self._create_new_shard() container.create_view( self.full_name, self.container.full_name + ApiName(self.shard.table_id), ) self.last_extend = Date.now() - EXTEND_LIMIT
def create_dataset(project_id, dataset, client): full_name = ApiName(project_id) + escape_name(dataset) _dataset = bigquery.Dataset(text(full_name)) _dataset.location = "US" return client.create_dataset(_dataset)
def _gen_select(source_path, source_tops, source_flake, total_path, total_tops, total_flake): if total_flake == source_flake and not total_tops: return [ quote_column(source_path + escape_name(k)) for k in jx.sort(total_flake.keys()) ] if NESTED_TYPE in total_flake: # PROMOTE EVERYTHING TO REPEATED v = source_flake.get(NESTED_TYPE) t = total_flake.get(NESTED_TYPE) if not v: # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select( source_path, Null, source_flake, total_path + REPEATED, Null, t, ), ), ) ] else: row_name = "row" + text(len(source_path.values)) ord_name = "ordering" + text(len(source_path.values)) inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select(ApiName(row_name), Null, v, ApiName(row_name), Null, t), ), SQL_FROM, sql_call("UNNEST", quote_column(source_path + REPEATED)), SQL_AS, SQL(row_name), SQL(" WITH OFFSET AS "), SQL(ord_name), SQL_ORDERBY, SQL(ord_name), ) ] return [sql_alias(sql_call("ARRAY", *inner), REPEATED)] selection = [] for k, t in jx.sort(total_flake.items(), 0): k_total_tops = total_tops if is_text(total_tops) else total_tops[k] k_tops = source_tops if is_text(source_tops) else source_tops[k] v = source_flake.get(k) if is_text(k_total_tops): # DO NOT INCLUDE TOP_LEVEL_FIELDS pass elif t == v and not k_total_tops and not k_tops: selection.append( ConcatSQL( quote_column(source_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) elif is_data(t): if not v: selects = _gen_select( source_path + escape_name(k), source_tops, {}, total_path + escape_name(k), k_total_tops, t, ) elif is_data(v): selects = _gen_select( source_path + escape_name(k), source_tops, v, total_path + escape_name(k), k_total_tops, t, ) else: raise Log.error( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(source_path + escape_name(k)), type=v, main=t, ) if selects: inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects), ) ] selection.append( sql_alias(sql_call("", *inner), escape_name(k))) elif is_text(t): if is_text(k_tops): # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD selection.append( ConcatSQL(SQL(k_tops), SQL_AS, quote_column(escape_name(k)))) elif v == t: selection.append( ConcatSQL( quote_column(total_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) else: if v: Log.note( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(source_path + escape_name(k)), type=v, main=t, ) selection.append( ConcatSQL( sql_call( "CAST", ConcatSQL(SQL_NULL, SQL_AS, SQL(json_type_to_bq_type[t])), ), SQL_AS, quote_column(escape_name(k)), )) else: Log.error("not expected") return selection
def gen_select(total_flake, flake): def _gen_select(jx_path, es_path, total_tops, total_flake, source_tops, source_flake): if total_flake == source_flake and total_tops == source_tops: if not jx_path: # TOP LEVEL FIELDS return [ quote_column(escape_name(k)) for k in total_flake.keys() if not is_text(total_tops[k]) ] else: Log.error("should not happen") if NESTED_TYPE in total_flake: k = NESTED_TYPE # PROMOTE EVERYTHING TO REPEATED v = source_flake.get(k) t = total_flake.get(k) if not v: # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select(jx_path, es_path + REPEATED, Null, t, Null, source_flake), ), ) ] else: row_name = "row" + text(len(jx_path)) ord_name = "ordering" + text(len(jx_path)) inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select([row_name], ApiName(row_name), Null, t, Null, v), ), SQL_FROM, sql_call("UNNEST", quote_column(es_path + escape_name(k))), SQL_AS, SQL(row_name), SQL(" WITH OFFSET AS "), SQL(ord_name), SQL_ORDERBY, SQL(ord_name), ) ] return [sql_alias(sql_call("ARRAY", *inner), escape_name(k))] selection = [] for k, t in jx.sort(total_flake.items(), 0): k_total_tops = total_tops if is_text(total_tops) else total_tops[k] k_tops = source_tops if is_text(source_tops) else source_tops[k] v = source_flake.get(k) if is_text(k_total_tops): # DO NOT INCLUDE TOP_LEVEL_FIELDS pass elif t == v and k_total_tops == k_tops: selection.append( ConcatSQL( quote_column(es_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) elif is_data(t): if not v: selects = _gen_select( jx_path + [k], es_path + escape_name(k), k_total_tops, t, source_tops, {}, ) elif is_data(v): selects = _gen_select( jx_path + [k], es_path + escape_name(k), k_total_tops, t, source_tops, v, ) else: raise Log.error( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(jx_path + [k]), type=v, main=t, ) inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects), ) ] selection.append( sql_alias(sql_call("", *inner), escape_name(k))) elif is_text(t): if is_text(k_tops): # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD selection.append( ConcatSQL(SQL(k_tops), SQL_AS, quote_column(escape_name(k)))) elif v == t: selection.append( ConcatSQL( quote_column(es_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) else: if v: Log.note( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(jx_path + [k]), type=v, main=t, ) selection.append( ConcatSQL( sql_call( "CAST", ConcatSQL(SQL_NULL, SQL_AS, SQL(json_type_to_bq_type[t])), ), SQL_AS, quote_column(escape_name(k)), )) else: Log.error("not expected") return selection output = _gen_select( [], ApiName(), total_flake.top_level_fields, total_flake.schema, flake.top_level_fields, flake.schema, ) tops = [] for path, name in total_flake.top_level_fields.leaves(): source = flake.top_level_fields[path] if source: # ALREADY TOP LEVEL FIELD source = SQL(source) else: # PULL OUT TOP LEVEL FIELD column = first(flake.leaves(path)) source = SQL(column.es_column) tops.append(ConcatSQL(source, SQL_AS, quote_column(ApiName(name)))) return tops + output