def find_dataset(dataset, client): esc_name = escape_name(dataset) datasets = list(client.list_datasets()) for _dataset in datasets: if ApiName(_dataset.dataset_id) == esc_name: return _dataset.reference
def __init__(self, dataset, account_info, kwargs): self.client = connect(account_info) self.short_name = dataset esc_name = escape_name(dataset) self.full_name = ApiName(account_info.project_id) + esc_name self.dataset = find_dataset(dataset, self.client) if not self.dataset: self.dataset = create_dataset(account_info.project_id, dataset, self.client)
def _schema_to_bq_schema(jx_path, es_path, schema): output = [] nt = schema.get(NESTED_TYPE) if nt: schema = {NESTED_TYPE: nt} for t, sub_schema in jx.sort(schema.items(), 0): bqt = typed_to_bq_type.get( t, {"field_type": "RECORD", "mode": "NULLABLE"} ) full_name = es_path + escape_name(t) top_field = self._top_level_fields.get(text(full_name)) if is_text(sub_schema): new_field_type = json_type_to_bq_type.get(sub_schema, sub_schema) if new_field_type != bqt["field_type"]: # OVERRIDE TYPE bqt = bqt.copy() bqt["field_type"] = new_field_type fields = () else: fields = _schema_to_bq_schema(jx_path + (t,), full_name, sub_schema) if top_field: if fields: Log.error("not expecting a structure") if self._partition.field == top_field: if bqt["field_type"] != "TIMESTAMP": Log.error("Partition field must be of time type") struct = SchemaField(name=top_field, fields=fields, **bqt) top_fields.append(struct) elif not fields and bqt["field_type"] == "RECORD": # THIS CAN HAPPEN WHEN WE MOVE A PRIMITIVE FIELD TO top_fields pass else: struct = SchemaField( name=text(escape_name(t)), fields=fields, **bqt ) output.append(struct) return output
def __init__(self, dataset, account_info, kwargs): creds = service_account.Credentials.from_service_account_info( info=account_info) self.client = bigquery.Client(project=account_info.project_id, credentials=creds) self.short_name = dataset esc_name = escape_name(dataset) self.full_name = ApiName(account_info.project_id) + esc_name datasets = list(self.client.list_datasets()) for _dataset in datasets: if ApiName(_dataset.dataset_id) == esc_name: self.dataset = _dataset.reference break else: _dataset = bigquery.Dataset(text(self.full_name)) _dataset.location = "US" self.dataset = self.client.create_dataset(_dataset)
def delete_table(self, name): api_name = escape_name(name) tables = list(self.client.list_tables(self.dataset)) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) self.client.delete_table(table) elif SUFFIX_PATTERN.match( text(table_api_name)[len(text(api_name)):]): try: self.client.delete_table(table) except Exception as e: Log.warning("could not delete table {{table}}", table=table, cause=e)
def to_bq(self, schema, not_null=False, boolean=False, many=True): var_name = self.var if var_name == GUID: return BQLScript(data_type=STRING, expr=quote_column(escape_name(GUID)), frum=self, miss=FALSE, many=False, schema=schema) cols = schema.leaves(var_name) if not cols: # DOES NOT EXIST return BQLScript(data_type=OBJECT, expr=SQL_NULL, frum=self, miss=TRUE, many=False, schema=schema) elif len(cols) == 1: col = first(cols) return BQLScript(data_type=col.jx_type, expr=quote_column( ApiName(*split_field(col.es_column))), frum=self, miss=MissingOp(self), many=False, schema=schema) else: coalesce = [] for col in cols: rel_path = untype_path(relative_field(col.name, var_name)) if rel_path == '.': coalesce.append(Variable(col.name)) else: Log.error("structure not supported") return CoalesceOp(coalesce).to_bq(schema)
def parse_schema(schema, tops, es_type_info, jx_path, nested_path, es_path): if is_text(schema): json_type = schema expected_es_type = json_type_to_bq_type[json_type] if es_type_info and es_type_info != expected_es_type: Log.error( "expecting {{path}} to be of type {{expected_type}} not of type {{observed_type}}", path=jx_path, expected_type=expected_es_type, observed_type=es_type_info ) c = jx_base.Column( name=join_field(jx_path), es_column=coalesce(tops, text(es_path)), es_index=self.es_index, es_type=coalesce(es_type_info, expected_es_type), jx_type=json_type, nested_path=nested_path, last_updated=now, ) columns.append(c) else: c = jx_base.Column( name=join_field(jx_path), es_column=text(es_path), es_index=self.es_index, es_type="RECORD", jx_type=OBJECT, nested_path=nested_path, last_updated=now, ) columns.append(c) count = len(columns) for k, s in schema.items(): if k == NESTED_TYPE: c.jx_type = NESTED parse_schema( s, tops if is_text(tops) else tops[k], es_type_info if is_text(es_type_info) else es_type_info[k], jx_path + (k,), (jx_path,) + nested_path, es_path + escape_name(k), ) else: parse_schema( s, tops if is_text(tops) else tops[k], es_type_info if is_text(es_type_info) else es_type_info[k], jx_path + (k,), nested_path, es_path + escape_name(k), ) if is_text(tops) and len(columns) > count + 1: Log.error( "too many top level fields at {{field}}:", field=join_field(jx_path), )
def columns(self): if not self._columns: now = Date.now() columns = [] def parse_schema(schema, tops, es_type_info, jx_path, nested_path, es_path): if is_text(schema): json_type = schema expected_es_type = json_type_to_bq_type[json_type] if es_type_info and es_type_info != expected_es_type: Log.error( "expecting {{path}} to be of type {{expected_type}} not of type {{observed_type}}", path=jx_path, expected_type=expected_es_type, observed_type=es_type_info, ) c = jx_base.Column( name=join_field(jx_path), es_column=coalesce(tops, text(es_path)), es_index=self.es_index, es_type=coalesce(es_type_info, expected_es_type), jx_type=json_type, nested_path=nested_path, last_updated=now, ) columns.append(c) else: c = jx_base.Column( name=join_field(jx_path), es_column=text(es_path), es_index=self.es_index, es_type="RECORD", jx_type=OBJECT, cardinality=1, nested_path=nested_path, last_updated=now, ) columns.append(c) count = len(columns) for k, s in schema.items(): if k == NESTED_TYPE: c.jx_type = NESTED parse_schema( s, tops if is_text(tops) else tops[k], es_type_info if is_text(es_type_info) else es_type_info[k], jx_path + (k,), (jx_path,) + nested_path, es_path + escape_name(k), ) else: parse_schema( s, tops if is_text(tops) else tops[k], es_type_info if is_text(es_type_info) else es_type_info[k], jx_path + (k,), nested_path, es_path + escape_name(k), ) if is_text(tops) and len(columns) > count + 1: Log.error( "too many top level fields at {{field}}:", field=join_field(jx_path), ) parse_schema( self.schema, self.top_level_fields, self._es_type_info, (), (".",), ApiName(), ) self._columns = columns self._top_level_fields = OrderedDict() # FORCE ORDERING for path, field in jx.sort(wrap(self.top_level_fields).leaves(), 0): leaves = self.leaves(path) if not leaves: continue if len(leaves) > 1: Log.error( "expecting {{path}} to have just one primitive value", path=path ) specific_path = first(leaves).name self._top_level_fields[ ".".join(text(escape_name(step)) for step in split_field(specific_path)) ] = field self._partition = Partition(kwargs=self.partition, flake=self) return self._columns
def create_dataset(project_id, dataset, client): full_name = ApiName(project_id) + escape_name(dataset) _dataset = bigquery.Dataset(text(full_name)) _dataset.location = "US" return client.create_dataset(_dataset)
def merge_shards(self): shards = [] tables = list(self.container.client.list_tables( self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match( text(table_api_name)[len(text(api_name)):]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning("could not merge table {{table}}", table=table, cause=e) if not current_view: Log.error("expecting {{table}} to be a view pointing to a table", table=api_name) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) Log.note("inserting into table {{table}}", table=text(primary_shard_name)) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, (sql_query({ "from": self.container.full_name + ApiName(shard.table_id) }) for _, shard, _ in merge_chunk), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note("job {{id}} state = {{state}}", id=job.job_id, state=job.state) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all(" does not have a schema." in m for m in wrap(job.errors).message): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name)
def __init__( self, table, typed, read_only, sharded, container, id=Null, partition=Null, cluster=Null, top_level_fields=Null, kwargs=None, ): self.short_name = table self.typed = typed self.read_only = read_only self.cluster = cluster self.id = id self.top_level_fields = top_level_fields self.config = Data( # USED TO REPLICATE THIS typed=typed, read_only=read_only, sharded=sharded, id=id, partition=partition, cluster=cluster, top_level_fields=top_level_fields, ) esc_name = escape_name(table) self.full_name = container.full_name + esc_name self.alias_view = alias_view = container.client.get_table( text(self.full_name)) self.partition = partition self.container = container if not sharded: if not read_only and alias_view.table_type == "VIEW": Log.error("Expecting a table, not a view") self.shard = alias_view self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) else: if alias_view.table_type != "VIEW": Log.error("Sharded tables require a view") current_view = container.client.get_table(text(self.full_name)) view_sql = current_view.view_query shard_name = _extract_primary_shard_name(view_sql) try: self.shard = container.client.get_table( text(container.full_name + shard_name)) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) except Exception as e: Log.warning("view {{name}} is invalid", name=shard_name, cause=e) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) # REMOVE STALE VIEW container.client.delete_table(current_view) # MAKE NEW VIEW POINTING TO NEW SHARD self._create_new_shard() container.create_view( self.full_name, self.container.full_name + ApiName(self.shard.table_id), ) self.last_extend = Date.now() - EXTEND_LIMIT
def create_table( self, table, schema=None, typed=True, read_only=True, # TO PREVENT ACCIDENTAL WRITING sharded=False, partition=Null, # PARTITION RULES cluster=None, # TUPLE OF FIELDS TO SORT DATA top_level_fields=Null, kwargs=None, ): if kwargs.lookup != None or kwargs.flake != None: Log.error("expecting schema, not lookup") full_name = self.full_name + escape_name(table) if not schema: # WE MUST HAVE SOMETHING if typed: schema = copy(DEFAULT_TYPED_SCHEMA) else: schema = copy(DEFAULT_SCHEMA) flake = Snowflake(text(full_name), top_level_fields, partition, schema=schema) if read_only: Log.error("Can not create a table for read-only use") if sharded: shard_name = escape_name(table + "_" + "".join(Random.sample(ALLOWED, 20))) shard_api_name = self.full_name + shard_name _shard = bigquery.Table(text(shard_api_name), schema=flake.to_bq_schema()) _shard.time_partitioning = unwrap( flake._partition.bq_time_partitioning) _shard.clustering_fields = [ c.es_column for f in listwrap(cluster) for c in [first(flake.leaves(f))] if c ] or None self.shard = self.client.create_table(_shard) self.create_view(full_name, shard_api_name) else: _table = bigquery.Table(text(full_name), schema=flake.to_bq_schema()) _table.time_partitioning = unwrap( flake._partition.bq_time_partitioning) _table.clustering_fields = [ l.es_column for f in listwrap(cluster) for l in flake.leaves(f) ] or None self.client.create_table(_table) Log.note("created table {{table}}", table=_table.table_id) return Table( table=table, typed=typed, read_only=read_only, sharded=sharded, partition=partition, top_level_fields=top_level_fields, kwargs=kwargs, container=self, )
def _typed_encode(value, schema): if is_many(value): output = [] update = {} nest_added = False child_schema = schema.get(NESTED_TYPE) if not child_schema: child_schema = schema[NESTED_TYPE] = {} for r in value: v, m, n = _typed_encode(r, child_schema) output.append(v) update.update(m) nest_added |= n if update: return {text(REPEATED): output}, {NESTED_TYPE: update}, True else: return {text(REPEATED): output}, None, nest_added elif NESTED_TYPE in schema: if not value: return {text(REPEATED): []}, None, False else: return _typed_encode([value], schema) elif is_data(value): output = {} update = {} nest_added = False for k, v in value.items(): child_schema = schema.get(k) if not child_schema: child_schema = schema[k] = {} result, more_update, n = _typed_encode(v, child_schema) output[text(escape_name(k))] = result if more_update: update.update({k: more_update}) nest_added |= n return output, update, nest_added elif is_text(schema): v, inserter_type, json_type = schema_type(value) if schema != json_type: Log.error( "Can not convert {{existing_type}} to {{expected_type}}", existing_type=json_type, expected_type=schema, ) return v, None, False elif value is None: return {text(escape_name(t)): None for t, child_schema in schema}, None, False else: v, inserter_type, json_type = schema_type(value) child_schema = schema.get(inserter_type) update = None if not child_schema: if schema.get(TIME_TYPE): # ATTEMPT TO CONVERT TO TIME, IF EXPECTING TIME try: v = parse(v).format(TIMESTAMP_FORMAT) return {text(escape_name(TIME_TYPE)): v}, update, False except Exception as e: Log.warning( "Failed attempt to convert {{value}} to TIMESTAMP string", value=v, cause=e) schema[inserter_type] = json_type update = {inserter_type: json_type} return {text(escape_name(inserter_type)): v}, update, False
} typed_to_bq_type = { BOOLEAN_TYPE: { "field_type": "BOOLEAN", "mode": "NULLABLE" }, NUMBER_TYPE: { "field_type": "FLOAT64", "mode": "NULLABLE" }, INTEGER_TYPE: { "field_type": "INT64", "mode": "NULLABLE" }, TIME_TYPE: { "field_type": "TIMESTAMP", "mode": "NULLABLE" }, STRING_TYPE: { "field_type": "STRING", "mode": "NULLABLE" }, NESTED_TYPE: { "field_type": "RECORD", "mode": "REPEATED" }, } REPEATED = escape_name(NESTED_TYPE)
def _gen_select(source_path, source_tops, source_flake, total_path, total_tops, total_flake): if total_flake == source_flake and not total_tops: return [ quote_column(source_path + escape_name(k)) for k in jx.sort(total_flake.keys()) ] if NESTED_TYPE in total_flake: # PROMOTE EVERYTHING TO REPEATED v = source_flake.get(NESTED_TYPE) t = total_flake.get(NESTED_TYPE) if not v: # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select( source_path, Null, source_flake, total_path + REPEATED, Null, t, ), ), ) ] else: row_name = "row" + text(len(source_path.values)) ord_name = "ordering" + text(len(source_path.values)) inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select(ApiName(row_name), Null, v, ApiName(row_name), Null, t), ), SQL_FROM, sql_call("UNNEST", quote_column(source_path + REPEATED)), SQL_AS, SQL(row_name), SQL(" WITH OFFSET AS "), SQL(ord_name), SQL_ORDERBY, SQL(ord_name), ) ] return [sql_alias(sql_call("ARRAY", *inner), REPEATED)] selection = [] for k, t in jx.sort(total_flake.items(), 0): k_total_tops = total_tops if is_text(total_tops) else total_tops[k] k_tops = source_tops if is_text(source_tops) else source_tops[k] v = source_flake.get(k) if is_text(k_total_tops): # DO NOT INCLUDE TOP_LEVEL_FIELDS pass elif t == v and not k_total_tops and not k_tops: selection.append( ConcatSQL( quote_column(source_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) elif is_data(t): if not v: selects = _gen_select( source_path + escape_name(k), source_tops, {}, total_path + escape_name(k), k_total_tops, t, ) elif is_data(v): selects = _gen_select( source_path + escape_name(k), source_tops, v, total_path + escape_name(k), k_total_tops, t, ) else: raise Log.error( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(source_path + escape_name(k)), type=v, main=t, ) if selects: inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects), ) ] selection.append( sql_alias(sql_call("", *inner), escape_name(k))) elif is_text(t): if is_text(k_tops): # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD selection.append( ConcatSQL(SQL(k_tops), SQL_AS, quote_column(escape_name(k)))) elif v == t: selection.append( ConcatSQL( quote_column(total_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) else: if v: Log.note( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(source_path + escape_name(k)), type=v, main=t, ) selection.append( ConcatSQL( sql_call( "CAST", ConcatSQL(SQL_NULL, SQL_AS, SQL(json_type_to_bq_type[t])), ), SQL_AS, quote_column(escape_name(k)), )) else: Log.error("not expected") return selection
def _typed_encode(value, schema): """ RETURN TRIPLE output - THE ENCODED VALUE update - THE ADDITIONAL SCHEMA OVER schema PROVIDED nested - True IF NESTING IS REQUIRED (CONSIDERED SERIOUS SCHEMA CHANGE) """ if is_many(value): if len(value) == 0: return None, None, False output = [] update = {} nest_added = False child_schema = schema.get(NESTED_TYPE) if not child_schema: nest_added = True child_schema = schema[NESTED_TYPE] = {} for r in value: v, m, n = _typed_encode(r, child_schema) output.append(v) set_default(update, m) nest_added |= n if update: return {text(REPEATED): output}, {NESTED_TYPE: update}, nest_added else: return {text(REPEATED): output}, None, nest_added elif NESTED_TYPE in schema: if not value: return {text(REPEATED): []}, None, False else: return _typed_encode([value], schema) elif is_data(value): output = {} update = {} nest_added = False for k, v in value.items(): child_schema = schema.get(k) if not child_schema: child_schema = schema[k] = {} result, more_update, n = _typed_encode(v, child_schema) if result != None: output[text(escape_name(k))] = result set_default(update, {k: more_update}) nest_added |= n return output, update or None, nest_added elif is_text(schema): v, inserter_type, json_type = schema_type(value) if schema != json_type: Log.error( "Can not convert {{existing_type}} to {{expected_type}}", existing_type=json_type, expected_type=schema, ) return v, None, False elif value == None: return { text(escape_name(t)): None for t, child_schema in schema.items() } or None, None, False else: try: v, inserter_type, json_type = schema_type(value) except Exception as e: # LAST DESPERATE ATTEMPT return _typed_encode(value.__data__(), schema) child_schema = schema.get(inserter_type) update = None if not child_schema: if schema.get(TIME_TYPE): # ATTEMPT TO CONVERT TO TIME, IF EXPECTING TIME try: v = parse(v).format(TIMESTAMP_FORMAT) return {text(escape_name(TIME_TYPE)): v}, update, False except Exception as e: Log.warning( "Failed attempt to convert {{value}} to TIMESTAMP string", value=v, cause=e, ) schema[inserter_type] = json_type update = {inserter_type: json_type} return {text(escape_name(inserter_type)): v}, update, False
def delete_table(self, name): full_name = self.full_name + escape_name(name) self.client.delete_table(full_name)