def metadata_validate(): # find the dataset and tables of the PR dataset_table_ids = get_table_dataset_id() # print dataset tables info tprint("TABLES FOUND") pprint(dataset_table_ids) tprint() # iterate over each table in dataset of the PR for table_id in dataset_table_ids.keys(): dataset_id = dataset_table_ids[table_id]["dataset_id"] source_bucket_name = dataset_table_ids[table_id]["source_bucket_name"] try: # push the table to bigquery md = Metadata(dataset_id=dataset_id, table_id=table_id) md.validate() tprint(f"SUCESS VALIDATE {dataset_id}.{table_id}") tprint() except Exception as error: tprint(f"ERROR ON {dataset_id}.{table_id}") traceback.print_exc() tprint()
def cli_validate_metadata(ctx, dataset_id, table_id): m = Metadata(dataset_id, table_id, **ctx.obj) try: m.validate() msg, color = "Local metadata is valid.", "green" except BaseDosDadosException as e: msg = ( f"Local metadata is invalid. Please check the traceback below for" f" more information on how to fix it:\n\n{repr(e)}") color = "red" click.echo(click.style(msg, fg=color))
def cli_is_updated_metadata(ctx, dataset_id, table_id): m = Metadata(dataset_id, table_id, **ctx.obj) if m.is_updated(): msg, color = "Local metadata is updated.", "green" else: msg = ( "Local metadata is out of date. Please run `basedosdados metadata" " create` with the flag `if_exists=replace` to get the updated da" "ta.") color = "red" click.echo(click.style(msg, fg=color))
def __init__(self, dataset_id, table_id, **kwargs): super().__init__(**kwargs) self.table_id = table_id.replace("-", "_") self.dataset_id = dataset_id.replace("-", "_") self.dataset_folder = Path(self.metadata_path / self.dataset_id) self.table_folder = self.dataset_folder / table_id self.table_full_name = dict( prod= f"{self.client['bigquery_prod'].project}.{self.dataset_id}.{self.table_id}", staging= f"{self.client['bigquery_staging'].project}.{self.dataset_id}_staging.{self.table_id}", ) self.table_full_name.update(dict(all=deepcopy(self.table_full_name))) self.metadata = Metadata(self.dataset_id, self.table_id, **kwargs)
def cli_publish_metadata( ctx, dataset_id, table_id, all, if_exists, update_locally, ): m = Metadata(dataset_id, table_id, **ctx.obj) try: m.publish(all=all, if_exists=if_exists, update_locally=update_locally) msg, color = "Local metadata has been published.", "green" except (CKANAPIError, BaseDosDadosException, AssertionError) as e: msg = (f"Local metadata couldn't be published due to an error. Pleas" f"e check the traceback below for more information on how to " f"fix it:\n\n{repr(e)}") color = "red" click.echo(click.style(msg, fg=color))
def cli_create_metadata( ctx, dataset_id, table_id, if_exists, columns, partition_columns, force_columns, table_only, ): m = Metadata(dataset_id, table_id, **ctx.obj).create( if_exists=if_exists, columns=columns, partition_columns=partition_columns, force_columns=force_columns, table_only=table_only, ) click.echo( click.style( f"Metadata file was created at `{m.filepath}`", fg="green", ))
class Dataset(Base): """ Manage datasets in BigQuery. """ def __init__(self, dataset_id, **kwargs): super().__init__(**kwargs) self.dataset_id = dataset_id.replace("-", "_") self.dataset_folder = Path(self.metadata_path / self.dataset_id) self.metadata = Metadata(self.dataset_id, **kwargs) @property def dataset_config(self): return self._load_yaml( self.metadata_path / self.dataset_id / "dataset_config.yaml" ) def _loop_modes(self, mode="all"): mode = ["prod", "staging"] if mode == "all" else [mode] dataset_tag = lambda m: f"_{m}" if m == "staging" else "" return ( { "client": self.client[f"bigquery_{m}"], "id": f"{self.client[f'bigquery_{m}'].project}.{self.dataset_id}{dataset_tag(m)}", } for m in mode ) def _setup_dataset_object(self, dataset_id): dataset = bigquery.Dataset(dataset_id) ## TODO: not being used since 1.6.0 - need to redo the description tha goes to bigquery # dataset.description = self._render_template( # Path("dataset/dataset_description.txt"), self.dataset_config # ) return dataset def _write_readme_file(self): readme_content = ( f"Como capturar os dados de {self.dataset_id}?\n\nPara cap" f"turar esses dados, basta verificar o link dos dados orig" f"inais indicado em dataset_config.yaml no item website.\n" f"\nCaso tenha sido utilizado algum código de captura ou t" f"ratamento, estes estarão contidos em code/. Se o dado pu" f"blicado for em sua versão bruta, não existirá a pasta co" f"de/.\n\nOs dados publicados estão disponíveis em: https:" f"//basedosdados.org/dataset/{self.dataset_id.replace('_','-')}" ) readme_path = Path(self.metadata_path / self.dataset_id / "README.md") with open(readme_path, "w") as readmefile: readmefile.write(readme_content) def init(self, replace=False): """Initialize dataset folder at metadata_path at `metadata_path/<dataset_id>`. The folder should contain: * `dataset_config.yaml` * `README.md` Args: replace (str): Optional. Whether to replace existing folder. Raises: FileExistsError: If dataset folder already exists and replace is False """ # Create dataset folder try: self.dataset_folder.mkdir(exist_ok=replace, parents=True) except FileExistsError: raise FileExistsError( f"Dataset {str(self.dataset_folder.stem)} folder does not exists. " "Set replace=True to replace current files." ) # create dataset_config.yaml with metadata self.metadata.create(if_exists="replace") # create README.md file self._write_readme_file() # Add code folder (self.dataset_folder / "code").mkdir(exist_ok=replace, parents=True) return self def publicize(self, mode="all"): """Changes IAM configuration to turn BigQuery dataset public. Args: mode (bool): Which dataset to create [prod|staging|all]. """ for m in self._loop_modes(mode): dataset = m["client"].get_dataset(m["id"]) entries = dataset.access_entries entries.extend( [ bigquery.AccessEntry( role="roles/bigquery.dataViewer", entity_type="iamMember", entity_id="allUsers", ), bigquery.AccessEntry( role="roles/bigquery.metadataViewer", entity_type="iamMember", entity_id="allUsers", ), bigquery.AccessEntry( role="roles/bigquery.user", entity_type="iamMember", entity_id="allUsers", ), ] ) dataset.access_entries = entries m["client"].update_dataset(dataset, ["access_entries"]) def create(self, mode="all", if_exists="raise"): """Creates BigQuery datasets given `dataset_id`. It can create two datasets: * `<dataset_id>` (mode = 'prod') * `<dataset_id>_staging` (mode = 'staging') If `mode` is all, it creates both. Args: mode (str): Optional. Which dataset to create [prod|staging|all]. if_exists (str): Optional. What to do if dataset exists * raise : Raises Conflict exception * replace : Drop all tables and replace dataset * update : Update dataset description * pass : Do nothing Raises: Warning: Dataset already exists and if_exists is set to `raise` """ if if_exists == "replace": self.delete(mode) elif if_exists == "update": self.update() return # Set dataset_id to the ID of the dataset to create. for m in self._loop_modes(mode): # Construct a full Dataset object to send to the API. dataset_obj = self._setup_dataset_object(m["id"]) # Send the dataset to the API for creation, with an explicit timeout. # Raises google.api_core.exceptions.Conflict if the Dataset already # exists within the project. try: job = m["client"].create_dataset(dataset_obj) # Make an API request. except Conflict: if if_exists == "pass": return else: raise Conflict(f"Dataset {self.dataset_id} already exists") # Make prod dataset public self.publicize() def delete(self, mode="all"): """Deletes dataset in BigQuery. Toogle mode to choose which dataset to delete. Args: mode (str): Optional. Which dataset to delete [prod|staging|all] """ for m in self._loop_modes(mode): m["client"].delete_dataset(m["id"], delete_contents=True, not_found_ok=True) def update(self, mode="all"): """Update dataset description. Toogle mode to choose which dataset to update. Args: mode (str): Optional. Which dataset to update [prod|staging|all] """ for m in self._loop_modes(mode): # Send the dataset to the API to update, with an explicit timeout. # Raises google.api_core.exceptions.Conflict if the Dataset already # exists within the project. dataset = m["client"].update_dataset( self._setup_dataset_object(m["id"]), fields=["description"] ) # Make an API request.
def __init__(self, dataset_id, **kwargs): super().__init__(**kwargs) self.dataset_id = dataset_id.replace("-", "_") self.dataset_folder = Path(self.metadata_path / self.dataset_id) self.metadata = Metadata(self.dataset_id, **kwargs)
def table_approve(): # find the dataset and tables of the PR dataset_table_ids = get_table_dataset_id() # print dataset tables info tprint("TABLES FOUND") pprint(dataset_table_ids) tprint() # iterate over each table in dataset of the PR for table_id in dataset_table_ids.keys(): dataset_id = dataset_table_ids[table_id]["dataset_id"] source_bucket_name = dataset_table_ids[table_id]["source_bucket_name"] # criate a bigquery table in prod try: # push the table to bigquery tprint(f"STARTING TABLE APPROVE ON {dataset_id}.{table_id}") push_table_to_bq( dataset_id=dataset_id, table_id=table_id, source_bucket_name=source_bucket_name, destination_bucket_name=os.environ.get( "BUCKET_NAME_DESTINATION"), backup_bucket_name=os.environ.get("BUCKET_NAME_BACKUP"), ) pretty_log(dataset_id, table_id, source_bucket_name) tprint() except Exception as error: tprint(f"DATA ERROR ON {dataset_id}.{table_id}") traceback.print_exc(file=sys.stderr) tprint() # pubish Metadata in prod try: # create table metadata object md = Metadata(dataset_id=dataset_id, table_id=table_id) # check if correspondent dataset metadata already exists in CKAN if not md.dataset_metadata_obj.exists_in_ckan(): # validate dataset metadata md.dataset_metadata_obj.validate() tprint(f"SUCESS VALIDATE {dataset_id}") # publish dataset metadata to CKAN md.dataset_metadata_obj.publish() # run multiple tries to get published dataset metadata from # ckan till it is published: dataset metadata must be # accessible for table metadata to be published too tprint(f"WAITING FOR {dataset_id} METADATA PUBLISHMENT...") MAX_RETRIES = 80 retry_count = 0 while not md.dataset_metadata_obj.exists_in_ckan(): time.sleep(15) retry_count += 1 if retry_count >= MAX_RETRIES: break if md.dataset_metadata_obj.exists_in_ckan(): tprint(f"SUCESS PUBLISH {dataset_id}") else: tprint(f"ERROR PUBLISH {dataset_id}") # validate table metadata md.validate() tprint(f"SUCESS VALIDATE {dataset_id}.{table_id}") # publish table metadata to CKAN md.publish(if_exists="replace") tprint(f"SUCESS PUBLISHED {dataset_id}.{table_id}") tprint() except Exception as error: tprint(f"METADATA ERROR ON {dataset_id}.{table_id}") traceback.print_exc(file=sys.stderr) tprint()
class Table(Base): """ Manage tables in Google Cloud Storage and BigQuery. """ def __init__(self, dataset_id, table_id, **kwargs): super().__init__(**kwargs) self.table_id = table_id.replace("-", "_") self.dataset_id = dataset_id.replace("-", "_") self.dataset_folder = Path(self.metadata_path / self.dataset_id) self.table_folder = self.dataset_folder / table_id self.table_full_name = dict( prod= f"{self.client['bigquery_prod'].project}.{self.dataset_id}.{self.table_id}", staging= f"{self.client['bigquery_staging'].project}.{self.dataset_id}_staging.{self.table_id}", ) self.table_full_name.update(dict(all=deepcopy(self.table_full_name))) self.metadata = Metadata(self.dataset_id, self.table_id, **kwargs) @property def table_config(self): return self._load_yaml(self.table_folder / "table_config.yaml") def _get_table_obj(self, mode): return self.client[f"bigquery_{mode}"].get_table( self.table_full_name[mode]) def _is_partitioned(self): ## check if the table are partitioned, need the split because of a change in the type of partitions in pydantic partitions = self.table_config["partitions"] if partitions: partitions = partitions.split(",") if partitions is None: return False elif isinstance(partitions, list): # check if any None inside list. # False if it is the case Ex: [None, 'partition'] # True otherwise Ex: ['partition1', 'partition2'] return all(item is not None for item in partitions) def _load_schema(self, mode="staging"): """Load schema from table_config.yaml Args: mode (bool): Which dataset to create [prod|staging]. """ self._check_mode(mode) json_path = self.table_folder / f"schema-{mode}.json" columns = self.table_config["columns"] if mode == "staging": new_columns = [] for c in columns: # case is_in_staging are None then must be True is_in_staging = (True if c.get("is_in_staging") is None else c["is_in_staging"]) # append columns declared in table_config.yaml to schema only if is_in_staging: True if is_in_staging and not c.get("is_partition"): c["type"] = "STRING" new_columns.append(c) del columns columns = new_columns elif mode == "prod": schema = self._get_table_obj(mode).schema # get field names for fields at schema and at table_config.yaml column_names = [c["name"] for c in columns] schema_names = [s.name for s in schema] # check if there are mismatched fields not_in_columns = [ name for name in schema_names if name not in column_names ] not_in_schema = [ name for name in column_names if name not in schema_names ] # raise if field is not in table_config if not_in_columns: raise BaseDosDadosException( "Column {error_columns} was not found in table_config.yaml. Are you sure that " "all your column names between table_config.yaml, publish.sql and " "{project_id}.{dataset_id}.{table_id} are the same?". format( error_columns=not_in_columns, project_id=self.table_config["project_id_prod"], dataset_id=self.table_config["dataset_id"], table_id=self.table_config["table_id"], )) # raise if field is not in schema elif not_in_schema: raise BaseDosDadosException( "Column {error_columns} was not found in publish.sql. Are you sure that " "all your column names between table_config.yaml, publish.sql and " "{project_id}.{dataset_id}.{table_id} are the same?". format( error_columns=not_in_schema, project_id=self.table_config["project_id_prod"], dataset_id=self.table_config["dataset_id"], table_id=self.table_config["table_id"], )) else: # if field is in schema, get field_type and field_mode for c in columns: for s in schema: if c["name"] == s.name: c["type"] = s.field_type c["mode"] = s.mode break ## force utf-8, write schema_{mode}.json json.dump(columns, (json_path).open("w", encoding="utf-8")) # load new created schema return self.client[f"bigquery_{mode}"].schema_from_json(str(json_path)) def _make_publish_sql(self): """Create publish.sql with columns and bigquery_type""" ### publish.sql header and instructions publish_txt = """ /* Query para publicar a tabela. Esse é o lugar para: - modificar nomes, ordem e tipos de colunas - dar join com outras tabelas - criar colunas extras (e.g. logs, proporções, etc.) Qualquer coluna definida aqui deve também existir em `table_config.yaml`. # Além disso, sinta-se à vontade para alterar alguns nomes obscuros # para algo um pouco mais explícito. TIPOS: - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido. - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name` - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types */ """ # remove triple quotes extra space publish_txt = inspect.cleandoc(publish_txt) publish_txt = textwrap.dedent(publish_txt) # add create table statement project_id_prod = self.client["bigquery_prod"].project publish_txt += f"\n\nCREATE VIEW {project_id_prod}.{self.dataset_id}.{self.table_id} AS\nSELECT \n" # sort columns by is_partition, partitions_columns come first if self._is_partitioned(): columns = sorted( self.table_config["columns"], key=lambda k: (k["is_partition"] is not None, k["is_partition"]), reverse=True, ) else: columns = self.table_config["columns"] # add columns in publish.sql for col in columns: name = col["name"] bigquery_type = ("STRING" if col["bigquery_type"] is None else col["bigquery_type"].upper()) publish_txt += f"SAFE_CAST({name} AS {bigquery_type}) {name},\n" ## remove last comma publish_txt = publish_txt[:-2] + "\n" # add from statement project_id_staging = self.client["bigquery_staging"].project publish_txt += ( f"FROM {project_id_staging}.{self.dataset_id}_staging.{self.table_id} AS t" ) # save publish.sql in table_folder (self.table_folder / "publish.sql").open( "w", encoding="utf-8").write(publish_txt) def _make_template(self, columns, partition_columns, if_table_config_exists): # create table_config.yaml with metadata self.metadata.create( if_exists=if_table_config_exists, columns=partition_columns + columns, partition_columns=partition_columns, table_only=False, ) self._make_publish_sql() def _sheet_to_df(self, columns_config_url): url = columns_config_url.replace("edit#gid=", "export?format=csv&gid=") try: return pd.read_csv( StringIO(requests.get(url).content.decode("utf-8"))) except: raise BaseDosDadosException( "Check if your google sheet Share are: Anyone on the internet with this link can view" ) def table_exists(self, mode): """Check if table exists in BigQuery. Args: mode (str): Which dataset to check [prod|staging]. """ try: ref = self._get_table_obj(mode=mode) except google.api_core.exceptions.NotFound: ref = None if ref: return True else: return False def update_columns(self, columns_config_url): """ Fills columns in table_config.yaml automatically using a public google sheets URL. Also regenerate publish.sql and autofill type using bigquery_type. The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>. The sheet must contain the columns: - nome: column name - descricao: column description - tipo: column bigquery type - unidade_medida: column mesurement unit - dicionario: column related dictionary - nome_diretorio: column related directory in the format <dataset_id>.<table_id>:<column_name> Args: columns_config_url (str): google sheets URL. """ ruamel = ryaml.YAML() ruamel.preserve_quotes = True ruamel.indent(mapping=4, sequence=6, offset=4) table_config_yaml = ruamel.load( (self.table_folder / "table_config.yaml").open(encoding="utf-8")) if ("edit#gid=" not in columns_config_url or "https://docs.google.com/spreadsheets/d/" not in columns_config_url or not columns_config_url.split("=")[1].isdigit()): raise BaseDosDadosException( "The Google sheet url not in correct format." "The url must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>" ) df = self._sheet_to_df(columns_config_url) df = df.fillna("NULL") if "nome" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'nome' not found in Google the google sheet. " "The sheet must contain the column name: 'nome'") elif "descricao" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'descricao' not found in Google the google sheet. " "The sheet must contain the column description: 'descricao'") elif "tipo" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'tipo' not found in Google the google sheet. " "The sheet must contain the column type: 'tipo'") elif "unidade_medida" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'unidade_medida' not found in Google the google sheet. " "The sheet must contain the column measurement unit: 'unidade_medida'" ) elif "dicionario" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'dicionario' not found in Google the google sheet. " "The sheet must contain the column dictionary: 'dicionario'") elif "nome_diretorio" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'nome_diretorio' not found in Google the google sheet. " "The sheet must contain the column dictionary name: 'nome_diretorio'" ) columns_parameters = zip( df["nome"].tolist(), df["descricao"].tolist(), df["tipo"].tolist(), df["unidade_medida"].tolist(), df["dicionario"].tolist(), df["nome_diretorio"].tolist(), ) for ( name, description, tipo, unidade_medida, dicionario, nome_diretorio, ) in columns_parameters: for col in table_config_yaml["columns"]: if col["name"] == name: col["description"] = (col["description"] if description == "NULL" else description) col["bigquery_type"] = (col["bigquery_type"] if tipo == "NULL" else tipo.lower()) col["measurement_unit"] = (col["measurement_unit"] if unidade_medida == "NULL" else unidade_medida) col["covered_by_dictionary"] = ("no" if dicionario == "NULL" else "yes") dataset = nome_diretorio.split(".")[0] col["directory_column"]["dataset_id"] = ( col["directory_column"]["dataset_id"] if dataset == "NULL" else dataset) table = nome_diretorio.split(".")[-1].split(":")[0] col["directory_column"]["table_id"] = ( col["directory_column"]["table_id"] if table == "NULL" else table) column = nome_diretorio.split(".")[-1].split(":")[-1] col["directory_column"]["column_name"] = ( col["directory_column"]["column_name"] if column == "NULL" else column) ruamel.dump( table_config_yaml, open(self.table_folder / "table_config.yaml", "w", encoding="utf-8"), ) # regenerate publish.sql self._make_publish_sql() def table_exists(self, mode): """Check if table exists in BigQuery. Args: mode (str): Which dataset to check [prod|staging|all]. """ try: ref = self._get_table_obj(mode=mode) except google.api_core.exceptions.NotFound: ref = None if ref: return True else: return False def init( self, data_sample_path=None, if_folder_exists="raise", if_table_config_exists="raise", source_format="csv", columns_config_url=None, ): """Initialize table folder at metadata_path at `metadata_path/<dataset_id>/<table_id>`. The folder should contain: * `table_config.yaml` * `publish.sql` You can also point to a sample of the data to auto complete columns names. Args: data_sample_path (str, pathlib.PosixPath): Optional. Data sample path to auto complete columns names It supports Comma Delimited CSV. if_folder_exists (str): Optional. What to do if table folder exists * 'raise' : Raises FileExistsError * 'replace' : Replace folder * 'pass' : Do nothing if_table_config_exists (str): Optional What to do if table_config.yaml and publish.sql exists * 'raise' : Raises FileExistsError * 'replace' : Replace files with blank template * 'pass' : Do nothing source_format (str): Optional Data source format. Only 'csv' is supported. Defaults to 'csv'. columns_config_url (str): google sheets URL. The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>. The sheet must contain the column name: "coluna" and column description: "descricao" Raises: FileExistsError: If folder exists and replace is False. NotImplementedError: If data sample is not in supported type or format. """ if not self.dataset_folder.exists(): raise FileExistsError( f"Dataset folder {self.dataset_folder} folder does not exists. " "Create a dataset before adding tables.") try: self.table_folder.mkdir(exist_ok=(if_folder_exists == "replace")) except FileExistsError: if if_folder_exists == "raise": raise FileExistsError( f"Table folder already exists for {self.table_id}. ") elif if_folder_exists == "pass": return self if not data_sample_path and if_table_config_exists != "pass": raise BaseDosDadosException( "You must provide a path to correctly create config files") partition_columns = [] if isinstance( data_sample_path, ( str, Path, ), ): # Check if partitioned and get data sample and partition columns data_sample_path = Path(data_sample_path) if data_sample_path.is_dir(): data_sample_path = [ f for f in data_sample_path.glob("**/*") if f.is_file() and f.suffix == ".csv" ][0] partition_columns = [ k.split("=")[0] for k in data_sample_path.as_posix().split("/") if "=" in k ] columns = Datatype(self, source_format).header(data_sample_path) else: columns = ["column_name"] if if_table_config_exists == "pass": # Check if config files exists before passing if (Path(self.table_folder / "table_config.yaml").is_file() and Path(self.table_folder / "publish.sql").is_file()): pass # Raise if no sample to determine columns elif not data_sample_path: raise BaseDosDadosException( "You must provide a path to correctly create config files") else: self._make_template(columns, partition_columns, if_table_config_exists) elif if_table_config_exists == "raise": # Check if config files already exist if (Path(self.table_folder / "table_config.yaml").is_file() and Path(self.table_folder / "publish.sql").is_file()): raise FileExistsError( f"table_config.yaml and publish.sql already exists at {self.table_folder}" ) # if config files don't exist, create them else: self._make_template(columns, partition_columns, if_table_config_exists) else: # Raise: without a path to data sample, should not replace config files with empty template self._make_template(columns, partition_columns, if_table_config_exists) if columns_config_url is not None: self.update_columns(columns_config_url) return self def create( self, path=None, job_config_params=None, force_dataset=True, if_table_exists="raise", if_storage_data_exists="raise", if_table_config_exists="raise", source_format="csv", columns_config_url=None, ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>` - for instance, `year=2012/country=BR`. The partition is automatcally detected by searching for `partitions` on the `table_config.yaml`. Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery if_table_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. if_table_config_exists (str): Optional. What to do if config files already exist * 'raise': Raises FileExistError * 'replace': Replace with blank template * 'pass'; Do nothing if_storage_data_exists (str): Optional. What to do if data already exists on your bucket: * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing source_format (str): Optional Data source format. Only 'csv' is supported. Defaults to 'csv'. columns_config_url (str): google sheets URL. The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>. The sheet must contain the column name: "coluna" and column description: "descricao" """ if path is None: # Look if table data already exists at Storage data = self.client["storage_staging"].list_blobs( self.bucket_name, prefix=f"staging/{self.dataset_id}/{self.table_id}") # Raise: Cannot create table without external data if not data: raise BaseDosDadosException( "You must provide a path for uploading data") # Add data to storage if isinstance( path, ( str, Path, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload(path, mode="staging", if_exists=if_storage_data_exists) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init( data_sample_path=path, if_folder_exists="replace", if_table_config_exists=if_table_config_exists, columns_config_url=columns_config_url, ) table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = Datatype( self, source_format, "staging", partitioned=self._is_partitioned()).external_config # Lookup if table alreay exists table_ref = None try: table_ref = self.client["bigquery_staging"].get_table( self.table_full_name["staging"]) except google.api_core.exceptions.NotFound: pass if isinstance(table_ref, google.cloud.bigquery.table.Table): if if_table_exists == "pass": return None elif if_table_exists == "raise": raise FileExistsError( "Table already exists, choose replace if you want to overwrite it" ) if if_table_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table) def update(self, mode="all", not_found_ok=True): """Updates BigQuery schema and description. Args: mode (str): Optional. Table of which table to update [prod|staging|all] not_found_ok (bool): Optional. What to do if table is not found """ self._check_mode(mode) mode = ["prod", "staging"] if mode == "all" else [mode] for m in mode: try: table = self._get_table_obj(m) except google.api_core.exceptions.NotFound: continue # if m == "staging": table.description = self._render_template( Path("table/table_description.txt"), self.table_config) # save table description open( self.metadata_path / self.dataset_id / self.table_id / "table_description.txt", "w", encoding="utf-8", ).write(table.description) if m == "prod": table.schema = self._load_schema(m) self.client[f"bigquery_{m}"].update_table( table, fields=["description", "schema"]) def publish(self, if_exists="raise"): """Creates BigQuery table at production dataset. Table should be located at `<dataset_id>.<table_id>`. It creates a view that uses the query from `<metadata_path>/<dataset_id>/<table_id>/publish.sql`. Make sure that all columns from the query also exists at `<metadata_path>/<dataset_id>/<table_id>/table_config.sql`, including the partitions. Args: if_exists (str): Optional. What to do if table exists. * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing Todo: * Check if all required fields are filled """ if if_exists == "replace": self.delete(mode="prod") self.client["bigquery_prod"].query( (self.table_folder / "publish.sql").open( "r", encoding="utf-8").read()).result() self.update("prod") def delete(self, mode): """Deletes table in BigQuery. Args: mode (str): Table of which table to delete [prod|staging] """ self._check_mode(mode) if mode == "all": for m, n in self.table_full_name[mode].items(): self.client[f"bigquery_{m}"].delete_table(n, not_found_ok=True) else: self.client[f"bigquery_{mode}"].delete_table( self.table_full_name[mode], not_found_ok=True) def append(self, filepath, partitions=None, if_exists="replace", **upload_args): """Appends new data to existing BigQuery table. As long as the data has the same schema. It appends the data in the filepath to the existing table. Args: filepath (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with partitions (str, pathlib.PosixPath, dict): Optional. Hive structured partition as a string or dict * str : `<key>=<value>/<key2>=<value2>` * dict: `dict(key=value, key2=value2)` if_exists (str): 0ptional. What to do if data with same name exists in storage * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing """ if not self.table_exists("staging"): raise BaseDosDadosException( "You cannot append to a table that does not exist") else: Storage(self.dataset_id, self.table_id, **self.main_vars).upload( filepath, mode="staging", partitions=partitions, if_exists=if_exists, **upload_args, )