def read_sql(query, billing_project_id=None, from_file=False, reauth=False): """Load data from BigQuery using a query. Just a wrapper around pandas.read_gbq Args: query (sql): Valid SQL Standard Query to basedosdados billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. Returns: pd.DataFrame: Query result """ try: return pandas_gbq.read_gbq( query, credentials=credentials(from_file=from_file, reauth=reauth), project_id=billing_project_id, ) except (OSError, ValueError): raise BaseDosDadosException( "\nWe are not sure which Google Cloud project should be billed.\n" "First, you should make sure that you have a Google Cloud project.\n" "If you don't have one, set one up following these steps: \n" "\t1. Go to this link https://console.cloud.google.com/projectselector2/home/dashboard\n" "\t2. Agree with Terms of Service if asked\n" "\t3. Click in Create Project\n" "\t4. Put a cool name in your project\n" "\t5. Hit create\n" "" "Copy the Project ID, (notice that it is not the Project Name)\n" "Now, you have two options:\n" "1. Add an argument to your function poiting to the billing project id.\n" " Like `bd.read_table('br_ibge_pib', 'municipios', billing_project_id=<YOUR_PROJECT_ID>)`\n" "2. You can set a project_id in the environment by running the following command in your terminal: `gcloud config set project <YOUR_PROJECT_ID>`." " Bear in mind that you need `gcloud` installed.") except GenericGBQException as e: if "Reason: 403" in str(e): raise BaseDosDadosException( "\nYou still don't have a Google Cloud Project.\n" "Set one up following these steps: \n" "1. Go to this link https://console.cloud.google.com/projectselector2/home/dashboard\n" "2. Agree with Terms of Service if asked\n" "3. Click in Create Project\n" "4. Put a cool name in your project\n" "5. Hit create\n" "6. Rerun this command with the flag `reauth=True`. \n" " Like `read_table('br_ibge_pib', 'municipios', reauth=True)`" ) else: raise e
def append(self, filepath, partitions=None, if_exists="replace", **upload_args): """Appends new data to existing BigQuery table. As long as the data has the same schema. It appends the data in the filepath to the existing table. Args: filepath (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with partitions (str, pathlib.PosixPath, dict): Optional. Hive structured partition as a string or dict * str : `<key>=<value>/<key2>=<value2>` * dict: `dict(key=value, key2=value2)` if_exists (str): 0ptional. What to do if data with same name exists in storage * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing """ if not self.table_exists("staging"): raise BaseDosDadosException( "You cannot append to a table that does not exist") else: Storage(self.dataset_id, self.table_id, **self.main_vars).upload( filepath, mode="staging", partitions=partitions, if_exists=if_exists, **upload_args, )
def read_table( dataset_id, table_id, query_project_id="basedosdados", billing_project_id=None, limit=None, from_file=False, reauth=False, use_bqstorage_api=False, ): """Load data from BigQuery using dataset_id and table_id. Args: dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard limit (int): Optional. Number of rows to read from table. from_file (boolean): Optional. Uses the credentials from file, located in `~/.basedosdados/credentials/ reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. use_bqstorage_api (boolean): Optional. Use the BigQuery Storage API to download query results quickly, but at an increased cost(https://cloud.google.com/bigquery/docs/reference/storage/). To use this API, first enable it in the Cloud Console(https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com). You must also have the bigquery.readsessions.create permission on the project you are billing queries to. Returns: pd.DataFrame: Query result """ if (dataset_id is not None) and (table_id is not None): query = f""" SELECT * FROM `{query_project_id}.{dataset_id}.{table_id}`""" if limit is not None: query += f" LIMIT {limit}" else: raise BaseDosDadosException( "Both table_id and dataset_id should be filled.") return read_sql( query, billing_project_id=billing_project_id, from_file=from_file, reauth=reauth, use_bqstorage_api=use_bqstorage_api, )
def _sheet_to_df(self, columns_config_url): url = columns_config_url.replace("edit#gid=", "export?format=csv&gid=") try: return pd.read_csv( StringIO(requests.get(url).content.decode("utf-8"))) except: raise BaseDosDadosException( "Check if your google sheet Share are: Anyone on the internet with this link can view" )
def _sets_savepath(savepath): """Sets savepath accordingly""" savepath = Path(savepath) if savepath.suffix == ".csv": # make sure that path exists savepath.parent.mkdir(parents=True, exist_ok=True) else: raise BaseDosDadosException( "Only .csv files are supported, " f"your filename has a diferent extension: {savepath.suffix}") return savepath
def read_table( dataset_id, table_id, query_project_id="basedosdados", billing_project_id=None, limit=None, from_file=False, reauth=False, ): """Load data from BigQuery using dataset_id and table_id. Args: dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. limit (int): Optional. Number of rows to read from table. Returns: pd.DataFrame: Query result """ if (dataset_id is not None) and (table_id is not None): query = f""" SELECT * FROM `{query_project_id}.{dataset_id}.{table_id}`""" if limit is not None: query += f" LIMIT {limit}" else: raise BaseDosDadosException( "Both table_id and dataset_id should be filled.") return read_sql(query, billing_project_id=billing_project_id, from_file=from_file, reauth=reauth)
def upload( self, path, mode="all", partitions=None, if_exists="raise", **upload_args, ): """Upload to storage at `<bucket_name>/<mode>/<dataset_id>/<table_id>`. You can: * Add a single **file** setting `path = <file_path>`. * Add a **folder** with multiple files setting `path = <folder_path>`. *The folder should just contain the files and no folders.* * Add **partitioned files** setting `path = <folder_path>`. This folder must follow the hive partitioning scheme i.e. `<table_id>/<key>=<value>/<key2>=<value2>/<partition>.csv` (ex: `mytable/country=brasil/year=2020/mypart.csv`). *Remember all files must follow a single schema.* Otherwise, things might fail in the future. There are 6 modes: * `raw` : should contain raw files from datasource * `staging` : should contain pre-treated files ready to upload to BiqQuery * `header`: should contain the header of the tables * `auxiliary_files`: should contain auxiliary files from eache table * `architecture`: should contain the architecture sheet of the tables * `all`: if no treatment is needed, use `all`. Args: path (str or pathlib.PosixPath): Where to find the file or folder that you want to upload to storage mode (str): Folder of which dataset to update [raw|staging|header|auxiliary_files|architecture|all] partitions (str, pathlib.PosixPath, or dict): Optional. *If adding a single file*, use this to add it to a specific partition. * str : `<key>=<value>/<key2>=<value2>` * dict: `dict(key=value, key2=value2)` if_exists (str): Optional. What to do if data exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing upload_args (): Extra arguments accepted by [`google.cloud.storage.blob.Blob.upload_from_file`](https://googleapis.dev/python/storage/latest/blobs.html?highlight=upload_from_filename#google.cloud.storage.blob.Blob.upload_from_filename) """ if (self.dataset_id is None) or (self.table_id is None): raise Exception("You need to pass dataset_id and table_id") path = Path(path) if path.is_dir(): paths = [ f for f in path.glob("**/*") if f.is_file() and f.suffix == ".csv" ] parts = [(filepath.as_posix().replace(path.as_posix() + "/", "").replace( str(filepath.name), "")) for filepath in paths] else: paths = [path] parts = [partitions or None] self._check_mode(mode) mode = ([ "raw", "staging", "header", "auxiliary_files", "architecture" ] if mode == "all" else [mode]) for m in mode: for filepath, part in tqdm(list(zip(paths, parts)), desc="Uploading files"): blob_name = self._build_blob_name(filepath.name, m, part) blob = self.bucket.blob(blob_name) if not blob.exists() or if_exists == "replace": upload_args["timeout"] = upload_args.get("timeout", None) blob.upload_from_filename(str(filepath), **upload_args) elif if_exists == "pass": pass else: raise BaseDosDadosException( f"Data already exists at {self.bucket_name}/{blob_name}. " "If you are using Storage.upload then set if_exists to " "'replace' to overwrite data \n" "If you are using Table.create then set if_storage_data_exists " "to 'replace' to overwrite data.")
def download( savepath, query=None, dataset_id=None, table_id=None, query_project_id="basedosdados", billing_project_id=None, limit=None, from_file=False, reauth=False, compression="GZIP", ): """Download table or query result from basedosdados BigQuery (or other). * Using a **query**: `download('select * from `basedosdados.br_suporte.diretorio_municipios` limit 10')` * Using **dataset_id & table_id**: `download(dataset_id='br_suporte', table_id='diretorio_municipios')` You can also add arguments to modify save parameters: `download(dataset_id='br_suporte', table_id='diretorio_municipios', index=False, sep='|')` Args: savepath (str, pathlib.PosixPath): savepath must be a file path. Only supports `.csv`. query (str): Optional. Valid SQL Standard Query to basedosdados. If query is available, dataset_id and table_id are not required. dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard limit (int): Optional Number of rows. from_file (boolean): Optional. Uses the credentials from file, located in `~/.basedosdados/credentials/ reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. compression (str): Optional. Compression type. Only `GZIP` is available for now. Raises: Exception: If either table_id, dataset_id or query are empty. """ if (query is None) and ((table_id is None) or (dataset_id is None)): raise BaseDosDadosException( "Either table_id, dataset_id or query should be filled.") client = google_client(query_project_id, billing_project_id, from_file, reauth) # makes sure that savepath is a filepath and not a folder savepath = _sets_savepath(savepath) # if query is not defined (so it won't be overwritten) and if # table is a view or external or if limit is specified, # convert it to a query. if not query and (not _is_table(client, dataset_id, table_id, query_project_id) or limit): query = f""" SELECT * FROM {query_project_id}.{dataset_id}.{table_id} """ if limit is not None: query += f" limit {limit}" if query: # sql queries produces anonymous tables, whose names # can be found within `job._properties` job = client["bigquery"].query(query) # views may take longer: wait for job to finish. _wait_for(job) dest_table = job._properties["configuration"]["query"][ "destinationTable"] project_id = dest_table["projectId"] dataset_id = dest_table["datasetId"] table_id = dest_table["tableId"] _direct_download(client, dataset_id, table_id, savepath, project_id, compression)
def create( self, path=None, job_config_params=None, force_dataset=True, if_table_exists="raise", if_storage_data_exists="raise", if_table_config_exists="raise", source_format="csv", ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_table_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. if_table_config_exists (str): Optional. What to do if config files already exist * 'raise': Raises FileExistError * 'replace': Replace with blank template * 'pass'; Do nothing if_storage_data_exists (str): Optional. What to do if data already exists on your bucket: * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing source_format (str): Optional Data source format. Only 'csv' is supported. Defaults to 'csv'. """ if path is None: # Look if table data already exists at Storage data = self.client["storage_staging"].list_blobs( self.bucket_name, prefix=f"staging/{self.dataset_id}/{self.table_id}") # Raise: Cannot create table without external data if not data: raise BaseDosDadosException( "You must provide a path for uploading data") # Add data to storage if isinstance( path, ( str, Path, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload(path, mode="staging", if_exists=if_storage_data_exists) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init( data_sample_path=path, if_folder_exists="replace", if_table_config_exists=if_table_config_exists, ) table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = Datatype( self, source_format, "staging", partitioned=self._is_partitioned()).external_config # Lookup if table alreay exists table_ref = None try: table_ref = self.client["bigquery_staging"].get_table( self.table_full_name["staging"]) except google.api_core.exceptions.NotFound: pass if isinstance(table_ref, google.cloud.bigquery.table.Table): if if_table_exists == "pass": return None elif if_table_exists == "raise": raise FileExistsError( "Table already exists, choose replace if you want to overwrite it" ) if if_table_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table)
def init( self, data_sample_path=None, if_folder_exists="raise", if_table_config_exists="raise", source_format="csv", ): """Initialize table folder at metadata_path at `metadata_path/<dataset_id>/<table_id>`. The folder should contain: * `table_config.yaml` * `publish.sql` You can also point to a sample of the data to auto complete columns names. Args: data_sample_path (str, pathlib.PosixPath): Optional. Data sample path to auto complete columns names It supports Comma Delimited CSV. if_folder_exists (str): Optional. What to do if table folder exists * 'raise' : Raises FileExistsError * 'replace' : Replace folder * 'pass' : Do nothing table_config_exists (str): Optional What to do if table_config.yaml and publish.sql exists * 'raise' : Raises FileExistsError * 'replace' : Replace files with blank template * 'pass' : Do nothing source_format (str): Optional Data source format. Only 'csv' is supported. Defaults to 'csv'. Raises: FileExistsError: If folder exists and replace is False. NotImplementedError: If data sample is not in supported type or format. """ if not self.dataset_folder.exists(): raise FileExistsError( f"Dataset folder {self.dataset_folder} folder does not exists. " "Create a dataset before adding tables.") try: self.table_folder.mkdir(exist_ok=(if_folder_exists == "replace")) except FileExistsError: if if_folder_exists == "raise": raise FileExistsError( f"Table folder already exists for {self.table_id}. ") elif if_folder_exists == "pass": return self if not data_sample_path and if_table_config_exists != "pass": raise BaseDosDadosException( "You must provide a path to correctly create config files") partition_columns = [] if isinstance( data_sample_path, ( str, Path, ), ): # Check if partitioned and get data sample and partition columns data_sample_path = Path(data_sample_path) if data_sample_path.is_dir(): data_sample_path = [ f for f in data_sample_path.glob("**/*") if f.is_file() and f.suffix == ".csv" ][0] partition_columns = [ k.split("=")[0] for k in data_sample_path.as_posix().split("/") if "=" in k ] columns = Datatype(self, source_format).header(data_sample_path) else: columns = ["column_name"] if if_table_config_exists == "pass": # Check if config files exists before passing if (Path(self.table_folder / "table_config.yaml").is_file() and Path(self.table_folder / "publish.sql").is_file()): pass # Raise if no sample to determine columns elif not data_sample_path: raise BaseDosDadosException( "You must provide a path to correctly create config files") else: self._make_template(columns, partition_columns) elif if_table_config_exists == "raise": # Check if config files already exist if (Path(self.table_folder / "table_config.yaml").is_file() and Path(self.table_folder / "publish.sql").is_file()): raise FileExistsError( f"table_config.yaml and publish.sql already exists at {self.table_folder}" ) # if config files don't exist, create them else: self._make_template(columns, partition_columns) else: # Raise: without a path to data sample, should not replace config files with empty template self._make_template(columns, partition_columns) return self
def download( savepath, query=None, dataset_id=None, table_id=None, query_project_id="basedosdados", billing_project_id=None, limit=None, from_file=False, reauth=False, **pandas_kwargs, ): """Download table or query result from basedosdados BigQuery (or other). * Using a **query**: `download('select * from `basedosdados.br_suporte.diretorio_municipios` limit 10')` * Using **dataset_id & table_id**: `download(dataset_id='br_suporte', table_id='diretorio_municipios')` You can also add arguments to modify save parameters: `download(dataset_id='br_suporte', table_id='diretorio_municipios', index=False, sep='|')` Args: savepath (str, pathlib.PosixPath): If savepath is a folder, it saves a file as `savepath / table_id.csv` or `savepath / query_result.csv` if table_id not available. If savepath is a file, saves data to file. query (str): Optional. Valid SQL Standard Query to basedosdados. If query is available, dataset_id and table_id are not required. dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard limit (int): Optional Number of rows. reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. pandas_kwargs (): Extra arguments accepted by [pandas.to_csv](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html) Raises: Exception: If either table_id or dataset_id were are empty. """ savepath = Path(savepath) # make sure that path exists if savepath.is_dir(): savepath.mkdir(parents=True, exist_ok=True) else: savepath.parent.mkdir(parents=True, exist_ok=True) if (dataset_id is not None) and (table_id is not None): table = read_table( dataset_id, table_id, query_project_id=query_project_id, billing_project_id=billing_project_id, limit=limit, reauth=reauth, from_file=from_file, ) elif query is not None: query += f" limit {limit}" if limit is not None else "" table = read_sql( query, billing_project_id=billing_project_id, from_file=from_file, reauth=reauth, ) elif query is None: raise BaseDosDadosException( "Either table_id, dataset_id or query should be filled.") if savepath.is_dir(): if table_id is not None: savepath = savepath / (table_id + ".csv") else: savepath = savepath / ("query_result.csv") table.to_csv(savepath, **pandas_kwargs)
def _load_schema(self, mode="staging"): """Load schema from table_config.yaml Args: mode (bool): Which dataset to create [prod|staging]. """ self._check_mode(mode) json_path = self.table_folder / f"schema-{mode}.json" columns = self.table_config["columns"] if mode == "staging": new_columns = [] for c in columns: # case is_in_staging are None then must be True is_in_staging = (True if c.get("is_in_staging") is None else c["is_in_staging"]) # append columns declared in table_config.yaml to schema only if is_in_staging: True if is_in_staging and not c.get("is_partition"): c["type"] = "STRING" new_columns.append(c) del columns columns = new_columns elif mode == "prod": schema = self._get_table_obj(mode).schema # get field names for fields at schema and at table_config.yaml column_names = [c["name"] for c in columns] schema_names = [s.name for s in schema] # check if there are mismatched fields not_in_columns = [ name for name in schema_names if name not in column_names ] not_in_schema = [ name for name in column_names if name not in schema_names ] # raise if field is not in table_config if not_in_columns: raise BaseDosDadosException( "Column {error_columns} was not found in table_config.yaml. Are you sure that " "all your column names between table_config.yaml, publish.sql and " "{project_id}.{dataset_id}.{table_id} are the same?". format( error_columns=not_in_columns, project_id=self.table_config["project_id_prod"], dataset_id=self.table_config["dataset_id"], table_id=self.table_config["table_id"], )) # raise if field is not in schema elif not_in_schema: raise BaseDosDadosException( "Column {error_columns} was not found in publish.sql. Are you sure that " "all your column names between table_config.yaml, publish.sql and " "{project_id}.{dataset_id}.{table_id} are the same?". format( error_columns=not_in_schema, project_id=self.table_config["project_id_prod"], dataset_id=self.table_config["dataset_id"], table_id=self.table_config["table_id"], )) else: # if field is in schema, get field_type and field_mode for c in columns: for s in schema: if c["name"] == s.name: c["type"] = s.field_type c["mode"] = s.mode break ## force utf-8, write schema_{mode}.json json.dump(columns, (json_path).open("w", encoding="utf-8")) # load new created schema return self.client[f"bigquery_{mode}"].schema_from_json(str(json_path))
def update_columns(self, columns_config_url): """ Fills columns in table_config.yaml automatically using a public google sheets URL. Also regenerate publish.sql and autofill type using bigquery_type. The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>. The sheet must contain the columns: - nome: column name - descricao: column description - tipo: column bigquery type - unidade_medida: column mesurement unit - dicionario: column related dictionary - nome_diretorio: column related directory in the format <dataset_id>.<table_id>:<column_name> Args: columns_config_url (str): google sheets URL. """ ruamel = ryaml.YAML() ruamel.preserve_quotes = True ruamel.indent(mapping=4, sequence=6, offset=4) table_config_yaml = ruamel.load( (self.table_folder / "table_config.yaml").open(encoding="utf-8")) if ("edit#gid=" not in columns_config_url or "https://docs.google.com/spreadsheets/d/" not in columns_config_url or not columns_config_url.split("=")[1].isdigit()): raise BaseDosDadosException( "The Google sheet url not in correct format." "The url must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>" ) df = self._sheet_to_df(columns_config_url) df = df.fillna("NULL") if "nome" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'nome' not found in Google the google sheet. " "The sheet must contain the column name: 'nome'") elif "descricao" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'descricao' not found in Google the google sheet. " "The sheet must contain the column description: 'descricao'") elif "tipo" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'tipo' not found in Google the google sheet. " "The sheet must contain the column type: 'tipo'") elif "unidade_medida" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'unidade_medida' not found in Google the google sheet. " "The sheet must contain the column measurement unit: 'unidade_medida'" ) elif "dicionario" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'dicionario' not found in Google the google sheet. " "The sheet must contain the column dictionary: 'dicionario'") elif "nome_diretorio" not in df.columns.tolist(): raise BaseDosDadosException( "Column 'nome_diretorio' not found in Google the google sheet. " "The sheet must contain the column dictionary name: 'nome_diretorio'" ) columns_parameters = zip( df["nome"].tolist(), df["descricao"].tolist(), df["tipo"].tolist(), df["unidade_medida"].tolist(), df["dicionario"].tolist(), df["nome_diretorio"].tolist(), ) for ( name, description, tipo, unidade_medida, dicionario, nome_diretorio, ) in columns_parameters: for col in table_config_yaml["columns"]: if col["name"] == name: col["description"] = (col["description"] if description == "NULL" else description) col["bigquery_type"] = (col["bigquery_type"] if tipo == "NULL" else tipo.lower()) col["measurement_unit"] = (col["measurement_unit"] if unidade_medida == "NULL" else unidade_medida) col["covered_by_dictionary"] = ("no" if dicionario == "NULL" else "yes") dataset = nome_diretorio.split(".")[0] col["directory_column"]["dataset_id"] = ( col["directory_column"]["dataset_id"] if dataset == "NULL" else dataset) table = nome_diretorio.split(".")[-1].split(":")[0] col["directory_column"]["table_id"] = ( col["directory_column"]["table_id"] if table == "NULL" else table) column = nome_diretorio.split(".")[-1].split(":")[-1] col["directory_column"]["column_name"] = ( col["directory_column"]["column_name"] if column == "NULL" else column) ruamel.dump( table_config_yaml, open(self.table_folder / "table_config.yaml", "w", encoding="utf-8"), ) # regenerate publish.sql self._make_publish_sql()