def upload_to_bd(tipo, filepath): if tipo == "estabelecimentos": tb = bd.Table(table_id="microdados_estabelecimentos", dataset_id="br_me_caged") else: tb = bd.Table(table_id="microdados_movimentacoes", dataset_id="br_me_caged") tb.append(filepath, if_exists="replace")
def create_table_bq( context, file_path, table_config="replace", publish_config="pass", table_id=None, dataset_id=None, ): if not table_id: table_id = context.resources.basedosdados_config["table_id"] if not dataset_id: dataset_id = context.resources.basedosdados_config["dataset_id"] tb = bd.Table(dataset_id=dataset_id, table_id=table_id) _file_path = file_path.split(table_id)[0] + table_id context.log.debug(_file_path) context.log.debug(table_id) tb.create( path=Path(_file_path), if_table_exists="replace", if_storage_data_exists="replace", if_table_config_exists=table_config, ) tb.publish(if_exists=publish_config)
def get_filtered_download_dict(tipo, download_dict, bucket_name="basedosdados"): def get_year_month(b): ano = b.split("ano=")[1].split("/")[0] mes = b.split("mes=")[1].split("/")[0] return f"{ano}/{mes}/" tb = bd.Table(dataset_id="br_me_caged", table_id=f"microdados_{tipo}") blobs = list(tb.client["storage_staging"].bucket(bucket_name).list_blobs( prefix=f"staging/{tb.dataset_id}/{tb.table_id}/")) blobs = list(set([get_year_month(b.name) for b in blobs])) check_download_pop = [ d for d in download_dict[tipo]["check_download"] if d in blobs ] [ download_dict[tipo]["check_download"].pop(year_month, None) for year_month in check_download_pop ] must_download_pop = [ d for d in download_dict[tipo]["must_download"] if d not in blobs ] if must_download_pop != []: pass else: download_dict[tipo]["must_download"] = {} return download_dict
def push_table_to_bq( dataset_id, table_id, source_bucket_name="basedosdados-dev", destination_bucket_name="basedosdados", backup_bucket_name="basedosdados-backup", ): # copy proprosed data between storage buckets # create a backup of old data, then delete it and copies new data into the destination bucket modes = ["staging", "raw", "auxiliary_files", "architecture", "header"] for mode in modes: try: sync_bucket( source_bucket_name=source_bucket_name, dataset_id=dataset_id, table_id=table_id, destination_bucket_name=destination_bucket_name, backup_bucket_name=backup_bucket_name, mode=mode, ) tprint() except Exception as error: tprint(f"DATA ERROR ON {mode}.{dataset_id}.{table_id}") traceback.print_exc(file=sys.stderr) tprint() # load the table_config.yaml to get the metadata IDs table_config, configs_path = load_configs(dataset_id, table_id) # adjust the correct project ID in publish sql replace_project_id_publish_sql(configs_path, dataset_id, table_id) # create table object of selected table and dataset ID tb = bd.Table(dataset_id=dataset_id, table_id=table_id) # delete table from staging and prod if exists tb.delete("all") # create the staging table in bigquery tb.create( path=None, if_table_exists="replace", if_storage_data_exists="pass", if_table_config_exists="pass", ) # publish the table in prod bigquery tb.publish(if_exists="replace") # updates the table description tb.update("prod") # updates the dataset description Dataset(dataset_id).update(mode="prod") ### save table header in storage save_header_files(dataset_id, table_id)
def push_table_to_bq( dataset_id, table_id, source_bucket_name="basedosdados-dev", destination_bucket_name="basedosdados", backup_bucket_name="basedosdados-staging", ): ### Copies proprosed data between storage buckets. ### Creates a backup of old data, then delete it and copies new data into the destination bucket. sync_bucket( source_bucket_name, dataset_id, table_id, destination_bucket_name, backup_bucket_name, ) ### laod the table_config.yalm to get the metadata IDs table_config, configs_path = load_configs(dataset_id, table_id) ### adjust the correct project ID in publish sql replace_project_id_publish_sql(configs_path, dataset_id, table_id) ### create Table object of selected table and dataset ID tb = bd.Table(table_id, dataset_id) ### delete table from staging and prod if exists tb.delete("all") ### create the staging table in bigquery tb.create( path=None, if_table_exists="replace", if_storage_data_exists="pass", if_table_config_exists="pass", ) ### publish the table in prod bigquery tb.publish(if_exists="replace") ### updates the table description tb.update("prod") ### updates the dataset description Dataset(dataset_id).update("prod")
def create_table_bq_v2(context, file_path, table_config="replace", publish_config="pass", table_id=None): if not table_id: table_id = context.resources.basedosdados_config["table_id"] dataset_id = context.resources.basedosdados_config["dataset_id"] context.log.debug(f"Filepath: {file_path}") tb = bd.Table(dataset_id=dataset_id, table_id=table_id) tb.create( path=Path(file_path), if_table_exists="replace", if_storage_data_exists="replace", if_table_config_exists=table_config, ) tb.publish(if_exists=publish_config) # delete file Path(file_path).unlink(missing_ok=True)
df.rename(columns = rename_m, inplace = True) return df indice = icv(codes[0], drop_m, rename_m) variacao_mensal = icv(codes[1], drop_m, rename_m) variacao_anual = icv(codes[2], drop_a, rename_a) icv_mes = pd.merge(indice, variacao_mensal, how = 'left', left_on = ['ano', 'mes'], right_on = ['ano', 'mes']) icv_mes.to_csv(path_output + 'mes/mes.csv', index=False, encoding = 'utf-8', na_rep = '') variacao_anual.to_csv(path_output + 'ano/ano.csv', index=False, encoding = 'utf-8', na_rep = '') print(icv_mes.head()) print(variacao_anual.head()) #------------------------------------------------------------------------------# # Bigquery #------------------------------------------------------------------------------# table_ids = ['ano', 'mes'] for table_id in table_ids: tb = bd.Table(dataset_id = 'br_sp_saopaulo_dieese_icv', table_id = table_id) tb.create( path = path_dados + 'output/{}'.format(table_id), if_storage_data_exists = 'replace', if_table_config_exists = 'replace', if_table_exists = 'replace')