def get_table_description( dataset_id=None, table_id=None, query_project_id="basedosdados", from_file=False, verbose=True, ): """Prints the full table description. Args: dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `str`. """ client = bigquery.Client( credentials=credentials(from_file=from_file), project=query_project_id ) table = client.get_table(f"{dataset_id}.{table_id}") return _handle_output(verbose=verbose, output_type="str", df=table)
def list_dataset_tables( dataset_id, query_project_id="basedosdados", from_file=False, filter_by=None, with_description=False, verbose=True, ): """Fetch table_id for tables available at the specified dataset_id. Prints the information on screen or returns it as a list. Args: dataset_id (str): Optional. Dataset id available in basedosdados. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. filter_by (str): Optional String to be matched in the table_id. with_description (bool): Optional If True, fetch short table descriptions for each table that match the search criteria. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. Example: list_dataset_tables( dataset_id='br_ibge_censo2010' filter_by='renda', with_description=True, ) """ client = bigquery.Client( credentials=credentials(from_file=from_file), project=query_project_id ) dataset = client.get_dataset(dataset_id) tables_list = list(client.list_tables(dataset)) tables = pd.DataFrame( [table.table_id for table in tables_list], columns=["table_id"] ) if filter_by: tables = tables.loc[tables["table_id"].str.contains(filter_by)] if with_description: tables["description"] = [ _get_header(client.get_table(f"{dataset_id}.{table}").description) for table in tables["table_id"] ] return _handle_output( verbose=verbose, output_type="list", df=tables, col_name="table_id", )
def get_table_size( dataset_id, table_id, billing_project_id, query_project_id="basedosdados", from_file=False, verbose=True, ): """Use a query to get the number of rows and size (in Mb) of a table query from BigQuery. Prints information on screen in markdown friendly format. WARNING: this query may cost a lot depending on the table. Args: dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. Example: get_table_size( dataset_id='br_ibge_censo2010', table_id='pessoa_renda_setor_censitario', billing_project_id='yourprojectid' ) """ billing_client = bigquery.Client( credentials=credentials(from_file=from_file), project=billing_project_id ) query = f"""SELECT COUNT(*) FROM {query_project_id}.{dataset_id}.{table_id}""" job = billing_client.query(query, location="US") num_rows = job.to_dataframe().loc[0, "f0_"] size_mb = round(job.total_bytes_processed / 1024 / 1024, 2) table_data = pd.DataFrame( [ { "project_id": query_project_id, "dataset_id": dataset_id, "table_id": table_id, "num_rows": num_rows, "size_mb": size_mb, } ] ) return _handle_output(verbose=verbose, output_type="records", df=table_data)
def list_datasets( query_project_id="basedosdados", filter_by=None, with_description=False, from_file=False, verbose=True, ): """Fetch the dataset_id of datasets available at query_project_id. Prints information on screen or returns it as a list. Args: query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. filter_by (str): Optional String to be matched in dataset_id. with_description (bool): Optional If True, fetch short dataset description for each dataset. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, a list object is returned. Example: list_datasets( filter_by='sp', with_description=True, ) """ client = bigquery.Client( credentials=credentials(from_file=from_file), project=query_project_id ) datasets_list = list(client.list_datasets()) datasets = pd.DataFrame( [dataset.dataset_id for dataset in datasets_list], columns=["dataset_id"] ) if filter_by: datasets = datasets.loc[datasets["dataset_id"].str.contains(filter_by)] if with_description: datasets["description"] = [ _get_header(client.get_dataset(dataset).description) for dataset in datasets["dataset_id"] ] return _handle_output( verbose=verbose, output_type="list", df=datasets, col_name="dataset_id", )
def get_table_columns( dataset_id=None, table_id=None, query_project_id="basedosdados", from_file=False, verbose=True, ): """Fetch the names, types and descriptions for the columns in the specified table. Prints information on screen. Args: dataset_id (str): Optional. Dataset id available in basedosdados. It should always come with table_id. table_id (str): Optional. Table id available in basedosdados.dataset_id. It should always come with dataset_id. query_project_id (str): Optional. Which project the table lives. You can change this you want to query different projects. verbose (bool): Optional. If set to True, information is printed to the screen. If set to False, data is returned as a `list` of `dict`s. Example: get_table_columns( dataset_id='br_ibge_censo2010', table_id='pessoa_renda_setor_censitario' ) """ client = bigquery.Client( credentials=credentials(from_file=from_file), project=query_project_id ) table_ref = client.get_table(f"{dataset_id}.{table_id}") columns = [ (field.name, field.field_type, field.description) for field in table_ref.schema ] description = pd.DataFrame(columns, columns=["name", "field_type", "description"]) return _handle_output(verbose=verbose, output_type="records", df=description)
def read_sql( query, billing_project_id=None, from_file=False, reauth=False, use_bqstorage_api=False, ): """Load data from BigQuery using a query. Just a wrapper around pandas.read_gbq Args: query (sql): Valid SQL Standard Query to basedosdados billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard from_file (boolean): Optional. Uses the credentials from file, located in `~/.basedosdados/credentials/ reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. use_bqstorage_api (boolean): Optional. Use the BigQuery Storage API to download query results quickly, but at an increased cost(https://cloud.google.com/bigquery/docs/reference/storage/). To use this API, first enable it in the Cloud Console(https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com). You must also have the bigquery.readsessions.create permission on the project you are billing queries to. Returns: pd.DataFrame: Query result """ try: # Set a two hours timeout bigquery_storage_v1.client.BigQueryReadClient.read_rows = partialmethod( bigquery_storage_v1.client.BigQueryReadClient.read_rows, timeout=3600 * 2, ) return pandas_gbq.read_gbq( query, credentials=credentials(from_file=from_file, reauth=reauth), project_id=billing_project_id, use_bqstorage_api=use_bqstorage_api, ) except GenericGBQException as e: if "Reason: 403" in str(e): raise BaseDosDadosAccessDeniedException elif re.match("Reason: 400 POST .* [Pp]roject[ ]*I[Dd]", str(e)): raise BaseDosDadosInvalidProjectIDException raise except PyDataCredentialsError as e: raise BaseDosDadosAuthorizationException except (OSError, ValueError) as e: no_billing_id = "Could not determine project ID" in str(e) no_billing_id |= "reading from stdin while output is captured" in str( e) if no_billing_id: raise BaseDosDadosNoBillingProjectIDException raise