def metadata_validate():
    # find the dataset and tables of the PR
    dataset_table_ids = get_table_dataset_id()

    # print dataset tables info
    tprint("TABLES FOUND")
    pprint(dataset_table_ids)
    tprint()

    # iterate over each table in dataset of the PR
    for table_id in dataset_table_ids.keys():
        dataset_id = dataset_table_ids[table_id]["dataset_id"]
        source_bucket_name = dataset_table_ids[table_id]["source_bucket_name"]

        try:
            # push the table to bigquery
            md = Metadata(dataset_id=dataset_id, table_id=table_id)

            md.validate()
            tprint(f"SUCESS VALIDATE {dataset_id}.{table_id}")
            tprint()

        except Exception as error:
            tprint(f"ERROR ON {dataset_id}.{table_id}")
            traceback.print_exc()
            tprint()
Exemple #2
0
def cli_validate_metadata(ctx, dataset_id, table_id):
    m = Metadata(dataset_id, table_id, **ctx.obj)

    try:
        m.validate()
        msg, color = "Local metadata is valid.", "green"
    except BaseDosDadosException as e:
        msg = (
            f"Local metadata is invalid. Please check the traceback below for"
            f" more information on how to fix it:\n\n{repr(e)}")
        color = "red"

    click.echo(click.style(msg, fg=color))
Exemple #3
0
def cli_is_updated_metadata(ctx, dataset_id, table_id):
    m = Metadata(dataset_id, table_id, **ctx.obj)

    if m.is_updated():
        msg, color = "Local metadata is updated.", "green"
    else:
        msg = (
            "Local metadata is out of date. Please run `basedosdados metadata"
            " create` with the flag `if_exists=replace` to get the updated da"
            "ta.")
        color = "red"

    click.echo(click.style(msg, fg=color))
Exemple #4
0
    def __init__(self, dataset_id, table_id, **kwargs):
        super().__init__(**kwargs)

        self.table_id = table_id.replace("-", "_")
        self.dataset_id = dataset_id.replace("-", "_")
        self.dataset_folder = Path(self.metadata_path / self.dataset_id)
        self.table_folder = self.dataset_folder / table_id
        self.table_full_name = dict(
            prod=
            f"{self.client['bigquery_prod'].project}.{self.dataset_id}.{self.table_id}",
            staging=
            f"{self.client['bigquery_staging'].project}.{self.dataset_id}_staging.{self.table_id}",
        )
        self.table_full_name.update(dict(all=deepcopy(self.table_full_name)))
        self.metadata = Metadata(self.dataset_id, self.table_id, **kwargs)
Exemple #5
0
def cli_publish_metadata(
    ctx,
    dataset_id,
    table_id,
    all,
    if_exists,
    update_locally,
):
    m = Metadata(dataset_id, table_id, **ctx.obj)

    try:
        m.publish(all=all, if_exists=if_exists, update_locally=update_locally)
        msg, color = "Local metadata has been published.", "green"
    except (CKANAPIError, BaseDosDadosException, AssertionError) as e:
        msg = (f"Local metadata couldn't be published due to an error. Pleas"
               f"e check the traceback below for more information on how to "
               f"fix it:\n\n{repr(e)}")
        color = "red"

    click.echo(click.style(msg, fg=color))
Exemple #6
0
def cli_create_metadata(
    ctx,
    dataset_id,
    table_id,
    if_exists,
    columns,
    partition_columns,
    force_columns,
    table_only,
):

    m = Metadata(dataset_id, table_id, **ctx.obj).create(
        if_exists=if_exists,
        columns=columns,
        partition_columns=partition_columns,
        force_columns=force_columns,
        table_only=table_only,
    )

    click.echo(
        click.style(
            f"Metadata file was created at `{m.filepath}`",
            fg="green",
        ))
Exemple #7
0
class Dataset(Base):
    """
    Manage datasets in BigQuery.
    """

    def __init__(self, dataset_id, **kwargs):
        super().__init__(**kwargs)

        self.dataset_id = dataset_id.replace("-", "_")
        self.dataset_folder = Path(self.metadata_path / self.dataset_id)
        self.metadata = Metadata(self.dataset_id, **kwargs)

    @property
    def dataset_config(self):

        return self._load_yaml(
            self.metadata_path / self.dataset_id / "dataset_config.yaml"
        )

    def _loop_modes(self, mode="all"):

        mode = ["prod", "staging"] if mode == "all" else [mode]
        dataset_tag = lambda m: f"_{m}" if m == "staging" else ""

        return (
            {
                "client": self.client[f"bigquery_{m}"],
                "id": f"{self.client[f'bigquery_{m}'].project}.{self.dataset_id}{dataset_tag(m)}",
            }
            for m in mode
        )

    def _setup_dataset_object(self, dataset_id):

        dataset = bigquery.Dataset(dataset_id)
        ## TODO: not being used since 1.6.0 - need to redo the description tha goes to bigquery
        # dataset.description = self._render_template(
        #     Path("dataset/dataset_description.txt"), self.dataset_config
        # )

        return dataset

    def _write_readme_file(self):

        readme_content = (
            f"Como capturar os dados de {self.dataset_id}?\n\nPara cap"
            f"turar esses dados, basta verificar o link dos dados orig"
            f"inais indicado em dataset_config.yaml no item website.\n"
            f"\nCaso tenha sido utilizado algum código de captura ou t"
            f"ratamento, estes estarão contidos em code/. Se o dado pu"
            f"blicado for em sua versão bruta, não existirá a pasta co"
            f"de/.\n\nOs dados publicados estão disponíveis em: https:"
            f"//basedosdados.org/dataset/{self.dataset_id.replace('_','-')}"
        )

        readme_path = Path(self.metadata_path / self.dataset_id / "README.md")

        with open(readme_path, "w") as readmefile:
            readmefile.write(readme_content)

    def init(self, replace=False):
        """Initialize dataset folder at metadata_path at `metadata_path/<dataset_id>`.

        The folder should contain:

        * `dataset_config.yaml`
        * `README.md`

        Args:
            replace (str): Optional. Whether to replace existing folder.

        Raises:
            FileExistsError: If dataset folder already exists and replace is False
        """

        # Create dataset folder
        try:
            self.dataset_folder.mkdir(exist_ok=replace, parents=True)
        except FileExistsError:
            raise FileExistsError(
                f"Dataset {str(self.dataset_folder.stem)} folder does not exists. "
                "Set replace=True to replace current files."
            )

        # create dataset_config.yaml with metadata
        self.metadata.create(if_exists="replace")

        # create README.md file
        self._write_readme_file()

        # Add code folder
        (self.dataset_folder / "code").mkdir(exist_ok=replace, parents=True)

        return self

    def publicize(self, mode="all"):
        """Changes IAM configuration to turn BigQuery dataset public.

        Args:
            mode (bool): Which dataset to create [prod|staging|all].
        """

        for m in self._loop_modes(mode):

            dataset = m["client"].get_dataset(m["id"])
            entries = dataset.access_entries

            entries.extend(
                [
                    bigquery.AccessEntry(
                        role="roles/bigquery.dataViewer",
                        entity_type="iamMember",
                        entity_id="allUsers",
                    ),
                    bigquery.AccessEntry(
                        role="roles/bigquery.metadataViewer",
                        entity_type="iamMember",
                        entity_id="allUsers",
                    ),
                    bigquery.AccessEntry(
                        role="roles/bigquery.user",
                        entity_type="iamMember",
                        entity_id="allUsers",
                    ),
                ]
            )
            dataset.access_entries = entries

            m["client"].update_dataset(dataset, ["access_entries"])

    def create(self, mode="all", if_exists="raise"):
        """Creates BigQuery datasets given `dataset_id`.

        It can create two datasets:

        * `<dataset_id>` (mode = 'prod')
        * `<dataset_id>_staging` (mode = 'staging')

        If `mode` is all, it creates both.

        Args:
            mode (str): Optional. Which dataset to create [prod|staging|all].
            if_exists (str): Optional. What to do if dataset exists

                * raise : Raises Conflict exception
                * replace : Drop all tables and replace dataset
                * update : Update dataset description
                * pass : Do nothing

        Raises:
            Warning: Dataset already exists and if_exists is set to `raise`
        """

        if if_exists == "replace":
            self.delete(mode)
        elif if_exists == "update":

            self.update()
            return

        # Set dataset_id to the ID of the dataset to create.
        for m in self._loop_modes(mode):

            # Construct a full Dataset object to send to the API.
            dataset_obj = self._setup_dataset_object(m["id"])

            # Send the dataset to the API for creation, with an explicit timeout.
            # Raises google.api_core.exceptions.Conflict if the Dataset already
            # exists within the project.
            try:
                job = m["client"].create_dataset(dataset_obj)  # Make an API request.
            except Conflict:

                if if_exists == "pass":
                    return
                else:
                    raise Conflict(f"Dataset {self.dataset_id} already exists")

        # Make prod dataset public
        self.publicize()

    def delete(self, mode="all"):
        """Deletes dataset in BigQuery. Toogle mode to choose which dataset to delete.

        Args:
            mode (str): Optional.  Which dataset to delete [prod|staging|all]
        """

        for m in self._loop_modes(mode):

            m["client"].delete_dataset(m["id"], delete_contents=True, not_found_ok=True)

    def update(self, mode="all"):
        """Update dataset description. Toogle mode to choose which dataset to update.

        Args:
            mode (str): Optional. Which dataset to update [prod|staging|all]
        """

        for m in self._loop_modes(mode):

            # Send the dataset to the API to update, with an explicit timeout.
            # Raises google.api_core.exceptions.Conflict if the Dataset already
            # exists within the project.
            dataset = m["client"].update_dataset(
                self._setup_dataset_object(m["id"]), fields=["description"]
            )  # Make an API request.
Exemple #8
0
    def __init__(self, dataset_id, **kwargs):
        super().__init__(**kwargs)

        self.dataset_id = dataset_id.replace("-", "_")
        self.dataset_folder = Path(self.metadata_path / self.dataset_id)
        self.metadata = Metadata(self.dataset_id, **kwargs)
Exemple #9
0
def table_approve():
    # find the dataset and tables of the PR
    dataset_table_ids = get_table_dataset_id()

    # print dataset tables info
    tprint("TABLES FOUND")
    pprint(dataset_table_ids)
    tprint()
    # iterate over each table in dataset of the PR
    for table_id in dataset_table_ids.keys():
        dataset_id = dataset_table_ids[table_id]["dataset_id"]
        source_bucket_name = dataset_table_ids[table_id]["source_bucket_name"]

        # criate a bigquery table in prod
        try:
            # push the table to bigquery
            tprint(f"STARTING TABLE APPROVE ON {dataset_id}.{table_id}")
            push_table_to_bq(
                dataset_id=dataset_id,
                table_id=table_id,
                source_bucket_name=source_bucket_name,
                destination_bucket_name=os.environ.get(
                    "BUCKET_NAME_DESTINATION"),
                backup_bucket_name=os.environ.get("BUCKET_NAME_BACKUP"),
            )
            pretty_log(dataset_id, table_id, source_bucket_name)
            tprint()
        except Exception as error:
            tprint(f"DATA ERROR ON {dataset_id}.{table_id}")
            traceback.print_exc(file=sys.stderr)
            tprint()
        # pubish Metadata in prod
        try:
            # create table metadata object
            md = Metadata(dataset_id=dataset_id, table_id=table_id)

            # check if correspondent dataset metadata already exists in CKAN
            if not md.dataset_metadata_obj.exists_in_ckan():
                # validate dataset metadata
                md.dataset_metadata_obj.validate()
                tprint(f"SUCESS VALIDATE {dataset_id}")

                # publish dataset metadata to CKAN
                md.dataset_metadata_obj.publish()

                # run multiple tries to get published dataset metadata from
                # ckan till it is published: dataset metadata must be
                # accessible for table metadata to be published too
                tprint(f"WAITING FOR {dataset_id} METADATA PUBLISHMENT...")
                MAX_RETRIES = 80
                retry_count = 0
                while not md.dataset_metadata_obj.exists_in_ckan():
                    time.sleep(15)
                    retry_count += 1
                    if retry_count >= MAX_RETRIES:
                        break

                if md.dataset_metadata_obj.exists_in_ckan():
                    tprint(f"SUCESS PUBLISH {dataset_id}")
                else:
                    tprint(f"ERROR PUBLISH {dataset_id}")

            # validate table metadata
            md.validate()
            tprint(f"SUCESS VALIDATE {dataset_id}.{table_id}")
            # publish table metadata to CKAN
            md.publish(if_exists="replace")
            tprint(f"SUCESS PUBLISHED {dataset_id}.{table_id}")
            tprint()
        except Exception as error:
            tprint(f"METADATA ERROR ON {dataset_id}.{table_id}")
            traceback.print_exc(file=sys.stderr)
            tprint()
Exemple #10
0
class Table(Base):
    """
    Manage tables in Google Cloud Storage and BigQuery.
    """
    def __init__(self, dataset_id, table_id, **kwargs):
        super().__init__(**kwargs)

        self.table_id = table_id.replace("-", "_")
        self.dataset_id = dataset_id.replace("-", "_")
        self.dataset_folder = Path(self.metadata_path / self.dataset_id)
        self.table_folder = self.dataset_folder / table_id
        self.table_full_name = dict(
            prod=
            f"{self.client['bigquery_prod'].project}.{self.dataset_id}.{self.table_id}",
            staging=
            f"{self.client['bigquery_staging'].project}.{self.dataset_id}_staging.{self.table_id}",
        )
        self.table_full_name.update(dict(all=deepcopy(self.table_full_name)))
        self.metadata = Metadata(self.dataset_id, self.table_id, **kwargs)

    @property
    def table_config(self):
        return self._load_yaml(self.table_folder / "table_config.yaml")

    def _get_table_obj(self, mode):
        return self.client[f"bigquery_{mode}"].get_table(
            self.table_full_name[mode])

    def _is_partitioned(self):
        ## check if the table are partitioned, need the split because of a change in the type of partitions in pydantic
        partitions = self.table_config["partitions"]
        if partitions:
            partitions = partitions.split(",")

        if partitions is None:
            return False

        elif isinstance(partitions, list):
            # check if any None inside list.
            # False if it is the case Ex: [None, 'partition']
            # True otherwise          Ex: ['partition1', 'partition2']
            return all(item is not None for item in partitions)

    def _load_schema(self, mode="staging"):
        """Load schema from table_config.yaml

        Args:
            mode (bool): Which dataset to create [prod|staging].
        """

        self._check_mode(mode)

        json_path = self.table_folder / f"schema-{mode}.json"
        columns = self.table_config["columns"]

        if mode == "staging":
            new_columns = []
            for c in columns:
                # case is_in_staging are None then must be True
                is_in_staging = (True if c.get("is_in_staging") is None else
                                 c["is_in_staging"])
                # append columns declared in table_config.yaml to schema only if is_in_staging: True
                if is_in_staging and not c.get("is_partition"):
                    c["type"] = "STRING"
                    new_columns.append(c)

            del columns
            columns = new_columns

        elif mode == "prod":
            schema = self._get_table_obj(mode).schema

            # get field names for fields at schema and at table_config.yaml
            column_names = [c["name"] for c in columns]
            schema_names = [s.name for s in schema]

            # check if there are mismatched fields
            not_in_columns = [
                name for name in schema_names if name not in column_names
            ]
            not_in_schema = [
                name for name in column_names if name not in schema_names
            ]

            # raise if field is not in table_config
            if not_in_columns:
                raise BaseDosDadosException(
                    "Column {error_columns} was not found in table_config.yaml. Are you sure that "
                    "all your column names between table_config.yaml, publish.sql and "
                    "{project_id}.{dataset_id}.{table_id} are the same?".
                    format(
                        error_columns=not_in_columns,
                        project_id=self.table_config["project_id_prod"],
                        dataset_id=self.table_config["dataset_id"],
                        table_id=self.table_config["table_id"],
                    ))

            # raise if field is not in schema
            elif not_in_schema:
                raise BaseDosDadosException(
                    "Column {error_columns} was not found in publish.sql. Are you sure that "
                    "all your column names between table_config.yaml, publish.sql and "
                    "{project_id}.{dataset_id}.{table_id} are the same?".
                    format(
                        error_columns=not_in_schema,
                        project_id=self.table_config["project_id_prod"],
                        dataset_id=self.table_config["dataset_id"],
                        table_id=self.table_config["table_id"],
                    ))

            else:
                # if field is in schema, get field_type and field_mode
                for c in columns:
                    for s in schema:
                        if c["name"] == s.name:
                            c["type"] = s.field_type
                            c["mode"] = s.mode
                            break
        ## force utf-8, write schema_{mode}.json
        json.dump(columns, (json_path).open("w", encoding="utf-8"))

        # load new created schema
        return self.client[f"bigquery_{mode}"].schema_from_json(str(json_path))

    def _make_publish_sql(self):
        """Create publish.sql with columns and bigquery_type"""

        ### publish.sql header and instructions
        publish_txt = """
        /*
        Query para publicar a tabela.

        Esse é o lugar para:
            - modificar nomes, ordem e tipos de colunas
            - dar join com outras tabelas
            - criar colunas extras (e.g. logs, proporções, etc.)

        Qualquer coluna definida aqui deve também existir em `table_config.yaml`.

        # Além disso, sinta-se à vontade para alterar alguns nomes obscuros
        # para algo um pouco mais explícito.

        TIPOS:
            - Para modificar tipos de colunas, basta substituir STRING por outro tipo válido.
            - Exemplo: `SAFE_CAST(column_name AS NUMERIC) column_name`
            - Mais detalhes: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
        */
        """

        # remove triple quotes extra space
        publish_txt = inspect.cleandoc(publish_txt)
        publish_txt = textwrap.dedent(publish_txt)

        # add create table statement
        project_id_prod = self.client["bigquery_prod"].project
        publish_txt += f"\n\nCREATE VIEW {project_id_prod}.{self.dataset_id}.{self.table_id} AS\nSELECT \n"

        # sort columns by is_partition, partitions_columns come first

        if self._is_partitioned():
            columns = sorted(
                self.table_config["columns"],
                key=lambda k:
                (k["is_partition"] is not None, k["is_partition"]),
                reverse=True,
            )
        else:
            columns = self.table_config["columns"]

        # add columns in publish.sql
        for col in columns:
            name = col["name"]
            bigquery_type = ("STRING" if col["bigquery_type"] is None else
                             col["bigquery_type"].upper())

            publish_txt += f"SAFE_CAST({name} AS {bigquery_type}) {name},\n"
        ## remove last comma
        publish_txt = publish_txt[:-2] + "\n"

        # add from statement
        project_id_staging = self.client["bigquery_staging"].project
        publish_txt += (
            f"FROM {project_id_staging}.{self.dataset_id}_staging.{self.table_id} AS t"
        )

        # save publish.sql in table_folder
        (self.table_folder / "publish.sql").open(
            "w", encoding="utf-8").write(publish_txt)

    def _make_template(self, columns, partition_columns,
                       if_table_config_exists):
        # create table_config.yaml with metadata
        self.metadata.create(
            if_exists=if_table_config_exists,
            columns=partition_columns + columns,
            partition_columns=partition_columns,
            table_only=False,
        )

        self._make_publish_sql()

    def _sheet_to_df(self, columns_config_url):
        url = columns_config_url.replace("edit#gid=", "export?format=csv&gid=")
        try:
            return pd.read_csv(
                StringIO(requests.get(url).content.decode("utf-8")))
        except:
            raise BaseDosDadosException(
                "Check if your google sheet Share are: Anyone on the internet with this link can view"
            )

    def table_exists(self, mode):
        """Check if table exists in BigQuery.

        Args:
            mode (str): Which dataset to check [prod|staging].
        """

        try:
            ref = self._get_table_obj(mode=mode)
        except google.api_core.exceptions.NotFound:
            ref = None

        if ref:
            return True
        else:
            return False

    def update_columns(self, columns_config_url):
        """
        Fills columns in table_config.yaml automatically using a public google sheets URL. Also regenerate
        publish.sql and autofill type using bigquery_type.

        The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>.
        The sheet must contain the columns:
            - nome: column name
            - descricao: column description
            - tipo: column bigquery type
            - unidade_medida: column mesurement unit
            - dicionario: column related dictionary
            - nome_diretorio: column related directory in the format <dataset_id>.<table_id>:<column_name>

        Args:
            columns_config_url (str): google sheets URL.
        """
        ruamel = ryaml.YAML()
        ruamel.preserve_quotes = True
        ruamel.indent(mapping=4, sequence=6, offset=4)
        table_config_yaml = ruamel.load(
            (self.table_folder / "table_config.yaml").open(encoding="utf-8"))
        if ("edit#gid=" not in columns_config_url
                or "https://docs.google.com/spreadsheets/d/"
                not in columns_config_url
                or not columns_config_url.split("=")[1].isdigit()):
            raise BaseDosDadosException(
                "The Google sheet url not in correct format."
                "The url must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>"
            )

        df = self._sheet_to_df(columns_config_url)
        df = df.fillna("NULL")

        if "nome" not in df.columns.tolist():
            raise BaseDosDadosException(
                "Column 'nome' not found in Google the google sheet. "
                "The sheet must contain the column name: 'nome'")
        elif "descricao" not in df.columns.tolist():
            raise BaseDosDadosException(
                "Column 'descricao' not found in Google the google sheet. "
                "The sheet must contain the column description: 'descricao'")
        elif "tipo" not in df.columns.tolist():
            raise BaseDosDadosException(
                "Column 'tipo' not found in Google the google sheet. "
                "The sheet must contain the column type: 'tipo'")
        elif "unidade_medida" not in df.columns.tolist():
            raise BaseDosDadosException(
                "Column 'unidade_medida' not found in Google the google sheet. "
                "The sheet must contain the column measurement unit: 'unidade_medida'"
            )
        elif "dicionario" not in df.columns.tolist():
            raise BaseDosDadosException(
                "Column 'dicionario' not found in Google the google sheet. "
                "The sheet must contain the column dictionary: 'dicionario'")
        elif "nome_diretorio" not in df.columns.tolist():
            raise BaseDosDadosException(
                "Column 'nome_diretorio' not found in Google the google sheet. "
                "The sheet must contain the column dictionary name: 'nome_diretorio'"
            )

        columns_parameters = zip(
            df["nome"].tolist(),
            df["descricao"].tolist(),
            df["tipo"].tolist(),
            df["unidade_medida"].tolist(),
            df["dicionario"].tolist(),
            df["nome_diretorio"].tolist(),
        )

        for (
                name,
                description,
                tipo,
                unidade_medida,
                dicionario,
                nome_diretorio,
        ) in columns_parameters:
            for col in table_config_yaml["columns"]:
                if col["name"] == name:

                    col["description"] = (col["description"] if description
                                          == "NULL" else description)

                    col["bigquery_type"] = (col["bigquery_type"] if tipo
                                            == "NULL" else tipo.lower())

                    col["measurement_unit"] = (col["measurement_unit"]
                                               if unidade_medida == "NULL" else
                                               unidade_medida)

                    col["covered_by_dictionary"] = ("no" if dicionario
                                                    == "NULL" else "yes")

                    dataset = nome_diretorio.split(".")[0]
                    col["directory_column"]["dataset_id"] = (
                        col["directory_column"]["dataset_id"]
                        if dataset == "NULL" else dataset)

                    table = nome_diretorio.split(".")[-1].split(":")[0]
                    col["directory_column"]["table_id"] = (
                        col["directory_column"]["table_id"]
                        if table == "NULL" else table)

                    column = nome_diretorio.split(".")[-1].split(":")[-1]
                    col["directory_column"]["column_name"] = (
                        col["directory_column"]["column_name"]
                        if column == "NULL" else column)

        ruamel.dump(
            table_config_yaml,
            open(self.table_folder / "table_config.yaml",
                 "w",
                 encoding="utf-8"),
        )

        # regenerate publish.sql
        self._make_publish_sql()

    def table_exists(self, mode):
        """Check if table exists in BigQuery.

        Args:
            mode (str): Which dataset to check [prod|staging|all].
        """
        try:
            ref = self._get_table_obj(mode=mode)
        except google.api_core.exceptions.NotFound:
            ref = None

        if ref:
            return True
        else:
            return False

    def init(
        self,
        data_sample_path=None,
        if_folder_exists="raise",
        if_table_config_exists="raise",
        source_format="csv",
        columns_config_url=None,
    ):
        """Initialize table folder at metadata_path at `metadata_path/<dataset_id>/<table_id>`.

        The folder should contain:

        * `table_config.yaml`
        * `publish.sql`

        You can also point to a sample of the data to auto complete columns names.

        Args:
            data_sample_path (str, pathlib.PosixPath): Optional.
                Data sample path to auto complete columns names
                It supports Comma Delimited CSV.
            if_folder_exists (str): Optional.
                What to do if table folder exists

                * 'raise' : Raises FileExistsError
                * 'replace' : Replace folder
                * 'pass' : Do nothing
            if_table_config_exists (str): Optional
                What to do if table_config.yaml and publish.sql exists

                * 'raise' : Raises FileExistsError
                * 'replace' : Replace files with blank template
                * 'pass' : Do nothing
            source_format (str): Optional
                Data source format. Only 'csv' is supported. Defaults to 'csv'.

            columns_config_url (str): google sheets URL.
                The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>.
                The sheet must contain the column name: "coluna" and column description: "descricao"

        Raises:
            FileExistsError: If folder exists and replace is False.
            NotImplementedError: If data sample is not in supported type or format.
        """
        if not self.dataset_folder.exists():

            raise FileExistsError(
                f"Dataset folder {self.dataset_folder} folder does not exists. "
                "Create a dataset before adding tables.")

        try:
            self.table_folder.mkdir(exist_ok=(if_folder_exists == "replace"))
        except FileExistsError:
            if if_folder_exists == "raise":
                raise FileExistsError(
                    f"Table folder already exists for {self.table_id}. ")
            elif if_folder_exists == "pass":
                return self

        if not data_sample_path and if_table_config_exists != "pass":
            raise BaseDosDadosException(
                "You must provide a path to correctly create config files")

        partition_columns = []
        if isinstance(
                data_sample_path,
            (
                str,
                Path,
            ),
        ):
            # Check if partitioned and get data sample and partition columns
            data_sample_path = Path(data_sample_path)

            if data_sample_path.is_dir():

                data_sample_path = [
                    f for f in data_sample_path.glob("**/*")
                    if f.is_file() and f.suffix == ".csv"
                ][0]

                partition_columns = [
                    k.split("=")[0]
                    for k in data_sample_path.as_posix().split("/") if "=" in k
                ]

            columns = Datatype(self, source_format).header(data_sample_path)

        else:

            columns = ["column_name"]

        if if_table_config_exists == "pass":
            # Check if config files exists before passing
            if (Path(self.table_folder / "table_config.yaml").is_file()
                    and Path(self.table_folder / "publish.sql").is_file()):
                pass
            # Raise if no sample to determine columns
            elif not data_sample_path:
                raise BaseDosDadosException(
                    "You must provide a path to correctly create config files")
            else:
                self._make_template(columns, partition_columns,
                                    if_table_config_exists)

        elif if_table_config_exists == "raise":

            # Check if config files already exist
            if (Path(self.table_folder / "table_config.yaml").is_file()
                    and Path(self.table_folder / "publish.sql").is_file()):

                raise FileExistsError(
                    f"table_config.yaml and publish.sql already exists at {self.table_folder}"
                )
            # if config files don't exist, create them
            else:
                self._make_template(columns, partition_columns,
                                    if_table_config_exists)

        else:
            # Raise: without a path to data sample, should not replace config files with empty template
            self._make_template(columns, partition_columns,
                                if_table_config_exists)

        if columns_config_url is not None:
            self.update_columns(columns_config_url)

        return self

    def create(
        self,
        path=None,
        job_config_params=None,
        force_dataset=True,
        if_table_exists="raise",
        if_storage_data_exists="raise",
        if_table_config_exists="raise",
        source_format="csv",
        columns_config_url=None,
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:

        - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>` - for instance,
        `year=2012/country=BR`. The partition is automatcally detected
        by searching for `partitions` on the `table_config.yaml`.

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            if_table_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.
            if_table_config_exists (str): Optional.
                What to do if config files already exist

                 * 'raise': Raises FileExistError
                 * 'replace': Replace with blank template
                 * 'pass'; Do nothing
            if_storage_data_exists (str): Optional.
                What to do if data already exists on your bucket:

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            source_format (str): Optional
                Data source format. Only 'csv' is supported. Defaults to 'csv'.

            columns_config_url (str): google sheets URL.
                The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>.
                The sheet must contain the column name: "coluna" and column description: "descricao"

        """

        if path is None:

            # Look if table data already exists at Storage
            data = self.client["storage_staging"].list_blobs(
                self.bucket_name,
                prefix=f"staging/{self.dataset_id}/{self.table_id}")

            # Raise: Cannot create table without external data
            if not data:
                raise BaseDosDadosException(
                    "You must provide a path for uploading data")

        # Add data to storage
        if isinstance(
                path,
            (
                str,
                Path,
            ),
        ):

            Storage(self.dataset_id, self.table_id,
                    **self.main_vars).upload(path,
                                             mode="staging",
                                             if_exists=if_storage_data_exists)

        # Create Dataset if it doesn't exist
        if force_dataset:

            dataset_obj = Dataset(self.dataset_id, **self.main_vars)

            try:
                dataset_obj.init()
            except FileExistsError:
                pass

            dataset_obj.create(if_exists="pass")

        self.init(
            data_sample_path=path,
            if_folder_exists="replace",
            if_table_config_exists=if_table_config_exists,
            columns_config_url=columns_config_url,
        )

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = Datatype(
            self, source_format, "staging",
            partitioned=self._is_partitioned()).external_config

        # Lookup if table alreay exists
        table_ref = None
        try:
            table_ref = self.client["bigquery_staging"].get_table(
                self.table_full_name["staging"])

        except google.api_core.exceptions.NotFound:
            pass

        if isinstance(table_ref, google.cloud.bigquery.table.Table):

            if if_table_exists == "pass":

                return None

            elif if_table_exists == "raise":

                raise FileExistsError(
                    "Table already exists, choose replace if you want to overwrite it"
                )

        if if_table_exists == "replace":

            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)

    def update(self, mode="all", not_found_ok=True):
        """Updates BigQuery schema and description.

        Args:
            mode (str): Optional.
                Table of which table to update [prod|staging|all]
            not_found_ok (bool): Optional.
                What to do if table is not found
        """

        self._check_mode(mode)

        mode = ["prod", "staging"] if mode == "all" else [mode]
        for m in mode:

            try:
                table = self._get_table_obj(m)
            except google.api_core.exceptions.NotFound:
                continue

            # if m == "staging":

            table.description = self._render_template(
                Path("table/table_description.txt"), self.table_config)

            # save table description
            open(
                self.metadata_path / self.dataset_id / self.table_id /
                "table_description.txt",
                "w",
                encoding="utf-8",
            ).write(table.description)

            if m == "prod":
                table.schema = self._load_schema(m)

                self.client[f"bigquery_{m}"].update_table(
                    table, fields=["description", "schema"])

    def publish(self, if_exists="raise"):
        """Creates BigQuery table at production dataset.

        Table should be located at `<dataset_id>.<table_id>`.

        It creates a view that uses the query from
        `<metadata_path>/<dataset_id>/<table_id>/publish.sql`.

        Make sure that all columns from the query also exists at
        `<metadata_path>/<dataset_id>/<table_id>/table_config.sql`, including
        the partitions.

        Args:
            if_exists (str): Optional.
                What to do if table exists.

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing

        Todo:

            * Check if all required fields are filled
        """

        if if_exists == "replace":
            self.delete(mode="prod")

        self.client["bigquery_prod"].query(
            (self.table_folder / "publish.sql").open(
                "r", encoding="utf-8").read()).result()

        self.update("prod")

    def delete(self, mode):
        """Deletes table in BigQuery.

        Args:
            mode (str): Table of which table to delete [prod|staging]
        """

        self._check_mode(mode)

        if mode == "all":
            for m, n in self.table_full_name[mode].items():
                self.client[f"bigquery_{m}"].delete_table(n, not_found_ok=True)
        else:
            self.client[f"bigquery_{mode}"].delete_table(
                self.table_full_name[mode], not_found_ok=True)

    def append(self,
               filepath,
               partitions=None,
               if_exists="replace",
               **upload_args):
        """Appends new data to existing BigQuery table.

        As long as the data has the same schema. It appends the data in the
        filepath to the existing table.

        Args:
            filepath (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            partitions (str, pathlib.PosixPath, dict): Optional.
                Hive structured partition as a string or dict

                * str : `<key>=<value>/<key2>=<value2>`
                * dict: `dict(key=value, key2=value2)`
            if_exists (str): 0ptional.
                What to do if data with same name exists in storage

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
        """
        if not self.table_exists("staging"):
            raise BaseDosDadosException(
                "You cannot append to a table that does not exist")
        else:
            Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
                filepath,
                mode="staging",
                partitions=partitions,
                if_exists=if_exists,
                **upload_args,
            )