Ejemplo n.º 1
0
 def create_schema(self, check_exists=False):
     super().create_schema(check_exists)
     if check_exists and self.does_table_exist():
         return
     if "init_query" in self.config:
         qstring = read_string("sql/{}.sql".format(
             self.config["init_query"]))
         qparams = self.get_query_params(self.date)
         qstring = qstring.format(**qparams)
         self.run_query(self.date, qstring)
     else:
         # Run a empty query to create schema
         start_date = "1970-01-01"
         qstring = read_string("sql/{}.sql".format(self.config["query"]))
         qparams = self.get_query_params(start_date)
         qstring = qstring.format(**qparams)
         LIMIT_REGEX = r"(.*)(LIMIT\s+[0-9]+)(.*)"
         if re.match(LIMIT_REGEX, qstring, re.IGNORECASE):
             re.sub(LIMIT_REGEX,
                    r"\1 LIMIT 0 \3",
                    qstring,
                    flags=re.IGNORECASE)
         else:
             qstring += " LIMIT 0"
         self.run_query(start_date, qstring)
Ejemplo n.º 2
0
def build_query(config: Dict[str, Any], start_date: str, end_date: str) -> str:
    """Build query based on configs and args.

    :rtype: str
    :param config: the config of the query
    :param start_date: the start date string for the query
    :param end_date: the end date string for the query
    :return: the composed query string
    """
    query = ""
    if "udf" in config:
        for udf in config["udf"]:
            query += read_string("udf/{}.sql".format(udf))
    if "udf_js" in config:
        for udf_js in config["udf_js"]:
            query += read_string("udf_js/{}.sql".format(udf_js))
    if "query" in config:
        query += read_string("sql/{}.sql".format(config["query"])).format(
            project=config["project"],
            dataset=config["dataset"],
            table=config["table"],
            start_date=start_date,
            end_date=end_date,
        )
    return query
Ejemplo n.º 3
0
 def daily_cleanup(self, d):
     if self.is_write_append():
         if "cleanup_query" in self.config:
             qstring = read_string("sql/{}.sql".format(self.config["cleanup_query"]))
             qparams = self.get_query_params(d)
             query_job = self.client.query(qstring.format(**qparams))
             query_job.result()
             log.info("Done custom cleaning up.")
         elif "execution_date_field" in self.config["params"]:
             qstring = read_string("sql/cleanup_generic.sql")
             qparams = self.get_query_params(d)
             query_job = self.client.query(qstring.format(**qparams))
             query_job.result()
             log.info("Done generic cleaning up.")
Ejemplo n.º 4
0
    def run_query(self, date, qstring=None):
        if qstring is None:
            qstring = read_string("sql/{}.sql".format(self.config["query"]))
            qparams = self.get_query_params(date)
            qstring = qstring.format(**qparams)
        table_ref = self.client.dataset(
            self.config["params"]["dataset"]).table(
                self.config["params"]["dest"])
        job_config = bigquery.QueryJobConfig()
        job_config.write_disposition = (
            bigquery.WriteDisposition.WRITE_APPEND if self.is_write_append()
            else bigquery.WriteDisposition.WRITE_TRUNCATE)
        if ("allow_field_addition" in self.config
                and self.config["allow_field_addition"]):
            job_config.schema_update_options = [
                bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
            ]
        job_config.destination = table_ref
        if "partition_field" in self.config:
            job_config.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=self.config["partition_field"],
            )

        query = self.client.query(qstring, job_config=job_config)
        query.result()

        if "create_view_alt" in self.config and self.config["create_view_alt"]:
            self.create_view("_view")
Ejemplo n.º 5
0
def test_read_string():
    STR = "test1"
    fname = _create_temp_file()
    with open(fname, "w") as f:
        f.write(STR)
    data = read_string(fname)
    assert data == STR
    os.remove(fname)
Ejemplo n.º 6
0
 def create_schema(self, check_exists=False):
     udfs = []
     if "udf" in self.config:
         udfs += [("udf_%s" % x, read_string("udf/{}.sql".format(x)))
                  for x in self.config["udf"]]
     if "udf_js" in self.config:
         udfs += [("udf_js_%s" % x, read_string("udf_js/{}.sql".format(x)))
                  for x in self.config["udf_js"]]
     for udf, qstring in udfs:
         if check_exists and self.does_routine_exist(udf):
             continue
         qstring = qstring % (
             self.config["params"]["project"],
             self.config["params"]["dataset"],
         )
         # Initiate the query to create the routine.
         query_job = self.client.query(qstring)  # Make an API request.
         # Wait for the query to complete.
         query_job.result()  # Waits for the job to complete.
         log.info("Created routine {}".format(query_job.ddl_target_routine))
Ejemplo n.º 7
0
 def create_view(self, postfix=None):
     qstring = read_string("sql/{}.sql".format(self.config["query"]))
     shared_dataset_ref = self.client.dataset(
         self.config["params"]["dataset"])
     view_ref = shared_dataset_ref.table(self.config["params"]["dest"] +
                                         (postfix if postfix else ""))
     view = bigquery.Table(view_ref)
     qparams = self.get_query_params(self.date)
     view.view_query = qstring.format(**qparams)
     if self.does_table_exist(postfix):
         view = self.client.update_table(view,
                                         ["view_query"])  # API request
     else:
         view = self.client.create_table(view)  # API request
     log.info("Successfully created view at {}".format(view.full_table_id))
Ejemplo n.º 8
0
    def create_schema(self, check_exists=False):
        super().create_schema(check_exists)
        if check_exists and self.does_table_exist():
            return
        qstring = read_string("sql/{}.sql".format(self.config["query"]))
        shared_dataset_ref = self.client.dataset(self.config["params"]["dataset"])
        view_ref = shared_dataset_ref.table(self.config["params"]["dest"])
        view = bigquery.Table(view_ref)
        qparams = self.get_query_params(self.date)
        view.view_query = qstring.format(**qparams)
        if self.does_table_exist():
            view = self.client.update_table(view, ["view_query"])  # API request
        else:
            view = self.client.create_table(view)  # API request

        log.info("Successfully created view at {}".format(view.full_table_id))
Ejemplo n.º 9
0
    def run_query(self, date, qstring=None):
        if qstring is None:
            qstring = read_string("sql/{}.sql".format(self.config["query"]))
            qparams = self.get_query_params(date)
            qstring = qstring.format(**qparams)
        table_ref = self.client.dataset(self.config["params"]["dataset"]).table(
            self.config["params"]["dest"]
        )
        job_config = bigquery.QueryJobConfig()
        job_config.write_disposition = (
            bigquery.WriteDisposition.WRITE_APPEND
            if self.is_write_append()
            else bigquery.WriteDisposition.WRITE_TRUNCATE
        )
        job_config.destination = table_ref
        if "partition_field" in self.config:
            job_config.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=self.config["partition_field"],
            )

        query = self.client.query(qstring, job_config=job_config)
        query.result()
Ejemplo n.º 10
0
    def extract_via_fs(
        self,
        source: str,
        config: Dict[str, Any],
        stage: str = "raw",
        date: datetime.datetime = None,
    ) -> DataFrame:
        """Extract data from file system and convert into DataFrame.

        The logic is based on task config, see `configs/*.py`

        :rtype: DataFrame
        :param source: name of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param config: config of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be raw/staging/production.
        :param date: the date part of the data file name,
            will use `self.current_date` if not specified
        :return: the extracted DataFrame
        """
        # extract paged raw files
        if "paths" in config:
            fpaths = config["paths"]
        elif stage == "raw":
            fpaths = self.get_filepaths(source, config, stage, "fs", date)
        else:
            fpaths = [self.get_filepath(source, config, stage, "fs", date)]
        if "iterator" in config:
            extracted = None if "iterator" not in config else dict()
            for fpath in fpaths:
                raw = read_string(fpath)
                it = get_file_ext(fpath)
                self.raw[it] = raw
                extracted[it] = convert_df(raw, config)
            log.info("%s-%s-%s/%s x %d iterators extracted from file system" %
                     (
                         stage,
                         self.task,
                         source,
                         (self.current_date if date is None else date).date(),
                         len(fpaths),
                     ))
        else:
            extracted = None
            for fpath in fpaths:
                raw = read_string(fpath)
                if extracted is None:
                    self.raw[source] = [raw]
                    extracted = convert_df(raw, config)
                else:
                    self.raw[source] += [raw]
                    extracted = extracted.append(convert_df(raw, config))
            extracted = extracted.reset_index(drop=True)
            log.info("%s-%s-%s/%s x %d pages extracted from file system" % (
                stage,
                self.task,
                source,
                (self.current_date if date is None else date).date(),
                len(fpaths),
            ))
        return extracted