Exemple #1
0
    def execute(self, *args):
        super().execute()

        param_valid = EssentialParameters(self.__class__.__name__,
                                          [self._table_schema])
        param_valid()

        cache_list = []
        inserts = False
        # initial if_exists
        if_exists = self.REPLACE if self._replace is True else self.APPEND
        with open(self._s.cache_file, "r", encoding="utf-8") as f:
            for i, l_str in enumerate(f):
                l_dict = ast.literal_eval(l_str)
                cache_list.append(l_dict)
                if len(cache_list) == self.BULK_LINE_CNT:
                    df = pandas.DataFrame(
                        self.__create_insert_data(cache_list))
                    if inserts is True:
                        # if_exists after the first insert execution
                        if_exists = self.APPEND
                    dest_tbl = self._dataset + "." + self._tblname
                    self._logger.info("Start insert %s rows to %s" %
                                      (len(cache_list), dest_tbl))
                    df.to_gbq(
                        dest_tbl,
                        project_id=self._project_id,
                        if_exists=if_exists,
                        table_schema=self._table_schema,
                        location=self._location,
                        credentials=ServiceAccount.auth(self._credentials),
                    )
                    cache_list.clear()
                    inserts = True
            if len(cache_list) > 0:
                df = pandas.DataFrame(self.__create_insert_data(cache_list))
                if inserts is True:
                    # if_exists after the first insert execution
                    if_exists = self.APPEND
                dest_tbl = self._dataset + "." + self._tblname
                self._logger.info("Start insert %s rows to %s" %
                                  (len(cache_list), dest_tbl))
                df.to_gbq(
                    dest_tbl,
                    project_id=self._project_id,
                    if_exists=if_exists,
                    table_schema=self._table_schema,
                    location=self._location,
                    credentials=ServiceAccount.auth(self._credentials),
                )
        self._s.remove()
Exemple #2
0
    def __exec_insert(self, insert_rows, is_inserted, if_exists):
        """
        Execute insert into a BigQuery table
        Args:
            insert_rows: rows to insert
            is_inserted: if the data is already inserted or not
            if_exists: replace or append
        """
        df = pandas.DataFrame(self.__format_insert_data(insert_rows))
        if is_inserted is True:
            # if_exists after the first insert execution
            if_exists = self.APPEND
        dest_tbl = self._dataset + "." + self._tblname
        self._logger.info("Start insert %s rows to %s" %
                          (len(insert_rows), dest_tbl))
        if isinstance(self._credentials, str):
            self._logger.warning((
                "DeprecationWarning: "
                "In the near future, "
                "the `credentials` will be changed to accept only dictionary types. "
            ))
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)

        df.to_gbq(
            dest_tbl,
            project_id=self._project_id,
            if_exists=if_exists,
            table_schema=self._table_schema,
            location=self._location,
            credentials=ServiceAccount.auth(key_filepath),
        )
Exemple #3
0
 def _save_to_cache(self):
     self._logger.info("Save data to on memory")
     if isinstance(self._credentials, str):
         self._logger.warning(
             (
                 "DeprecationWarning: "
                 "In the near future, "
                 "the `credentials` will be changed to accept only dictionary types. "
                 "Please see more information "
                 "https://github.com/BrainPad/cliboa/blob/master/docs/modules/bigquery_read.md"
             )
         )
         key_filepath = self._credentials
     else:
         key_filepath = self._source_path_reader(self._credentials)
     df = pandas.read_gbq(
         query="SELECT * FROM %s.%s" % (self._dataset, self._tblname)
         if self._query is None
         else self._query,
         dialect="standard",
         location=self._location,
         project_id=self._project_id,
         credentials=ServiceAccount.auth(key_filepath),
     )
     ObjectStore.put(self._key, df)
Exemple #4
0
 def _save_to_cache(self):
     self._logger.info("Save data to on memory")
     df = pandas.read_gbq(
         query="SELECT * FROM %s.%s" % (self._dataset, self._tblname)
         if self._query is None else self._query,
         dialect="standard",
         location=self._location,
         project_id=self._project_id,
         credentials=ServiceAccount.auth(self._credentials),
     )
     ObjectStore.put(self._key, df)
Exemple #5
0
    def execute(self, *args):
        super().execute()
        valid = EssentialParameters(self.__class__.__name__, [self._key])
        valid()

        df = pandas.read_gbq(
            query=self._get_query(),
            dialect="standard",
            location=self._location,
            project_id=self._project_id,
            credentials=ServiceAccount.auth(self._credentials),
        )
        ObjectStore.put(self._key, df)
Exemple #6
0
 def __exec_insert(self, insert_rows, is_inserted, if_exists):
     """
     Execute insert into a BigQuery table
     Args:
         insert_rows: rows to insert
         is_inserted: if the data is already inserted or not
         if_exists: replace or append
     """
     df = pandas.DataFrame(self.__format_insert_data(insert_rows))
     if is_inserted is True:
         # if_exists after the first insert execution
         if_exists = self.APPEND
     dest_tbl = self._dataset + "." + self._tblname
     self._logger.info("Start insert %s rows to %s" %
                       (len(insert_rows), dest_tbl))
     df.to_gbq(
         dest_tbl,
         project_id=self._project_id,
         if_exists=if_exists,
         table_schema=self._table_schema,
         location=self._location,
         credentials=ServiceAccount.auth(self._credentials),
     )
Exemple #7
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.info("%s : %s" % (k, v))
        dl_files = ObjectStore.get(self._symbol)

        if len(dl_files) > 0:
            self._logger.info("Delete files %s" % dl_files)
            c = storage.Client(
                super().get_step_argument("project_id"),
                credentials=ServiceAccount.auth(
                    super().get_step_argument("credentials")),
            )
            bucket = c.get_bucket(super().get_step_argument("bucket"))
            for blob in bucket.list_blobs(
                    prefix=super().get_step_argument("prefix"),
                    delimiter=super().get_step_argument("delimiter"),
            ):
                for dl_f in dl_files:
                    if dl_f == blob.name:
                        blob.delete()
                        break
        else:
            self._logger.info("No files to delete.")
Exemple #8
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.info("%s : %s" % (k, v))
        super().execute()

        valid = EssentialParameters(self.__class__.__name__,
                                    [self._src_pattern])
        valid()

        c = storage.Client(self._project_id,
                           credentials=ServiceAccount.auth(self._credentials))
        bucket = c.get_bucket(self._bucket)
        dl_files = []
        for blob in bucket.list_blobs(prefix=self._prefix,
                                      delimiter=self._delimiter):
            r = re.compile(self._src_pattern)
            if not r.fullmatch(blob.name):
                continue
            dl_files.append(blob.name)
            blob.download_to_filename(
                os.path.join(self._dest_dir, os.path.basename(blob.name)))

        ObjectStore.put(self._step, dl_files)
Exemple #9
0
    def execute(self, *args):
        super().execute()
        valid = EssentialParameters(self.__class__.__name__, [self._key])
        valid()

        if isinstance(self._credentials, str):
            self._logger.warning(
                (
                    "DeprecationWarning: "
                    "In the near future, "
                    "the `credentials` will be changed to accept only dictionary types. "
                )
            )
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)
        df = pandas.read_gbq(
            query=self._get_query(),
            dialect="standard",
            location=self._location,
            project_id=self._project_id,
            credentials=ServiceAccount.auth(key_filepath),
        )
        ObjectStore.put(self._key, df)
Exemple #10
0
    def execute(self, *args):
        super().execute()

        param_valid = EssentialParameters(self.__class__.__name__,
                                          [self._table_schema])
        param_valid()

        cache_list = []
        inserts = False
        # initial if_exists
        if_exists = self.REPLACE if self._replace is True else self.APPEND

        if isinstance(self._credentials, str):
            self._logger.warning((
                "DeprecationWarning: "
                "In the near future, "
                "the `credentials` will be changed to accept only dictionary types. "
            ))
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)

        with open(self._s.cache_file, "r", encoding="utf-8") as f:
            for i, l_str in enumerate(f):
                l_dict = ast.literal_eval(l_str)
                cache_list.append(l_dict)
                if len(cache_list) == self.BULK_LINE_CNT:
                    df = pandas.DataFrame(
                        self.__create_insert_data(cache_list))
                    if inserts is True:
                        # if_exists after the first insert execution
                        if_exists = self.APPEND
                    dest_tbl = self._dataset + "." + self._tblname
                    self._logger.info("Start insert %s rows to %s" %
                                      (len(cache_list), dest_tbl))
                    df.to_gbq(
                        dest_tbl,
                        project_id=self._project_id,
                        if_exists=if_exists,
                        table_schema=self._table_schema,
                        location=self._location,
                        credentials=ServiceAccount.auth(key_filepath),
                    )
                    cache_list.clear()
                    inserts = True
            if len(cache_list) > 0:
                df = pandas.DataFrame(self.__create_insert_data(cache_list))
                if inserts is True:
                    # if_exists after the first insert execution
                    if_exists = self.APPEND
                dest_tbl = self._dataset + "." + self._tblname
                self._logger.info("Start insert %s rows to %s" %
                                  (len(cache_list), dest_tbl))
                df.to_gbq(
                    dest_tbl,
                    project_id=self._project_id,
                    if_exists=if_exists,
                    table_schema=self._table_schema,
                    location=self._location,
                    credentials=ServiceAccount.auth(key_filepath),
                )
        self._s.remove()
Exemple #11
0
 def test_auth_no_credentials(self):
     assert ServiceAccount.auth(None) is None