コード例 #1
0
ファイル: csv.py プロジェクト: cocoa-maemae/cliboa
    def execute(self, *args):
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._dest_dir],
        )
        valid()

        if not self._columns and not self._column_numbers:
            raise InvalidParameter(
                "Specifying either 'column' or 'column_numbers' is essential.")
        if self._columns and self._column_numbers:
            raise InvalidParameter(
                "Cannot specify both 'column' and 'column_numbers'.")

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound("The specified csv file not found.")

        for f in files:
            _, filename = os.path.split(f)
            dest_path = os.path.join(self._dest_dir, filename)
            if self._columns:
                Csv.extract_columns_with_names(f, dest_path, self._columns)
            elif self._column_numbers:
                if isinstance(self._column_numbers, int) is True:
                    remain_column_numbers = []
                    remain_column_numbers.append(self._column_numbers)
                else:
                    column_numbers = self._column_numbers.split(",")
                    remain_column_numbers = [int(n) for n in column_numbers]
                Csv.extract_columns_with_numbers(f, dest_path,
                                                 remain_column_numbers)
コード例 #2
0
ファイル: csv.py プロジェクト: BrainPad/cliboa
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._dest_dir, self._dest_pattern],
        )
        valid()

        if self._dest_pattern:
            self._logger.warning(
                "'dest_pattern' will be unavailable in the near future."
                + "'dest_pattern' will change to 'dest_name'."
            )

        if not self._src_pattern and not self._src_filenames:
            raise InvalidParameter(
                "Specifying either 'src_pattern' or 'src_filenames' is essential."
            )
        if self._src_pattern and self._src_filenames:
            raise InvalidParameter(
                "Cannot specify both 'src_pattern' and 'src_filenames'."
            )

        if self._src_pattern:
            files = File().get_target_files(self._src_dir, self._src_pattern)
        else:
            files = []
            for file in self._src_filenames:
                files.append(os.path.join(self._src_dir, file))

        if len(files) == 0:
            raise FileNotFound("No files are found.")
        elif len(files) == 1:
            self._logger.warning("Two or more input files are required.")

        file = files.pop(0)
        df1 = pandas.read_csv(
            file,
            dtype=str,
            encoding=self._encoding,
        )

        for file in files:
            df2 = pandas.read_csv(
                file,
                dtype=str,
                encoding=self._encoding,
            )
            df1 = pandas.concat([df1, df2])

        df1.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
コード例 #3
0
ファイル: gcp.py プロジェクト: BrainPad/cliboa
    def execute(self, *args):
        super().execute()
        if not self._key and not self._bucket:
            raise InvalidParameter("Specifying either 'key' or 'bucket' is essential.")
        if self._key and self._bucket:
            raise InvalidParameter("Cannot specify both 'key' and 'bucket'.")

        # fetch records and save to on-memory
        if self._key:
            valid = EssentialParameters(self.__class__.__name__, [self._tblname])
            valid()
            self._save_to_cache()
        elif self._bucket:
            valid = EssentialParameters(self.__class__.__name__, [self._dest_dir])
            valid()
            self._save_as_file_via_gcs()
コード例 #4
0
ファイル: csv.py プロジェクト: cocoa-maemae/cliboa
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._dest_dir, self._dest_pattern],
        )
        valid()

        if not self._src_pattern and not self._src_filenames:
            raise InvalidParameter(
                "Specifying either 'src_pattern' or 'src_filenames' is essential."
            )
        if self._src_pattern and self._src_filenames:
            raise InvalidParameter(
                "Cannot specify both 'src_pattern' and 'src_filenames'.")

        if self._src_pattern:
            files = File().get_target_files(self._src_dir, self._src_pattern)
        else:
            files = []
            for file in self._src_filenames:
                files.append(os.path.join(self._src_dir, file))

        if len(files) < 2:
            raise InvalidCount("Two or more input files are required.")

        file = files.pop(0)
        df1 = pandas.read_csv(
            file,
            dtype=str,
            encoding=self._encoding,
        )

        for file in files:
            df2 = pandas.read_csv(
                file,
                dtype=str,
                encoding=self._encoding,
            )
            df1 = pandas.concat([df1, df2])

        df1.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
コード例 #5
0
ファイル: file.py プロジェクト: BrainPad/cliboa
    def execute(self, *args):
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._dest_pattern],
        )
        valid()

        if self._dest_pattern:
            self._logger.warning(
                "'dest_pattern' will be unavailable in the near future." +
                "Basically every classes which extends FileBaseTransform will be allowed"
                +
                " plural input files, and output files will be the same name with input"
                + " file names.\n"
                "At that time, if 'dest_dir' is given, transformed files will be created in the given directory.\n"  # noqa
                +
                "If not, original files will be updated by transformed files.")

        files = super().get_target_files(self._src_dir, self._src_pattern)
        self._logger.info("Files found %s" % files)
        if len(files) == 0:
            raise InvalidCount("No files are found.")

        dir = self._dest_dir if self._dest_dir is not None else self._src_dir
        dest_path = os.path.join(dir,
                                 (self._dest_pattern + ".%s" % self._format))

        if self._format == "tar":
            with tarfile.open(dest_path, "w") as tar:
                for file in files:
                    arcname = (os.path.join(self._dest_pattern,
                                            os.path.basename(file))
                               if self._create_dir else os.path.basename(file))
                    tar.add(file, arcname=arcname)
        elif self._format == "zip":
            with zipfile.ZipFile(dest_path, "w") as zp:
                for file in files:
                    arcname = (os.path.join(self._dest_pattern,
                                            os.path.basename(file))
                               if self._create_dir else os.path.basename(file))
                    zp.write(file, arcname=arcname)
        else:
            raise InvalidParameter(
                "'format' must set one of the followings [tar, zip]")
コード例 #6
0
 def _source_path_reader(self, src, encoding="utf-8"):
     """
     Returns an path to temporary file contains content specify in src if src is dict,
     returns src if not
     """
     if src is None:
         return src
     if isinstance(src, dict) and "content" in src:
         with tempfile.NamedTemporaryFile(mode="w",
                                          encoding=encoding,
                                          delete=False) as fp:
             fp.write(src["content"])
             return fp.name
     elif isinstance(src, dict) and "file" in src:
         if os.path.exists(src["file"]) is False:
             raise FileNotFound(src)
         return src["file"]
     else:
         raise InvalidParameter("The parameter is invalid.")
コード例 #7
0
ファイル: validator.py プロジェクト: 45deg/cliboa
 def __call__(self):
     for p in self.__param_list:
         if not p:
             raise InvalidParameter(
                 "The essential parameter is not specified in %s." % self.__cls_name
             )
コード例 #8
0
ファイル: gcp.py プロジェクト: BrainPad/cliboa
    def _save_as_file_via_gcs(self):
        self._logger.info("Save data as a file via GCS")
        os.makedirs(self._dest_dir, exist_ok=True)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % (StringUtil().random_str(self._RANDOM_STR_LENGTH), ymd_hms,)
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        if isinstance(self._credentials, str):
            self._logger.warning(
                (
                    "DeprecationWarning: "
                    "In the near future, "
                    "the `credentials` will be changed to accept only dictionary types. "
                    "Please see more information "
                    "https://github.com/BrainPad/cliboa/blob/master/docs/modules/bigquery_read.md"
                )
            )
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)
        gbq_client = BigQuery.get_bigquery_client(key_filepath)
        if self._dataset and self._tblname:
            table_ref = gbq_client.dataset(self._dataset).table(self._tblname)
        elif self._dataset and not self._tblname:
            tmp_tbl = (
                "tmp_"
                + StringUtil().random_str(self._RANDOM_STR_LENGTH)
                + "_"
                + ymd_hms
            )
            table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl)
        gcs_client = Gcs.get_gcs_client(key_filepath)
        gcs_bucket = gcs_client.bucket(self._bucket)

        # extract job config settings
        ext_job_config = BigQuery.get_extract_job_config()
        ext_job_config.compression = BigQuery.get_compression_type()
        ext = ".csv"
        if self._filename:
            _, ext = os.path.splitext(self._filename)
            support_ext = [".csv", ".json"]
            if ext not in support_ext:
                raise InvalidParameter("%s is not supported as filename." % ext)
        ext_job_config.destination_format = BigQuery.get_destination_format(ext)

        comp_format_and_ext = {"GZIP": ".gz"}
        comp_ext = comp_format_and_ext.get(str(BigQuery.get_compression_type()))
        if self._filename:
            dest_gcs = "gs://%s/%s/%s%s" % (
                self._bucket,
                prefix,
                self._filename,
                comp_ext,
            )
        else:
            dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext, comp_ext)

        # Execute query.
        if self._query:
            query_job_config = BigQuery.get_query_job_config()
            query_job_config.destination = table_ref
            query_job_config.write_disposition = BigQuery.get_write_disposition()
            query_job = gbq_client.query(
                self._query, location=self._location, job_config=query_job_config
            )
            query_job.result()

        # Extract to GCS
        extract_job = gbq_client.extract_table(
            table_ref, dest_gcs, job_config=ext_job_config, location=self._location
        )
        extract_job.result()

        # Download from gcs
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary table
        if self._query:
            gbq_client.delete_table(table_ref)

        # Cleanup temporary files
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            blob.delete()
コード例 #9
0
    def _save_as_file_via_gcs(self):
        self._logger.info("Save data as a file via GCS")
        os.makedirs(self._dest_dir, exist_ok=True)

        ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f")
        path = "%s-%s" % (
            StringUtil().random_str(self._RANDOM_STR_LENGTH),
            ymd_hms,
        )
        prefix = "%s/%s/%s" % (self._dataset, self._tblname, path)

        gbq_client = BigQuery.get_bigquery_client(self._credentials)
        if self._dataset and self._tblname:
            table_ref = gbq_client.dataset(self._dataset).table(self._tblname)
        elif self._dataset and not self._tblname:
            tmp_tbl = ("tmp_" +
                       StringUtil().random_str(self._RANDOM_STR_LENGTH) + "_" +
                       ymd_hms)
            table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl)

        gcs_client = Gcs.get_gcs_client(self._credentials)
        gcs_bucket = gcs_client.get_bucket(self._bucket)

        # extract job config settings
        ext_job_config = BigQuery.get_extract_job_config()
        ext_job_config.compression = BigQuery.get_compression_type()
        ext = ".csv"
        if self._filename:
            _, ext = os.path.splitext(self._filename)
            support_ext = [".csv", ".json"]
            if ext not in support_ext:
                raise InvalidParameter("%s is not supported as filename." %
                                       ext)
        ext_job_config.destination_format = BigQuery.get_destination_format(
            ext)

        comp_format_and_ext = {"GZIP": ".gz"}
        comp_ext = comp_format_and_ext.get(str(
            BigQuery.get_compression_type()))
        if self._filename:
            dest_gcs = "gs://%s/%s/%s%s" % (
                self._bucket,
                prefix,
                self._filename,
                comp_ext,
            )
        else:
            dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext,
                                             comp_ext)

        # Execute query.
        if self._query:
            query_job_config = BigQuery.get_query_job_config()
            query_job_config.destination = table_ref
            query_job_config.write_disposition = BigQuery.get_write_disposition(
            )
            query_job = gbq_client.query(self._query,
                                         location=self._location,
                                         job_config=query_job_config)
            query_job.result()

        # Extract to GCS
        extract_job = gbq_client.extract_table(table_ref,
                                               dest_gcs,
                                               job_config=ext_job_config,
                                               location=self._location)
        extract_job.result()

        # Download from gcs
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            dest = os.path.join(self._dest_dir, os.path.basename(blob.name))
            blob.download_to_filename(dest)

        # Cleanup temporary table
        if self._query:
            gbq_client.delete_table(table_ref)

        # Cleanup temporary files
        for blob in gcs_bucket.list_blobs(prefix=prefix):
            blob.delete()