Beispiel #1
0
    def execute(self, *args):
        super().execute()

        valid = EssentialParameters(
            self.__class__.__name__,
            [self._collection, self._src_dir, self._src_pattern],
        )
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound("No files are found.")

        if isinstance(self._credentials, str):
            self._logger.warning((
                "DeprecationWarning: "
                "In the near future, "
                "the `credentials` will be changed to accept only dictionary types. "
                "Please see more information "
                "https://github.com/BrainPad/cliboa/blob/master/docs/modules/firestore_document_create.md"  # noqa
            ))
            key_filepath = self._credentials
        else:
            key_filepath = self._source_path_reader(self._credentials)
        firestore_client = Firestore.get_firestore_client(key_filepath)

        for file in files:
            with open(file) as f:
                fname = os.path.splitext(os.path.basename(file))[0]
                doc = firestore_client.collection(
                    self._collection).document(fname)
                doc.set(json.load(f))
Beispiel #2
0
    def execute(self, *args):
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._dest_dir],
        )
        valid()

        if not self._columns and not self._column_numbers:
            raise InvalidParameter(
                "Specifying either 'column' or 'column_numbers' is essential.")
        if self._columns and self._column_numbers:
            raise InvalidParameter(
                "Cannot specify both 'column' and 'column_numbers'.")

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound("The specified csv file not found.")

        for f in files:
            _, filename = os.path.split(f)
            dest_path = os.path.join(self._dest_dir, filename)
            if self._columns:
                Csv.extract_columns_with_names(f, dest_path, self._columns)
            elif self._column_numbers:
                if isinstance(self._column_numbers, int) is True:
                    remain_column_numbers = []
                    remain_column_numbers.append(self._column_numbers)
                else:
                    column_numbers = self._column_numbers.split(",")
                    remain_column_numbers = [int(n) for n in column_numbers]
                Csv.extract_columns_with_numbers(f, dest_path,
                                                 remain_column_numbers)
Beispiel #3
0
    def execute(self, *args):
        input_valid = IOInput(self._io)
        input_valid()

        files = glob(self._src_path)
        if len(files) > 1:
            raise CliboaException("Input file must be only one.")

        if len(files) == 0:
            raise FileNotFound("The specified csv file not found.")

        with open(files[0], "r", encoding=self._encoding) as f:

            # save per one column
            if self._columns:
                reader = csv.DictReader(f, delimiter=",")
                for row in reader:
                    # extract only the specified columns
                    row_dict = {}
                    for c in self._columns:
                        if not row.get(c):
                            continue
                        row_dict[c] = row.get(c)
                    self._s.save(row_dict)
            else:
                reader = csv.reader(f)
                header = next(reader, None)
                for row in reader:
                    row_dict = dict(zip(header, row))
                    self._s.save(row_dict)

        # cache downloaded file names
        ObjectStore.put(self._step, files)
Beispiel #4
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.info("%s : %s" % (k, v))
        super().execute()

        valid = EssentialParameters(self.__class__.__name__,
                                    [self._src_dir, self._src_pattern])
        valid()

        session = None
        if self._access_key and self._secret_key:
            session = Session(self._access_key, self._secret_key, self._region)

        s3 = session.resource("s3") if session else boto3.resource("s3")
        bucket = s3.Bucket(self._bucket)
        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound(
                "Files matching to the specified pattern %s is not found." %
                os.path.join(self._src_dir, self._src_pattern))
        else:
            for f in files:
                bucket.upload_file(Key=os.path.join(self._key,
                                                    os.path.basename(f)),
                                   Filename=f)
Beispiel #5
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._dest_dir, self._dest_pattern],
        )
        valid()

        if self._dest_pattern:
            self._logger.warning(
                "'dest_pattern' will be unavailable in the near future."
                + "'dest_pattern' will change to 'dest_name'."
            )

        if not self._src_pattern and not self._src_filenames:
            raise InvalidParameter(
                "Specifying either 'src_pattern' or 'src_filenames' is essential."
            )
        if self._src_pattern and self._src_filenames:
            raise InvalidParameter(
                "Cannot specify both 'src_pattern' and 'src_filenames'."
            )

        if self._src_pattern:
            files = File().get_target_files(self._src_dir, self._src_pattern)
        else:
            files = []
            for file in self._src_filenames:
                files.append(os.path.join(self._src_dir, file))

        if len(files) == 0:
            raise FileNotFound("No files are found.")
        elif len(files) == 1:
            self._logger.warning("Two or more input files are required.")

        file = files.pop(0)
        df1 = pandas.read_csv(
            file,
            dtype=str,
            encoding=self._encoding,
        )

        for file in files:
            df2 = pandas.read_csv(
                file,
                dtype=str,
                encoding=self._encoding,
            )
            df1 = pandas.concat([df1, df2])

        df1.to_csv(
            os.path.join(self._dest_dir, self._dest_pattern),
            encoding=self._encoding,
            index=False,
        )
Beispiel #6
0
 def _property_path_reader(self, src, encoding="utf-8"):
     """
     Returns an resource contents from the path if src starts with "path:", returns src if not
     """
     if src[:5].upper() == "PATH:":
         fpath = src[5:]
         if os.path.exists(fpath) is False:
             raise FileNotFound(src)
         with open(fpath, mode="r", encoding=encoding) as f:
             return f.read()
     return src
Beispiel #7
0
 def _property_path_reader(self, src, encoding="utf-8"):
     """
     Returns an resource contents from the path if src starts with "path:",
     returns src if not
     """
     self._logger.warning(
         "DeprecationWarning: Will be removed in the near future")
     if src[:5].upper() == "PATH:":
         fpath = src[5:]
         if os.path.exists(fpath) is False:
             raise FileNotFound(src)
         with open(fpath, mode="r", encoding=encoding) as f:
             return f.read()
     return src
Beispiel #8
0
 def _source_path_reader(self, src, encoding="utf-8"):
     """
     Returns an path to temporary file contains content specify in src if src is dict,
     returns src if not
     """
     if src is None:
         return src
     if isinstance(src, dict) and "content" in src:
         with tempfile.NamedTemporaryFile(mode="w",
                                          encoding=encoding,
                                          delete=False) as fp:
             fp.write(src["content"])
             return fp.name
     elif isinstance(src, dict) and "file" in src:
         if os.path.exists(src["file"]) is False:
             raise FileNotFound(src)
         return src["file"]
     else:
         raise InvalidParameter("The parameter is invalid.")
Beispiel #9
0
    def execute(self, *args):
        for k, v in self.__dict__.items():
            self._logger.info("%s : %s" % (k, v))

        BaseBigQuery.execute(self)
        FileRead.execute(self)

        param_valid = EssentialParameters(self.__class__.__name__,
                                          [self.__table_schema])
        param_valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) > 1:
            raise InvalidFileCount("Input file must be only one.")
        if len(files) == 0:
            raise FileNotFound("The specified csv file not found.")

        insert_rows = []
        is_inserted = False
        # initial if_exists
        if_exists = self.REPLACE if self.__replace is True else self.APPEND
        self.__columns = [
            name_and_type["name"] for name_and_type in self.__table_schema
        ]
        with open(files[0], "r", encoding=self._encoding) as f:
            reader = csv.DictReader(f, delimiter=",")
            for r in reader:
                # extract only the specified columns
                row_dict = {}
                for c in self.__columns:
                    if not r.get(c):
                        continue
                    row_dict[c] = r.get(c)
                insert_rows.append(row_dict)

                if len(insert_rows) == self.BULK_LINE_CNT:
                    self.__exec_insert(insert_rows, is_inserted, if_exists)
                    insert_rows.clear()
                    is_inserted = True
            if len(insert_rows) > 0:
                self.__exec_insert(insert_rows, is_inserted, if_exists)
Beispiel #10
0
    def execute(self, *args):
        super().execute()

        valid = EssentialParameters(
            self.__class__.__name__,
            [self._collection, self._src_dir, self._src_pattern],
        )
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound("No files are found.")

        firestore_client = Firestore.get_firestore_client(self._credentials)

        for file in files:
            with open(file) as f:
                fname = os.path.splitext(os.path.basename(file))[0]
                doc = firestore_client.collection(
                    self._collection).document(fname)
                doc.set(json.load(f))
Beispiel #11
0
 def __call__(self, scenario_file):
     exists_scenario_file = os.path.isfile(scenario_file)
     if not exists_scenario_file:
         raise FileNotFound("scenario.yml %s does not exist" %
                            scenario_file)
Beispiel #12
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._tblname])
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        self._logger.info("Files found %s" % files)

        if len(files) == 0:
            raise FileNotFound("No csv file was found.")

        files.sort()

        def func():
            # Find csv columns from all csv files
            csv_columns = []
            for file in files:
                with open(file, "r", encoding=self._encoding) as f:
                    reader = csv.DictReader(f)
                    for col in reader.fieldnames:
                        csv_columns.append(col)
            csv_columns = sorted(set(csv_columns), key=csv_columns.index)
            escaped_columns = ['"%s"' % fn for fn in csv_columns]

            if self._refresh is True:
                # Drop table in advance, If refresh is True
                self._sqlite_adptr.execute("DROP TABLE IF EXISTS %s" %
                                           self._tblname)
                self._sqlite_adptr.commit()
                self._create_table(self._tblname, escaped_columns)
            else:
                self._create_table(self._tblname, escaped_columns)

                if self._force_insert is True:
                    self._alter_table(self._tblname, escaped_columns)
                else:
                    # Make sure if csv columns and db table names are exactly the same
                    db_columns = self._get_column_names(self._tblname)
                    if escaped_columns != db_columns:
                        raise Exception(
                            "Csv columns %s were not matched to table column %s."
                            % (csv_columns, db_columns))

            for file in files:
                with open(file, mode="r", encoding=self._encoding) as f:
                    reader = csv.DictReader(f)

                    replace = True if self._primary_key else False

                    # Put all csv records into the table.
                    self._logger.info("Insert all csv records into table[%s]" %
                                      self._tblname)
                    params = []
                    for row in reader:
                        params.append(row)
                        if len(params) == self.COMMIT_COUNT:
                            self._sqlite_adptr.execute_many_insert(
                                self._tblname, csv_columns, params, replace)
                            self._sqlite_adptr.commit()
                            params.clear()
                    if len(params) > 0:
                        self._sqlite_adptr.execute_many_insert(
                            self._tblname, csv_columns, params, replace)
                        self._sqlite_adptr.commit()

            if self._index and len(self._index) > 0:
                """
                Create index (Add the index at the end for
                better performance when insert data is large)
                """
                self._logger.info("Add index")
                self._sqlite_adptr.add_index(self._tblname, self._index)
                self._sqlite_adptr.commit()

        super().execute(func)
Beispiel #13
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._tblname])
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        self._logger.info("Files found %s" % files)

        if len(files) > 1:
            raise Exception("Input file must be only one.")

        if len(files) == 0:
            raise FileNotFound("No csv file was found.")

        def func():
            if self._refresh is True:
                # Drop table in advance, If refresh is True
                self._sqlite_adptr.execute("DROP TABLE IF EXISTS %s" %
                                           self._tblname)
                self._sqlite_adptr.commit()

            with codecs.open(files[0], mode="r", encoding=self._encoding) as f:
                reader = csv.DictReader(f)
                # Table columns will be the same with csv column names.
                escaped_columns = ['"%s"' % fn for fn in reader.fieldnames]

                self._logger.info("Create table [%s]" % self._tblname)
                if self._primary_key is None:
                    sql = "CREATE TABLE IF NOT EXISTS %s (%s)"
                    self._sqlite_adptr.execute(
                        sql % (self._tblname,
                               " TEXT, ".join(escaped_columns) + " TEXT"))
                else:
                    sql = "CREATE TABLE IF NOT EXISTS %s (%s, PRIMARY KEY(%s))"
                    self._sqlite_adptr.execute(sql % (
                        self._tblname,
                        " TEXT, ".join(escaped_columns) + " TEXT",
                        self._primary_key,
                    ))
                self._sqlite_adptr.commit()

                # Put all csv records into the table.
                self._logger.info("Insert all csv records into table[%s]" %
                                  self._tblname)
                params = []
                for row in reader:
                    params.append(row)
                    if len(params) == self.COMMIT_COUNT:
                        self._sqlite_adptr.execute_many_insert(
                            self._tblname, reader.fieldnames, params, False)
                        self._sqlite_adptr.commit()
                        params.clear()
                if len(params) > 0:
                    self._sqlite_adptr.execute_many_insert(
                        self._tblname, reader.fieldnames, params, False)
                    self._sqlite_adptr.commit()

            if self._index and len(self._index) > 0:
                """
                Create index (Add the index at the end for
                better performance when insert data is large)
                """
                self._logger.info("Add index")
                self._sqlite_adptr.add_index(self._tblname, self._index)
                self._sqlite_adptr.commit()

        super().execute(func)
Beispiel #14
0
    def execute(self, *args):
        BaseBigQuery.execute(self)

        param_valid = EssentialParameters(self.__class__.__name__,
                                          [self._table_schema])
        param_valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        if len(files) == 0:
            raise FileNotFound("The specified csv file not found.")
        self._logger.info("insert target files %s" % files)

        is_inserted = False
        # initial if_exists
        if_exists = self._REPLACE if self._replace is True else self._APPEND
        self._columns = [
            name_and_type["name"] for name_and_type in self._table_schema
        ]
        for file in files:
            insert_rows = []
            with open(file, "r", encoding=self._encoding) as f:
                reader = (csv.DictReader(f, delimiter=",")
                          if self._has_header is True else csv.reader(
                              f, delimiter=","))
                if self._has_header is True:
                    for r in reader:
                        # extract only the specified columns
                        contents = {}
                        for c in self._columns:
                            if not r.get(c):
                                continue
                            contents[c] = r.get(c)
                        insert_rows.append(contents)

                        # bulk insert
                        if len(insert_rows) == self._BULK_LINE_CNT:
                            self._exec_insert(insert_rows, is_inserted,
                                              if_exists)
                            insert_rows.clear()
                            is_inserted = True
                    if len(insert_rows) > 0:
                        self._exec_insert(insert_rows, is_inserted, if_exists)
                        is_inserted = True

                else:
                    # csv headers do not exist
                    for row in reader:
                        contents = {}
                        for i, c in enumerate(self._columns):
                            contents[c] = row[i]
                        insert_rows.append(contents)

                        # bulk insert
                        if len(insert_rows) == self._BULK_LINE_CNT:
                            self._exec_insert(insert_rows, is_inserted,
                                              if_exists)
                            insert_rows.clear()
                            is_inserted = True
                    if len(insert_rows) > 0:
                        self._exec_insert(insert_rows, is_inserted, if_exists)
                        is_inserted = True
Beispiel #15
0
    def execute(self, *args):
        # essential parameters check
        valid = EssentialParameters(
            self.__class__.__name__,
            [self._src_dir, self._src_pattern, self._tblname])
        valid()

        files = super().get_target_files(self._src_dir, self._src_pattern)
        self._logger.info("Files found %s" % files)

        if len(files) == 0:
            raise FileNotFound("No csv file was found.")

        files.sort()

        def func():
            # Find csv columns from all csv files
            csv_columns = []
            for file in files:
                csv_columns.extend(Csv.get_column_names(file))
            csv_columns = sorted(set(csv_columns), key=csv_columns.index)

            if self._refresh is True:
                # Drop table in advance, If refresh is True
                self._sqlite_adptr.drop_table(self._tblname)
                self._sqlite_adptr.create_table(self._tblname, csv_columns,
                                                self._primary_key)
            else:
                self._sqlite_adptr.create_table(self._tblname, csv_columns,
                                                self._primary_key)

                if self._force_insert is True:
                    db_columns = self._sqlite_adptr.get_column_names(
                        self._tblname)
                    result = list(set(csv_columns) - set(db_columns))
                    self._sqlite_adptr.add_columns(self._tblname, result)
                else:
                    # Make sure if csv columns and db table names are exactly the same
                    db_columns = self._sqlite_adptr.get_column_names(
                        self._tblname)
                    if self._sqlite_adptr.escape_columns(
                            csv_columns) != self._sqlite_adptr.escape_columns(
                                db_columns):
                        raise CliboaException(
                            "Csv columns %s were not matched to table column %s."
                            % (csv_columns, db_columns))

            for file in files:
                self._sqlite_adptr.import_table(file,
                                                self._tblname,
                                                refresh=False,
                                                encoding=self._encoding)

            if self._index and len(self._index) > 0:
                """
                Create index (Add the index at the end for
                better performance when insert data is large)
                """
                self._sqlite_adptr.add_index(self._tblname, self._index)

        super().execute(func)