def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._collection, self._src_dir, self._src_pattern], ) valid() files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound("No files are found.") if isinstance(self._credentials, str): self._logger.warning(( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/firestore_document_create.md" # noqa )) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) firestore_client = Firestore.get_firestore_client(key_filepath) for file in files: with open(file) as f: fname = os.path.splitext(os.path.basename(file))[0] doc = firestore_client.collection( self._collection).document(fname) doc.set(json.load(f))
def execute(self, *args): valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._dest_dir], ) valid() if not self._columns and not self._column_numbers: raise InvalidParameter( "Specifying either 'column' or 'column_numbers' is essential.") if self._columns and self._column_numbers: raise InvalidParameter( "Cannot specify both 'column' and 'column_numbers'.") files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound("The specified csv file not found.") for f in files: _, filename = os.path.split(f) dest_path = os.path.join(self._dest_dir, filename) if self._columns: Csv.extract_columns_with_names(f, dest_path, self._columns) elif self._column_numbers: if isinstance(self._column_numbers, int) is True: remain_column_numbers = [] remain_column_numbers.append(self._column_numbers) else: column_numbers = self._column_numbers.split(",") remain_column_numbers = [int(n) for n in column_numbers] Csv.extract_columns_with_numbers(f, dest_path, remain_column_numbers)
def execute(self, *args): input_valid = IOInput(self._io) input_valid() files = glob(self._src_path) if len(files) > 1: raise CliboaException("Input file must be only one.") if len(files) == 0: raise FileNotFound("The specified csv file not found.") with open(files[0], "r", encoding=self._encoding) as f: # save per one column if self._columns: reader = csv.DictReader(f, delimiter=",") for row in reader: # extract only the specified columns row_dict = {} for c in self._columns: if not row.get(c): continue row_dict[c] = row.get(c) self._s.save(row_dict) else: reader = csv.reader(f) header = next(reader, None) for row in reader: row_dict = dict(zip(header, row)) self._s.save(row_dict) # cache downloaded file names ObjectStore.put(self._step, files)
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) super().execute() valid = EssentialParameters(self.__class__.__name__, [self._src_dir, self._src_pattern]) valid() session = None if self._access_key and self._secret_key: session = Session(self._access_key, self._secret_key, self._region) s3 = session.resource("s3") if session else boto3.resource("s3") bucket = s3.Bucket(self._bucket) files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound( "Files matching to the specified pattern %s is not found." % os.path.join(self._src_dir, self._src_pattern)) else: for f in files: bucket.upload_file(Key=os.path.join(self._key, os.path.basename(f)), Filename=f)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._dest_dir, self._dest_pattern], ) valid() if self._dest_pattern: self._logger.warning( "'dest_pattern' will be unavailable in the near future." + "'dest_pattern' will change to 'dest_name'." ) if not self._src_pattern and not self._src_filenames: raise InvalidParameter( "Specifying either 'src_pattern' or 'src_filenames' is essential." ) if self._src_pattern and self._src_filenames: raise InvalidParameter( "Cannot specify both 'src_pattern' and 'src_filenames'." ) if self._src_pattern: files = File().get_target_files(self._src_dir, self._src_pattern) else: files = [] for file in self._src_filenames: files.append(os.path.join(self._src_dir, file)) if len(files) == 0: raise FileNotFound("No files are found.") elif len(files) == 1: self._logger.warning("Two or more input files are required.") file = files.pop(0) df1 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) for file in files: df2 = pandas.read_csv( file, dtype=str, encoding=self._encoding, ) df1 = pandas.concat([df1, df2]) df1.to_csv( os.path.join(self._dest_dir, self._dest_pattern), encoding=self._encoding, index=False, )
def _property_path_reader(self, src, encoding="utf-8"): """ Returns an resource contents from the path if src starts with "path:", returns src if not """ if src[:5].upper() == "PATH:": fpath = src[5:] if os.path.exists(fpath) is False: raise FileNotFound(src) with open(fpath, mode="r", encoding=encoding) as f: return f.read() return src
def _property_path_reader(self, src, encoding="utf-8"): """ Returns an resource contents from the path if src starts with "path:", returns src if not """ self._logger.warning( "DeprecationWarning: Will be removed in the near future") if src[:5].upper() == "PATH:": fpath = src[5:] if os.path.exists(fpath) is False: raise FileNotFound(src) with open(fpath, mode="r", encoding=encoding) as f: return f.read() return src
def _source_path_reader(self, src, encoding="utf-8"): """ Returns an path to temporary file contains content specify in src if src is dict, returns src if not """ if src is None: return src if isinstance(src, dict) and "content" in src: with tempfile.NamedTemporaryFile(mode="w", encoding=encoding, delete=False) as fp: fp.write(src["content"]) return fp.name elif isinstance(src, dict) and "file" in src: if os.path.exists(src["file"]) is False: raise FileNotFound(src) return src["file"] else: raise InvalidParameter("The parameter is invalid.")
def execute(self, *args): for k, v in self.__dict__.items(): self._logger.info("%s : %s" % (k, v)) BaseBigQuery.execute(self) FileRead.execute(self) param_valid = EssentialParameters(self.__class__.__name__, [self.__table_schema]) param_valid() files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) > 1: raise InvalidFileCount("Input file must be only one.") if len(files) == 0: raise FileNotFound("The specified csv file not found.") insert_rows = [] is_inserted = False # initial if_exists if_exists = self.REPLACE if self.__replace is True else self.APPEND self.__columns = [ name_and_type["name"] for name_and_type in self.__table_schema ] with open(files[0], "r", encoding=self._encoding) as f: reader = csv.DictReader(f, delimiter=",") for r in reader: # extract only the specified columns row_dict = {} for c in self.__columns: if not r.get(c): continue row_dict[c] = r.get(c) insert_rows.append(row_dict) if len(insert_rows) == self.BULK_LINE_CNT: self.__exec_insert(insert_rows, is_inserted, if_exists) insert_rows.clear() is_inserted = True if len(insert_rows) > 0: self.__exec_insert(insert_rows, is_inserted, if_exists)
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._collection, self._src_dir, self._src_pattern], ) valid() files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound("No files are found.") firestore_client = Firestore.get_firestore_client(self._credentials) for file in files: with open(file) as f: fname = os.path.splitext(os.path.basename(file))[0] doc = firestore_client.collection( self._collection).document(fname) doc.set(json.load(f))
def __call__(self, scenario_file): exists_scenario_file = os.path.isfile(scenario_file) if not exists_scenario_file: raise FileNotFound("scenario.yml %s does not exist" % scenario_file)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._tblname]) valid() files = super().get_target_files(self._src_dir, self._src_pattern) self._logger.info("Files found %s" % files) if len(files) == 0: raise FileNotFound("No csv file was found.") files.sort() def func(): # Find csv columns from all csv files csv_columns = [] for file in files: with open(file, "r", encoding=self._encoding) as f: reader = csv.DictReader(f) for col in reader.fieldnames: csv_columns.append(col) csv_columns = sorted(set(csv_columns), key=csv_columns.index) escaped_columns = ['"%s"' % fn for fn in csv_columns] if self._refresh is True: # Drop table in advance, If refresh is True self._sqlite_adptr.execute("DROP TABLE IF EXISTS %s" % self._tblname) self._sqlite_adptr.commit() self._create_table(self._tblname, escaped_columns) else: self._create_table(self._tblname, escaped_columns) if self._force_insert is True: self._alter_table(self._tblname, escaped_columns) else: # Make sure if csv columns and db table names are exactly the same db_columns = self._get_column_names(self._tblname) if escaped_columns != db_columns: raise Exception( "Csv columns %s were not matched to table column %s." % (csv_columns, db_columns)) for file in files: with open(file, mode="r", encoding=self._encoding) as f: reader = csv.DictReader(f) replace = True if self._primary_key else False # Put all csv records into the table. self._logger.info("Insert all csv records into table[%s]" % self._tblname) params = [] for row in reader: params.append(row) if len(params) == self.COMMIT_COUNT: self._sqlite_adptr.execute_many_insert( self._tblname, csv_columns, params, replace) self._sqlite_adptr.commit() params.clear() if len(params) > 0: self._sqlite_adptr.execute_many_insert( self._tblname, csv_columns, params, replace) self._sqlite_adptr.commit() if self._index and len(self._index) > 0: """ Create index (Add the index at the end for better performance when insert data is large) """ self._logger.info("Add index") self._sqlite_adptr.add_index(self._tblname, self._index) self._sqlite_adptr.commit() super().execute(func)
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._tblname]) valid() files = super().get_target_files(self._src_dir, self._src_pattern) self._logger.info("Files found %s" % files) if len(files) > 1: raise Exception("Input file must be only one.") if len(files) == 0: raise FileNotFound("No csv file was found.") def func(): if self._refresh is True: # Drop table in advance, If refresh is True self._sqlite_adptr.execute("DROP TABLE IF EXISTS %s" % self._tblname) self._sqlite_adptr.commit() with codecs.open(files[0], mode="r", encoding=self._encoding) as f: reader = csv.DictReader(f) # Table columns will be the same with csv column names. escaped_columns = ['"%s"' % fn for fn in reader.fieldnames] self._logger.info("Create table [%s]" % self._tblname) if self._primary_key is None: sql = "CREATE TABLE IF NOT EXISTS %s (%s)" self._sqlite_adptr.execute( sql % (self._tblname, " TEXT, ".join(escaped_columns) + " TEXT")) else: sql = "CREATE TABLE IF NOT EXISTS %s (%s, PRIMARY KEY(%s))" self._sqlite_adptr.execute(sql % ( self._tblname, " TEXT, ".join(escaped_columns) + " TEXT", self._primary_key, )) self._sqlite_adptr.commit() # Put all csv records into the table. self._logger.info("Insert all csv records into table[%s]" % self._tblname) params = [] for row in reader: params.append(row) if len(params) == self.COMMIT_COUNT: self._sqlite_adptr.execute_many_insert( self._tblname, reader.fieldnames, params, False) self._sqlite_adptr.commit() params.clear() if len(params) > 0: self._sqlite_adptr.execute_many_insert( self._tblname, reader.fieldnames, params, False) self._sqlite_adptr.commit() if self._index and len(self._index) > 0: """ Create index (Add the index at the end for better performance when insert data is large) """ self._logger.info("Add index") self._sqlite_adptr.add_index(self._tblname, self._index) self._sqlite_adptr.commit() super().execute(func)
def execute(self, *args): BaseBigQuery.execute(self) param_valid = EssentialParameters(self.__class__.__name__, [self._table_schema]) param_valid() files = super().get_target_files(self._src_dir, self._src_pattern) if len(files) == 0: raise FileNotFound("The specified csv file not found.") self._logger.info("insert target files %s" % files) is_inserted = False # initial if_exists if_exists = self._REPLACE if self._replace is True else self._APPEND self._columns = [ name_and_type["name"] for name_and_type in self._table_schema ] for file in files: insert_rows = [] with open(file, "r", encoding=self._encoding) as f: reader = (csv.DictReader(f, delimiter=",") if self._has_header is True else csv.reader( f, delimiter=",")) if self._has_header is True: for r in reader: # extract only the specified columns contents = {} for c in self._columns: if not r.get(c): continue contents[c] = r.get(c) insert_rows.append(contents) # bulk insert if len(insert_rows) == self._BULK_LINE_CNT: self._exec_insert(insert_rows, is_inserted, if_exists) insert_rows.clear() is_inserted = True if len(insert_rows) > 0: self._exec_insert(insert_rows, is_inserted, if_exists) is_inserted = True else: # csv headers do not exist for row in reader: contents = {} for i, c in enumerate(self._columns): contents[c] = row[i] insert_rows.append(contents) # bulk insert if len(insert_rows) == self._BULK_LINE_CNT: self._exec_insert(insert_rows, is_inserted, if_exists) insert_rows.clear() is_inserted = True if len(insert_rows) > 0: self._exec_insert(insert_rows, is_inserted, if_exists) is_inserted = True
def execute(self, *args): # essential parameters check valid = EssentialParameters( self.__class__.__name__, [self._src_dir, self._src_pattern, self._tblname]) valid() files = super().get_target_files(self._src_dir, self._src_pattern) self._logger.info("Files found %s" % files) if len(files) == 0: raise FileNotFound("No csv file was found.") files.sort() def func(): # Find csv columns from all csv files csv_columns = [] for file in files: csv_columns.extend(Csv.get_column_names(file)) csv_columns = sorted(set(csv_columns), key=csv_columns.index) if self._refresh is True: # Drop table in advance, If refresh is True self._sqlite_adptr.drop_table(self._tblname) self._sqlite_adptr.create_table(self._tblname, csv_columns, self._primary_key) else: self._sqlite_adptr.create_table(self._tblname, csv_columns, self._primary_key) if self._force_insert is True: db_columns = self._sqlite_adptr.get_column_names( self._tblname) result = list(set(csv_columns) - set(db_columns)) self._sqlite_adptr.add_columns(self._tblname, result) else: # Make sure if csv columns and db table names are exactly the same db_columns = self._sqlite_adptr.get_column_names( self._tblname) if self._sqlite_adptr.escape_columns( csv_columns) != self._sqlite_adptr.escape_columns( db_columns): raise CliboaException( "Csv columns %s were not matched to table column %s." % (csv_columns, db_columns)) for file in files: self._sqlite_adptr.import_table(file, self._tblname, refresh=False, encoding=self._encoding) if self._index and len(self._index) > 0: """ Create index (Add the index at the end for better performance when insert data is large) """ self._sqlite_adptr.add_index(self._tblname, self._index) super().execute(func)