def __init__( self, args: Namespace, sources: Dict[str, Any], schema: List[Tuple[str, np.generic]], destinations: Dict[str, Any], stage: str, task: str, ): """Initiate parameters and client libraries for ETL task. :param args: args passed from command line, see `get_arg_parser()` :param sources: data source to be extracted, specified in task config, see `configs/*.py` :param schema: the target schema to load to. :param destinations: destinations to load data to, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be staging/production. :param task: the name of the task. """ # Clear cached files if args.rm: for source in sources: files = [] files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage="raw", task=args.task, source=source, )) files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage=stage, task=args.task, source=source, )) for f in files: log.info("Removing cached file: %s" % f) os.remove(f) self.task = task self.stage = stage self.args = args self.period = args.period self.current_date = args.date self.last_month = lookback_dates(args.date, args.period) self.sources = sources coltypes = [] for coltype in schema: coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])] self.schema = Schema(coltypes) self.raw_schema = schema self.destinations = destinations self.raw = dict() self.extracted_base = dict() self.extracted = dict() self.transformed = dict() self.gcs = storage.Client()
def get_latest_filepath(self, source: str, config: Dict[str, Any], stage: str, dest: str) -> str: filename = "latest.{ext}".format( ext=self.get_dest_ext(self.destinations)) return get_path_format().format( stage=stage, task=self.task, source=source, prefix=self.destinations[dest]["prefix"], filename=filename, )
def get_filepath( self, source: str, config: Dict[str, Any], stage: str, dest: str, page: Union[int, str] = None, date: datetime.datetime = None, ) -> str: """Get data file path. The format would be {prefix}{stage}-{task}-{source}/{filename} :rtype: str :param source: name of the data source to be extracted, specified in task config, see `configs/*.py` :param config: config of the data source to be extracted, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be raw/staging/production. :param dest: name of the destination to load data to, specified in task config, see `configs/*.py` :param page: the page part of the data file name :param date: the date part of the data file name, will use `self.current_date` if not specified :return: the data file path """ if config["type"] == "gcs": if dest == "gcs": prefix = config["prefix"] else: prefix = self.destinations[dest]["prefix"] fpath = prefix + config["path"] + config["filename"] if page is not None: fpath = fpath.replace("*", page) return fpath else: return get_path_format().format( stage=stage, task=self.task, source=source, prefix=self.destinations[dest]["prefix"], filename=self.get_filename(source, config, stage, dest, page, date), )
def convert_latest_file( self, config: Dict[str, Any], source: str, stage: str, date: datetime.datetime = None, ): """Convert DataFrame into destination files. The logic is based on task config (see `configs/*.py`). :param config: the corresponding source config :param source: the name of the data source :param stage: the stage of the data :param date: the date of the data """ # find the latest file files = glob.glob( get_path_format(True).format( prefix=self.destinations["fs"]["prefix"], stage=stage, task=self.task, source=source, )) latest_ds = None latest_file = None for file in files: fn = get_path_prefix(os.path.basename(file))[0:-1] try: ds = datetime.datetime.strptime(fn, DEFAULT_DATE_FORMAT) except ValueError: # could be "latest" continue if latest_ds is None or ds > latest_ds: latest_ds = ds latest_file = file # copy to latest filepath latest_dest_file = self.get_latest_filepath(source, config, stage, "fs") copyfile(latest_file, latest_dest_file)
def get_filepaths( self, source: str, config: Dict[str, Any], stage: str, dest: str, date: datetime.datetime = None, ) -> List[str]: """Get existing data file paths with wildcard page number. :rtype: list[str] :param source: name of the data source to be extracted, specified in task config, see `configs/*.py` :param config: config of the data source to be extracted, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be raw/staging/production. :param dest: name of the destination to load data to, specified in task config, see `configs/*.py` :param date: the date part of the data file name, will use `self.current_date` if not specified :return: a list of data file paths """ if config["type"] == "gcs": if dest == "gcs": prefix = config["prefix"] else: prefix = self.destinations[dest]["prefix"] return glob.glob(prefix + config["path"] + config["filename"]) else: return glob.glob(get_path_format().format( stage=stage, task=self.task, source=source, prefix=self.destinations[dest]["prefix"], filename=self.get_filename(source, config, stage, dest, "*", date), ))