Esempio n. 1
0
    def __init__(
        self,
        args: Namespace,
        sources: Dict[str, Any],
        schema: List[Tuple[str, np.generic]],
        destinations: Dict[str, Any],
        stage: str,
        task: str,
    ):
        """Initiate parameters and client libraries for ETL task.

        :param args: args passed from command line,
        see `get_arg_parser()`
        :param sources: data source to be extracted,
        specified in task config, see `configs/*.py`
        :param schema: the target schema to load to.
        :param destinations: destinations to load data to,
        specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be staging/production.
        :param task: the name of the task.
        """
        # Clear cached files
        if args.rm:
            for source in sources:
                files = []
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage="raw",
                        task=args.task,
                        source=source,
                    ))
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage=stage,
                        task=args.task,
                        source=source,
                    ))
                for f in files:
                    log.info("Removing cached file: %s" % f)
                    os.remove(f)
        self.task = task
        self.stage = stage
        self.args = args
        self.period = args.period
        self.current_date = args.date
        self.last_month = lookback_dates(args.date, args.period)
        self.sources = sources
        coltypes = []
        for coltype in schema:
            coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])]
        self.schema = Schema(coltypes)
        self.raw_schema = schema
        self.destinations = destinations
        self.raw = dict()
        self.extracted_base = dict()
        self.extracted = dict()
        self.transformed = dict()
        self.gcs = storage.Client()
Esempio n. 2
0
 def get_latest_filepath(self, source: str, config: Dict[str, Any],
                         stage: str, dest: str) -> str:
     filename = "latest.{ext}".format(
         ext=self.get_dest_ext(self.destinations))
     return get_path_format().format(
         stage=stage,
         task=self.task,
         source=source,
         prefix=self.destinations[dest]["prefix"],
         filename=filename,
     )
Esempio n. 3
0
    def get_filepath(
        self,
        source: str,
        config: Dict[str, Any],
        stage: str,
        dest: str,
        page: Union[int, str] = None,
        date: datetime.datetime = None,
    ) -> str:
        """Get data file path.

        The format would be {prefix}{stage}-{task}-{source}/{filename}

        :rtype: str
        :param source: name of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param config: config of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be raw/staging/production.
        :param dest: name of the destination to load data to,
            specified in task config, see `configs/*.py`
        :param page: the page part of the data file name
        :param date: the date part of the data file name,
            will use `self.current_date` if not specified
        :return: the data file path
        """
        if config["type"] == "gcs":
            if dest == "gcs":
                prefix = config["prefix"]
            else:
                prefix = self.destinations[dest]["prefix"]
            fpath = prefix + config["path"] + config["filename"]
            if page is not None:
                fpath = fpath.replace("*", page)
            return fpath
        else:
            return get_path_format().format(
                stage=stage,
                task=self.task,
                source=source,
                prefix=self.destinations[dest]["prefix"],
                filename=self.get_filename(source, config, stage, dest, page,
                                           date),
            )
Esempio n. 4
0
    def convert_latest_file(
        self,
        config: Dict[str, Any],
        source: str,
        stage: str,
        date: datetime.datetime = None,
    ):
        """Convert DataFrame into destination files.

        The logic is based on task config (see `configs/*.py`).

        :param config: the corresponding source config
        :param source: the name of the data source
        :param stage: the stage of the data
        :param date: the date of the data
        """
        # find the latest file
        files = glob.glob(
            get_path_format(True).format(
                prefix=self.destinations["fs"]["prefix"],
                stage=stage,
                task=self.task,
                source=source,
            ))
        latest_ds = None
        latest_file = None
        for file in files:
            fn = get_path_prefix(os.path.basename(file))[0:-1]
            try:
                ds = datetime.datetime.strptime(fn, DEFAULT_DATE_FORMAT)
            except ValueError:
                # could be "latest"
                continue
            if latest_ds is None or ds > latest_ds:
                latest_ds = ds
                latest_file = file

        # copy to latest filepath
        latest_dest_file = self.get_latest_filepath(source, config, stage,
                                                    "fs")
        copyfile(latest_file, latest_dest_file)
Esempio n. 5
0
    def get_filepaths(
        self,
        source: str,
        config: Dict[str, Any],
        stage: str,
        dest: str,
        date: datetime.datetime = None,
    ) -> List[str]:
        """Get existing data file paths with wildcard page number.

        :rtype: list[str]
        :param source: name of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param config: config of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be raw/staging/production.
        :param dest: name of the destination to load data to,
            specified in task config, see `configs/*.py`
        :param date: the date part of the data file name,
            will use `self.current_date` if not specified
        :return: a list of data file paths
        """
        if config["type"] == "gcs":
            if dest == "gcs":
                prefix = config["prefix"]
            else:
                prefix = self.destinations[dest]["prefix"]
            return glob.glob(prefix + config["path"] + config["filename"])
        else:
            return glob.glob(get_path_format().format(
                stage=stage,
                task=self.task,
                source=source,
                prefix=self.destinations[dest]["prefix"],
                filename=self.get_filename(source, config, stage, dest, "*",
                                           date),
            ))