Beispiel #1
0
    def __init__(
        self,
        args: Namespace,
        sources: Dict[str, Any],
        schema: List[Tuple[str, np.generic]],
        destinations: Dict[str, Any],
        stage: str,
        task: str,
    ):
        """Initiate parameters and client libraries for ETL task.

        :param args: args passed from command line,
        see `get_arg_parser()`
        :param sources: data source to be extracted,
        specified in task config, see `configs/*.py`
        :param schema: the target schema to load to.
        :param destinations: destinations to load data to,
        specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be staging/production.
        :param task: the name of the task.
        """
        # Clear cached files
        if args.rm:
            for source in sources:
                files = []
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage="raw",
                        task=args.task,
                        source=source,
                    ))
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage=stage,
                        task=args.task,
                        source=source,
                    ))
                for f in files:
                    log.info("Removing cached file: %s" % f)
                    os.remove(f)
        self.task = task
        self.stage = stage
        self.args = args
        self.period = args.period
        self.current_date = args.date
        self.last_month = lookback_dates(args.date, args.period)
        self.sources = sources
        coltypes = []
        for coltype in schema:
            coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])]
        self.schema = Schema(coltypes)
        self.raw_schema = schema
        self.destinations = destinations
        self.raw = dict()
        self.extracted_base = dict()
        self.extracted = dict()
        self.transformed = dict()
        self.gcs = storage.Client()
Beispiel #2
0
 def extract(self):
     """Inherit from super class and extract latest fb_index for later use."""
     super().extract()
     source = "bukalapak"
     if not self.args.source or source in self.args.source.split(","):
         config = self.sources[source]
         yesterday = lookback_dates(self.current_date, 1)
         if self.args.dest != "fs":
             self.extracted[source + "_base"] = self.extract_via_gcs(
                 source, config, "raw", yesterday)
         else:
             self.extracted[source + "_base"] = self.extract_via_fs(
                 source, config, "raw", yesterday)
Beispiel #3
0
 def get_backfill_dates(self):
     if "backfill_days" in self.config:
         bf_dates = []
         for bf_day in self.config["backfill_days"]:
             bf_dates += [
                 lookback_dates(
                     datetime.datetime.strptime(
                         self.date, utils.config.DEFAULT_DATE_FORMAT),
                     bf_day,
                 ).strftime(utils.config.DEFAULT_DATE_FORMAT)
             ]
         return bf_dates
     return
Beispiel #4
0
    def extract_via_api(
        self,
        source: str,
        config: Dict[str, Any],
        stage: str = "raw",
        date: datetime.datetime = None,
    ) -> Union[DataFrame, Dict[str, DataFrame]]:
        """Extract data from API and convert into DataFrame.

        The logic is based on task config, see `configs/*.py`

        :rtype: DataFrame
        :param source: name of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param config: config of the data source to be extracted,
            specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be raw/staging/production.
        :param date: the date part of the data file name,
            will use `self.current_date` if not specified
        :return: the extracted `DataFrame`
        """
        # API paging
        start_date = (self.last_month.strftime(config["date_format"])
                      if date is None else lookback_dates(date, self.period))
        end_date = (self.current_date.strftime(config["date_format"])
                    if date is None else date)
        request_interval = (config["request_interval"]
                            if "request_interval" in config else 1)
        if "iterator" in config:
            raw = dict()
            extracted = dict()
            for it in config["iterator"]:
                log.debug("waiting for %s iterator %d" % (source, it))
                time.sleep(request_interval)
                it = str(it)
                url = config["url"].format(
                    api_key=config["api_key"],
                    start_date=start_date,
                    end_date=end_date,
                    iterator=it,
                )
                r = requests.get(url, allow_redirects=True)
                raw[it] = r.text
                extracted[it] = convert_df(raw[it], config)
            self.raw[source] = raw
            log.info("%s-%s-%s/%s x %d iterators extracted from API" %
                     (stage, self.task, source, self.current_date.date(),
                      len(extracted)))
            return extracted
        elif "page_size" in config:
            limit = config["page_size"]
            url = config["url"].format(
                api_key=config["api_key"],
                start_date=start_date,
                end_date=end_date,
                page=1,
                limit=limit,
            )
            r = requests.get(url, allow_redirects=True)
            raw = [r.text]
            extracted = convert_df(raw[0], config)
            count = int(json_extract(raw[0], config["json_path_page_count"]))
            if count is None or int(count) <= 1:
                self.raw[source] = raw
                log.info("%s-%s-%s/%s x 1 page extracted from API" %
                         (stage, self.task, source, self.current_date.date()))
                return extracted
            for page in range(2, count):
                log.debug("waiting for %s page %d" % (source, page))
                time.sleep(request_interval)
                url = config["url"].format(
                    api_key=config["api_key"],
                    start_date=start_date,
                    end_date=end_date,
                    page=page,
                    limit=limit,
                )
                r = requests.get(url, allow_redirects=True)
                raw += [r.text]
                extracted = extracted.append(convert_df(raw[page - 1], config))
            extracted = extracted.reset_index(drop=True)
            self.raw[source] = raw
            log.info(
                "%s-%s-%s/%s x %d pages extracted from API" %
                (stage, self.task, source, self.current_date.date(), count))
            return extracted
        else:
            url = config["url"].format(api_key=config["api_key"],
                                       start_date=start_date,
                                       end_date=end_date)
            r = requests.get(url, allow_redirects=True)
            raw = r.text
            self.raw[source] = raw
            log.info("%s-%s-%s/%s extracted from API" %
                     ("raw", self.task, source, self.current_date.date()))
            return convert_df(raw, config)
Beispiel #5
0
 def get_latest_date(self):
     # assuming the latest date passed is one day behind
     lookback_period = (
         1 if "days_behind" not in self.config else self.config["days_behind"] + 1
     )
     return lookback_dates(datetime.datetime.utcnow(), lookback_period).date()
Beispiel #6
0
"""Adjust ETL task."""
import datetime
from argparse import Namespace
from typing import Dict, Any, List, Tuple
from tasks import base
import numpy as np
from utils.config import get_configs, get_arg_parser
import logging

from utils.marshalling import lookback_dates

log = logging.getLogger(__name__)

DEFAULTS = {"date": lookback_dates(datetime.datetime.utcnow(), 1)}


class AdjustEtlTask(base.EtlTask):
    """ETL task to compute Adjust from events."""
    def __init__(
        self,
        args: Namespace,
        sources: Dict[str, Any],
        schema: List[Tuple[str, np.generic]],
        destinations: Dict[str, Any],
    ):
        """Initialize Adjust ETL task.

        :param args: args passed from command line,
        see `get_arg_parser()`
        :param sources: data source to be extracted,
        specified in task config, see `configs/*.py`