Esempio n. 1
0
            class MyTaskC(Task):

                requires = Requires()
                a_reviews = Requirement(MyTaskA)
                b_reviews = Requirement(MyTaskB)

                def run(self):

                    dsk = self.input()["a_reviews"].read_dask(
                        columns=["a", "b"])

                    read_dask_dataframe(dsk)
Esempio n. 2
0
class LimitFactLoad(Task):
    LOCAL_ROOT = "./data/limitfact/"
    """
    This task outputs a local ParquetTarget in ./data/limitfact/ for further load into the database.
    """

    subset = BoolParameter(default=True)
    requires = Requires()
    limit_fact = Requirement(LimitFactVer)
    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "MTN": "object",
            "CUST_ACCT": "object",
            "LIMITING_DT": "object",
            "LIMIT_TYPE": "object",
        }
        dsk = self.input()["limit_fact"].read_dask(dtype=dtype_dic)

        if self.subset:
            dsk = dsk.get_partition(0)

        self.output().write_dask(dsk, compression="gzip", compute=True)
Esempio n. 3
0
class AccountDimLoad(Task):
    LOCAL_ROOT = "./data/accountdim/"
    """
    This task outputs a local ParquetTarget in ./data/accountdim/ for further load into the database.
    """

    subset = BoolParameter(default=True)
    requires = Requires()
    account_dim = Requirement(AccountDimVer)
    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "CUST_ACCT": "object",
            "SEGMENT_NAME": "object",
            "SVC_PLAN": "object",
        }
        dsk = self.input()["account_dim"].read_dask(dtype=dtype_dic)

        if self.subset:
            dsk = dsk.get_partition(0)

        self.output().write_dask(dsk, compression="gzip", compute=True)
Esempio n. 4
0
class LineDimLoad(Task):
    LOCAL_ROOT = "./data/linedim/"
    """
    This task outputs a local ParquetTarget in ./data/linedim/ for further load into the database.
    """

    subset = BoolParameter(default=True)
    requires = Requires()
    line_dim = Requirement(LineDimVer)
    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "MDN": "object",
            "DEVICE_GROUPING": "object",
            "SALES_CHANNEL": "object",
        }
        dsk = self.input()["line_dim"].read_dask(dtype=dtype_dic)

        if self.subset:
            dsk = dsk.get_partition(0)

        self.output().write_dask(dsk, compression="gzip", compute=True)
Esempio n. 5
0
class ByMdn(Task):
    LOCAL_ROOT = "file://data/amplitude/by_mdn/"

    requires = Requires()
    clean_files = Requirement(CleanandProcessData)

    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def print_results(self):
        print(self.output().read_dask().compute())

    def run(self):
        dsk = self.input()["clean_files"].read_dask(
            columns=["mdn", "event_time"], parse_dates=["event_time"])

        out = dsk.dropna()

        self.output().write_dask(out, compression="gzip")
        self.print_results()
Esempio n. 6
0
class ArchiveGzFile(ExternalTask):
    """
    Requires - Output from VerifyFileArrived
    Run - Copy the File to Archive Directory
    Output - File Exists in the Archive Directory
    """

    S3_DEST_PATH = ("s3://cscie29vsf/archive/"
                    )  # Destination S3 Path, as a constant, target directory
    client = S3Client()

    requires = Requires()
    verify_files = Requirement(VerifyFileArrived)

    def output(self):
        # return the S3Target of the files
        return S3Target(self.S3_DEST_PATH, client=self.client)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file
        self.client.copy(self.input().path, self.output().path)
Esempio n. 7
0
class CleanandProcessData(Task):
    """
    Requires - File from the Data Directory (txt format)
    Run - Clean and Process the Files and convert to a csv/parquet file
    Output - Parquet File with the salted output
    """

    LOCAL_ROOT = "file://data/amplitude/"

    col_names = [
        "mdn",
        "app",
        "amplitude_id",
        "device_id",
        "user_id",
        "event_time",
        "client_event_time",
        "client_upload_time",
        "server_upload_time",
        "event_id",
        "session_id",
        "event_type",
        "amplitude_event_type",
        "version_name",
        "os_name",
        "os_version",
        "device_brand",
        "device_manufacturer",
        "device_model",
        "device_carrier",
        "country",
        "language",
        "location_lat",
        "location_lng",
        "ip_address",
        "event_properties",
        "user_properties",
        "region",
        "city",
        "dma",
        "device_family",
        "device_type",
        "platform",
        "uuid",
        "paying",
        "start_version",
        "user_creation_time",
        "library",
        "idfa",
        "adid",
    ]

    requires = Requires()
    verify_files = Requirement(VerifyFileArrived)
    archive_files = Requirement(ArchiveGzFile)

    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "amplitude_id": "object",
            "os_version": "object",
            "mdn": "object"
        }
        dsk = self.input()["verify_files"].read_dask(
            parse_dates=[
                "event_time",
                "client_event_time",
                "client_upload_time",
                "server_upload_time",
            ],
            dtype=dtype_dic,
            delimiter="|",
            compression="gzip",
            blocksize=None,
        )
        self.output().write_dask(dsk, compression="gzip", compute=True)
Esempio n. 8
0
 class NewByMdn(ByMdn):
     LOCAL_ROOT = path + "mdn"
     clean_files = Requirement(NewCleanedData)
     output = TargetOutput(file_pattern=LOCAL_ROOT,
                           target_class=ParquetTarget,
                           ext="")
Esempio n. 9
0
 class NewCleanedData(CleanandProcessData):
     verify_files = Requirement(NewAmplitudeFiles)
     output = TargetOutput(target_class=ParquetTarget,
                           file_pattern=path,
                           ext="")