class MyTaskC(Task): requires = Requires() a_reviews = Requirement(MyTaskA) b_reviews = Requirement(MyTaskB) def run(self): dsk = self.input()["a_reviews"].read_dask( columns=["a", "b"]) read_dask_dataframe(dsk)
class LimitFactLoad(Task): LOCAL_ROOT = "./data/limitfact/" """ This task outputs a local ParquetTarget in ./data/limitfact/ for further load into the database. """ subset = BoolParameter(default=True) requires = Requires() limit_fact = Requirement(LimitFactVer) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "MTN": "object", "CUST_ACCT": "object", "LIMITING_DT": "object", "LIMIT_TYPE": "object", } dsk = self.input()["limit_fact"].read_dask(dtype=dtype_dic) if self.subset: dsk = dsk.get_partition(0) self.output().write_dask(dsk, compression="gzip", compute=True)
class AccountDimLoad(Task): LOCAL_ROOT = "./data/accountdim/" """ This task outputs a local ParquetTarget in ./data/accountdim/ for further load into the database. """ subset = BoolParameter(default=True) requires = Requires() account_dim = Requirement(AccountDimVer) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "CUST_ACCT": "object", "SEGMENT_NAME": "object", "SVC_PLAN": "object", } dsk = self.input()["account_dim"].read_dask(dtype=dtype_dic) if self.subset: dsk = dsk.get_partition(0) self.output().write_dask(dsk, compression="gzip", compute=True)
class LineDimLoad(Task): LOCAL_ROOT = "./data/linedim/" """ This task outputs a local ParquetTarget in ./data/linedim/ for further load into the database. """ subset = BoolParameter(default=True) requires = Requires() line_dim = Requirement(LineDimVer) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "MDN": "object", "DEVICE_GROUPING": "object", "SALES_CHANNEL": "object", } dsk = self.input()["line_dim"].read_dask(dtype=dtype_dic) if self.subset: dsk = dsk.get_partition(0) self.output().write_dask(dsk, compression="gzip", compute=True)
class ByMdn(Task): LOCAL_ROOT = "file://data/amplitude/by_mdn/" requires = Requires() clean_files = Requirement(CleanandProcessData) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def print_results(self): print(self.output().read_dask().compute()) def run(self): dsk = self.input()["clean_files"].read_dask( columns=["mdn", "event_time"], parse_dates=["event_time"]) out = dsk.dropna() self.output().write_dask(out, compression="gzip") self.print_results()
class ArchiveGzFile(ExternalTask): """ Requires - Output from VerifyFileArrived Run - Copy the File to Archive Directory Output - File Exists in the Archive Directory """ S3_DEST_PATH = ("s3://cscie29vsf/archive/" ) # Destination S3 Path, as a constant, target directory client = S3Client() requires = Requires() verify_files = Requirement(VerifyFileArrived) def output(self): # return the S3Target of the files return S3Target(self.S3_DEST_PATH, client=self.client) def run(self): # Use self.output() and self.input() targets to atomically copy # the file self.client.copy(self.input().path, self.output().path)
class CleanandProcessData(Task): """ Requires - File from the Data Directory (txt format) Run - Clean and Process the Files and convert to a csv/parquet file Output - Parquet File with the salted output """ LOCAL_ROOT = "file://data/amplitude/" col_names = [ "mdn", "app", "amplitude_id", "device_id", "user_id", "event_time", "client_event_time", "client_upload_time", "server_upload_time", "event_id", "session_id", "event_type", "amplitude_event_type", "version_name", "os_name", "os_version", "device_brand", "device_manufacturer", "device_model", "device_carrier", "country", "language", "location_lat", "location_lng", "ip_address", "event_properties", "user_properties", "region", "city", "dma", "device_family", "device_type", "platform", "uuid", "paying", "start_version", "user_creation_time", "library", "idfa", "adid", ] requires = Requires() verify_files = Requirement(VerifyFileArrived) archive_files = Requirement(ArchiveGzFile) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "amplitude_id": "object", "os_version": "object", "mdn": "object" } dsk = self.input()["verify_files"].read_dask( parse_dates=[ "event_time", "client_event_time", "client_upload_time", "server_upload_time", ], dtype=dtype_dic, delimiter="|", compression="gzip", blocksize=None, ) self.output().write_dask(dsk, compression="gzip", compute=True)
class NewByMdn(ByMdn): LOCAL_ROOT = path + "mdn" clean_files = Requirement(NewCleanedData) output = TargetOutput(file_pattern=LOCAL_ROOT, target_class=ParquetTarget, ext="")
class NewCleanedData(CleanandProcessData): verify_files = Requirement(NewAmplitudeFiles) output = TargetOutput(target_class=ParquetTarget, file_pattern=path, ext="")