class ETLAnalysis(Task): """Created an abstract class for conducting analysis of covid data at different levels - by country, by year, by month and by week. This is a luigi task and sub-classed by the different levels of covid data analysis tasks. The analysis abstract class requires Cleanup and the parquet files for performing the analysis and display. This abstract class has one analysis method to override / implement in their respective tasks. Each analysis should be a separate Luigi task, which computes its analysis and writes the result to parquet. To display to the terminal or answer a quiz, the output should be read back from the written parquet file. Parameters: subset: bool, True to process just one partition, False to process the entire dataset, default: True analysis_path: str, base directory to store output files Output: Dataframe stored in compressed Parquet format in {task.analysis_path}/{task.sub_dir}/subset-{task.subset}/ """ subset = BoolParameter(default=True) analysis_path = Parameter(default="./data/covid/") requires = Requires() input_data = Requirement(CovidDataGlobalCleanupTask) # the output references a "sub_dir" parameter, which is expected to be defined # in a subclass output = TargetOutput( "{task.analysis_path}{task.sub_dir}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", ) def perform_analysis(self, df): """ this method will be implemented by sub-classes. """ raise NotImplementedError def run(self): """ Uses the data points we need for analysis -> Country_Region and Date calls the implemented perform_analysis method to do the calculations """ analysis_dataframe = self.input()["input_data"].read_dask() # invoke perform_analysis from the implemented sub-classes # only gets the aggregated analysis column and the calculated column output_dataframe = self.perform_analysis(analysis_dataframe) # write_dask parquet file output with gzip compression. self.output().write_dask(output_dataframe, write_index=True, compression="gzip")
class VaccineDataGlobalCleanupTask(Task): """Luigi Task to clean Vaccine time series data. The input is from External Task that specifies files in GIT. The cleaning from below code handles removing rows with null date and doses administered values are non-zero. The default parameters can be overridden for testing and I have overridden for all test cases. Parameters: subset: bool, True to process one partition, False to process the entire dataset default: True data_root: str, base directory to store cleaned output files Output: Dataframe stored in compressed Parquet format """ # default parameters subset = BoolParameter(default=True) data_root = Parameter(default="./data/vaccine/") # External task completion is required, to work with GIT / CSVTarget requires = Requires() input_data = Requirement(VaccineDataGlobalTask) # TargetOutput returns ParquetTarget output = TargetOutput( "{task.data_root}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", storage_options=None, ) def run(self): """ Clean Vaccine data from Task input and stores dataframe in Parquet format. :return: File content is stored in the data directory """ # The columns ["Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated"] # are all integers. However, given there are missing values, you must first # read them as floats, fill nan's as 0, then convert to int. # You can provide a dict of {col: dtype} when providing the dtype arg in places like # read_parquet and astype. number_columns = [ "Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated", ] # Ensure that the date column is parsed as a pandas datetime using parse_dates vdg_dask = self.input()["input_data"].read_dask( parse_dates=["Date"], dtype={c: "float" for c in number_columns}) if self.subset: vdg_dask = vdg_dask.get_partition(0) # perform data cleaning # Remove any blank countries vdg_dask = vdg_dask[~vdg_dask.Country_Region.isnull()] # Filter out invalid dates vdg_dask = vdg_dask[~vdg_dask.Date.isnull()] # You should set the index to Country_Region and ensure the output reads back with meaningful divisions # vdg_dask = vdg_dask.set_index("Country_Region") vdg_dask[number_columns] = vdg_dask[number_columns].fillna(0).astype( int) # write_dask parquet file output with gzip compression. vdg_output = vdg_dask self.output().write_dask(vdg_output, compression="gzip")
class ByCountryMonthVaccine(ETLAnalysisPrint): """ this class defines the requirement - ByCountryMonthAnalysis and does the results print. """ input_data = Requirement(ByCountryMonthVaccineAnalysis)
class CovidDataGlobalCleanupTask(Task): """Luigi Task to clean Covid time series data. The input is from External Task that specifies files in GIT. The cleaning from below code handles removing rows with null date and confirmed cases values are non-zero. The default parameters can be overridden for testing and I have overridden for all test cases. Parameters: subset: bool, True to process one partition, False to process the entire dataset default: True data_root: str, base directory to store cleaned output files Output: Dataframe stored in compressed Parquet format """ # default parameters subset = BoolParameter(default=True) data_root = Parameter(default="./data/covid/") # External task completion is required, to work with GIT / CSVTarget requires = Requires() input_data = Requirement(CovidDataGlobalTask) # TargetOutput returns ParquetTarget output = TargetOutput( "{task.data_root}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", storage_options=None, ) def run(self): """ Clean Covid data from Task input and stores dataframe in Parquet format. :return: File content is stored in the data directory """ # The columns [460 plus date data column contains confirmed covid cases numbers] # are all integers. However, given there are missing values, you must first # read them as floats, fill nan's as 0, then convert to int. # You can provide a dict of {col: dtype} when providing the dtype arg in places like # read_parquet and astype. est = timezone("EST") cur_date = datetime.datetime.now(est) logging.info(cur_date) number_of_days = ( cur_date - datetime.datetime.strptime("1/22/20", "%m/%d/%y").astimezone(est) ).days logging.info(number_of_days) number_columns = list() for days in range(1, number_of_days): number_columns.append( (datetime.datetime.now(est) - datetime.timedelta(days=days)).strftime( "%-m/%-d/%y" ) ) logging.info(number_columns) # Ensure that the date column is parsed as a pandas datetime using parse_dates cdg_dask = self.input()["input_data"].read_dask( dtype={c: "float" for c in number_columns} ) if self.subset: cdg_dask = cdg_dask.get_partition(0) # perform data cleaning # Remove any blank countries cdg_dask = cdg_dask[~cdg_dask[cdg_dask.columns[1]].isnull()] # You should set the index to Country_Region and ensure the output reads back with meaningful divisions # vdg_dask = vdg_dask.set_index("Country_Region") cdg_dask[number_columns] = cdg_dask[number_columns].fillna(0).astype(int) # write_dask parquet file output with gzip compression. cdg_output = cdg_dask self.output().write_dask(cdg_output, compression="gzip")
class ByCountryCovid(ETLAnalysisPrint): """ this class defines the requirement - ByCountryAnalysis and does the results print. """ input_data = Requirement(ByCountryCovidAnalysis)