Example #1
0
class ETLAnalysis(Task):
    """Created an abstract class for conducting analysis of covid data
    at different levels - by country, by year, by month and by week.  This is a luigi
    task and sub-classed by the different levels of covid data analysis tasks.  The analysis
    abstract class requires Cleanup and the parquet files for performing
    the analysis and display.

    This abstract class has one analysis method to override / implement in their
    respective tasks.

    Each analysis should be a separate Luigi task, which computes its analysis and writes
    the result to parquet. To display to the terminal or answer a quiz, the output should
    be read back from the written parquet file.

    Parameters:
        subset: bool, True to process just one partition, False to process
            the entire dataset, default: True
        analysis_path: str, base directory to store output files

    Output:
        Dataframe stored in compressed Parquet format in
            {task.analysis_path}/{task.sub_dir}/subset-{task.subset}/
    """

    subset = BoolParameter(default=True)
    analysis_path = Parameter(default="./data/covid/")

    requires = Requires()
    input_data = Requirement(CovidDataGlobalCleanupTask)

    # the output references a "sub_dir" parameter, which is expected to be defined
    # in a subclass
    output = TargetOutput(
        "{task.analysis_path}{task.sub_dir}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
    )

    def perform_analysis(self, df):
        """ this method will be implemented by sub-classes. """
        raise NotImplementedError

    def run(self):
        """
        Uses the data points we need for analysis -> Country_Region and Date
        calls the implemented perform_analysis method to do the calculations
        """
        analysis_dataframe = self.input()["input_data"].read_dask()

        # invoke perform_analysis from the implemented sub-classes
        # only gets the aggregated analysis column and the calculated column
        output_dataframe = self.perform_analysis(analysis_dataframe)
        # write_dask parquet file output with gzip compression.
        self.output().write_dask(output_dataframe, write_index=True, compression="gzip")
class VaccineDataGlobalCleanupTask(Task):
    """Luigi Task to clean Vaccine time series data. The input is from
    External Task that specifies files in GIT. The cleaning from below code handles
    removing rows with null date and doses administered values are non-zero.
    The default parameters can be overridden for testing and I have overridden for
    all test cases.

    Parameters:
        subset: bool, True to process one partition, False to process the entire dataset
                    default: True
        data_root: str, base directory to store cleaned output files

    Output:
        Dataframe stored in compressed Parquet format

    """

    # default parameters
    subset = BoolParameter(default=True)
    data_root = Parameter(default="./data/vaccine/")

    # External task completion is required, to work with GIT / CSVTarget
    requires = Requires()
    input_data = Requirement(VaccineDataGlobalTask)

    # TargetOutput returns ParquetTarget
    output = TargetOutput(
        "{task.data_root}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
        storage_options=None,
    )

    def run(self):
        """
        Clean Vaccine data from Task input and stores dataframe in Parquet format.

        :return:
            File content is stored in the data directory
        """

        # The columns ["Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated"]
        # are all integers. However, given there are missing values, you must first
        # read them as floats, fill nan's as 0, then convert to int.
        # You can provide a dict of {col: dtype} when providing the dtype arg in places like
        # read_parquet and astype.
        number_columns = [
            "Doses_admin",
            "People_partially_vaccinated",
            "People_fully_vaccinated",
        ]
        # Ensure that the date column is parsed as a pandas datetime using parse_dates
        vdg_dask = self.input()["input_data"].read_dask(
            parse_dates=["Date"], dtype={c: "float"
                                         for c in number_columns})

        if self.subset:
            vdg_dask = vdg_dask.get_partition(0)

        # perform data cleaning
        # Remove any blank countries
        vdg_dask = vdg_dask[~vdg_dask.Country_Region.isnull()]
        # Filter out invalid dates
        vdg_dask = vdg_dask[~vdg_dask.Date.isnull()]

        # You should set the index to Country_Region and ensure the output reads back with meaningful divisions
        # vdg_dask = vdg_dask.set_index("Country_Region")
        vdg_dask[number_columns] = vdg_dask[number_columns].fillna(0).astype(
            int)

        # write_dask parquet file output with gzip compression.
        vdg_output = vdg_dask
        self.output().write_dask(vdg_output, compression="gzip")
class ByCountryMonthVaccine(ETLAnalysisPrint):
    """
    this class defines the requirement - ByCountryMonthAnalysis and does the results print.
    """

    input_data = Requirement(ByCountryMonthVaccineAnalysis)
Example #4
0
class CovidDataGlobalCleanupTask(Task):
    """Luigi Task to clean Covid time series data. The input is from
    External Task that specifies files in GIT. The cleaning from below code handles
    removing rows with null date and confirmed cases values are non-zero.
    The default parameters can be overridden for testing and I have overridden for
    all test cases.

    Parameters:
        subset: bool, True to process one partition, False to process the entire dataset
                    default: True
        data_root: str, base directory to store cleaned output files

    Output:
        Dataframe stored in compressed Parquet format

    """

    # default parameters
    subset = BoolParameter(default=True)
    data_root = Parameter(default="./data/covid/")

    # External task completion is required, to work with GIT / CSVTarget
    requires = Requires()
    input_data = Requirement(CovidDataGlobalTask)

    # TargetOutput returns ParquetTarget
    output = TargetOutput(
        "{task.data_root}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
        storage_options=None,
    )

    def run(self):
        """
        Clean Covid data from Task input and stores dataframe in Parquet format.

        :return:
            File content is stored in the data directory
        """

        # The columns [460 plus date data column contains confirmed covid cases numbers]
        # are all integers. However, given there are missing values, you must first
        # read them as floats, fill nan's as 0, then convert to int.
        # You can provide a dict of {col: dtype} when providing the dtype arg in places like
        # read_parquet and astype.
        est = timezone("EST")
        cur_date = datetime.datetime.now(est)
        logging.info(cur_date)
        number_of_days = (
            cur_date - datetime.datetime.strptime("1/22/20", "%m/%d/%y").astimezone(est)
        ).days
        logging.info(number_of_days)
        number_columns = list()
        for days in range(1, number_of_days):
            number_columns.append(
                (datetime.datetime.now(est) - datetime.timedelta(days=days)).strftime(
                    "%-m/%-d/%y"
                )
            )
        logging.info(number_columns)
        # Ensure that the date column is parsed as a pandas datetime using parse_dates
        cdg_dask = self.input()["input_data"].read_dask(
            dtype={c: "float" for c in number_columns}
        )

        if self.subset:
            cdg_dask = cdg_dask.get_partition(0)

        # perform data cleaning
        # Remove any blank countries
        cdg_dask = cdg_dask[~cdg_dask[cdg_dask.columns[1]].isnull()]

        # You should set the index to Country_Region and ensure the output reads back with meaningful divisions
        # vdg_dask = vdg_dask.set_index("Country_Region")
        cdg_dask[number_columns] = cdg_dask[number_columns].fillna(0).astype(int)

        # write_dask parquet file output with gzip compression.
        cdg_output = cdg_dask
        self.output().write_dask(cdg_output, compression="gzip")
Example #5
0
class ByCountryCovid(ETLAnalysisPrint):
    """
    this class defines the requirement - ByCountryAnalysis and does the results print.
    """

    input_data = Requirement(ByCountryCovidAnalysis)