Ejemplo n.º 1
0
def write_data_by_file_extension(data: dd = None, file_path: Path = None):
    """
    write dask dataframe to file into based on the input path file extension
    :param file_path: path of output file
    :return: None
    """
    data = data.compute()
    map_file_extension_to_read_function = {
        '.csv': 'to_csv',
        '.parquet': 'to_parquet'
    }
    name, extension = os.path.splitext(file_path)
    if extension.lower() in map_file_extension_to_read_function.keys():
        write_function = getattr(
            data, map_file_extension_to_read_function[extension.lower()])
        read_function = map_file_extension_to_read_function[extension.lower()]
        write_function(file_path, index=False)
    else:
        raise Exception(f"File extention {extension} not recognized")
Ejemplo n.º 2
0
    def remove_outliers(self, data: dataframe, threshold: float):

        data = data.compute(num_workers=self.workers)
        stats: dict = {
            "mean": data[self.cols["CONTINUOUS"]].mean(axis=0),
            "std_dev": data[self.cols["CONTINUOUS"]].std(axis=0)
        }

        z_cols = list(map(lambda col: "z" + col, self.cols["CONTINUOUS"]))
        zdata = data[self.cols["CONTINUOUS"]].apply(
            lambda col: (col - stats["mean"][col.name]) /
            (stats["std_dev"][col.name]),
            axis=0)
        zdata.columns = z_cols

        data = concat([data, zdata], axis=1)
        for z_col in z_cols:
            data = data[data[z_col].between(-1 * threshold, threshold)]

        return dataframe.from_pandas(
            data.drop(columns=z_cols).reset_index(drop=True),
            npartitions=self.workers)
Ejemplo n.º 3
0
def null_data_check(data: dd = None) -> bool:
    """
    Check if dataframe contains any null values if so return true
    """
    data = data.compute()
    return data.isnull().values.any()