Beispiel #1
0
    def gbt(df, columns, input_col, **kwargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = GBTClassifier(**kwargs)

        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
Beispiel #2
0
    def decision_tree(df, columns, input_col, **kargs):
        """
        Runs a decision tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with decision tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = DecisionTreeClassifier(**kargs)

        df = df.cols.rename(name_col(input_col, "index"), "label")

        dt_model = model.fit(df)
        df_model = dt_model.transform(df)
        return df_model, dt_model
Beispiel #3
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/
    normal = [
        Normalizer(inputCol=col_name,
                   outputCol=name_col(col_name, "normalized"),
                   p=p) for col_name in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Beispiel #4
0
def print_json(value):
    """
    Print a human readable json
    :param value: json to be printed
    :return: json
    """
    pp = pprint.PrettyPrinter(indent=2)
    if is_str(value):
        value = value.replace("'", "\"")
        value = json.loads(value)

    pp.pprint(value)
Beispiel #5
0
def get_output_cols(input_cols, output_cols):
    """
    Construct output columns names
    :param input_cols:
    :param output_cols:
    :return:
    """

    if is_list(input_cols) and is_list(output_cols):
        if len(input_cols) != len(output_cols):
            RaiseIt.length_error(input_cols, output_cols)
    elif is_list(input_cols) and is_str(output_cols):
        if len(input_cols) > 1:
            output_cols = list([i + output_cols for i in input_cols])
        else:
            output_cols = val_to_list(output_cols)
    elif is_str(input_cols) and is_str(output_cols):
        output_cols = val_to_list(output_cols)
    elif output_cols is None:
        output_cols = input_cols

    return output_cols
Beispiel #6
0
def set_name(self, value=None):
    """
    Create a temp view for a data frame also used in the json output profiling
    :param self:
    :param value:
    :return:
    """
    self._name = value
    if not is_str(value):
        RaiseIt.type_error(value, ["string"])

    if len(value) == 0:
        RaiseIt.value_error(value, ["> 0"])

    self.createOrReplaceTempView(value)
Beispiel #7
0
def table_name(self, name=None):
    """
    Create a temp view for a data frame
    :param self:
    :param name:
    :return:
    """
    if not is_str(name):
        RaiseIt.type_error(name, ["string"])

    if len(name) is 0:
        RaiseIt.value_error(name, ["> 0"])

    self.createOrReplaceTempView(name)
    return self
Beispiel #8
0
def infer(value):
    """
    Infer a Spark data type from a value
    :param value: value to be inferred
    :return: Spark data type
    """
    result = None
    if value is None:
        result = "null"

    elif is_bool(value):
        result = "bool"

    elif isint(value):
        result = "int"

    elif isfloat(value):
        result = "float"

    elif is_list(value):
        result = ArrayType(infer(value[0]))

    elif is_datetime(value):
        result = "datetime"

    elif is_date(value):
        result = "date"

    elif is_binary(value):
        result = "binary"

    elif is_str(value):
        if str_to_boolean(value):
            result = "bool"
        elif str_to_date(value):
            result = "string"  # date
        elif str_to_array(value):
            result = "string"  # array
        else:
            result = "string"

    return parse_spark_class_dtypes(result)
Beispiel #9
0
def parse_columns(df,
                  cols_args,
                  get_args=False,
                  is_regex=None,
                  filter_by_column_dtypes=None,
                  accepts_missing_cols=False,
                  invert=False):
    """
    Return a list of columns and check that columns exists in the dataframe
    Accept '*' as parameter in which case return a list of all columns in the dataframe.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the dataframe
    :param get_args:
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the dataframe
    :param invert: Invert the final selection. For example if you want to select not integers

    :return: A list of columns string names
    """

    attrs = None

    # if columns value is * get all dataframes columns
    if is_regex is True:
        r = re.compile(cols_args[0])
        cols = list(filter(r.match, df.columns))

    elif cols_args == "*" or cols_args is None:
        cols = df.columns

    # In case we have a list of tuples we use the first element of the tuple is taken as the column name
    # and the rest as params. We can use the param in a custom function as follow
    # def func(attrs): attrs return (1,2) and (3,4)
    #   return attrs[0] + 1
    # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

    # Verify if we have a list with tuples
    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df.columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    if filter_by_column_dtypes is not None:
        filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)

    columns_residual = None

    # If necessary filter the columns by data type
    if filter_by_column_dtypes:
        # Get columns for every data type

        columns_filtered = filter_col_name_by_dtypes(df,
                                                     filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(
            OrderedSet(cols) - OrderedSet(columns_filtered))
    else:
        final_columns = cols

    # Return cols or cols an params
    cols_params = []

    if invert:
        final_columns = list(OrderedSet(cols) - OrderedSet(final_columns))

    if get_args is True:
        cols_params = final_columns, attrs
    elif get_args is False:
        cols_params = final_columns
    else:
        RaiseIt.value_error(get_args, ["True", "False"])

    if columns_residual:
        logger.print("%s %s %s", ",".join(escape_columns(columns_residual)),
                     "column(s) was not processed because is/are not",
                     ",".join(filter_by_column_dtypes))

    return cols_params
Beispiel #10
0
    def create(self,
               obj,
               method,
               suffix=None,
               output="df",
               additional_method=None,
               *args,
               **kwargs):
        """
        This is a helper function that output python tests for Spark Dataframes.
        :param obj: Object to be tested
        :param method: Method to be tested
        :param suffix: The test name will be create using the method param. suffix will add a string in case you want
        to customize the test name.
        :param output: can be a 'df' or a 'json'
        :param additional_method:
        :param args: Arguments to be used in the method
        :param kwargs: Keyword arguments to be used in the functions
        :return:
        """

        buffer = []

        def add_buffer(value):
            buffer.append("\t" + value)

        # Create name
        name = []

        if method is not None:
            name.append(method.replace(".", "_"))

        if additional_method is not None:
            name.append(additional_method)

        if suffix is not None:
            name.append(suffix)

        test_name = "_".join(name)

        func_test_name = "test_" + test_name + "()"

        print("Creating {test} test function...".format(test=func_test_name))
        logger.print(func_test_name)

        if not output == "dict":
            add_buffer("@staticmethod\n")
            func_test_name = "test_" + test_name + "()"
        else:
            func_test_name = "test_" + test_name + "(self)"

        filename = test_name + ".test"

        add_buffer("def " + func_test_name + ":\n")

        source = "source_df"
        if obj is None:
            # Use the main df
            df_func = self.df
        elif isinstance(obj, pyspark.sql.dataframe.DataFrame):

            source_df = "\tsource_df=op.create.df(" + obj.export() + ")\n"
            df_func = obj
            add_buffer(source_df)
        else:
            source = get_var_name(obj)
            df_func = obj

        # Process simple arguments
        _args = []
        for v in args:
            if is_str(v):
                _args.append("'" + v + "'")
            elif is_numeric(v):
                _args.append(str(v))

            elif is_list(v):
                if is_list_of_strings(v):
                    lst = ["'" + x + "'" for x in v]
                elif is_list_of_numeric(v):
                    lst = [str(x) for x in v]
                elif is_list_of_tuples(v):
                    lst = [str(x) for x in v]

                _args.append('[' + ','.join(lst) + ']')
            elif is_function(v):
                _args.append(v.__qualname__)

            else:
                _args.append(get_var_name(v))

            # else:
            #     import marshal
            #     code_string = marshal.dumps(v.__code__)
            #     add_buffer("\tfunction = '" + code_string + "'\n")
            # import marshal, types
            #
            # code = marshal.loads(code_string)
            # func = types.FunctionType(code, globals(), "some_func_name")

        _args = ','.join(_args)
        _kwargs = []

        # print(_args)
        # Process keywords arguments
        for k, v in kwargs.items():
            if is_str(v):
                v = "'" + v + "'"
            _kwargs.append(k + "=" + str(v))

        # Separator if we have positional and keyword arguments
        separator = ""
        if (not is_list_empty(args)) & (not is_list_empty(kwargs)):
            separator = ","

        if method is None:
            add_buffer("\tactual_df = source_df\n")
        else:
            am = ""
            if additional_method:
                am = "." + additional_method + "()"

            add_buffer("\tactual_df =" + source + "." + method + "(" + _args +
                       separator + ','.join(_kwargs) + ")" + am + "\n")

        # Apply function to the dataframe
        if method is None:
            df_result = self.op.create.df(*args, **kwargs)
        else:
            # Here we construct the method to be applied to the source object
            for f in method.split("."):
                df_func = getattr(df_func, f)

            df_result = df_func(*args, **kwargs)

        # Additional Methods
        if additional_method is not None:
            df_result = getattr(df_result, additional_method)()

        if output == "df":

            df_result.table()
            expected = "\texpected_df = op.create.df(" + df_result.export(
            ) + ")\n"
        elif output == "json":
            print(df_result)

            if is_str(df_result):
                df_result = "'" + df_result + "'"
            else:
                df_result = str(df_result)
            add_buffer("\tactual_df =json_enconding(actual_df)\n")

            expected = "\texpected_value =json_enconding(" + df_result + ")\n"
        elif output == "dict":
            print(df_result)

            expected = "\texpected_value =" + df_result + "\n"
        else:
            expected = "\t\n"

        add_buffer(expected)

        # Output
        if output == "df":
            add_buffer(
                "\tassert (expected_df.collect() == actual_df.collect())\n")
        elif output == "json":
            add_buffer("\tassert(expected_value == actual_df)\n")
        elif output == "dict":
            add_buffer(
                "\tself.assertDictEqual(deep_sort(expected_value),  deep_sort(actual_df))\n"
            )

        filename = self.path + "//" + filename
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # Write file
        test_file = open(filename, 'w', encoding='utf-8')

        for b in buffer:
            test_file.write(b)