Ejemplo n.º 1
0
    def rename(columns_old_new=None, func=None):
        """"
        Changes the name of a column(s) dataFrame.
        :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName).
        :param func: can be lower, upper or any string transformation function
        """

        df = self

        # Apply a transformation function
        if is_function(func):
            exprs = [F.col(c).alias(func(c)) for c in df.columns]
            df = df.select(exprs)

        elif is_list_of_tuples(columns_old_new):
            # Check that the 1st element in the tuple is a valid set of columns

            validate_columns_names(self, columns_old_new)
            for c in columns_old_new:
                old_col_name = c[0]
                if is_str(old_col_name):
                    df = df.withColumnRenamed(old_col_name, c[1])
                elif is_int(old_col_name):
                    df = df.withColumnRenamed(self.schema.names[old_col_name], c[1])

        return df
Ejemplo n.º 2
0
    def years_between(col_name, new_col, date_format):
        """
        This method compute the age based on a born date.
        :param  col_name: Name of the column born dates column.
        :param  new_col: Name of the new column, the new columns is the resulting column of ages.
        :param  date_format: String format date of the column provided.


        """
        # Asserting if column if in dataFrame:
        validate_columns_names(self, col_name)

        # Output format date
        format_dt = "yyyy-MM-dd"  # Some SimpleDateFormat string

        def _years_between(new_col, attr):
            _date_format = attr[0]
            _col_name = attr[1]

            return F.format_number(
                F.abs(
                    F.months_between(
                        F.date_format(
                            F.unix_timestamp(
                                _col_name,
                                _date_format).cast("timestamp"),
                            format_dt),
                        F.current_date()) / 12), 4) \
                .alias(
                new_col)

        return apply_expr(new_col, _years_between,
                          [date_format, col_name]).cols.cast(new_col, "float")
Ejemplo n.º 3
0
 def drop_by_dtypes(col_name, data_type=None):
     """
     Drop rows by cell data type
     :param col_name: Column in which the filter is going to be apllied
     :param data_type: filter by string, integer, float or boolean
     :return: Spark DataFrame
     """
     validate_columns_names(self, col_name)
     return self.rows.drop(fbdt(col_name, data_type))
Ejemplo n.º 4
0
    def date_transform(col_name, new_col, current_format, output_format):
        """
        Tranform a column date format
        :param  col_name: Name date columns to be transformed. Columns ha
        :param  current_format: current_format is the current string dat format of columns specified. Of course,
                                all columns specified must have the same format. Otherwise the function is going
                                to return tons of null values because the transformations in the columns with
                                different formats will fail.
        :param  output_format: output date string format to be expected.
        """

        # Asserting if column if in dataFrame:
        validate_columns_names(self, col_name)

        def _date_transform(new_col, attr):
            _col_name = attr[0]
            _current_format = attr[1]
            _output_format = attr[2]
            return F.date_format(
                F.unix_timestamp(_col_name, _current_format).cast("timestamp"),
                _output_format).alias(new_col)

        return apply_expr(new_col, _date_transform,
                          [col_name, current_format, output_format])