コード例 #1
0
ファイル: import_csv.py プロジェクト: Haleyo/spark-tk
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on thedata.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> frame.inspect()
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
コード例 #2
0
ファイル: map_columns.py プロジェクト: mapleNvg/spark-tk
def map_columns(self, func, schema):
    """
    Create a new frame from the output of a UDF which over each row of the current frame.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])

        >>> adult.inspect()
        [#]  name      adult_years
        ==========================
        [0]  Fred               21
        [1]  Susan              15
        [2]  Thurston           47
        [3]  Judy               26


    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    (see also the 'add_columns' frame operation)
    """

    schema_helper.validate(schema)
    row = Row(self.schema)

    def map_columns_func(r):
        row._set_data(r)
        return func(row)

    if isinstance(schema, list):
        rdd = self._python.rdd.map(lambda r: map_columns_func(r))
    else:
        rdd = self._python.rdd.map(lambda r: [map_columns_func(r)])
    return self._tc.frame.create(rdd, schema)
コード例 #3
0
def map_columns(self, func, schema):
    """
    Create a new frame from the output of a UDF which over each row of the current frame.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])

        >>> adult.inspect()
        [#]  name      adult_years
        ==========================
        [0]  Fred               21
        [1]  Susan              15
        [2]  Thurston           47
        [3]  Judy               26


    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    (see also the 'add_columns' frame operation)
    """

    schema_helper.validate(schema)
    row = Row(self.schema)

    def map_columns_func(r):
        row._set_data(r)
        return func(row)
    if isinstance(schema, list):
        rdd = self._python.rdd.map(lambda r: map_columns_func(r))
    else:
        rdd = self._python.rdd.map(lambda r: [map_columns_func(r)])
    return self._tc.frame.create(rdd, schema)
コード例 #4
0
def add_columns(self, func, schema):
    """
    Add columns to current frame.

    Assigns data to column based on evaluating a function for each row.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's add a column which has how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))

        >>> frame.inspect()
        [#]  name      age  tenure  phone     adult_years
        =================================================
        [0]  Fred       39      16  555-1234           21
        [1]  Susan      33       3  555-0202           15
        [2]  Thurston   65      26  555-4510           47
        [3]  Judy       44      14  555-2183           26


    Multiple columns can be added at the same time.  Let's add percentage of
    life and percentage of adult life in one call, which is more efficient.

        >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
        ...                   [("of_age", float), ("of_adult", float)])

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54

    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    Let's add a column which shows the amount of person's name based on their adult tenure percentage.

        >>> def percentage_of_string(string, percentage):
        ...     '''returns a substring of the given string according to the given percentage'''
        ...     substring_len = int(percentage * len(string))
        ...     return string[:substring_len]

        >>> def add_name_by_adult_tenure(row):
        ...     return percentage_of_string(row.name, row.of_adult)

        >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))

        >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
        [#]  name      of_adult  tenured_name
        =====================================
        [0]  Fred          0.76  Fre
        [1]  Susan         0.20  S
        [2]  Thurston      0.55  Thur
        [3]  Judy          0.54  Ju


    Let's add a name based on tenure percentage of age.

        >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
        ...                   ('tenured_name_age', unicode))

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54
        <blankline>
        [#]  tenured_name  tenured_name_age
        ===================================
        [0]  Fre           F
        [1]  S
        [2]  Thur          Thu
        [3]  Ju            J


    """

    schema_helper.validate(schema)
    schema_helper.validate_is_mergeable(self._tc, self.schema, schema)

    row = Row(self.schema)

    def add_columns_func(r):
        row._set_data(r)
        return func(row)

    if isinstance(schema, list):
        self._python.rdd = self._python.rdd.map(
            lambda r: r + add_columns_func(r))
        self._python.schema.extend(schema)
    else:
        self._python.rdd = self._python.rdd.map(
            lambda r: r + [add_columns_func(r)])
        self._python.schema.append(schema)
コード例 #5
0
def import_csv(path,
               delimiter=",",
               header=False,
               infer_schema=True,
               schema=None,
               datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX",
               tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.  If the
                    value from the csv file cannot be converted to the data type specified by the schema (for example,
                    if the csv file has a string, and the schema specifies an int), the value will show up as missing
                    (None) in the frame.
    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.


        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True, infer_schema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False, infer_schema=False)
        -etc-

        >>> frame.inspect()
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """

    if schema is not None:
        infer_schema = False  # if a custom schema is provided, don't waste time inferring the schema during load
        sparktk_schema.validate(schema)
    if not isinstance(header, bool):
        raise ValueError(
            "header parameter must be a boolean, but is {0}.".format(
                type(header)))
    if not isinstance(infer_schema, bool):
        raise ValueError(
            "infer_schema parameter must be a boolean, but is {0}.".format(
                type(infer_schema)))
    TkContext.validate(tc)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if (not infer_schema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(
                    StructField(
                        column[0],
                        dtypes._data_type_to_pyspark_type_table[column[1]],
                        True))
            else:
                raise TypeError(
                    "Unsupported type {0} in schema for column {1}.".format(
                        column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                raise TypeError(
                    "Unsupported data type ({0}) for column {1}.".format(
                        str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(
                    row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
コード例 #6
0
ファイル: add_columns.py プロジェクト: ashaarunkumar/spark-tk
def add_columns(self, func, schema):
    """
    Add columns to current frame.

    Assigns data to column based on evaluating a function for each row.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's add a column which has how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))

        >>> frame.inspect()
        [#]  name      age  tenure  phone     adult_years
        =================================================
        [0]  Fred       39      16  555-1234           21
        [1]  Susan      33       3  555-0202           15
        [2]  Thurston   65      26  555-4510           47
        [3]  Judy       44      14  555-2183           26


    Multiple columns can be added at the same time.  Let's add percentage of
    life and percentage of adult life in one call, which is more efficient.

        >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
        ...                   [("of_age", float), ("of_adult", float)])

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54

    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    Let's add a column which shows the amount of person's name based on their adult tenure percentage.

        >>> def percentage_of_string(string, percentage):
        ...     '''returns a substring of the given string according to the given percentage'''
        ...     substring_len = int(percentage * len(string))
        ...     return string[:substring_len]

        >>> def add_name_by_adult_tenure(row):
        ...     return percentage_of_string(row.name, row.of_adult)

        >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))

        >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
        [#]  name      of_adult  tenured_name
        =====================================
        [0]  Fred          0.76  Fre
        [1]  Susan         0.20  S
        [2]  Thurston      0.55  Thur
        [3]  Judy          0.54  Ju


    Let's add a name based on tenure percentage of age.  We know we're only going to use
    columns 'name' and 'of_age'.

        >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
        ...                   ('tenured_name_age', unicode))

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54
        <blankline>
        [#]  tenured_name  tenured_name_age
        ===================================
        [0]  Fre           F
        [1]  S
        [2]  Thur          Thu
        [3]  Ju            J


    """

    schema_helper.validate(schema)
    schema_helper.validate_is_mergeable(self._tc, self.schema, schema)

    row = Row(self.schema)

    def add_columns_func(r):
        row._set_data(r)
        return func(row)
    if isinstance(schema, list):
        self._python.rdd = self._python.rdd.map(lambda r: r + add_columns_func(r))
        self._python.schema.extend(schema)
    else:
        self._python.rdd = self._python.rdd.map(lambda r: r + [add_columns_func(r)])
        self._python.schema.append(schema)