Beispiel #1
0
def count(self, where=None):
    """
    Counts all rows or all qualified rows.

    Parameters
    ----------

    :param where: (UDF) Optional function which evaluates a row to a boolean to determine if it should be counted
    :return: (int) Number of rows counted

    Counts all rows or all rows which meet criteria specified by a UDF predicate.

    Examples
    --------

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.count()
        4

        >>> frame.count(lambda row: row.age > 35)
        3

    """
    if where:
        row = Row(self.schema)

        def count_where(r):
            row._set_data(r)
            return where(row)

        return self._python.rdd.filter(lambda r: count_where(r)).count()
    else:
        if self._is_scala:
            return int(self._scala.rowCount())
        return self.rdd.count()
def filter(self, predicate):
    """
    Select all rows which satisfy a predicate.

    Modifies the current frame to save defined rows and delete everything
    else.

    Parameters
    ----------

    :param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer False are dropped
                      from the frame.

    Examples
    --------
        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.filter(lambda row: row.tenure >= 15)  # keep only people with 15 or more years tenure

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Thurston   65      26  555-4510

    More information on a |UDF| can be found at :doc:`/ds_apir`.
    """
    row = Row(self.schema)

    def filter_func(r):
        row._set_data(r)
        return predicate(row)

    self._python.rdd = self._python.rdd.filter(filter_func)
Beispiel #3
0
def drop_rows(self, predicate):
    """
    Erase any row in the current frame which qualifies.

    Parameters
    ----------
    :param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer True are dropped from
                      the frame.

    Examples
    --------

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.drop_rows(lambda row: row.name[-1] == 'n')  # drop people whose name ends in 'n'

        >>> frame.inspect()
        [#]  name  age  tenure  phone
        ================================
        [0]  Fred   39      16  555-1234
        [1]  Judy   44      14  555-2183

    More information on a |UDF| can be found at :doc:`/ds_apir`.
    """
    row = Row(self.schema)

    def drop_rows_func(r):
        row._set_data(r)
        return not predicate(row)

    self._python.rdd = self._python.rdd.filter(drop_rows_func)
Beispiel #4
0
def count(self, where):
    """
    Counts qualified rows.

    Parameters
    ----------

    :param where: (UDF) Function which evaluates a row to a boolean
    :return: (int) Number of rows matching qualifications.

    Counts rows which meet criteria specified by a UDF predicate.

    Examples
    --------

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.count(lambda row: row.age > 35)
        3

    """
    row = Row(self.schema)

    def count_where(r):
        row._set_data(r)
        return where(row)

    return self._python.rdd.filter(lambda r: count_where(r)).count()
Beispiel #5
0
def map_columns(self, func, schema):
    """
    Create a new frame from the output of a UDF which over each row of the current frame.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)])

        >>> adult.inspect()
        [#]  name      adult_years
        ==========================
        [0]  Fred               21
        [1]  Susan              15
        [2]  Thurston           47
        [3]  Judy               26


    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    (see also the 'add_columns' frame operation)
    """

    schema_helper.validate(schema)
    row = Row(self.schema)

    def map_columns_func(r):
        row._set_data(r)
        return func(row)

    if isinstance(schema, list):
        rdd = self._python.rdd.map(lambda r: map_columns_func(r))
    else:
        rdd = self._python.rdd.map(lambda r: [map_columns_func(r)])
    return self._tc.frame.create(rdd, schema)
Beispiel #6
0
def add_columns(self, func, schema):
    """
    Add columns to current frame.

    Assigns data to column based on evaluating a function for each row.

    Notes
    -----

    1.  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.

    Parameters
    ----------

    :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s).
    :param schema: (List[(str,type)]) Schema for the column(s) being added.

    Examples
    --------

    Given our frame, let's add a column which has how many years the person has been over 18

        >>> frame = tc.frame.create([['Fred',39,16,'555-1234'],
        ...                          ['Susan',33,3,'555-0202'],
        ...                          ['Thurston',65,26,'555-4510'],
        ...                          ['Judy',44,14,'555-2183']],
        ...                         schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))

        >>> frame.inspect()
        [#]  name      age  tenure  phone     adult_years
        =================================================
        [0]  Fred       39      16  555-1234           21
        [1]  Susan      33       3  555-0202           15
        [2]  Thurston   65      26  555-4510           47
        [3]  Judy       44      14  555-2183           26


    Multiple columns can be added at the same time.  Let's add percentage of
    life and percentage of adult life in one call, which is more efficient.

        >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)],
        ...                   [("of_age", float), ("of_adult", float)])

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54

    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    Let's add a column which shows the amount of person's name based on their adult tenure percentage.

        >>> def percentage_of_string(string, percentage):
        ...     '''returns a substring of the given string according to the given percentage'''
        ...     substring_len = int(percentage * len(string))
        ...     return string[:substring_len]

        >>> def add_name_by_adult_tenure(row):
        ...     return percentage_of_string(row.name, row.of_adult)

        >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))

        >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
        [#]  name      of_adult  tenured_name
        =====================================
        [0]  Fred          0.76  Fre
        [1]  Susan         0.20  S
        [2]  Thurston      0.55  Thur
        [3]  Judy          0.54  Ju


    Let's add a name based on tenure percentage of age.

        >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
        ...                   ('tenured_name_age', unicode))

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54
        <blankline>
        [#]  tenured_name  tenured_name_age
        ===================================
        [0]  Fre           F
        [1]  S
        [2]  Thur          Thu
        [3]  Ju            J


    """

    schema_helper.validate(schema)
    schema_helper.validate_is_mergeable(self._tc, self.schema, schema)

    row = Row(self.schema)

    def add_columns_func(r):
        row._set_data(r)
        return func(row)

    if isinstance(schema, list):
        self._python.rdd = self._python.rdd.map(
            lambda r: r + add_columns_func(r))
        self._python.schema.extend(schema)
    else:
        self._python.rdd = self._python.rdd.map(
            lambda r: r + [add_columns_func(r)])
        self._python.schema.append(schema)
Beispiel #7
0
def copy(self, columns=None, where=None):
    """
    New frame with copied columns.

    Parameters
    ----------

    :param columns: (str, List[str], or dictionary(str,str))  If not None, the copy will only include the
                    columns specified.  If dict, the string pairs represent a column renaming
                    { source_column_name : destination_column_name }
    :param where: (UDF) Optionally provide a where function.  If not None, only those rows for which the UDF
                  evaluates to True will be copied.
    :return: (Frame) New Frame object.

    Copies specified columns into a new Frame object, optionally renaming them and/or filtering them.
    Useful for frame query.

    Examples
    --------

    <hide>
    >>> schema = [("name", str), ("age", int), ("years", int)]
    >>> data = [["Thurston",64,26],["Judy",44,14],["Emily",37,5],["Frank",50,18],["Joe",43,11],["Ruth",52,21]]
    >>> frame = tc.frame.create(data, schema)
    </hide>

    Consider the following frame of employee names, age, and years of service:

        >>> frame.inspect()
        [#]  name      age  years
        =========================
        [0]  Thurston   64     26
        [1]  Judy       44     14
        [2]  Emily      37      5
        [3]  Frank      50     18
        [4]  Joe        43     11
        [5]  Ruth       52     21

        >>> frame.schema
        [('name', <type 'str'>), ('age', <type 'int'>), ('years', <type 'int'>)]

    To create a duplicate copy of the frame, use the copy operation with no parameters:

        >>> duplicate = frame.copy()
        <progress>

        >>> duplicate.inspect()
        [#]  name      age  years
        =========================
        [0]  Thurston   64     26
        [1]  Judy       44     14
        [2]  Emily      37      5
        [3]  Frank      50     18
        [4]  Joe        43     11
        [5]  Ruth       52     21

    Using the copy operation, we can also limit the new frame to just include the 'name' column:

        >>> names = frame.copy("name")
        <progress>

        >>> names.inspect()
        [#]  name
        =============
        [0]  Thurston
        [1]  Judy
        [2]  Emily
        [3]  Frank
        [4]  Joe
        [5]  Ruth

    We could also include a UDF to filter the data that is included in the new frame, and also provide
    a dictionary to rename the column(s) in the new frame.  Here we will use copy to create a frame of
    names for the employees that have over 20 years of service and also rename of the 'name' column to
    'first_name':

        >>> names = frame.copy({"name" : "first_name"}, lambda row: row.years > 20)
        <progress>

        >>> names.inspect()
        [#]  first_name
        ===============
        [0]  Thurston
        [1]  Ruth

    """
    new_rdd = self._python.rdd

    if where is not None and not isinstance(where, types.FunctionType):
        raise ValueError(
            "Unsupported type for 'where' parameter.  Must be a function or None, but is: {0}"
            .format(type(where)))

    if isinstance(columns, str):
        columns = [columns]
    if isinstance(columns, list):
        column_indices = [
            i for i, column in enumerate(self._python.schema)
            if column[0] in columns
        ]
    elif isinstance(columns, dict):
        column_indices = [
            i for i, column in enumerate(self._python.schema)
            if column[0] in columns.keys()
        ]
    elif columns is None:
        column_indices = xrange(0, len(self._python.schema))
    else:
        raise ValueError(
            "Unsupported type for 'columns' parameter. Expected str, list, dict, or None, but was: {0}"
            .format(type(columns)))

    if where is not None:
        # If a udf is provided, apply that function and apply the new schema
        row = Row(self._python.schema)

        def copy_func(r):
            row._set_data(r)
            return where(row)

        new_rdd = self._python.rdd.filter(lambda r: copy_func(r))
    if len(column_indices) < len(self._python.schema):
        # Map rows to only include the specified columns
        row = Row(self._python.schema)

        def map_func(r):
            row._set_data(r)
            return list(row[i] for i in column_indices)

        new_rdd = new_rdd.map(lambda r: map_func(r))

    new_schema = list(self._python.schema[i] for i in column_indices)

    # If columns are being renamed through a dictionary, alter the schema
    if (isinstance(columns, dict)):
        renamed_schema = []
        for column in new_schema:
            if columns.has_key(column[0]):
                new_name = columns[column[0]]
                renamed_schema.append((new_name, column[1]))
        new_schema = renamed_schema

    # return new frame with the filtered rdd and new schema
    return self._tc.frame.create(new_rdd, new_schema)
Beispiel #8
0
def add_columns(self, func, schema, columns_accessed=None):
    """
    Add columns to current frame.

    Assigns data to column based on evaluating a function for each row.

    Notes
    -----
    1)  The row |UDF| ('func') must return a value in the same format as
        specified by the schema.
        See :doc:`/ds_apir`.
    2)  Unicode in column names is not supported and will likely cause the
        drop_frames() method (and others) to fail!

    Examples
    --------
    Given our frame, let's add a column which has how many years the person has been over 18

    .. code::

        >>> frame = tc.to_frame([['Fred',39,16,'555-1234'],
        ...                      ['Susan',33,3,'555-0202'],
        ...                      ['Thurston',65,26,'555-4510'],
        ...                      ['Judy',44,14,'555-2183']],
        ...                     schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)])

        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

        >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int))

        >>> frame.inspect()
        [#]  name      age  tenure  phone     adult_years
        =================================================
        [0]  Fred       39      16  555-1234           21
        [1]  Susan      33       3  555-0202           15
        [2]  Thurston   65      26  555-4510           47
        [3]  Judy       44      14  555-2183           26


    Multiple columns can be added at the same time.  Let's add percentage of
    life and percentage of adult life in one call, which is more efficient.

    .. code::

        >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)], [("of_age", float), ("of_adult", float)])

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54

    Note that the function returns a list, and therefore the schema also needs to be a list.

    It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument.  We
    can also call other local functions within.

    Let's add a column which shows the amount of person's name based on their adult tenure percentage.

        >>> def percentage_of_string(string, percentage):
        ...     '''returns a substring of the given string according to the given percentage'''
        ...     substring_len = int(percentage * len(string))
        ...     return string[:substring_len]

        >>> def add_name_by_adult_tenure(row):
        ...     return percentage_of_string(row.name, row.of_adult)

        >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode))

        <skip>
        >>> frame
        Frame "example_frame"
        row_count = 4
        schema = [name:unicode, age:int32, tenure:int32, phone:unicode, adult_years:int32, of_age:float32, of_adult:float32, tenured_name:unicode]
        status = ACTIVE  (last_read_date = -etc-)
        </skip>

        >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2)
        [#]  name      of_adult  tenured_name
        =====================================
        [0]  Fred          0.76  Fre
        [1]  Susan         0.20  S
        [2]  Thurston      0.55  Thur
        [3]  Judy          0.54  Ju


    **Optimization** - If we know up front which columns our row function will access, we
    can tell add_columns to speed up the execution by working on only the limited feature
    set rather than the entire row.

    Let's add a name based on tenure percentage of age.  We know we're only going to use
    columns 'name' and 'of_age'.

    .. code::

        >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age),
        ...                   ('tenured_name_age', unicode),
        ...                   columns_accessed=['name', 'of_age'])

        >>> frame.inspect(round=2)
        [#]  name      age  tenure  phone     adult_years  of_age  of_adult
        ===================================================================
        [0]  Fred       39      16  555-1234           21    0.41      0.76
        [1]  Susan      33       3  555-0202           15    0.09      0.20
        [2]  Thurston   65      26  555-4510           47    0.40      0.55
        [3]  Judy       44      14  555-2183           26    0.32      0.54
        <blankline>
        [#]  tenured_name  tenured_name_age
        ===================================
        [0]  Fre           F
        [1]  S
        [2]  Thur          Thu
        [3]  Ju            J

    More information on a row |UDF| can be found at :doc:`/ds_apir`

    """
    # For further examples, see :ref:`example_frame.add_columns`.

    row = Row(self.schema)

    def add_columns_func(r):
        row._set_data(r)
        return func(row)

    if isinstance(schema, list):
        self._python.rdd = self._python.rdd.map(
            lambda r: r + add_columns_func(r))
        self._python.schema.extend(schema)
    else:
        self._python.rdd = self._python.rdd.map(
            lambda r: r + [add_columns_func(r)])
        self._python.schema.append(schema)