def count(self, where=None): """ Counts all rows or all qualified rows. Parameters ---------- :param where: (UDF) Optional function which evaluates a row to a boolean to determine if it should be counted :return: (int) Number of rows counted Counts all rows or all rows which meet criteria specified by a UDF predicate. Examples -------- >>> frame = tc.frame.create([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> frame.count() 4 >>> frame.count(lambda row: row.age > 35) 3 """ if where: row = Row(self.schema) def count_where(r): row._set_data(r) return where(row) return self._python.rdd.filter(lambda r: count_where(r)).count() else: if self._is_scala: return int(self._scala.rowCount()) return self.rdd.count()
def filter(self, predicate): """ Select all rows which satisfy a predicate. Modifies the current frame to save defined rows and delete everything else. Parameters ---------- :param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer False are dropped from the frame. Examples -------- >>> frame = tc.frame.create([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> frame.filter(lambda row: row.tenure >= 15) # keep only people with 15 or more years tenure >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Thurston 65 26 555-4510 More information on a |UDF| can be found at :doc:`/ds_apir`. """ row = Row(self.schema) def filter_func(r): row._set_data(r) return predicate(row) self._python.rdd = self._python.rdd.filter(filter_func)
def drop_rows(self, predicate): """ Erase any row in the current frame which qualifies. Parameters ---------- :param predicate: (UDF) Function which evaluates a row to a boolean; rows that answer True are dropped from the frame. Examples -------- >>> frame = tc.frame.create([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> frame.drop_rows(lambda row: row.name[-1] == 'n') # drop people whose name ends in 'n' >>> frame.inspect() [#] name age tenure phone ================================ [0] Fred 39 16 555-1234 [1] Judy 44 14 555-2183 More information on a |UDF| can be found at :doc:`/ds_apir`. """ row = Row(self.schema) def drop_rows_func(r): row._set_data(r) return not predicate(row) self._python.rdd = self._python.rdd.filter(drop_rows_func)
def count(self, where): """ Counts qualified rows. Parameters ---------- :param where: (UDF) Function which evaluates a row to a boolean :return: (int) Number of rows matching qualifications. Counts rows which meet criteria specified by a UDF predicate. Examples -------- >>> frame = tc.frame.create([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> frame.count(lambda row: row.age > 35) 3 """ row = Row(self.schema) def count_where(r): row._set_data(r) return where(row) return self._python.rdd.filter(lambda r: count_where(r)).count()
def map_columns(self, func, schema): """ Create a new frame from the output of a UDF which over each row of the current frame. Notes ----- 1. The row |UDF| ('func') must return a value in the same format as specified by the schema. Parameters ---------- :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s). :param schema: (List[(str,type)]) Schema for the column(s) being added. Examples -------- Given our frame, let's create a new frame with the name and a column with how many years the person has been over 18 >>> frame = tc.frame.create([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> adult = frame.map_columns(lambda row: [row.name, row.age - 18], [('name', str), ('adult_years', int)]) >>> adult.inspect() [#] name adult_years ========================== [0] Fred 21 [1] Susan 15 [2] Thurston 47 [3] Judy 26 Note that the function returns a list, and therefore the schema also needs to be a list. It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within. (see also the 'add_columns' frame operation) """ schema_helper.validate(schema) row = Row(self.schema) def map_columns_func(r): row._set_data(r) return func(row) if isinstance(schema, list): rdd = self._python.rdd.map(lambda r: map_columns_func(r)) else: rdd = self._python.rdd.map(lambda r: [map_columns_func(r)]) return self._tc.frame.create(rdd, schema)
def add_columns(self, func, schema): """ Add columns to current frame. Assigns data to column based on evaluating a function for each row. Notes ----- 1. The row |UDF| ('func') must return a value in the same format as specified by the schema. Parameters ---------- :param func: (UDF) Function which takes the values in the row and produces a value, or collection of values, for the new cell(s). :param schema: (List[(str,type)]) Schema for the column(s) being added. Examples -------- Given our frame, let's add a column which has how many years the person has been over 18 >>> frame = tc.frame.create([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int)) >>> frame.inspect() [#] name age tenure phone adult_years ================================================= [0] Fred 39 16 555-1234 21 [1] Susan 33 3 555-0202 15 [2] Thurston 65 26 555-4510 47 [3] Judy 44 14 555-2183 26 Multiple columns can be added at the same time. Let's add percentage of life and percentage of adult life in one call, which is more efficient. >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)], ... [("of_age", float), ("of_adult", float)]) >>> frame.inspect(round=2) [#] name age tenure phone adult_years of_age of_adult =================================================================== [0] Fred 39 16 555-1234 21 0.41 0.76 [1] Susan 33 3 555-0202 15 0.09 0.20 [2] Thurston 65 26 555-4510 47 0.40 0.55 [3] Judy 44 14 555-2183 26 0.32 0.54 Note that the function returns a list, and therefore the schema also needs to be a list. It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within. Let's add a column which shows the amount of person's name based on their adult tenure percentage. >>> def percentage_of_string(string, percentage): ... '''returns a substring of the given string according to the given percentage''' ... substring_len = int(percentage * len(string)) ... return string[:substring_len] >>> def add_name_by_adult_tenure(row): ... return percentage_of_string(row.name, row.of_adult) >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode)) >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2) [#] name of_adult tenured_name ===================================== [0] Fred 0.76 Fre [1] Susan 0.20 S [2] Thurston 0.55 Thur [3] Judy 0.54 Ju Let's add a name based on tenure percentage of age. >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age), ... ('tenured_name_age', unicode)) >>> frame.inspect(round=2) [#] name age tenure phone adult_years of_age of_adult =================================================================== [0] Fred 39 16 555-1234 21 0.41 0.76 [1] Susan 33 3 555-0202 15 0.09 0.20 [2] Thurston 65 26 555-4510 47 0.40 0.55 [3] Judy 44 14 555-2183 26 0.32 0.54 <blankline> [#] tenured_name tenured_name_age =================================== [0] Fre F [1] S [2] Thur Thu [3] Ju J """ schema_helper.validate(schema) schema_helper.validate_is_mergeable(self._tc, self.schema, schema) row = Row(self.schema) def add_columns_func(r): row._set_data(r) return func(row) if isinstance(schema, list): self._python.rdd = self._python.rdd.map( lambda r: r + add_columns_func(r)) self._python.schema.extend(schema) else: self._python.rdd = self._python.rdd.map( lambda r: r + [add_columns_func(r)]) self._python.schema.append(schema)
def copy(self, columns=None, where=None): """ New frame with copied columns. Parameters ---------- :param columns: (str, List[str], or dictionary(str,str)) If not None, the copy will only include the columns specified. If dict, the string pairs represent a column renaming { source_column_name : destination_column_name } :param where: (UDF) Optionally provide a where function. If not None, only those rows for which the UDF evaluates to True will be copied. :return: (Frame) New Frame object. Copies specified columns into a new Frame object, optionally renaming them and/or filtering them. Useful for frame query. Examples -------- <hide> >>> schema = [("name", str), ("age", int), ("years", int)] >>> data = [["Thurston",64,26],["Judy",44,14],["Emily",37,5],["Frank",50,18],["Joe",43,11],["Ruth",52,21]] >>> frame = tc.frame.create(data, schema) </hide> Consider the following frame of employee names, age, and years of service: >>> frame.inspect() [#] name age years ========================= [0] Thurston 64 26 [1] Judy 44 14 [2] Emily 37 5 [3] Frank 50 18 [4] Joe 43 11 [5] Ruth 52 21 >>> frame.schema [('name', <type 'str'>), ('age', <type 'int'>), ('years', <type 'int'>)] To create a duplicate copy of the frame, use the copy operation with no parameters: >>> duplicate = frame.copy() <progress> >>> duplicate.inspect() [#] name age years ========================= [0] Thurston 64 26 [1] Judy 44 14 [2] Emily 37 5 [3] Frank 50 18 [4] Joe 43 11 [5] Ruth 52 21 Using the copy operation, we can also limit the new frame to just include the 'name' column: >>> names = frame.copy("name") <progress> >>> names.inspect() [#] name ============= [0] Thurston [1] Judy [2] Emily [3] Frank [4] Joe [5] Ruth We could also include a UDF to filter the data that is included in the new frame, and also provide a dictionary to rename the column(s) in the new frame. Here we will use copy to create a frame of names for the employees that have over 20 years of service and also rename of the 'name' column to 'first_name': >>> names = frame.copy({"name" : "first_name"}, lambda row: row.years > 20) <progress> >>> names.inspect() [#] first_name =============== [0] Thurston [1] Ruth """ new_rdd = self._python.rdd if where is not None and not isinstance(where, types.FunctionType): raise ValueError( "Unsupported type for 'where' parameter. Must be a function or None, but is: {0}" .format(type(where))) if isinstance(columns, str): columns = [columns] if isinstance(columns, list): column_indices = [ i for i, column in enumerate(self._python.schema) if column[0] in columns ] elif isinstance(columns, dict): column_indices = [ i for i, column in enumerate(self._python.schema) if column[0] in columns.keys() ] elif columns is None: column_indices = xrange(0, len(self._python.schema)) else: raise ValueError( "Unsupported type for 'columns' parameter. Expected str, list, dict, or None, but was: {0}" .format(type(columns))) if where is not None: # If a udf is provided, apply that function and apply the new schema row = Row(self._python.schema) def copy_func(r): row._set_data(r) return where(row) new_rdd = self._python.rdd.filter(lambda r: copy_func(r)) if len(column_indices) < len(self._python.schema): # Map rows to only include the specified columns row = Row(self._python.schema) def map_func(r): row._set_data(r) return list(row[i] for i in column_indices) new_rdd = new_rdd.map(lambda r: map_func(r)) new_schema = list(self._python.schema[i] for i in column_indices) # If columns are being renamed through a dictionary, alter the schema if (isinstance(columns, dict)): renamed_schema = [] for column in new_schema: if columns.has_key(column[0]): new_name = columns[column[0]] renamed_schema.append((new_name, column[1])) new_schema = renamed_schema # return new frame with the filtered rdd and new schema return self._tc.frame.create(new_rdd, new_schema)
def add_columns(self, func, schema, columns_accessed=None): """ Add columns to current frame. Assigns data to column based on evaluating a function for each row. Notes ----- 1) The row |UDF| ('func') must return a value in the same format as specified by the schema. See :doc:`/ds_apir`. 2) Unicode in column names is not supported and will likely cause the drop_frames() method (and others) to fail! Examples -------- Given our frame, let's add a column which has how many years the person has been over 18 .. code:: >>> frame = tc.to_frame([['Fred',39,16,'555-1234'], ... ['Susan',33,3,'555-0202'], ... ['Thurston',65,26,'555-4510'], ... ['Judy',44,14,'555-2183']], ... schema=[('name', str), ('age', int), ('tenure', int), ('phone', str)]) >>> frame.inspect() [#] name age tenure phone ==================================== [0] Fred 39 16 555-1234 [1] Susan 33 3 555-0202 [2] Thurston 65 26 555-4510 [3] Judy 44 14 555-2183 >>> frame.add_columns(lambda row: row.age - 18, ('adult_years', int)) >>> frame.inspect() [#] name age tenure phone adult_years ================================================= [0] Fred 39 16 555-1234 21 [1] Susan 33 3 555-0202 15 [2] Thurston 65 26 555-4510 47 [3] Judy 44 14 555-2183 26 Multiple columns can be added at the same time. Let's add percentage of life and percentage of adult life in one call, which is more efficient. .. code:: >>> frame.add_columns(lambda row: [row.tenure / float(row.age), row.tenure / float(row.adult_years)], [("of_age", float), ("of_adult", float)]) >>> frame.inspect(round=2) [#] name age tenure phone adult_years of_age of_adult =================================================================== [0] Fred 39 16 555-1234 21 0.41 0.76 [1] Susan 33 3 555-0202 15 0.09 0.20 [2] Thurston 65 26 555-4510 47 0.40 0.55 [3] Judy 44 14 555-2183 26 0.32 0.54 Note that the function returns a list, and therefore the schema also needs to be a list. It is not necessary to use lambda syntax, any function will do, as long as it takes a single row argument. We can also call other local functions within. Let's add a column which shows the amount of person's name based on their adult tenure percentage. >>> def percentage_of_string(string, percentage): ... '''returns a substring of the given string according to the given percentage''' ... substring_len = int(percentage * len(string)) ... return string[:substring_len] >>> def add_name_by_adult_tenure(row): ... return percentage_of_string(row.name, row.of_adult) >>> frame.add_columns(add_name_by_adult_tenure, ('tenured_name', unicode)) <skip> >>> frame Frame "example_frame" row_count = 4 schema = [name:unicode, age:int32, tenure:int32, phone:unicode, adult_years:int32, of_age:float32, of_adult:float32, tenured_name:unicode] status = ACTIVE (last_read_date = -etc-) </skip> >>> frame.inspect(columns=['name', 'of_adult', 'tenured_name'], round=2) [#] name of_adult tenured_name ===================================== [0] Fred 0.76 Fre [1] Susan 0.20 S [2] Thurston 0.55 Thur [3] Judy 0.54 Ju **Optimization** - If we know up front which columns our row function will access, we can tell add_columns to speed up the execution by working on only the limited feature set rather than the entire row. Let's add a name based on tenure percentage of age. We know we're only going to use columns 'name' and 'of_age'. .. code:: >>> frame.add_columns(lambda row: percentage_of_string(row.name, row.of_age), ... ('tenured_name_age', unicode), ... columns_accessed=['name', 'of_age']) >>> frame.inspect(round=2) [#] name age tenure phone adult_years of_age of_adult =================================================================== [0] Fred 39 16 555-1234 21 0.41 0.76 [1] Susan 33 3 555-0202 15 0.09 0.20 [2] Thurston 65 26 555-4510 47 0.40 0.55 [3] Judy 44 14 555-2183 26 0.32 0.54 <blankline> [#] tenured_name tenured_name_age =================================== [0] Fre F [1] S [2] Thur Thu [3] Ju J More information on a row |UDF| can be found at :doc:`/ds_apir` """ # For further examples, see :ref:`example_frame.add_columns`. row = Row(self.schema) def add_columns_func(r): row._set_data(r) return func(row) if isinstance(schema, list): self._python.rdd = self._python.rdd.map( lambda r: r + add_columns_func(r)) self._python.schema.extend(schema) else: self._python.rdd = self._python.rdd.map( lambda r: r + [add_columns_func(r)]) self._python.schema.append(schema)