def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) else: for item in schema: if not self._is_supported_datatype(item[1]): raise TypeError("Invalid schema. %s is not a supported data type." % str(item[1])) elif schema is None: schema = self._infer_schema(source) else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) self._frame = PythonFrame(python_rdd, python_schema) return self._frame
def __init__(self, tc, source, schema=None): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): source = tc.sc.parallelize(source) if schema: self.validate_pyrdd_schema(source, schema) self._frame = PythonFrame(source, schema)
def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_python_rdd = MatrixCoercion.schema_is_coercible( python_rdd, list(python_schema)) self._frame = PythonFrame(map_python_rdd, python_schema) return self._frame
def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) elif self.is_scala_dataframe(source): self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source) elif isinstance(source, DataFrame): self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)): raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.") inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1]) if len(duplicate_column_names) > 0: raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError("The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_python_rdd = MatrixCoercion.schema_is_coercible(python_rdd, list(python_schema)) self._frame = PythonFrame(map_python_rdd, python_schema) return self._frame
class Frame(object): def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) elif self.is_scala_dataframe(source): self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source) elif isinstance(source, DataFrame): self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)): raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.") inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1]) if len(duplicate_column_names) > 0: raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError("The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = MatrixCoercion.schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema) def _merge_types(self, type_list_a, type_list_b): """ Merges two lists of data types :param type_list_a: First list of data types to merge :param type_list_b: Second list of data types to merge :return: List of merged data types """ if not isinstance(type_list_a, list) or not isinstance(type_list_b, list): raise TypeError("Unable to generate schema, because schema is not a list.") if len(type_list_a) != len(type_list_b): raise ValueError("Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b))) return [dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a))] def _infer_types_for_row(self, row): """ Returns a list of data types for the data in the specified row :param row: List or Row of data :return: List of data types """ inferred_types = [] for item in row: if item is None: inferred_types.append(int) elif not isinstance(item, list): inferred_types.append(type(item)) else: inferred_types.append(dtypes.vector((len(item)))) return inferred_types def _infer_schema(self, data, column_names=[], sample_size=100): """ Infers the schema based on the data in the RDD. :param sc: Spark Context :param data: Data used to infer schema :param column_names: Optional column names to use in the schema. If no column names are provided, columns are given numbered names. If there are more columns in the RDD than there are in the column_names list, remaining columns will be numbered. :param sample_size: Number of rows to check when inferring the schema. Defaults to 100. :return: Schema """ inferred_schema = [] if isinstance(data, list): if len(data) > 0: # get the schema for the first row data_types = self._infer_types_for_row(data[0]) sample_size = min(sample_size, len(data)) for i in xrange (1, sample_size): data_types = self._merge_types(data_types, self._infer_types_for_row(data[i])) for i, data_type in enumerate(data_types): column_name = "C%s" % i if len(column_names) > i: column_name = column_names[i] inferred_schema.append((column_name, data_type)) else: raise TypeError("Unable to infer schema, because the data provided is not a list.") return inferred_schema def _is_supported_datatype(self, data_type): """ Returns True if the specified data_type is supported. """ supported_primitives = [int, float, long, str, unicode] if data_type in supported_primitives: return True elif data_type is dtypes.datetime: return True elif type(data_type) is dtypes.vector: return True elif data_type is dtypes.matrix: return True else: return False def validate_pyrdd_schema(self, pyrdd, schema): if isinstance(pyrdd, RDD): schema_length = len(schema) num_bad_values = self._tc.sc.accumulator(0) def validate_schema(row, accumulator): data = [] if len(row) != schema_length: raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema))) for index, column in enumerate(schema): data_type = column[1] try: if row[index] is not None: data.append(dtypes.dtypes.cast(row[index], data_type)) except: data.append(None) accumulator += 1 return data validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values)) # Force rdd to load, so that we can get a bad value count validated_rdd.count() return SchemaValidationReturn(validated_rdd, num_bad_values.value) else: raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.") @staticmethod def create_scala_frame(sc, scala_rdd, scala_schema): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema, False) @staticmethod def create_scala_frame_from_scala_dataframe(sc, scala_dataframe): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_dataframe) @staticmethod def _from_scala(tc, scala_frame): """creates a python Frame for the given scala Frame""" return Frame(tc, scala_frame) def _frame_to_scala(self, python_frame): """converts a PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, python_frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema) return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) def _is_scala_frame(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame) def is_scala_rdd(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD) def is_scala_dataframe(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.sql.DataFrame) def is_python_rdd(self, item): return isinstance(item, RDD) @property def _is_scala(self): """answers whether the current frame is backed by a Scala Frame""" return self._is_scala_frame(self._frame) @property def _is_python(self): """answers whether the current frame is backed by a _PythonFrame""" return not self._is_scala @property def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # If schema contains matrix dataype, # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True) # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame @property def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_python_rdd = MatrixCoercion.schema_is_coercible(python_rdd, list(python_schema)) self._frame = PythonFrame(map_python_rdd, python_schema) return self._frame ########################################################################## # API ########################################################################## @property def rdd(self): """pyspark RDD (causes conversion if currently backed by a Scala RDD)""" return self._python.rdd @property def schema(self): if self._is_scala: return schema_to_python(self._tc.sc, self._frame.schema()) # need ()'s on schema because it's a def in scala return self._frame.schema @property def dataframe(self): return DataFrame(self._scala.dataframe(), self._tc.sql_context) @property def column_names(self): """ Column identifications in the current frame. :return: list of names of all the frame's columns Returns the names of the columns of the current frame. Examples -------- <skip> >>> frame.column_names [u'name', u'age', u'tenure', u'phone'] </skip> """ return [name for name, data_type in self.schema] def append_csv_file(self, file_name, schema, separator=','): self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator) def export_to_csv(self, file_name): self._scala.exportToCsv(file_name) # Frame Operations from sparktk.frame.ops.add_columns import add_columns from sparktk.frame.ops.append import append from sparktk.frame.ops.assign_sample import assign_sample from sparktk.frame.ops.bin_column import bin_column from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics from sparktk.frame.ops.box_cox import box_cox from sparktk.frame.ops.categorical_summary import categorical_summary from sparktk.frame.ops.collect import collect from sparktk.frame.ops.column_median import column_median from sparktk.frame.ops.column_mode import column_mode from sparktk.frame.ops.column_summary_statistics import column_summary_statistics from sparktk.frame.ops.copy import copy from sparktk.frame.ops.correlation import correlation from sparktk.frame.ops.correlation_matrix import correlation_matrix from sparktk.frame.ops.count import count from sparktk.frame.ops.covariance import covariance from sparktk.frame.ops.covariance_matrix import covariance_matrix from sparktk.frame.ops.cumulative_percent import cumulative_percent from sparktk.frame.ops.cumulative_sum import cumulative_sum from sparktk.frame.ops.dot_product import dot_product from sparktk.frame.ops.drop_columns import drop_columns from sparktk.frame.ops.drop_duplicates import drop_duplicates from sparktk.frame.ops.drop_rows import drop_rows from sparktk.frame.ops.ecdf import ecdf from sparktk.frame.ops.entropy import entropy from sparktk.frame.ops.export_data import export_to_jdbc, export_to_json, export_to_hbase, export_to_hive from sparktk.frame.ops.filter import filter from sparktk.frame.ops.flatten_columns import flatten_columns from sparktk.frame.ops.group_by import group_by from sparktk.frame.ops.histogram import histogram from sparktk.frame.ops.inspect import inspect from sparktk.frame.ops.join_inner import join_inner from sparktk.frame.ops.join_left import join_left from sparktk.frame.ops.join_right import join_right from sparktk.frame.ops.join_outer import join_outer from sparktk.frame.ops.map_columns import map_columns from sparktk.frame.ops.matrix_covariance_matrix import matrix_covariance_matrix from sparktk.frame.ops.matrix_pca import matrix_pca from sparktk.frame.ops.matrix_svd import matrix_svd from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics from sparktk.frame.ops.power_iteration_clustering import power_iteration_clustering from sparktk.frame.ops.quantile_bin_column import quantile_bin_column from sparktk.frame.ops.quantiles import quantiles from sparktk.frame.ops.rename_columns import rename_columns from sparktk.frame.ops.reverse_box_cox import reverse_box_cox from sparktk.frame.ops.save import save from sparktk.frame.ops.sort import sort from sparktk.frame.ops.sortedk import sorted_k from sparktk.frame.ops.take import take from sparktk.frame.ops.tally import tally from sparktk.frame.ops.tally_percent import tally_percent from sparktk.frame.ops.timeseries_augmented_dickey_fuller_test import timeseries_augmented_dickey_fuller_test from sparktk.frame.ops.timeseries_breusch_godfrey_test import timeseries_breusch_godfrey_test from sparktk.frame.ops.timeseries_breusch_pagan_test import timeseries_breusch_pagan_test from sparktk.frame.ops.timeseries_durbin_watson_test import timeseries_durbin_watson_test from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations from sparktk.frame.ops.timeseries_slice import timeseries_slice from sparktk.frame.ops.to_pandas import to_pandas from sparktk.frame.ops.topk import top_k from sparktk.frame.ops.unflatten_columns import unflatten_columns
class Frame(object): def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) else: for item in schema: if not self._is_supported_datatype(item[1]): raise TypeError("Invalid schema. %s is not a supported data type." % str(item[1])) elif schema is None: schema = self._infer_schema(source) else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema) def _merge_types(self, type_list_a, type_list_b): """ Merges two lists of data types :param type_list_a: First list of data types to merge :param type_list_b: Second list of data types to merge :return: List of merged data types """ if not isinstance(type_list_a, list) or not isinstance(type_list_b, list): raise TypeError("Unable to generate schema, because schema is not a list.") if len(type_list_a) != len(type_list_b): raise ValueError("Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b))) return [dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a))] def _infer_types_for_row(self, row): """ Returns a list of data types for the data in the specified row :param row: List or Row of data :return: List of data types """ inferred_types = [] for item in row: if not isinstance(item, list): inferred_types.append(type(item)) else: inferred_types.append(dtypes.vector((len(item)))) return inferred_types def _infer_schema(self, data, column_names=[], sample_size=100): """ Infers the schema based on the data in the RDD. :param sc: Spark Context :param data: Data used to infer schema :param column_names: Optional column names to use in the schema. If no column names are provided, columns are given numbered names. If there are more columns in the RDD than there are in the column_names list, remaining columns will be numbered. :param sample_size: Number of rows to check when inferring the schema. Defaults to 100. :return: Schema """ inferred_schema = [] if isinstance(data, list): if len(data) > 0: # get the schema for the first row data_types = self._infer_types_for_row(data[0]) sample_size = min(sample_size, len(data)) for i in xrange (1, sample_size): data_types = self._merge_types(data_types, self._infer_types_for_row(data[i])) for i, data_type in enumerate(data_types): column_name = "C%s" % i if len(column_names) > i: column_name = column_names[i] inferred_schema.append((column_name, data_type)) else: raise TypeError("Unable to infer schema, because the data provided is not a list.") return inferred_schema def _is_supported_datatype(self, data_type): """ Returns True if the specified data_type is supported. """ supported_primitives = [int, float, long, str, unicode] if data_type in supported_primitives: return True elif data_type is dtypes.datetime: return True elif type(data_type) is dtypes.vector: return True else: return False def validate_pyrdd_schema(self, pyrdd, schema): if isinstance(pyrdd, RDD): schema_length = len(schema) num_bad_values = self._tc.sc.accumulator(0) def validate_schema(row, accumulator): data = [] if len(row) != schema_length: raise ValueError("Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema))) for index, column in enumerate(schema): data_type = column[1] try: if row[index] is not None: data.append(dtypes.dtypes.cast(row[index], data_type)) except: data.append(None) accumulator += 1 return data validated_rdd = pyrdd.map(lambda row: validate_schema(row, num_bad_values)) # Force rdd to load, so that we can get a bad value count validated_rdd.count() return SchemaValidationReturn(validated_rdd, num_bad_values.value) else: raise TypeError("Unable to validate schema, because the pyrdd provided is not an RDD.") @staticmethod def create_scala_frame(sc, scala_rdd, scala_schema): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema, False) @staticmethod def load(tc, scala_frame): """creates a python Frame for the given scala Frame""" return Frame(tc, scala_frame) def _frame_to_scala(self, python_frame): """converts a PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, python_frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema) return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) def _is_scala_frame(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame) def is_scala_rdd(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD) def is_python_rdd(self, item): return isinstance(item, RDD) @property def _is_scala(self): """answers whether the current frame is backed by a Scala Frame""" return self._is_scala_frame(self._frame) @property def _is_python(self): """answers whether the current frame is backed by a _PythonFrame""" return not self._is_scala @property def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame @property def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) self._frame = PythonFrame(python_rdd, python_schema) return self._frame ########################################################################## # API ########################################################################## @property def rdd(self): """pyspark RDD (causes conversion if currently backed by a Scala RDD)""" return self._python.rdd @property def schema(self): if self._is_scala: return schema_to_python(self._tc.sc, self._frame.schema()) # need ()'s on schema because it's a def in scala return self._frame.schema @property def column_names(self): """ Column identifications in the current frame. :return: list of names of all the frame's columns Returns the names of the columns of the current frame. Examples -------- >>> frame.column_names [u'name', u'age', u'tenure', u'phone'] """ return [name for name, data_type in self.schema] @property def row_count(self): """ Number of rows in the current frame. :return: The number of rows in the frame Counts all of the rows in the frame. Examples -------- Get the number of rows: <hide> >>> frame = tc.frame.create([[item] for item in range(0, 4)],[("a", int)]) </hide> >>> frame.row_count 4 """ if self._is_scala: return int(self._scala.rowCount()) return self.rdd.count() def append_csv_file(self, file_name, schema, separator=','): self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator) def export_to_csv(self, file_name): self._scala.exportToCsv(file_name) # Frame Operations from sparktk.frame.ops.add_columns import add_columns from sparktk.frame.ops.append import append from sparktk.frame.ops.assign_sample import assign_sample from sparktk.frame.ops.bin_column import bin_column from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics from sparktk.frame.ops.categorical_summary import categorical_summary from sparktk.frame.ops.column_median import column_median from sparktk.frame.ops.column_mode import column_mode from sparktk.frame.ops.column_summary_statistics import column_summary_statistics from sparktk.frame.ops.copy import copy from sparktk.frame.ops.correlation import correlation from sparktk.frame.ops.correlation_matrix import correlation_matrix from sparktk.frame.ops.count import count from sparktk.frame.ops.covariance import covariance from sparktk.frame.ops.covariance_matrix import covariance_matrix from sparktk.frame.ops.cumulative_percent import cumulative_percent from sparktk.frame.ops.cumulative_sum import cumulative_sum from sparktk.frame.ops.dot_product import dot_product from sparktk.frame.ops.drop_columns import drop_columns from sparktk.frame.ops.drop_duplicates import drop_duplicates from sparktk.frame.ops.drop_rows import drop_rows from sparktk.frame.ops.ecdf import ecdf from sparktk.frame.ops.entropy import entropy from sparktk.frame.ops.export_data import export_to_jdbc, export_to_json, export_to_hbase, export_to_hive from sparktk.frame.ops.filter import filter from sparktk.frame.ops.flatten_columns import flatten_columns from sparktk.frame.ops.histogram import histogram from sparktk.frame.ops.inspect import inspect from sparktk.frame.ops.join_inner import join_inner from sparktk.frame.ops.join_left import join_left from sparktk.frame.ops.join_right import join_right from sparktk.frame.ops.join_outer import join_outer from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics from sparktk.frame.ops.quantile_bin_column import quantile_bin_column from sparktk.frame.ops.quantiles import quantiles from sparktk.frame.ops.rename_columns import rename_columns from sparktk.frame.ops.save import save from sparktk.frame.ops.sort import sort from sparktk.frame.ops.sortedk import sorted_k from sparktk.frame.ops.take import take from sparktk.frame.ops.tally import tally from sparktk.frame.ops.tally_percent import tally_percent from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations from sparktk.frame.ops.timeseries_slice import timeseries_slice from sparktk.frame.ops.topk import top_k from sparktk.frame.ops.unflatten_columns import unflatten_columns
class Frame(object): def __init__(self, tc, source, schema=None): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): source = tc.sc.parallelize(source) if schema: self.validate_pyrdd_schema(source, schema) self._frame = PythonFrame(source, schema) def validate_pyrdd_schema(self, pyrdd, schema): pass @staticmethod def create_scala_frame(sc, scala_rdd, scala_schema): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame(scala_rdd, scala_schema) @staticmethod def load(tc, scala_frame): """creates a python Frame for the given scala Frame""" return Frame(tc, scala_frame) def _frame_to_scala(self, python_frame): """converts a PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, python_frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema) return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) def _is_scala_frame(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame) def is_scala_rdd(self, item): return self._tc._jutils.is_jvm_instance_of(item, self._tc.sc._jvm.org.apache.spark.rdd.RDD) def is_python_rdd(self, item): return isinstance(item, RDD) @property def _is_scala(self): """answers whether the current frame is backed by a Scala Frame""" return self._is_scala_frame(self._frame) @property def _is_python(self): """answers whether the current frame is backed by a _PythonFrame""" return not self._is_scala @property def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame @property def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) self._frame = PythonFrame(python_rdd, python_schema) return self._frame ########################################################################## # API ########################################################################## @property def rdd(self): """pyspark RDD (causes conversion if currently backed by a Scala RDD)""" return self._python.rdd @property def schema(self): if self._is_scala: return schema_to_python(self._tc.sc, self._frame.schema()) # need ()'s on schema because it's a def in scala return self._frame.schema @property def column_names(self): """ Column identifications in the current frame. :return: list of names of all the frame's columns Returns the names of the columns of the current frame. Examples -------- .. code:: >>> frame.column_names [u'name', u'age', u'tenure', u'phone'] """ return [name for name, data_type in self.schema] @property def row_count(self): """ Number of rows in the current frame. :return: The number of rows in the frame Counts all of the rows in the frame. Examples -------- Get the number of rows: <hide> frame = tc.to_frame([[item] for item in range(0, 4)],[("a", int)]) </hide> .. code:: >>> frame.row_count 4 """ if self._is_scala: return int(self._scala.rowCount()) return self.rdd.count() def append_csv_file(self, file_name, schema, separator=','): self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator) def export_to_csv(self, file_name): self._scala.exportToCsv(file_name) # Frame Operations from sparktk.frame.ops.add_columns import add_columns from sparktk.frame.ops.assign_sample import assign_sample from sparktk.frame.ops.bin_column import bin_column from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics from sparktk.frame.ops.categorical_summary import categorical_summary from sparktk.frame.ops.column_median import column_median from sparktk.frame.ops.column_mode import column_mode from sparktk.frame.ops.column_summary_statistics import column_summary_statistics from sparktk.frame.ops.correlation import correlation from sparktk.frame.ops.correlation_matrix import correlation_matrix from sparktk.frame.ops.count import count from sparktk.frame.ops.covariance import covariance from sparktk.frame.ops.covariance_matrix import covariance_matrix from sparktk.frame.ops.cumulative_percent import cumulative_percent from sparktk.frame.ops.cumulative_sum import cumulative_sum from sparktk.frame.ops.dot_product import dot_product from sparktk.frame.ops.drop_columns import drop_columns from sparktk.frame.ops.drop_duplicates import drop_duplicates from sparktk.frame.ops.drop_rows import drop_rows from sparktk.frame.ops.ecdf import ecdf from sparktk.frame.ops.entropy import entropy from sparktk.frame.ops.filter import filter from sparktk.frame.ops.flatten_columns import flatten_columns from sparktk.frame.ops.histogram import histogram from sparktk.frame.ops.inspect import inspect from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics from sparktk.frame.ops.quantile_bin_column import quantile_bin_column from sparktk.frame.ops.quantiles import quantiles from sparktk.frame.ops.rename_columns import rename_columns from sparktk.frame.ops.save import save from sparktk.frame.ops.sort import sort from sparktk.frame.ops.sortedk import sorted_k from sparktk.frame.ops.take import take from sparktk.frame.ops.tally import tally from sparktk.frame.ops.tally_percent import tally_percent from sparktk.frame.ops.topk import top_k from sparktk.frame.ops.unflatten_columns import unflatten_columns
def __init__(self, tc, source, schema=None, validate_schema=False): """(Private constructor -- use tc.frame.create or other methods available from the TkContext)""" self._tc = tc if self._is_scala_frame(source): self._frame = source elif self._is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self._create_scala_frame(tc.sc, source, scala_schema) elif self._is_scala_dataframe(source): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([ col for col in column_names if column_names.count(col) > 1 ]) if len(duplicate_column_names) > 0: raise ValueError( "Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema)
class Frame(object): def __init__(self, tc, source, schema=None, validate_schema=False): """(Private constructor -- use tc.frame.create or other methods available from the TkContext)""" self._tc = tc if self._is_scala_frame(source): self._frame = source elif self._is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self._create_scala_frame(tc.sc, source, scala_schema) elif self._is_scala_dataframe(source): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([ col for col in column_names if column_names.count(col) > 1 ]) if len(duplicate_column_names) > 0: raise ValueError( "Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema) def _merge_types(self, type_list_a, type_list_b): """ Merges two lists of data types :param type_list_a: First list of data types to merge :param type_list_b: Second list of data types to merge :return: List of merged data types """ if not isinstance(type_list_a, list) or not isinstance( type_list_b, list): raise TypeError( "Unable to generate schema, because schema is not a list.") if len(type_list_a) != len(type_list_b): raise ValueError( "Length of each row must be the same (found rows with lengths: %s and %s)." % (len(type_list_a), len(type_list_b))) return [ dtypes._DataTypes.merge_types(type_list_a[i], type_list_b[i]) for i in xrange(0, len(type_list_a)) ] def _infer_types_for_row(self, row): """ Returns a list of data types for the data in the specified row :param row: List or Row of data :return: List of data types """ inferred_types = [] for item in row: if item is None: inferred_types.append(int) elif not isinstance(item, list): inferred_types.append(type(item)) else: inferred_types.append(dtypes.vector((len(item)))) return inferred_types def _infer_schema(self, data, column_names=[], sample_size=100): """ Infers the schema based on the data in the RDD. :param sc: Spark Context :param data: Data used to infer schema :param column_names: Optional column names to use in the schema. If no column names are provided, columns are given numbered names. If there are more columns in the RDD than there are in the column_names list, remaining columns will be numbered. :param sample_size: Number of rows to check when inferring the schema. Defaults to 100. :return: Schema """ inferred_schema = [] if isinstance(data, list): if len(data) > 0: # get the schema for the first row data_types = self._infer_types_for_row(data[0]) sample_size = min(sample_size, len(data)) for i in xrange(1, sample_size): data_types = self._merge_types( data_types, self._infer_types_for_row(data[i])) for i, data_type in enumerate(data_types): column_name = "C%s" % i if len(column_names) > i: column_name = column_names[i] inferred_schema.append((column_name, data_type)) else: raise TypeError( "Unable to infer schema, because the data provided is not a list." ) return inferred_schema def _is_supported_datatype(self, data_type): """ Returns True if the specified data_type is supported. """ supported_primitives = [int, float, long, str, unicode] if data_type in supported_primitives: return True elif data_type is dtypes.datetime: return True elif type(data_type) is dtypes.vector: return True elif data_type is dtypes.matrix: return True else: return False def validate_pyrdd_schema(self, pyrdd, schema): if isinstance(pyrdd, RDD): schema_length = len(schema) num_bad_values = self._tc.sc.accumulator(0) def validate_schema(row, accumulator): data = [] if len(row) != schema_length: raise ValueError( "Length of the row (%s) does not match the schema length (%s)." % (len(row), len(schema))) for index, column in enumerate(schema): data_type = column[1] try: if row[index] is not None: data.append( dtypes.dtypes.cast(row[index], data_type)) except: data.append(None) accumulator += 1 return data validated_rdd = pyrdd.map( lambda row: validate_schema(row, num_bad_values)) # Force rdd to load, so that we can get a bad value count validated_rdd.count() return SchemaValidationReturn(validated_rdd, num_bad_values.value) else: raise TypeError( "Unable to validate schema, because the pyrdd provided is not an RDD." ) @staticmethod def _create_scala_frame(sc, scala_rdd, scala_schema): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame( scala_rdd, scala_schema, False) @staticmethod def _create_scala_frame_from_scala_dataframe(sc, scala_dataframe): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame( scala_dataframe) @staticmethod def _from_scala(tc, scala_frame): """creates a python Frame for the given scala Frame""" return Frame(tc, scala_frame) def _frame_to_scala(self, python_frame): """converts a PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, python_frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala( python_frame.rdd._jrdd, scala_schema) return self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema) def _is_scala_frame(self, item): return self._tc._jutils.is_jvm_instance_of( item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame) def _is_scala_rdd(self, item): return self._tc._jutils.is_jvm_instance_of( item, self._tc.sc._jvm.org.apache.spark.rdd.RDD) def _is_scala_dataframe(self, item): return self._tc._jutils.is_jvm_instance_of( item, self._tc.sc._jvm.org.apache.spark.sql.DataFrame) def _is_python_rdd(self, item): return isinstance(item, RDD) @property def _is_scala(self): """answers whether the current frame is backed by a Scala Frame""" answer = self._is_scala_frame(self._frame) logger.info("frame._is_scala reference: %s" % answer) return answer @property def _is_python(self): """answers whether the current frame is backed by a _PythonFrame""" answer = not self._is_scala_frame(self._frame) logger.info("frame._is_python reference: %s" % answer) return answer @property def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: logger.info( "frame._scala reference: converting frame backend from Python to Scala" ) # If schema contains matrix dataype, # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java self._frame.rdd = schema_is_coercible(self._frame.rdd, list(self._frame.schema), True) # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala( self._frame.rdd._jrdd, scala_schema) self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema) else: logger.info( "frame._scala reference: frame already has a scala backend") return self._frame @property def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: logger.info( "frame._python reference: converting frame backend from Scala to Python" ) # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_python_rdd = schema_is_coercible(python_rdd, list(python_schema)) self._frame = PythonFrame(map_python_rdd, python_schema) else: logger.info( "frame._python reference: frame already has a python backend") return self._frame ########################################################################## # API ########################################################################## @property def rdd(self): """pyspark RDD (causes conversion if currently backed by a Scala RDD)""" return self._python.rdd @property def dataframe(self): """pyspark DataFrame (causes conversion through Scala)""" return DataFrame(self._scala.dataframe(), self._tc.sql_context) @property def schema(self): if self._is_scala: return schema_to_python(self._tc.sc, self._frame.schema( )) # need ()'s on schema because it's a def in scala return self._frame.schema @property def column_names(self): """ Column identifications in the current frame. :return: list of names of all the frame's columns Returns the names of the columns of the current frame. Examples -------- <skip> >>> frame.column_names [u'name', u'age', u'tenure', u'phone'] </skip> """ return [name for name, data_type in self.schema] # Frame Operations from sparktk.frame.ops.add_columns import add_columns from sparktk.frame.ops.append import append from sparktk.frame.ops.assign_sample import assign_sample from sparktk.frame.ops.bin_column import bin_column from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics from sparktk.frame.ops.box_cox import box_cox from sparktk.frame.ops.categorical_summary import categorical_summary from sparktk.frame.ops.collect import collect from sparktk.frame.ops.column_median import column_median from sparktk.frame.ops.column_mode import column_mode from sparktk.frame.ops.column_summary_statistics import column_summary_statistics from sparktk.frame.ops.copy import copy from sparktk.frame.ops.correlation import correlation from sparktk.frame.ops.correlation_matrix import correlation_matrix from sparktk.frame.ops.count import count from sparktk.frame.ops.covariance import covariance from sparktk.frame.ops.covariance_matrix import covariance_matrix from sparktk.frame.ops.cumulative_percent import cumulative_percent from sparktk.frame.ops.cumulative_sum import cumulative_sum from sparktk.frame.ops.dot_product import dot_product from sparktk.frame.ops.drop_columns import drop_columns from sparktk.frame.ops.drop_duplicates import drop_duplicates from sparktk.frame.ops.drop_rows import drop_rows from sparktk.frame.ops.ecdf import ecdf from sparktk.frame.ops.entropy import entropy from sparktk.frame.ops.export_to_csv import export_to_csv from sparktk.frame.ops.export_to_jdbc import export_to_jdbc from sparktk.frame.ops.export_to_json import export_to_json from sparktk.frame.ops.export_to_hbase import export_to_hbase from sparktk.frame.ops.export_to_hive import export_to_hive from sparktk.frame.ops.export_to_tensorflow import export_to_tensorflow from sparktk.frame.ops.filter import filter from sparktk.frame.ops.flatten_columns import flatten_columns from sparktk.frame.ops.group_by import group_by from sparktk.frame.ops.histogram import histogram from sparktk.frame.ops.inspect import inspect, get_inspect from sparktk.frame.ops.join_cross import join_cross from sparktk.frame.ops.join_inner import join_inner from sparktk.frame.ops.join_left import join_left from sparktk.frame.ops.join_right import join_right from sparktk.frame.ops.join_outer import join_outer from sparktk.frame.ops.map_columns import map_columns from sparktk.frame.ops.matrix_covariance_matrix import matrix_covariance_matrix from sparktk.frame.ops.matrix_pca import matrix_pca from sparktk.frame.ops.matrix_svd import matrix_svd from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics from sparktk.frame.ops.power_iteration_clustering import power_iteration_clustering from sparktk.frame.ops.quantile_bin_column import quantile_bin_column from sparktk.frame.ops.quantiles import quantiles from sparktk.frame.ops.rename_columns import rename_columns from sparktk.frame.ops.reverse_box_cox import reverse_box_cox from sparktk.frame.ops.save import save from sparktk.frame.ops.sort import sort from sparktk.frame.ops.sortedk import sorted_k from sparktk.frame.ops.take import take from sparktk.frame.ops.tally import tally from sparktk.frame.ops.tally_percent import tally_percent from sparktk.frame.ops.timeseries_augmented_dickey_fuller_test import timeseries_augmented_dickey_fuller_test from sparktk.frame.ops.timeseries_breusch_godfrey_test import timeseries_breusch_godfrey_test from sparktk.frame.ops.timeseries_breusch_pagan_test import timeseries_breusch_pagan_test from sparktk.frame.ops.timeseries_durbin_watson_test import timeseries_durbin_watson_test from sparktk.frame.ops.timeseries_from_observations import timeseries_from_observations from sparktk.frame.ops.timeseries_slice import timeseries_slice from sparktk.frame.ops.to_pandas import to_pandas from sparktk.frame.ops.topk import top_k from sparktk.frame.ops.unflatten_columns import unflatten_columns
def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) elif self.is_scala_dataframe(source): self._frame = self.create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self.create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
class Frame(object): def __init__(self, tc, source, schema=None): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): source = tc.sc.parallelize(source) if schema: self.validate_pyrdd_schema(source, schema) self._frame = PythonFrame(source, schema) def validate_pyrdd_schema(self, pyrdd, schema): pass @staticmethod def create_scala_frame(sc, scala_rdd, scala_schema): """call constructor in JVM""" return sc._jvm.org.trustedanalytics.sparktk.frame.Frame( scala_rdd, scala_schema) @staticmethod def load(tc, scala_frame): """creates a python Frame for the given scala Frame""" return Frame(tc, scala_frame) def _frame_to_scala(self, python_frame): """converts a PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, python_frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala( python_frame.rdd._jrdd, scala_schema) return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) def _is_scala_frame(self, item): return self._tc._jutils.is_jvm_instance_of( item, self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.Frame) def is_scala_rdd(self, item): return self._tc._jutils.is_jvm_instance_of( item, self._tc.sc._jvm.org.apache.spark.rdd.RDD) def is_python_rdd(self, item): return isinstance(item, RDD) @property def _is_scala(self): """answers whether the current frame is backed by a Scala Frame""" return self._is_scala_frame(self._frame) @property def _is_python(self): """answers whether the current frame is backed by a _PythonFrame""" return not self._is_scala @property def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala( self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame @property def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) self._frame = PythonFrame(python_rdd, python_schema) return self._frame ########################################################################## # API ########################################################################## @property def rdd(self): """pyspark RDD (causes conversion if currently backed by a Scala RDD)""" return self._python.rdd @property def schema(self): if self._is_scala: return schema_to_python(self._tc.sc, self._frame.schema( )) # need ()'s on schema because it's a def in scala return self._frame.schema @property def column_names(self): """ Column identifications in the current frame. :return: list of names of all the frame's columns Returns the names of the columns of the current frame. Examples -------- .. code:: >>> frame.column_names [u'name', u'age', u'tenure', u'phone'] """ return [name for name, data_type in self.schema] @property def row_count(self): """ Number of rows in the current frame. :return: The number of rows in the frame Counts all of the rows in the frame. Examples -------- Get the number of rows: <hide> frame = tc.to_frame([[item] for item in range(0, 4)],[("a", int)]) </hide> .. code:: >>> frame.row_count 4 """ if self._is_scala: return int(self._scala.rowCount()) return self.rdd.count() def append_csv_file(self, file_name, schema, separator=','): self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator) def export_to_csv(self, file_name): self._scala.exportToCsv(file_name) # Frame Operations from sparktk.frame.ops.add_columns import add_columns from sparktk.frame.ops.assign_sample import assign_sample from sparktk.frame.ops.bin_column import bin_column from sparktk.frame.ops.binary_classification_metrics import binary_classification_metrics from sparktk.frame.ops.categorical_summary import categorical_summary from sparktk.frame.ops.column_median import column_median from sparktk.frame.ops.column_mode import column_mode from sparktk.frame.ops.column_summary_statistics import column_summary_statistics from sparktk.frame.ops.correlation import correlation from sparktk.frame.ops.correlation_matrix import correlation_matrix from sparktk.frame.ops.count import count from sparktk.frame.ops.covariance import covariance from sparktk.frame.ops.covariance_matrix import covariance_matrix from sparktk.frame.ops.cumulative_percent import cumulative_percent from sparktk.frame.ops.cumulative_sum import cumulative_sum from sparktk.frame.ops.dot_product import dot_product from sparktk.frame.ops.drop_columns import drop_columns from sparktk.frame.ops.drop_duplicates import drop_duplicates from sparktk.frame.ops.drop_rows import drop_rows from sparktk.frame.ops.ecdf import ecdf from sparktk.frame.ops.entropy import entropy from sparktk.frame.ops.filter import filter from sparktk.frame.ops.flatten_columns import flatten_columns from sparktk.frame.ops.histogram import histogram from sparktk.frame.ops.inspect import inspect from sparktk.frame.ops.multiclass_classification_metrics import multiclass_classification_metrics from sparktk.frame.ops.quantile_bin_column import quantile_bin_column from sparktk.frame.ops.quantiles import quantiles from sparktk.frame.ops.rename_columns import rename_columns from sparktk.frame.ops.save import save from sparktk.frame.ops.sort import sort from sparktk.frame.ops.sortedk import sorted_k from sparktk.frame.ops.take import take from sparktk.frame.ops.tally import tally from sparktk.frame.ops.tally_percent import tally_percent from sparktk.frame.ops.topk import top_k from sparktk.frame.ops.unflatten_columns import unflatten_columns