Exemple #1
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self.is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
        else:
            if not isinstance(source, RDD):
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                    elif not all(isinstance(item, tuple) and
                                  len(item) == 2 and
                                  isinstance(item[0], str) for item in schema):
                        raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                    else:
                        for item in schema:
                            if not self._is_supported_datatype(item[1]):
                                raise TypeError("Invalid schema.  %s is not a supported data type." % str(item[1]))
                elif schema is None:
                    schema = self._infer_schema(source)
                else:
                    # Schema is not a list or None
                    raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)

            self._frame = PythonFrame(source, schema)
Exemple #2
0
 def _python(self):
     """gets frame backend as _PythonFrame, causes conversion if it is current not"""
     if self._is_scala:
         # convert Scala Frame to a PythonFrame"""
         scala_schema = self._frame.schema()
         java_rdd =  self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
         python_schema = schema_to_python(self._tc.sc, scala_schema)
         python_rdd = RDD(java_rdd, self._tc.sc)
         self._frame = PythonFrame(python_rdd, python_schema)
     return self._frame
Exemple #3
0
 def __init__(self, tc, source, schema=None):
     self._tc = tc
     if self._is_scala_frame(source):
         self._frame = source
     elif self.is_scala_rdd(source):
         scala_schema = schema_to_scala(tc.sc, schema)
         self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
     else:
         if not isinstance(source, RDD):
             source = tc.sc.parallelize(source)
         if schema:
             self.validate_pyrdd_schema(source, schema)
         self._frame = PythonFrame(source, schema)
Exemple #4
0
 def _python(self):
     """gets frame backend as _PythonFrame, causes conversion if it is current not"""
     if self._is_scala:
         # convert Scala Frame to a PythonFrame"""
         scala_schema = self._frame.schema()
         java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
             self._frame.rdd())
         python_schema = schema_to_python(self._tc.sc, scala_schema)
         python_rdd = RDD(java_rdd, self._tc.sc)
         # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
         map_python_rdd = MatrixCoercion.schema_is_coercible(
             python_rdd, list(python_schema))
         self._frame = PythonFrame(map_python_rdd, python_schema)
     return self._frame
Exemple #5
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self._is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
        elif self._is_scala_dataframe(source):
            self._frame = self._create_scala_frame_from_scala_dataframe(
                tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self._create_scala_frame_from_scala_dataframe(
                tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(
                        not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError(
                        "Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD."
                    )

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(
                            isinstance(item, tuple) and len(item) == 2
                            and isinstance(item[0], basestring)
                            for item in schema):
                        raise TypeError(
                            "Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s."
                            % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([
                        col for col in column_names
                        if column_names.count(col) > 1
                    ])
                    if len(duplicate_column_names) > 0:
                        raise ValueError(
                            "Invalid schema, column names cannot be duplicated: %s"
                            % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError(
                        "Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type."
                        % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError(
                                "The %s data type was found when inferring the schema, and it is not a "
                                "supported data type.  Instead, specify a schema that uses a supported data "
                                "type, and enable validate_schema so that the data is converted to the proper "
                                "data type.\n\nInferred schema: %s\n\nSupported data types: %s"
                                % (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError(
                                "Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s"
                                % (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(
                    source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug(
                    "%s values were unable to be parsed to the schema's data type."
                    % validate_schema_result.bad_value_count)

            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_source = schema_is_coercible(source, list(schema))
            self._frame = PythonFrame(map_source, schema)
Exemple #6
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self.is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
        elif self.is_scala_dataframe(source):
            self._frame = self.create_scala_frame_from_scala_dataframe(
                tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self.create_scala_frame_from_scala_dataframe(
                tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(
                        not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError(
                        "Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD."
                    )

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(
                            isinstance(item, tuple) and len(item) == 2
                            and isinstance(item[0], basestring)
                            for item in schema):
                        raise TypeError(
                            "Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s."
                            % type(schema))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError(
                        "Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type."
                        % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError(
                                "The %s data type was found when inferring the schema, and it is not a "
                                "supported data type.  Instead, specify a schema that uses a supported data "
                                "type, and enable validate_schema so that the data is converted to the proper "
                                "data type.\n\nInferred schema: %s\n\nSupported data types: %s"
                                % (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError(
                                "Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s"
                                % (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(
                    source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug(
                    "%s values were unable to be parsed to the schema's data type."
                    % validate_schema_result.bad_value_count)

            self._frame = PythonFrame(source, schema)