def as_h2o_frame(self, dataframe, framename=None): """ Transforms given Spark RDD or DataFrame to H2OFrame. Parameters ---------- dataframe : Spark RDD or DataFrame framename : Optional name for resulting H2OFrame Returns ------- H2OFrame which contains data of original input Spark data structure """ if isinstance(dataframe, DataFrame): return fc._as_h2o_frame_from_dataframe(self, dataframe, framename) elif isinstance(dataframe, RDD): # First check if the type T in RDD[T] is one of the python "primitive" types # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger) if _is_of_simple_type(dataframe): first = _get_first(dataframe) if isinstance(first, str): return fc._as_h2o_frame_from_RDD_String(self, dataframe, framename) elif isinstance(first, bool): return fc._as_h2o_frame_from_RDD_Bool(self, dataframe, framename) elif isinstance(dataframe.min(), int) and isinstance(dataframe.max(), int): if dataframe.min() >= self._jvm.Integer.MIN_VALUE and dataframe.max() <= self._jvm.Integer.MAX_VALUE: return fc._as_h2o_frame_from_RDD_Int(self, dataframe, framename) else: return fc._as_h2o_frame_from_RDD_Long(self, dataframe, framename) elif isinstance(first, float): return fc._as_h2o_frame_from_RDD_Float(self, dataframe, framename) elif isinstance(dataframe.max(), long): raise ValueError('Numbers in RDD Too Big') else: return fc._as_h2o_frame_from_complex_type(self, dataframe, framename)
def as_h2o_frame(self, dataframe, framename=None, full_cols=100): """ Transforms given Spark RDD or DataFrame to H2OFrame. Parameters ---------- dataframe : Spark RDD or DataFrame framename : Optional name for resulting H2OFrame full_cols : number of first n columns which are sent to the client together with the data Returns ------- H2OFrame which contains data of original input Spark data structure """ if isinstance(dataframe, DataFrame): return fc._as_h2o_frame_from_dataframe(self, dataframe, framename, full_cols) elif isinstance(dataframe, RDD): # First check if the type T in RDD[T] is one of the python "primitive" types # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger) if _is_of_simple_type(dataframe): first = _get_first(dataframe) # Make this code compatible with python 3.6 and python 2.7 global long if sys.version_info > (3, ): long = int if isinstance(first, str): return fc._as_h2o_frame_from_RDD_String( self, dataframe, framename, full_cols) elif isinstance(first, bool): return fc._as_h2o_frame_from_RDD_Bool( self, dataframe, framename, full_cols) elif (isinstance(dataframe.min(), int) and isinstance(dataframe.max(), int)) or ( isinstance(dataframe.min(), long) and isinstance(dataframe.max(), long)): if dataframe.min( ) >= self._jvm.Integer.MIN_VALUE and dataframe.max( ) <= self._jvm.Integer.MAX_VALUE: return fc._as_h2o_frame_from_RDD_Int( self, dataframe, framename, full_cols) elif dataframe.min( ) >= self._jvm.Long.MIN_VALUE and dataframe.max( ) <= self._jvm.Long.MAX_VALUE: return fc._as_h2o_frame_from_RDD_Long( self, dataframe, framename, full_cols) else: raise ValueError('Numbers in RDD Too Big') elif isinstance(first, float): return fc._as_h2o_frame_from_RDD_Float( self, dataframe, framename, full_cols) else: return fc._as_h2o_frame_from_complex_type( self, dataframe, framename, full_cols) else: raise ValueError( 'The as_h2o_frame method expects Spark DataFrame or RDD as the input only!' )
def as_h2o_frame(self, dataframe, framename = None): """ Transforms given Spark RDD or DataFrame to H2OFrame. Parameters ---------- dataframe : Spark RDD or DataFrame framename : Optional name for resulting H2OFrame Returns ------- H2OFrame which contains data of original input Spark data structure """ if isinstance(dataframe, DataFrame): return fc._as_h2o_frame_from_dataframe(self, dataframe, framename) elif isinstance(dataframe, RDD): # First check if the type T in RDD[T] is one of the python "primitive" types # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger) if _is_of_simple_type(dataframe): first = _get_first(dataframe) if isinstance(first, str): return fc._as_h2o_frame_from_RDD_String(self, dataframe, framename) elif isinstance(first, bool): return fc._as_h2o_frame_from_RDD_Bool(self, dataframe, framename) elif isinstance(dataframe.max(), int): return fc._as_h2o_frame_from_RDD_Long(self, dataframe, framename) elif isinstance(first, float): return fc._as_h2o_frame_from_RDD_Float(self, dataframe, framename) elif isinstance(dataframe.max(), long): raise ValueError('Numbers in RDD Too Big') else: return fc._as_h2o_frame_from_complex_type(self, dataframe, framename)
def as_h2o_frame(self, dataframe, framename=None, full_cols=100): """ Transforms given Spark RDD or DataFrame to H2OFrame. Parameters ---------- dataframe : Spark RDD or DataFrame framename : Optional name for resulting H2OFrame full_cols : number of first n columns which are sent to the client together with the data Returns ------- H2OFrame which contains data of original input Spark data structure """ if isinstance(dataframe, DataFrame): return fc._as_h2o_frame_from_dataframe(self, dataframe, framename, full_cols) elif isinstance(dataframe, RDD) and dataframe.isEmpty(): schema = StructType([]) empty = self._spark_session.createDataFrame(self._spark_session.sparkContext.emptyRDD(), schema) return fc._as_h2o_frame_from_dataframe(self, empty, framename, full_cols) elif isinstance(dataframe, RDD): # First check if the type T in RDD[T] is one of the python "primitive" types # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger) if _is_of_simple_type(dataframe): first = _get_first(dataframe) # Make this code compatible with python 3.6 and python 2.7 global long if sys.version_info > (3,): long = int if isinstance(first, str): return fc._as_h2o_frame_from_RDD_String(self, dataframe, framename, full_cols) elif isinstance(first, bool): return fc._as_h2o_frame_from_RDD_Bool(self, dataframe, framename, full_cols) elif (isinstance(dataframe.min(), int) and isinstance(dataframe.max(), int)) or (isinstance(dataframe.min(), long) and isinstance(dataframe.max(), long)): if dataframe.min() >= self._jvm.Integer.MIN_VALUE and dataframe.max() <= self._jvm.Integer.MAX_VALUE: return fc._as_h2o_frame_from_RDD_Int(self, dataframe, framename, full_cols) elif dataframe.min() >= self._jvm.Long.MIN_VALUE and dataframe.max() <= self._jvm.Long.MAX_VALUE: return fc._as_h2o_frame_from_RDD_Long(self, dataframe, framename, full_cols) else: raise ValueError('Numbers in RDD Too Big') elif isinstance(first, float): return fc._as_h2o_frame_from_RDD_Float(self, dataframe, framename, full_cols) else: return fc._as_h2o_frame_from_complex_type(self, dataframe, framename, full_cols) else: raise ValueError('The as_h2o_frame method expects Spark DataFrame or RDD as the input only!')