def test_parquet_write(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_parquet(tmp, mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp)[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # Write out partitioned by two columns expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"]) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp)[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), )
def test_dot(self): kser = ks.Series([90, 91, 85], index=[2, 4, 1]) pser = kser.to_pandas() kser_other = ks.Series([90, 91, 85], index=[2, 4, 1]) pser_other = kser_other.to_pandas() self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) kser_other = ks.Series([90, 91, 85], index=[1, 2, 4]) pser_other = pd.Series([90, 91, 85], index=[1, 2, 4]) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) # length of index is different kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0]) with self.assertRaisesRegex(ValueError, "matrices are not aligned"): kser.dot(kser_other) # with DataFram is not supported for now since performance issue, # now we raise ValueError with proper message instead. kdf = ks.DataFrame([[0, 1], [-2, 3], [4, -5]], index=[2, 4, 1]) with self.assertRaisesRegex(ValueError, r"Series\.dot\(\) is currently not supported*"): kser.dot(kdf) # for MultiIndex midx = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() kser_other = ks.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3], index=midx) pser_other = kser_other.to_pandas() self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))
def test_default_index(self): sdf = self.spark.range(1000) self.assert_eq( ks.DataFrame(sdf).sort_index(), pd.DataFrame({'id': list(range(1000))}))
def test_index_drop(self): pidx = pd.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index kidx = ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index self.assert_eq(pidx.drop(1), kidx.drop(1)) self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2]))
def test_default_index_distributed(self): with ks.option_context("compute.default_index_type", "distributed"): sdf = self.spark.range(1000) pdf = ks.DataFrame(sdf).to_pandas() self.assertEqual(len(set(pdf.index)), len(pdf))
def test_add_suffix(self): pdf = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) kdf = ks.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) self.assert_eq(pdf.add_suffix('_col'), kdf.add_suffix('_col'))
def test_index_drop(self): pidx = pd.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, 3]).index kidx = ks.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, 3]).index self.assert_eq(pidx.drop(1), kidx.drop(1)) self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2]))
def apply_batch(self, func, args=(), **kwds): """ Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. Koalas internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ks.DataFrame[int]: ... return pd.DataFrame([len(pdf)]) ... >>> df = ks.DataFrame({'A': range(1000)}) >>> df.koalas.apply_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... 10 83 11 83 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ks.DataFrame[float, float]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a pandas friendly style as below: >>> def plus_one(x) -> ks.DataFrame["a": float, "b": float]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to apply to each pandas frame. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- DataFrame See Also -------- DataFrame.apply: For row/columnwise operations. DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Series.koalas.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def query_func(pdf) -> ks.DataFrame[int, int]: ... return pdf.query('A == 1') >>> df.koalas.apply_batch(query_func) c0 c1 0 1 2 >>> def query_func(pdf) -> ks.DataFrame["A": int, "B": int]: ... return pdf.query('A == 1') >>> df.koalas.apply_batch(query_func) A B 0 1 2 You can also omit the type hints so Koalas infers the return schema as below: >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1')) A B 0 1 2 You can also specify extra arguments. >>> def calculation(pdf, y, z) -> ks.DataFrame[int, int]: ... return pdf ** y + z >>> df.koalas.apply_batch(calculation, args=(10,), z=20) c0 c1 0 21 1044 1 59069 1048596 2 9765645 60466196 You can also use ``np.ufunc`` and built-in functions as input. >>> df.koalas.apply_batch(np.add, args=(10,)) A B 0 11 12 1 13 14 2 15 16 >>> (df * -1).koalas.apply_batch(abs) A B 0 1 2 1 3 4 2 5 6 """ # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate? from databricks.koalas.groupby import GroupBy from databricks.koalas.frame import DataFrame from databricks import koalas as ks if not isinstance(func, types.FunctionType): assert callable( func), "the first argument should be a callable function." f = func func = lambda *args, **kwargs: f(*args, **kwargs) spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0" original_func = func func = lambda o: original_func(o, *args, **kwds) self_applied = DataFrame(self._kdf._internal.resolved_copy) if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. limit = ks.get_option("compute.shortcut_limit") pdf = self_applied.head(limit + 1)._to_internal_pandas() applied = func(pdf) if not isinstance(applied, pd.DataFrame): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(applied)) kdf = ks.DataFrame(applied) if len(pdf) <= limit: return kdf return_schema = kdf._internal.to_internal_spark_frame.schema if should_use_map_in_pandas: output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema) else: sdf = GroupBy._spark_group_map_apply( self_applied, func, (F.spark_partition_id(), ), return_schema, retain_index=True) # If schema is inferred, we can restore indexes too. internal = kdf._internal.with_new_sdf(sdf) else: return_type = infer_return_type(original_func) return_schema = return_type.tpe is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe: raise TypeError( "The given function should specify a frame as its type " "hints; however, the return type was %s." % return_sig) if should_use_map_in_pandas: output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=False) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema) else: sdf = GroupBy._spark_group_map_apply( self_applied, func, (F.spark_partition_id(), ), return_schema, retain_index=False) # Otherwise, it loses index. internal = InternalFrame(spark_frame=sdf, index_map=None) return DataFrame(internal)
"min_samples_split": [2, 3, 5], "min_samples_leaf": [1, 2, 5], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [5, 10, 15, 20] } grid_search = GridSearchCV(RandomForestClassifier(), param_grid=parameter_grid) best_model = grid_search.fit(X, y) # COMMAND ---------- # MAGIC %md ####Pandas UDF # COMMAND ---------- # MAGIC %md ###Koalas - Pandas like API for PySpark # COMMAND ---------- import databricks.koalas as ks boston_data = load_boston() features = boston_data.feature_names boston_pd['MEDV'] = boston_data.target boston_pd = ks.DataFrame(boston_data.data, columns=boston_data.feature_names) boston_pd.info() boston_pd.head() boston_pd.isnull().sum() boston_pd.describe()
import databricks.koalas as ks df = ks.DataFrame({ 'a': [1, 2, 3, 4, 4, 3, 2, 1], # 'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]), 'b': list('abcdabcd'), # 'c': pd.Categorical(list('abcdabcd')), 'c': list('abcdabcd') })
userRelatedEntitiesObj["school_id"] = userSchool userRelatedEntitiesObj["school_name"] = userSchoolName userRelatedEntitiesObj["school_externalId"] = userSchoolUDISE userRelatedEntitiesObj["organisation_name"] = orgName except KeyError: pass if userRelatedEntitiesObj: userIntegratedAppEntitiesArr.append(userRelatedEntitiesObj) searchObj = {} searchObj["id"] = ch searchObj["channel"] = rootOrgId searchObj["parent_channel"] = "SHIKSHALOKAM" userId_obs_status_df_after.append(searchObj) df_user_org = ks.DataFrame(userId_obs_status_df_after) df_user_org = df_user_org.to_spark() if len(userIntegratedAppEntitiesArr) > 0: df_user_rel_entities = ks.DataFrame(userIntegratedAppEntitiesArr) df_user_rel_entities = df_user_rel_entities.to_spark() # roles dataframe from mongodb roles_cursorMongo = userRolesCollec.aggregate([{ "$project": { "_id": { "$toString": "$_id" }, "title": 1 } }])
def test_transform(self): pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6], 'b': [1, 1, 2, 3, 5, 8], 'c': [1, 4, 9, 16, 25, 36] }, columns=['a', 'b', 'c']) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby("b").transform(lambda x: x + 1).sort_index(), pdf.groupby("b").transform(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) self.assert_eq( kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) # multi-index columns columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) pdf.columns = columns kdf.columns = columns self.assert_eq( kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(), pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby([('x', 'a'), ('x', 'b')]).transform(lambda x: x * x).sort_index(), pdf.groupby([('x', 'a'), ('x', 'b')]).transform(lambda x: x * x).sort_index()) set_option('compute.shortcut_limit', 1000) try: pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6] * 300, 'b': [1, 1, 2, 3, 5, 8] * 300, 'c': [1, 4, 9, 16, 25, 36] * 300 }, columns=['a', 'b', 'c']) kdf = koalas.DataFrame(pdf) self.assert_eq( kdf.groupby("b").transform(lambda x: x + 1).sort_index(), pdf.groupby("b").transform(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(), pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index()) self.assert_eq( kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(), pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index()) with self.assertRaisesRegex( TypeError, "<class 'int'> object is not callable"): kdf.groupby("b").transform(1) # multi-index columns columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) pdf.columns = columns kdf.columns = columns self.assert_eq( kdf.groupby( ("x", "b")).transform(lambda x: x + 1).sort_index(), pdf.groupby( ("x", "b")).transform(lambda x: x + 1).sort_index()) self.assert_eq( kdf.groupby([('x', 'a'), ('x', 'b') ]).transform(lambda x: x * x).sort_index(), pdf.groupby([('x', 'a'), ('x', 'b') ]).transform(lambda x: x * x).sort_index()) finally: reset_option('compute.shortcut_limit')
import dask.dataframe as dd dask_dataframe = dd.from_pandas(raw_data, npartitions=1) try: CleanData(dask_dataframe, lazy=True).compute() except pa.errors.SchemaErrors as exc: print(exc.failure_cases) # %% [markdown] slideshow={"slide_type": "slide"} # #### Koalas # %% import databricks.koalas as ks koalas_dataframe = ks.DataFrame(raw_data) try: CleanData(koalas_dataframe, lazy=True).compute() except pa.errors.SchemaErrors as exc: print(exc.failure_cases) # %% [markdown] slideshow={"slide_type": "slide"} # #### Modin # %% import modin.pandas as mpd modin_dataframe = mpd.DataFrame(raw_data) try:
regr = MLPRegressor(max_iter=10, hidden_layer_sizes=(100, 50, 25, 10, 5), verbose=True) regr.fit(X_train, y_train) mlflow.sklearn.log_model(regr, "model") #### Notre modèle est entrainé, on peut donc l'utiliser sur des datafames Koalas from databricks.koalas.mlflow import load_model run_info = client.list_run_infos(exp)[-1] model = load_model("runs:/{run_id}/model".format(run_id=run_info.run_uuid)) # Prédiction et Score df = ks.DataFrame(X_test) df["prediction"] = model.predict(df) stop = datetime.now() print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s") # %% ##### 7e changement : Il faut donc recalculer le score nous même from databricks.koalas.config import set_option, reset_option set_option("compute.ops_on_diff_frames", True) # Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()
try : userRoles = userSubType except KeyError : userRoles = '' try : if userRoles : userInfoObj["designation"] = userRoles except KeyError : userInfoObj["designation"] = '' try: userInfoObj["organisation_name"] = userObj["orgname"] except KeyError: userInfoObj["organisation_name"] = '' user_info_arr.append(userInfoObj) user_df = ks.DataFrame(user_info_arr) user_df = user_df.to_spark() final_projects_df = projects_df_cols.join( user_df, projects_df_cols["createdBy"] == user_df["id"], "inner" ).drop(user_df["id"]) final_projects_df = final_projects_df.dropDuplicates() final_projects_df.coalesce(1).write.format("json").mode("overwrite").save( config.get("OUTPUT_DIR", "projects_folder") + "/" ) for filename in os.listdir(config.get("OUTPUT_DIR", "projects_folder")+"/"):
def _parse_schema_errors(schema_errors: List[Dict[str, Any]]): """Parse schema error dicts to produce data for error message.""" error_counts = defaultdict(int) # type: ignore check_failure_cases = [] column_order = [ "schema_context", "column", "check", "check_number", "failure_case", "index", ] for schema_error_dict in schema_errors: reason_code = schema_error_dict["reason_code"] err = schema_error_dict["error"] error_counts[reason_code] += 1 check_identifier = ( None if err.check is None else err.check if isinstance(err.check, str) else err.check.error if err.check.error is not None else err.check. name if err.check.name is not None else str(err.check)) if err.failure_cases is not None: if "column" in err.failure_cases: column = err.failure_cases["column"] else: column = (err.schema.name if reason_code == "schema_component_check" else None) failure_cases = err.failure_cases.assign( schema_context=err.schema.__class__.__name__, check=check_identifier, check_number=err.check_index, # if the column key is a tuple (for MultiIndex column # names), explicitly wrap `column` in a list of the # same length as the number of failure cases. column=([column] * err.failure_cases.shape[0] if isinstance(column, tuple) else column), ) check_failure_cases.append(failure_cases[column_order]) # NOTE: this is a hack to support koalas and modin concat_fn = pd.concat if any( type(x).__module__.startswith("databricks.koalas") for x in check_failure_cases): # pylint: disable=import-outside-toplevel import databricks.koalas as ks concat_fn = ks.concat check_failure_cases = [ x if isinstance(x, ks.DataFrame) else ks.DataFrame(x) for x in check_failure_cases ] elif any( type(x).__module__.startswith("modin.pandas") for x in check_failure_cases): # pylint: disable=import-outside-toplevel import modin.pandas as mpd concat_fn = mpd.concat check_failure_cases = [ x if isinstance(x, mpd.DataFrame) else mpd.DataFrame(x) for x in check_failure_cases ] failure_cases = (concat_fn(check_failure_cases).reset_index( drop=True).sort_values("schema_context", ascending=False).drop_duplicates()) return error_counts, failure_cases
'output_prefix', 'region']) logger.info(f'Resolved options are: {args}') TRANSACTION_ID = 'TransactionID' transactions = glueContext.create_dynamic_frame.from_catalog(database=args['database'], table_name=args['transaction_table']) identities = glueContext.create_dynamic_frame.from_catalog(database=args['database'], table_name=args['identity_table']) s3 = boto3.resource('s3', region_name=args['region']) train_data_ratio = 0.8 # extract out transactions for test/validation n_train = int(transactions.count()*train_data_ratio) test_ids = ks.DataFrame(transactions.select_fields(TRANSACTION_ID).toDF())[n_train:].to_spark() get_fraud_frac = lambda series: 100 * sum(series)/len(series) isfraud_df: DynamicFrame = transactions.select_fields("isFraud") logger.info("Percent fraud for train transactions: {}".format(sum_col(transactions.toDF(), "isFraud"))) dump_df_to_s3(test_ids, 'test', header=False) id_cols = args['id_cols'] cat_cols = args['cat_cols'] features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols) logger.info(f'Dumping features and labels for training...') dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags') featurs_graph_df = features_df.withColumn('props_values:String', to_json(struct(list(filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names))))) featurs_graph_df = featurs_graph_df.select('TransactionID','props_values:String')
Forums_and_Messages_Exploded.head(5) # COMMAND ---------- DiscScores = Forums_and_Messages_Exploded.groupby(['UserId', 'Name']).size().reset_index(name = 'Count') DiscScores.head(5) # COMMAND ---------- DiscScores['DiscScore'] = DiscScores.groupby('Name')['Count'].rank(pct=True) DiscScores = DiscScores.drop(['Count'], axis = 'columns') # COMMAND ---------- # convert competition scores and kernel scores to pandas dfs CompScores_df = ks.DataFrame(CompScores).toPandas() KernelScores_df = ks.DataFrame(KernelScores).toPandas() # COMMAND ---------- DiscScores.head(3) # COMMAND ---------- CompScores_df.head(3) # COMMAND ---------- KernelScores_df.head(3) # COMMAND ----------
def test_missing(self): kdf = koalas.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) # DataFrameGroupBy functions missing_functions = inspect.getmembers( _MissingPandasLikeDataFrameGroupBy, inspect.isfunction) unsupported_functions = [ name for (name, type_) in missing_functions if type_.__name__ == 'unsupported_function' ] for name in unsupported_functions: with self.assertRaisesRegex( PandasNotImplementedError, "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)". format(name)): getattr(kdf.groupby('a'), name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == 'deprecated_function' ] for name in deprecated_functions: with self.assertRaisesRegex( PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name)): getattr(kdf.groupby('a'), name)() # SeriesGroupBy functions missing_functions = inspect.getmembers(_MissingPandasLikeSeriesGroupBy, inspect.isfunction) unsupported_functions = [ name for (name, type_) in missing_functions if type_.__name__ == 'unsupported_function' ] for name in unsupported_functions: with self.assertRaisesRegex( PandasNotImplementedError, "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)". format(name)): getattr(kdf.a.groupby('a'), name)() deprecated_functions = [ name for (name, type_) in missing_functions if type_.__name__ == 'deprecated_function' ] for name in deprecated_functions: with self.assertRaisesRegex( PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name)): getattr(kdf.a.groupby('a'), name)() # DataFrameGroupBy properties missing_properties = inspect.getmembers( _MissingPandasLikeDataFrameGroupBy, lambda o: isinstance(o, property)) unsupported_properties = [ name for (name, type_) in missing_properties if type_.fget.__name__ == 'unsupported_property' ] for name in unsupported_properties: with self.assertRaisesRegex( PandasNotImplementedError, "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)". format(name)): getattr(kdf.groupby('a'), name) deprecated_properties = [ name for (name, type_) in missing_properties if type_.fget.__name__ == 'deprecated_property' ] for name in deprecated_properties: with self.assertRaisesRegex( PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name)): getattr(kdf.groupby('a'), name) # SeriesGroupBy properties missing_properties = inspect.getmembers( _MissingPandasLikeSeriesGroupBy, lambda o: isinstance(o, property)) unsupported_properties = [ name for (name, type_) in missing_properties if type_.fget.__name__ == 'unsupported_property' ] for name in unsupported_properties: with self.assertRaisesRegex( PandasNotImplementedError, "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)". format(name)): getattr(kdf.a.groupby('a'), name) deprecated_properties = [ name for (name, type_) in missing_properties if type_.fget.__name__ == 'deprecated_property' ] for name in deprecated_properties: with self.assertRaisesRegex( PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name)): getattr(kdf.a.groupby('a'), name)
def test_no_matched_index(self): with self.assertRaisesRegex(ValueError, "Index names must be exactly matched"): ks.DataFrame({"a": [1, 2, 3]}).set_index("a") + ks.DataFrame( {"b": [1, 2, 3]} ).set_index("b")
.option('header', True)\ .option('sep', ';')\ .schema("id INT, name STRING, surname STRING, age INT ")\ .load('user.csv') import databricks.koalas as ks df = ks.read_csv('user.csv', sep=";", header=0) print(df[df['age'] > 30]) (df.to_spark()) df.toPandas() #Pandas to koalas ks.from_pandas(df) #Spark to koalas ks.DataFrame(df.to_spark()) from pyspark.sql.types import * schema = StructType( [StructField('id', IntegerType()), StructField('name', StringType()), StructField('surname', StringType()), StructField('age', IntegerType(), False)]) df = spark.read.format('csv').option('header', 'True').option('sep', ';').schema(schema).load( 'user.csv') df.show() df.groupby('surname').pivot('name').agg(avg(col('age'))).show() df.filter(col('surname').isNull()).show() df_aux = df.filter(col('age') < 30)
# Koalas import databricks.koalas as ks kdf = ks.read_parquet( "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet") kdf.head() # COMMAND ---------- # MAGIC %md # MAGIC ### Converting to Koalas DataFrame to/from Spark DataFrame # COMMAND ---------- # Creating a Koalas DataFrame from PySpark DataFrame kdf = ks.DataFrame(df) # COMMAND ---------- # Alternative way of creating a Koalas DataFrame from PySpark DataFrame kdf = df.to_koalas() # COMMAND ---------- # MAGIC %md # MAGIC ### Value Counts # COMMAND ---------- # To get value counts of the different property types with PySpark display(df.groupby("property_type").count().orderBy("count", ascending=False))
def test_default_index_distributed_sequence(self): with ks.option_context("compute.default_index_type", "distributed-sequence"): sdf = self.spark.range(1000) self.assert_eq(ks.DataFrame(sdf), pd.DataFrame({"id": list(range(1000))}))
def test_dot(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) kser = ks.from_pandas(pser) pser_other = pd.Series([90, 91, 85], index=[2, 4, 1]) kser_other = ks.from_pandas(pser_other) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) kser_other = ks.Series([90, 91, 85], index=[1, 2, 4]) pser_other = pd.Series([90, 91, 85], index=[1, 2, 4]) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) # length of index is different kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0]) with self.assertRaisesRegex(ValueError, "matrices are not aligned"): kser.dot(kser_other) # for MultiIndex midx = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) kser = ks.from_pandas(pser) pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3], index=midx) kser_other = ks.from_pandas(pser_other) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) pser = pd.Series([0, 1, 2, 3]) kser = ks.from_pandas(pser) # DataFrame "other" without Index/MultiIndex as columns pdf = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) # DataFrame "other" with Index as columns pdf.columns = pd.Index(["x", "y"]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) pdf.columns = pd.Index(["x", "y"], name="cols_name") kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) pdf = pdf.reindex([1, 0, 2, 3]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) # DataFrame "other" with MultiIndex as columns pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) pdf.columns = pd.MultiIndex.from_tuples( [("a", "x"), ("b", "y")], names=["cols_name1", "cols_name2"]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) kser = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).b pser = kser.to_pandas() kdf = ks.DataFrame({"c": [7, 8, 9]}) pdf = kdf.to_pandas() self.assert_eq(kser.dot(kdf), pser.dot(pdf))
def test_missing(self): kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) # Index functions missing_functions = inspect.getmembers(_MissingPandasLikeIndex, inspect.isfunction) unsupported_functions = [name for (name, type_) in missing_functions if type_.__name__ == 'unsupported_function'] for name in unsupported_functions: with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): getattr(kdf.set_index('a').index, name)() deprecated_functions = [name for (name, type_) in missing_functions if type_.__name__ == 'deprecated_function'] for name in deprecated_functions: with self.assertRaisesRegex(PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)): getattr(kdf.set_index('a').index, name)() # MultiIndex functions missing_functions = inspect.getmembers(_MissingPandasLikeMultiIndex, inspect.isfunction) unsupported_functions = [name for (name, type_) in missing_functions if type_.__name__ == 'unsupported_function'] for name in unsupported_functions: with self.assertRaisesRegex( PandasNotImplementedError, "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): getattr(kdf.set_index(['a', 'b']).index, name)() deprecated_functions = [name for (name, type_) in missing_functions if type_.__name__ == 'deprecated_function'] for name in deprecated_functions: with self.assertRaisesRegex(PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name)): getattr(kdf.set_index(['a', 'b']).index, name)() # Index properties missing_properties = inspect.getmembers(_MissingPandasLikeIndex, lambda o: isinstance(o, property)) unsupported_properties = [name for (name, type_) in missing_properties if type_.fget.__name__ == 'unsupported_property'] for name in unsupported_properties: with self.assertRaisesRegex( PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): getattr(kdf.set_index('a').index, name) deprecated_properties = [name for (name, type_) in missing_properties if type_.fget.__name__ == 'deprecated_property'] for name in deprecated_properties: with self.assertRaisesRegex(PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name)): getattr(kdf.set_index('a').index, name) # MultiIndex properties missing_properties = inspect.getmembers(_MissingPandasLikeMultiIndex, lambda o: isinstance(o, property)) unsupported_properties = [name for (name, type_) in missing_properties if type_.fget.__name__ == 'unsupported_property'] for name in unsupported_properties: with self.assertRaisesRegex( PandasNotImplementedError, "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)): getattr(kdf.set_index(['a', 'b']).index, name) deprecated_properties = [name for (name, type_) in missing_properties if type_.fget.__name__ == 'deprecated_property'] for name in deprecated_properties: with self.assertRaisesRegex(PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name)): getattr(kdf.set_index(['a', 'b']).index, name)
def test_merge(self): left_kdf = koalas.DataFrame({'A': [1, 2]}) right_kdf = koalas.DataFrame({'B': ['x', 'y']}, index=[1, 2]) # Assert only 'on' or 'left_index' and 'right_index' parameters are set msg = "At least 'on' or 'left_index' and 'right_index' have to be set" with self.assertRaises(SparkPandasMergeError, msg=msg): left_kdf.merge(right_kdf) msg = "Only 'on' or 'left_index' and 'right_index' can be set" with self.assertRaises(SparkPandasMergeError, msg=msg): left_kdf.merge(right_kdf, on='id', left_index=True) # Assert a valid option for the 'how' parameter is used msg = ( "The 'how' parameter has to be amongst the following values: ['inner', 'left', " + "'right', 'full', 'outer']") with self.assertRaises(ValueError, msg=msg): left_kdf.merge(right_kdf, how='foo', left_index=True, right_index=True) # Assert inner join res = left_kdf.merge(right_kdf, left_index=True, right_index=True) self.assert_eq(res, pd.DataFrame({'A': [2], 'B': ['x']})) # Assert inner join on non-default column left_kdf_with_id = koalas.DataFrame({'A': [1, 2], 'id': [0, 1]}) right_kdf_with_id = koalas.DataFrame({ 'B': ['x', 'y'], 'id': [0, 1] }, index=[1, 2]) res = left_kdf_with_id.merge(right_kdf_with_id, on='id') # Explicitly set columns to also assure their correct order with Python 3.5 self.assert_eq( res, pd.DataFrame({ 'A': [1, 2], 'id': [0, 1], 'B': ['x', 'y'] }, columns=['A', 'id', 'B'])) # Assert left join res = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='left') # FIXME Replace None with np.nan once #263 is solved self.assert_eq(res, pd.DataFrame({'A': [1, 2], 'B': [None, 'x']})) # Assert right join res = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='right') self.assert_eq(res, pd.DataFrame({'A': [2, np.nan], 'B': ['x', 'y']})) # Assert full outer join res = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='outer') # FIXME Replace None with np.nan once #263 is solved self.assert_eq( res, pd.DataFrame({ 'A': [1, 2, np.nan], 'B': [None, 'x', 'y'] })) # Assert full outer join also works with 'full' keyword res = left_kdf.merge(right_kdf, left_index=True, right_index=True, how='full') # FIXME Replace None with np.nan once #263 is solved self.assert_eq( res, pd.DataFrame({ 'A': [1, 2, np.nan], 'B': [None, 'x', 'y'] })) # Assert suffixes create the expected column names res = left_kdf.merge(koalas.DataFrame({'A': [3, 4]}), left_index=True, right_index=True, suffixes=('_left', '_right')) self.assert_eq(res, pd.DataFrame({ 'A_left': [1, 2], 'A_right': [3, 4] }))
def test_loc(self): kdf = self.kdf pdf = self.pdf self.assert_eq(kdf.loc[5:5], pdf.loc[5:5]) self.assert_eq(kdf.loc[3:8], pdf.loc[3:8]) self.assert_eq(kdf.loc[:8], pdf.loc[:8]) self.assert_eq(kdf.loc[3:], pdf.loc[3:]) self.assert_eq(kdf.loc[[5]], pdf.loc[[5]]) self.assert_eq(kdf.loc[:], pdf.loc[:]) # TODO?: self.assert_eq(kdf.loc[[3, 4, 1, 8]], pdf.loc[[3, 4, 1, 8]]) # TODO?: self.assert_eq(kdf.loc[[3, 4, 1, 9]], pdf.loc[[3, 4, 1, 9]]) # TODO?: self.assert_eq(kdf.loc[np.array([3, 4, 1, 9])], pdf.loc[np.array([3, 4, 1, 9])]) self.assert_eq(kdf.a.loc[5:5], pdf.a.loc[5:5]) self.assert_eq(kdf.a.loc[3:8], pdf.a.loc[3:8]) self.assert_eq(kdf.a.loc[:8], pdf.a.loc[:8]) self.assert_eq(kdf.a.loc[3:], pdf.a.loc[3:]) self.assert_eq(kdf.a.loc[[5]], pdf.a.loc[[5]]) # TODO?: self.assert_eq(kdf.a.loc[[3, 4, 1, 8]], pdf.a.loc[[3, 4, 1, 8]]) # TODO?: self.assert_eq(kdf.a.loc[[3, 4, 1, 9]], pdf.a.loc[[3, 4, 1, 9]]) # TODO?: self.assert_eq(kdf.a.loc[np.array([3, 4, 1, 9])], # pdf.a.loc[np.array([3, 4, 1, 9])]) self.assert_eq(kdf.a.loc[[]], pdf.a.loc[[]]) self.assert_eq(kdf.a.loc[np.array([])], pdf.a.loc[np.array([])]) self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) self.assert_eq(kdf.loc[5], pdf.loc[5]) self.assert_eq(kdf.loc[9], pdf.loc[9]) self.assert_eq(kdf.a.loc[5], pdf.a.loc[5]) self.assert_eq(kdf.a.loc[9], pdf.a.loc[9]) self.assertRaises(KeyError, lambda: kdf.loc[10]) self.assertRaises(KeyError, lambda: kdf.a.loc[10]) # monotonically increasing index test pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[0, 1, 1, 2, 2, 2, 4, 5, 6]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc[:2], pdf.loc[:2]) self.assert_eq(kdf.loc[:3], pdf.loc[:3]) self.assert_eq(kdf.loc[3:], pdf.loc[3:]) self.assert_eq(kdf.loc[4:], pdf.loc[4:]) self.assert_eq(kdf.loc[3:2], pdf.loc[3:2]) self.assert_eq(kdf.loc[-1:2], pdf.loc[-1:2]) self.assert_eq(kdf.loc[3:10], pdf.loc[3:10]) # monotonically decreasing index test pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[6, 5, 5, 4, 4, 4, 2, 1, 0]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc[:4], pdf.loc[:4]) self.assert_eq(kdf.loc[:3], pdf.loc[:3]) self.assert_eq(kdf.loc[3:], pdf.loc[3:]) self.assert_eq(kdf.loc[2:], pdf.loc[2:]) self.assert_eq(kdf.loc[2:3], pdf.loc[2:3]) self.assert_eq(kdf.loc[2:-1], pdf.loc[2:-1]) self.assert_eq(kdf.loc[10:3], pdf.loc[10:3]) # test when type of key is string and given value is not included in key pdf = pd.DataFrame({"a": [1, 2, 3]}, index=["a", "b", "d"]) kdf = ks.from_pandas(pdf) self.assert_eq(kdf.loc["a":"z"], pdf.loc["a":"z"]) # KeyError when index is not monotonic increasing or decreasing # and specified values don't exist in index kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], index=["cobra", "viper", "sidewinder"]) self.assertRaises(KeyError, lambda: kdf.loc["cobra":"koalas"]) self.assertRaises(KeyError, lambda: kdf.loc["koalas":"viper"]) kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], index=[10, 30, 20]) self.assertRaises(KeyError, lambda: kdf.loc[0:30]) self.assertRaises(KeyError, lambda: kdf.loc[10:100])
def test_ranges(self): self.assert_eq((ks.range(10) + ks.range(10)).sort_index(), (ks.DataFrame({'id': list(range(10))}) + ks.DataFrame({'id': list(range(10))})).sort_index())
def test_default_index(self): sdf = self.spark.range(1000) pdf = ks.DataFrame(sdf).to_pandas() self.assertEqual(len(set(pdf.index)), len(pdf))
def test_no_matched_index(self): with self.assertRaisesRegex(ValueError, "Index names must be exactly matched"): ks.DataFrame({'a': [1, 2, 3]}).set_index('a') + \ ks.DataFrame({'b': [1, 2, 3]}).set_index('b')