コード例 #1
0
    def test_parquet_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_parquet(tmp, mode="overwrite", partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )
    def test_dot(self):
        kser = ks.Series([90, 91, 85], index=[2, 4, 1])
        pser = kser.to_pandas()
        kser_other = ks.Series([90, 91, 85], index=[2, 4, 1])
        pser_other = kser_other.to_pandas()

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        kser_other = ks.Series([90, 91, 85], index=[1, 2, 4])
        pser_other = pd.Series([90, 91, 85], index=[1, 2, 4])

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        # length of index is different
        kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0])
        with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
            kser.dot(kser_other)

        # with DataFram is not supported for now since performance issue,
        # now we raise ValueError with proper message instead.
        kdf = ks.DataFrame([[0, 1], [-2, 3], [4, -5]], index=[2, 4, 1])

        with self.assertRaisesRegex(ValueError, r"Series\.dot\(\) is currently not supported*"):
            kser.dot(kdf)

        # for MultiIndex
        midx = pd.MultiIndex(
            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
        )
        kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        pser = kser.to_pandas()
        kser_other = ks.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3], index=midx)
        pser_other = kser_other.to_pandas()

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))
コード例 #3
0
 def test_default_index(self):
     sdf = self.spark.range(1000)
     self.assert_eq(
         ks.DataFrame(sdf).sort_index(),
         pd.DataFrame({'id': list(range(1000))}))
コード例 #4
0
ファイル: test_indexes.py プロジェクト: kinghows/koalas
    def test_index_drop(self):
        pidx = pd.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index
        kidx = ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, 3]).index

        self.assert_eq(pidx.drop(1), kidx.drop(1))
        self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2]))
コード例 #5
0
 def test_default_index_distributed(self):
     with ks.option_context("compute.default_index_type", "distributed"):
         sdf = self.spark.range(1000)
         pdf = ks.DataFrame(sdf).to_pandas()
         self.assertEqual(len(set(pdf.index)), len(pdf))
コード例 #6
0
 def test_add_suffix(self):
     pdf = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
     kdf = ks.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
     self.assert_eq(pdf.add_suffix('_col'), kdf.add_suffix('_col'))
コード例 #7
0
ファイル: test_indexes.py プロジェクト: digitalminellc/koalas
    def test_index_drop(self):
        pidx = pd.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, 3]).index
        kidx = ks.DataFrame({"a": ["a", "b", "c"]}, index=[1, 2, 3]).index

        self.assert_eq(pidx.drop(1), kidx.drop(1))
        self.assert_eq(pidx.drop([1, 2]), kidx.drop([1, 2]))
コード例 #8
0
ファイル: accessors.py プロジェクト: Rasha27/koalas
    def apply_batch(self, func, args=(), **kwds):
        """
        Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas
        DataFrame given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ks.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)])
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.koalas.apply_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...
            10  83
            11  83

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ks.DataFrame["a": float, "b": float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1


        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.apply: For row/columnwise operations.
        DataFrame.applymap: For elementwise operations.
        DataFrame.aggregate: Only perform aggregating type operations.
        DataFrame.transform: Only perform transforming type operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def query_func(pdf) -> ks.DataFrame[int, int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           c0  c1
        0   1   2

        >>> def query_func(pdf) -> ks.DataFrame["A": int, "B": int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           A  B
        0  1  2

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1'))
           A  B
        0  1  2

        You can also specify extra arguments.

        >>> def calculation(pdf, y, z) -> ks.DataFrame[int, int]:
        ...     return pdf ** y + z
        >>> df.koalas.apply_batch(calculation, args=(10,), z=20)
                c0        c1
        0       21      1044
        1    59069   1048596
        2  9765645  60466196

        You can also use ``np.ufunc`` and built-in functions as input.

        >>> df.koalas.apply_batch(np.add, args=(10,))
            A   B
        0  11  12
        1  13  14
        2  15  16

        >>> (df * -1).koalas.apply_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        """
        # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?

        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.frame import DataFrame
        from databricks import koalas as ks

        if not isinstance(func, types.FunctionType):
            assert callable(
                func), "the first argument should be a callable function."
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0"

        original_func = func
        func = lambda o: original_func(o, *args, **kwds)

        self_applied = DataFrame(self._kdf._internal.resolved_copy)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ks.get_option("compute.shortcut_limit")
            pdf = self_applied.head(limit + 1)._to_internal_pandas()
            applied = func(pdf)
            if not isinstance(applied, pd.DataFrame):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(applied))
            kdf = ks.DataFrame(applied)
            if len(pdf) <= limit:
                return kdf

            return_schema = kdf._internal.to_internal_spark_frame.schema
            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=True)

            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.with_new_sdf(sdf)
        else:
            return_type = infer_return_type(original_func)
            return_schema = return_type.tpe
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe:
                raise TypeError(
                    "The given function should specify a frame as its type "
                    "hints; however, the return type was %s." % return_sig)

            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=False)

            # Otherwise, it loses index.
            internal = InternalFrame(spark_frame=sdf, index_map=None)

        return DataFrame(internal)
    "min_samples_split": [2, 3, 5],
    "min_samples_leaf": [1, 2, 5],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
    "n_estimators": [5, 10, 15, 20]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=parameter_grid)
best_model = grid_search.fit(X, y)

# COMMAND ----------

# MAGIC %md ####Pandas UDF

# COMMAND ----------

# MAGIC %md ###Koalas - Pandas like API for PySpark

# COMMAND ----------

import databricks.koalas as ks

boston_data = load_boston()
features = boston_data.feature_names
boston_pd['MEDV'] = boston_data.target

boston_pd = ks.DataFrame(boston_data.data, columns=boston_data.feature_names)

boston_pd.info()
boston_pd.head()
boston_pd.isnull().sum()
boston_pd.describe()
コード例 #10
0
import databricks.koalas as ks

df = ks.DataFrame({
    'a': [1, 2, 3, 4, 4, 3, 2, 1],
    # 'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
    'b': list('abcdabcd'),
    # 'c': pd.Categorical(list('abcdabcd')),
    'c': list('abcdabcd')
})
コード例 #11
0
            userRelatedEntitiesObj["school_id"] = userSchool
            userRelatedEntitiesObj["school_name"] = userSchoolName
            userRelatedEntitiesObj["school_externalId"] = userSchoolUDISE
            userRelatedEntitiesObj["organisation_name"] = orgName
        except KeyError:
            pass
        if userRelatedEntitiesObj:
            userIntegratedAppEntitiesArr.append(userRelatedEntitiesObj)

        searchObj = {}
        searchObj["id"] = ch
        searchObj["channel"] = rootOrgId
        searchObj["parent_channel"] = "SHIKSHALOKAM"
        userId_obs_status_df_after.append(searchObj)

df_user_org = ks.DataFrame(userId_obs_status_df_after)
df_user_org = df_user_org.to_spark()

if len(userIntegratedAppEntitiesArr) > 0:
    df_user_rel_entities = ks.DataFrame(userIntegratedAppEntitiesArr)
    df_user_rel_entities = df_user_rel_entities.to_spark()

# roles dataframe from mongodb
roles_cursorMongo = userRolesCollec.aggregate([{
    "$project": {
        "_id": {
            "$toString": "$_id"
        },
        "title": 1
    }
}])
コード例 #12
0
ファイル: test_groupby.py プロジェクト: floscha/koalas
    def test_transform(self):
        pdf = pd.DataFrame(
            {
                'a': [1, 2, 3, 4, 5, 6],
                'b': [1, 1, 2, 3, 5, 8],
                'c': [1, 4, 9, 16, 25, 36]
            },
            columns=['a', 'b', 'c'])
        kdf = koalas.DataFrame(pdf)
        self.assert_eq(
            kdf.groupby("b").transform(lambda x: x + 1).sort_index(),
            pdf.groupby("b").transform(lambda x: x + 1).sort_index())
        self.assert_eq(
            kdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index(),
            pdf.groupby(['a', 'b']).transform(lambda x: x * x).sort_index())
        self.assert_eq(
            kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
            pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'),
                                             ('y', 'c')])
        pdf.columns = columns
        kdf.columns = columns

        self.assert_eq(
            kdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index(),
            pdf.groupby(("x", "b")).transform(lambda x: x + 1).sort_index())
        self.assert_eq(
            kdf.groupby([('x', 'a'),
                         ('x', 'b')]).transform(lambda x: x * x).sort_index(),
            pdf.groupby([('x', 'a'),
                         ('x', 'b')]).transform(lambda x: x * x).sort_index())

        set_option('compute.shortcut_limit', 1000)
        try:
            pdf = pd.DataFrame(
                {
                    'a': [1, 2, 3, 4, 5, 6] * 300,
                    'b': [1, 1, 2, 3, 5, 8] * 300,
                    'c': [1, 4, 9, 16, 25, 36] * 300
                },
                columns=['a', 'b', 'c'])
            kdf = koalas.DataFrame(pdf)
            self.assert_eq(
                kdf.groupby("b").transform(lambda x: x + 1).sort_index(),
                pdf.groupby("b").transform(lambda x: x + 1).sort_index())
            self.assert_eq(
                kdf.groupby(['a',
                             'b']).transform(lambda x: x * x).sort_index(),
                pdf.groupby(['a',
                             'b']).transform(lambda x: x * x).sort_index())
            self.assert_eq(
                kdf.groupby(['b'])['a'].transform(lambda x: x).sort_index(),
                pdf.groupby(['b'])['a'].transform(lambda x: x).sort_index())
            with self.assertRaisesRegex(
                    TypeError, "<class 'int'> object is not callable"):
                kdf.groupby("b").transform(1)

            # multi-index columns
            columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'),
                                                 ('y', 'c')])
            pdf.columns = columns
            kdf.columns = columns

            self.assert_eq(
                kdf.groupby(
                    ("x", "b")).transform(lambda x: x + 1).sort_index(),
                pdf.groupby(
                    ("x", "b")).transform(lambda x: x + 1).sort_index())
            self.assert_eq(
                kdf.groupby([('x', 'a'), ('x', 'b')
                             ]).transform(lambda x: x * x).sort_index(),
                pdf.groupby([('x', 'a'), ('x', 'b')
                             ]).transform(lambda x: x * x).sort_index())
        finally:
            reset_option('compute.shortcut_limit')
import dask.dataframe as dd

dask_dataframe = dd.from_pandas(raw_data, npartitions=1)

try:
    CleanData(dask_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
    print(exc.failure_cases)

# %% [markdown] slideshow={"slide_type": "slide"}
# #### Koalas

# %%
import databricks.koalas as ks

koalas_dataframe = ks.DataFrame(raw_data)

try:
    CleanData(koalas_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
    print(exc.failure_cases)

# %% [markdown] slideshow={"slide_type": "slide"}
# #### Modin

# %%
import modin.pandas as mpd

modin_dataframe = mpd.DataFrame(raw_data)

try:
コード例 #14
0
    regr = MLPRegressor(max_iter=10,
                        hidden_layer_sizes=(100, 50, 25, 10, 5),
                        verbose=True)

    regr.fit(X_train, y_train)

    mlflow.sklearn.log_model(regr, "model")

#### Notre modèle est entrainé, on peut donc l'utiliser sur des datafames Koalas
from databricks.koalas.mlflow import load_model
run_info = client.list_run_infos(exp)[-1]

model = load_model("runs:/{run_id}/model".format(run_id=run_info.run_uuid))

# Prédiction et Score
df = ks.DataFrame(X_test)
df["prediction"] = model.predict(df)

stop = datetime.now()

print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s")

# %%
##### 7e changement : Il faut donc recalculer le score nous même

from databricks.koalas.config import set_option, reset_option

set_option("compute.ops_on_diff_frames", True)

# Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()
コード例 #15
0
        try :
            userRoles = userSubType
        except KeyError :
            userRoles = ''
        try :
            if userRoles :
                userInfoObj["designation"] = userRoles
        except KeyError :
            userInfoObj["designation"] = ''
        try:
            userInfoObj["organisation_name"] = userObj["orgname"]
        except KeyError:
            userInfoObj["organisation_name"] = ''
        user_info_arr.append(userInfoObj)

user_df = ks.DataFrame(user_info_arr)
user_df = user_df.to_spark()

final_projects_df = projects_df_cols.join(
    user_df,
    projects_df_cols["createdBy"] == user_df["id"],
    "inner"
).drop(user_df["id"])

final_projects_df = final_projects_df.dropDuplicates()

final_projects_df.coalesce(1).write.format("json").mode("overwrite").save(
    config.get("OUTPUT_DIR", "projects_folder") + "/"
)

for filename in os.listdir(config.get("OUTPUT_DIR", "projects_folder")+"/"):
コード例 #16
0
ファイル: errors.py プロジェクト: tfwillems/pandera
    def _parse_schema_errors(schema_errors: List[Dict[str, Any]]):
        """Parse schema error dicts to produce data for error message."""
        error_counts = defaultdict(int)  # type: ignore
        check_failure_cases = []

        column_order = [
            "schema_context",
            "column",
            "check",
            "check_number",
            "failure_case",
            "index",
        ]

        for schema_error_dict in schema_errors:
            reason_code = schema_error_dict["reason_code"]
            err = schema_error_dict["error"]

            error_counts[reason_code] += 1
            check_identifier = (
                None if err.check is None else
                err.check if isinstance(err.check, str) else
                err.check.error if err.check.error is not None else err.check.
                name if err.check.name is not None else str(err.check))

            if err.failure_cases is not None:
                if "column" in err.failure_cases:
                    column = err.failure_cases["column"]
                else:
                    column = (err.schema.name if reason_code
                              == "schema_component_check" else None)

                failure_cases = err.failure_cases.assign(
                    schema_context=err.schema.__class__.__name__,
                    check=check_identifier,
                    check_number=err.check_index,
                    # if the column key is a tuple (for MultiIndex column
                    # names), explicitly wrap `column` in a list of the
                    # same length as the number of failure cases.
                    column=([column] * err.failure_cases.shape[0]
                            if isinstance(column, tuple) else column),
                )
                check_failure_cases.append(failure_cases[column_order])

        # NOTE: this is a hack to support koalas and modin
        concat_fn = pd.concat
        if any(
                type(x).__module__.startswith("databricks.koalas")
                for x in check_failure_cases):
            # pylint: disable=import-outside-toplevel
            import databricks.koalas as ks

            concat_fn = ks.concat
            check_failure_cases = [
                x if isinstance(x, ks.DataFrame) else ks.DataFrame(x)
                for x in check_failure_cases
            ]
        elif any(
                type(x).__module__.startswith("modin.pandas")
                for x in check_failure_cases):
            # pylint: disable=import-outside-toplevel
            import modin.pandas as mpd

            concat_fn = mpd.concat
            check_failure_cases = [
                x if isinstance(x, mpd.DataFrame) else mpd.DataFrame(x)
                for x in check_failure_cases
            ]

        failure_cases = (concat_fn(check_failure_cases).reset_index(
            drop=True).sort_values("schema_context",
                                   ascending=False).drop_duplicates())
        return error_counts, failure_cases
コード例 #17
0
                           'output_prefix',
                           'region'])

logger.info(f'Resolved options are: {args}')

TRANSACTION_ID = 'TransactionID'

transactions = glueContext.create_dynamic_frame.from_catalog(database=args['database'], table_name=args['transaction_table'])
identities = glueContext.create_dynamic_frame.from_catalog(database=args['database'], table_name=args['identity_table'])

s3 = boto3.resource('s3', region_name=args['region'])

train_data_ratio = 0.8
# extract out transactions for test/validation
n_train = int(transactions.count()*train_data_ratio)
test_ids = ks.DataFrame(transactions.select_fields(TRANSACTION_ID).toDF())[n_train:].to_spark()
get_fraud_frac = lambda series: 100 * sum(series)/len(series)
isfraud_df: DynamicFrame = transactions.select_fields("isFraud")
logger.info("Percent fraud for train transactions: {}".format(sum_col(transactions.toDF(), "isFraud")))
dump_df_to_s3(test_ids, 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols)

logger.info(f'Dumping features and labels for training...')
dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')

featurs_graph_df = features_df.withColumn('props_values:String', to_json(struct(list(filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names)))))
featurs_graph_df = featurs_graph_df.select('TransactionID','props_values:String')
コード例 #18
0
Forums_and_Messages_Exploded.head(5)

# COMMAND ----------

DiscScores = Forums_and_Messages_Exploded.groupby(['UserId', 'Name']).size().reset_index(name = 'Count')
DiscScores.head(5)                                                                                                     

# COMMAND ----------

DiscScores['DiscScore'] = DiscScores.groupby('Name')['Count'].rank(pct=True)
DiscScores = DiscScores.drop(['Count'], axis = 'columns')

# COMMAND ----------

# convert competition scores and kernel scores to pandas dfs
CompScores_df = ks.DataFrame(CompScores).toPandas()
KernelScores_df = ks.DataFrame(KernelScores).toPandas()

# COMMAND ----------

DiscScores.head(3)

# COMMAND ----------

CompScores_df.head(3)

# COMMAND ----------

KernelScores_df.head(3)

# COMMAND ----------
コード例 #19
0
    def test_missing(self):
        kdf = koalas.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]})

        # DataFrameGroupBy functions
        missing_functions = inspect.getmembers(
            _MissingPandasLikeDataFrameGroupBy, inspect.isfunction)
        unsupported_functions = [
            name for (name, type_) in missing_functions
            if type_.__name__ == 'unsupported_function'
        ]
        for name in unsupported_functions:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".
                    format(name)):
                getattr(kdf.groupby('a'), name)()

        deprecated_functions = [
            name for (name, type_) in missing_functions
            if type_.__name__ == 'deprecated_function'
        ]
        for name in deprecated_functions:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "method.*GroupBy.*{}.*is deprecated".format(name)):
                getattr(kdf.groupby('a'), name)()

        # SeriesGroupBy functions
        missing_functions = inspect.getmembers(_MissingPandasLikeSeriesGroupBy,
                                               inspect.isfunction)
        unsupported_functions = [
            name for (name, type_) in missing_functions
            if type_.__name__ == 'unsupported_function'
        ]
        for name in unsupported_functions:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".
                    format(name)):
                getattr(kdf.a.groupby('a'), name)()

        deprecated_functions = [
            name for (name, type_) in missing_functions
            if type_.__name__ == 'deprecated_function'
        ]
        for name in deprecated_functions:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "method.*GroupBy.*{}.*is deprecated".format(name)):
                getattr(kdf.a.groupby('a'), name)()

        # DataFrameGroupBy properties
        missing_properties = inspect.getmembers(
            _MissingPandasLikeDataFrameGroupBy,
            lambda o: isinstance(o, property))
        unsupported_properties = [
            name for (name, type_) in missing_properties
            if type_.fget.__name__ == 'unsupported_property'
        ]
        for name in unsupported_properties:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".
                    format(name)):
                getattr(kdf.groupby('a'), name)
        deprecated_properties = [
            name for (name, type_) in missing_properties
            if type_.fget.__name__ == 'deprecated_property'
        ]
        for name in deprecated_properties:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "property.*GroupBy.*{}.*is deprecated".format(name)):
                getattr(kdf.groupby('a'), name)

        # SeriesGroupBy properties
        missing_properties = inspect.getmembers(
            _MissingPandasLikeSeriesGroupBy, lambda o: isinstance(o, property))
        unsupported_properties = [
            name for (name, type_) in missing_properties
            if type_.fget.__name__ == 'unsupported_property'
        ]
        for name in unsupported_properties:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".
                    format(name)):
                getattr(kdf.a.groupby('a'), name)
        deprecated_properties = [
            name for (name, type_) in missing_properties
            if type_.fget.__name__ == 'deprecated_property'
        ]
        for name in deprecated_properties:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "property.*GroupBy.*{}.*is deprecated".format(name)):
                getattr(kdf.a.groupby('a'), name)
コード例 #20
0
 def test_no_matched_index(self):
     with self.assertRaisesRegex(ValueError, "Index names must be exactly matched"):
         ks.DataFrame({"a": [1, 2, 3]}).set_index("a") + ks.DataFrame(
             {"b": [1, 2, 3]}
         ).set_index("b")
コード例 #21
0
                  .option('header', True)\
                  .option('sep', ';')\
                  .schema("id INT, name STRING, surname STRING, age INT ")\
                  .load('user.csv')

    import databricks.koalas as ks
    df = ks.read_csv('user.csv', sep=";", header=0)
    print(df[df['age'] > 30])

    (df.to_spark())
    df.toPandas()

    #Pandas to koalas
    ks.from_pandas(df)
    #Spark to koalas
    ks.DataFrame(df.to_spark())


    from pyspark.sql.types import *
    schema = StructType(
        [StructField('id', IntegerType()), StructField('name', StringType()), StructField('surname', StringType()),
         StructField('age', IntegerType(), False)])
    df = spark.read.format('csv').option('header', 'True').option('sep', ';').schema(schema).load(
        'user.csv')
    df.show()

    df.groupby('surname').pivot('name').agg(avg(col('age'))).show()

    df.filter(col('surname').isNull()).show()
    df_aux = df.filter(col('age') < 30)
コード例 #22
0
# Koalas
import databricks.koalas as ks

kdf = ks.read_parquet(
    "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet")
kdf.head()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Converting to Koalas DataFrame to/from Spark DataFrame

# COMMAND ----------

# Creating a Koalas DataFrame from PySpark DataFrame
kdf = ks.DataFrame(df)

# COMMAND ----------

# Alternative way of creating a Koalas DataFrame from PySpark DataFrame
kdf = df.to_koalas()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Value Counts

# COMMAND ----------

# To get value counts of the different property types with PySpark
display(df.groupby("property_type").count().orderBy("count", ascending=False))
コード例 #23
0
 def test_default_index_distributed_sequence(self):
     with ks.option_context("compute.default_index_type",
                            "distributed-sequence"):
         sdf = self.spark.range(1000)
         self.assert_eq(ks.DataFrame(sdf),
                        pd.DataFrame({"id": list(range(1000))}))
コード例 #24
0
    def test_dot(self):
        pser = pd.Series([90, 91, 85], index=[2, 4, 1])
        kser = ks.from_pandas(pser)
        pser_other = pd.Series([90, 91, 85], index=[2, 4, 1])
        kser_other = ks.from_pandas(pser_other)

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        kser_other = ks.Series([90, 91, 85], index=[1, 2, 4])
        pser_other = pd.Series([90, 91, 85], index=[1, 2, 4])

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        # length of index is different
        kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0])
        with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
            kser.dot(kser_other)

        # for MultiIndex
        midx = pd.MultiIndex(
            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
        )
        pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        kser = ks.from_pandas(pser)
        pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3],
                               index=midx)
        kser_other = ks.from_pandas(pser_other)
        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        pser = pd.Series([0, 1, 2, 3])
        kser = ks.from_pandas(pser)

        # DataFrame "other" without Index/MultiIndex as columns
        pdf = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        # DataFrame "other" with Index as columns
        pdf.columns = pd.Index(["x", "y"])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
        pdf.columns = pd.Index(["x", "y"], name="cols_name")
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        pdf = pdf.reindex([1, 0, 2, 3])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        # DataFrame "other" with MultiIndex as columns
        pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
        pdf.columns = pd.MultiIndex.from_tuples(
            [("a", "x"), ("b", "y")], names=["cols_name1", "cols_name2"])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        kser = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).b
        pser = kser.to_pandas()
        kdf = ks.DataFrame({"c": [7, 8, 9]})
        pdf = kdf.to_pandas()
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
コード例 #25
0
ファイル: test_indexes.py プロジェクト: kinghows/koalas
    def test_missing(self):
        kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})

        # Index functions
        missing_functions = inspect.getmembers(_MissingPandasLikeIndex, inspect.isfunction)
        unsupported_functions = [name for (name, type_) in missing_functions
                                 if type_.__name__ == 'unsupported_function']
        for name in unsupported_functions:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)):
                getattr(kdf.set_index('a').index, name)()

        deprecated_functions = [name for (name, type_) in missing_functions
                                if type_.__name__ == 'deprecated_function']
        for name in deprecated_functions:
            with self.assertRaisesRegex(PandasNotImplementedError,
                                        "method.*Index.*{}.*is deprecated".format(name)):
                getattr(kdf.set_index('a').index, name)()

        # MultiIndex functions
        missing_functions = inspect.getmembers(_MissingPandasLikeMultiIndex, inspect.isfunction)
        unsupported_functions = [name for (name, type_) in missing_functions
                                 if type_.__name__ == 'unsupported_function']
        for name in unsupported_functions:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)):
                getattr(kdf.set_index(['a', 'b']).index, name)()

        deprecated_functions = [name for (name, type_) in missing_functions
                                if type_.__name__ == 'deprecated_function']
        for name in deprecated_functions:
            with self.assertRaisesRegex(PandasNotImplementedError,
                                        "method.*Index.*{}.*is deprecated".format(name)):
                getattr(kdf.set_index(['a', 'b']).index, name)()

        # Index properties
        missing_properties = inspect.getmembers(_MissingPandasLikeIndex,
                                                lambda o: isinstance(o, property))
        unsupported_properties = [name for (name, type_) in missing_properties
                                  if type_.fget.__name__ == 'unsupported_property']
        for name in unsupported_properties:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)):
                getattr(kdf.set_index('a').index, name)

        deprecated_properties = [name for (name, type_) in missing_properties
                                 if type_.fget.__name__ == 'deprecated_property']
        for name in deprecated_properties:
            with self.assertRaisesRegex(PandasNotImplementedError,
                                        "property.*Index.*{}.*is deprecated".format(name)):
                getattr(kdf.set_index('a').index, name)

        # MultiIndex properties
        missing_properties = inspect.getmembers(_MissingPandasLikeMultiIndex,
                                                lambda o: isinstance(o, property))
        unsupported_properties = [name for (name, type_) in missing_properties
                                  if type_.fget.__name__ == 'unsupported_property']
        for name in unsupported_properties:
            with self.assertRaisesRegex(
                    PandasNotImplementedError,
                    "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name)):
                getattr(kdf.set_index(['a', 'b']).index, name)

        deprecated_properties = [name for (name, type_) in missing_properties
                                 if type_.fget.__name__ == 'deprecated_property']
        for name in deprecated_properties:
            with self.assertRaisesRegex(PandasNotImplementedError,
                                        "property.*Index.*{}.*is deprecated".format(name)):
                getattr(kdf.set_index(['a', 'b']).index, name)
コード例 #26
0
    def test_merge(self):
        left_kdf = koalas.DataFrame({'A': [1, 2]})
        right_kdf = koalas.DataFrame({'B': ['x', 'y']}, index=[1, 2])

        # Assert only 'on' or 'left_index' and 'right_index' parameters are set
        msg = "At least 'on' or 'left_index' and 'right_index' have to be set"
        with self.assertRaises(SparkPandasMergeError, msg=msg):
            left_kdf.merge(right_kdf)
        msg = "Only 'on' or 'left_index' and 'right_index' can be set"
        with self.assertRaises(SparkPandasMergeError, msg=msg):
            left_kdf.merge(right_kdf, on='id', left_index=True)

        # Assert a valid option for the 'how' parameter is used
        msg = (
            "The 'how' parameter has to be amongst the following values: ['inner', 'left', "
            + "'right', 'full', 'outer']")
        with self.assertRaises(ValueError, msg=msg):
            left_kdf.merge(right_kdf,
                           how='foo',
                           left_index=True,
                           right_index=True)

        # Assert inner join
        res = left_kdf.merge(right_kdf, left_index=True, right_index=True)
        self.assert_eq(res, pd.DataFrame({'A': [2], 'B': ['x']}))

        # Assert inner join on non-default column
        left_kdf_with_id = koalas.DataFrame({'A': [1, 2], 'id': [0, 1]})
        right_kdf_with_id = koalas.DataFrame({
            'B': ['x', 'y'],
            'id': [0, 1]
        },
                                             index=[1, 2])
        res = left_kdf_with_id.merge(right_kdf_with_id, on='id')
        # Explicitly set columns to also assure their correct order with Python 3.5
        self.assert_eq(
            res,
            pd.DataFrame({
                'A': [1, 2],
                'id': [0, 1],
                'B': ['x', 'y']
            },
                         columns=['A', 'id', 'B']))

        # Assert left join
        res = left_kdf.merge(right_kdf,
                             left_index=True,
                             right_index=True,
                             how='left')
        # FIXME Replace None with np.nan once #263 is solved
        self.assert_eq(res, pd.DataFrame({'A': [1, 2], 'B': [None, 'x']}))

        # Assert right join
        res = left_kdf.merge(right_kdf,
                             left_index=True,
                             right_index=True,
                             how='right')
        self.assert_eq(res, pd.DataFrame({'A': [2, np.nan], 'B': ['x', 'y']}))

        # Assert full outer join
        res = left_kdf.merge(right_kdf,
                             left_index=True,
                             right_index=True,
                             how='outer')
        # FIXME Replace None with np.nan once #263 is solved
        self.assert_eq(
            res, pd.DataFrame({
                'A': [1, 2, np.nan],
                'B': [None, 'x', 'y']
            }))

        # Assert full outer join also works with 'full' keyword
        res = left_kdf.merge(right_kdf,
                             left_index=True,
                             right_index=True,
                             how='full')
        # FIXME Replace None with np.nan once #263 is solved
        self.assert_eq(
            res, pd.DataFrame({
                'A': [1, 2, np.nan],
                'B': [None, 'x', 'y']
            }))

        # Assert suffixes create the expected column names
        res = left_kdf.merge(koalas.DataFrame({'A': [3, 4]}),
                             left_index=True,
                             right_index=True,
                             suffixes=('_left', '_right'))
        self.assert_eq(res, pd.DataFrame({
            'A_left': [1, 2],
            'A_right': [3, 4]
        }))
コード例 #27
0
    def test_loc(self):
        kdf = self.kdf
        pdf = self.pdf

        self.assert_eq(kdf.loc[5:5], pdf.loc[5:5])
        self.assert_eq(kdf.loc[3:8], pdf.loc[3:8])
        self.assert_eq(kdf.loc[:8], pdf.loc[:8])
        self.assert_eq(kdf.loc[3:], pdf.loc[3:])
        self.assert_eq(kdf.loc[[5]], pdf.loc[[5]])
        self.assert_eq(kdf.loc[:], pdf.loc[:])

        # TODO?: self.assert_eq(kdf.loc[[3, 4, 1, 8]], pdf.loc[[3, 4, 1, 8]])
        # TODO?: self.assert_eq(kdf.loc[[3, 4, 1, 9]], pdf.loc[[3, 4, 1, 9]])
        # TODO?: self.assert_eq(kdf.loc[np.array([3, 4, 1, 9])], pdf.loc[np.array([3, 4, 1, 9])])

        self.assert_eq(kdf.a.loc[5:5], pdf.a.loc[5:5])
        self.assert_eq(kdf.a.loc[3:8], pdf.a.loc[3:8])
        self.assert_eq(kdf.a.loc[:8], pdf.a.loc[:8])
        self.assert_eq(kdf.a.loc[3:], pdf.a.loc[3:])
        self.assert_eq(kdf.a.loc[[5]], pdf.a.loc[[5]])

        # TODO?: self.assert_eq(kdf.a.loc[[3, 4, 1, 8]], pdf.a.loc[[3, 4, 1, 8]])
        # TODO?: self.assert_eq(kdf.a.loc[[3, 4, 1, 9]], pdf.a.loc[[3, 4, 1, 9]])
        # TODO?: self.assert_eq(kdf.a.loc[np.array([3, 4, 1, 9])],
        #                       pdf.a.loc[np.array([3, 4, 1, 9])])

        self.assert_eq(kdf.a.loc[[]], pdf.a.loc[[]])
        self.assert_eq(kdf.a.loc[np.array([])], pdf.a.loc[np.array([])])

        self.assert_eq(kdf.loc[1000:], pdf.loc[1000:])
        self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000])

        self.assert_eq(kdf.loc[5], pdf.loc[5])
        self.assert_eq(kdf.loc[9], pdf.loc[9])
        self.assert_eq(kdf.a.loc[5], pdf.a.loc[5])
        self.assert_eq(kdf.a.loc[9], pdf.a.loc[9])

        self.assertRaises(KeyError, lambda: kdf.loc[10])
        self.assertRaises(KeyError, lambda: kdf.a.loc[10])

        # monotonically increasing index test
        pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[0, 1, 1, 2, 2, 2, 4, 5, 6])
        kdf = ks.from_pandas(pdf)

        self.assert_eq(kdf.loc[:2], pdf.loc[:2])
        self.assert_eq(kdf.loc[:3], pdf.loc[:3])
        self.assert_eq(kdf.loc[3:], pdf.loc[3:])
        self.assert_eq(kdf.loc[4:], pdf.loc[4:])
        self.assert_eq(kdf.loc[3:2], pdf.loc[3:2])
        self.assert_eq(kdf.loc[-1:2], pdf.loc[-1:2])
        self.assert_eq(kdf.loc[3:10], pdf.loc[3:10])

        # monotonically decreasing index test
        pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, index=[6, 5, 5, 4, 4, 4, 2, 1, 0])
        kdf = ks.from_pandas(pdf)

        self.assert_eq(kdf.loc[:4], pdf.loc[:4])
        self.assert_eq(kdf.loc[:3], pdf.loc[:3])
        self.assert_eq(kdf.loc[3:], pdf.loc[3:])
        self.assert_eq(kdf.loc[2:], pdf.loc[2:])
        self.assert_eq(kdf.loc[2:3], pdf.loc[2:3])
        self.assert_eq(kdf.loc[2:-1], pdf.loc[2:-1])
        self.assert_eq(kdf.loc[10:3], pdf.loc[10:3])

        # test when type of key is string and given value is not included in key
        pdf = pd.DataFrame({"a": [1, 2, 3]}, index=["a", "b", "d"])
        kdf = ks.from_pandas(pdf)

        self.assert_eq(kdf.loc["a":"z"], pdf.loc["a":"z"])

        # KeyError when index is not monotonic increasing or decreasing
        # and specified values don't exist in index
        kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], index=["cobra", "viper", "sidewinder"])

        self.assertRaises(KeyError, lambda: kdf.loc["cobra":"koalas"])
        self.assertRaises(KeyError, lambda: kdf.loc["koalas":"viper"])

        kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], index=[10, 30, 20])

        self.assertRaises(KeyError, lambda: kdf.loc[0:30])
        self.assertRaises(KeyError, lambda: kdf.loc[10:100])
コード例 #28
0
 def test_ranges(self):
     self.assert_eq((ks.range(10) + ks.range(10)).sort_index(),
                    (ks.DataFrame({'id': list(range(10))}) +
                     ks.DataFrame({'id': list(range(10))})).sort_index())
コード例 #29
0
 def test_default_index(self):
     sdf = self.spark.range(1000)
     pdf = ks.DataFrame(sdf).to_pandas()
     self.assertEqual(len(set(pdf.index)), len(pdf))
コード例 #30
0
 def test_no_matched_index(self):
     with self.assertRaisesRegex(ValueError,
                                 "Index names must be exactly matched"):
         ks.DataFrame({'a': [1, 2, 3]}).set_index('a') + \
             ks.DataFrame({'b': [1, 2, 3]}).set_index('b')