def _unfold(key, col): """ Return row selection and column selection pair. If col parameter is not None, the key should be row selection and the column selection will be the col parameter itself. Otherwise check the key contains column selection, and the selection is acceptable. """ from databricks.koalas.series import Series if col is not None: if isinstance(key, tuple): if len(key) > 1: raise SparkPandasIndexingError('Too many indexers') key = key[0] rows_sel = key cols_sel = col._scol elif isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError("Only accepts pairs of candidates") rows_sel, cols_sel = key # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, (str, Series)): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise SparkPandasNotImplementedError( description="Can only select columns either by name or reference or all", pandas_function="loc", spark_target_function="select, where, withColumn") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None else: rows_sel = key cols_sel = None return rows_sel, cols_sel
def _select_rows(self, rows_sel): from databricks.koalas.indexes import Index if isinstance(rows_sel, tuple) and len(rows_sel) > 1: raise SparkPandasIndexingError("Too many indexers") elif isinstance(rows_sel, Index): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None elif isinstance(rows_sel, slice): if rows_sel == slice(None): # If slice is None - select everything, so nothing to do return None, None, None elif (rows_sel.start is not None) or (rows_sel.step is not None): iLocIndexer._raiseNotImplemented( "Cannot use start or step with Spark.") elif not isinstance(rows_sel.stop, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}". format(rows_sel.stop, type(rows_sel.stop))) else: return None, rows_sel.stop, None elif isinstance(rows_sel, int): sdf = self._internal.sdf return (sdf[self._sequence_col] == rows_sel), None, 0 else: iLocIndexer._raiseNotImplemented( ".iloc requires numeric slice or conditional " "boolean Index, got {}".format(type(rows_sel)))
def _unfold(key, kseries): """ Return row selection and column selection pair. If kseries parameter is not None, the key should be row selection and the column selection will be the kseries parameter. >>> s = ks.Series([1, 2, 3], name='a') >>> _unfold(slice(1, 2), s) (slice(1, 2, None), 0 1 1 2 2 3 Name: a, dtype: int64) >>> _unfold((slice(1, 2), slice(None)), None) (slice(1, 2, None), slice(None, None, None)) >>> _unfold((slice(1, 2), s), None) (slice(1, 2, None), 0 1 1 2 2 3 Name: a, dtype: int64) >>> _unfold((slice(1, 2), 'col'), None) (slice(1, 2, None), 'col') """ if kseries is not None: if isinstance(key, tuple): if len(key) > 1: raise SparkPandasIndexingError('Too many indexers') key = key[0] rows_sel = key cols_sel = kseries elif isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError("Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None return rows_sel, cols_sel
def _select_rows(self, rows_sel): from databricks.koalas.series import Series if isinstance(rows_sel, Series): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None elif isinstance(rows_sel, slice): assert len(self._internal.index_columns) > 0 if rows_sel.step is not None: LocIndexer._raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do return None, None, None elif len(self._internal.index_columns) == 1: sdf = self._internal.sdf index = self._kdf_or_kser.index index_column = index.to_series() index_data_type = index_column.spark_type start = rows_sel.start stop = rows_sel.stop # get natural order from '__natural_order__' from start to stop # to keep natural order. start_and_stop = (sdf.select( index_column._scol, NATURAL_ORDER_COLUMN_NAME ).where( (index_column._scol == F.lit(start).cast(index_data_type)) | (index_column._scol == F.lit(stop).cast(index_data_type)) ).collect()) start = [row[1] for row in start_and_stop if row[0] == start] start = start[0] if len(start) > 0 else None stop = [row[1] for row in start_and_stop if row[0] == stop] stop = stop[-1] if len(stop) > 0 else None cond = [] if start is not None: cond.append( F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast( LongType())) if stop is not None: cond.append( F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast( LongType())) # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError if (start is None and rows_sel.start is not None) or ( stop is None and rows_sel.stop is not None): inc, dec = (sdf.select( index_column._is_monotonic()._scol.alias( "__increasing__"), index_column._is_monotonic_decreasing()._scol.alias( "__decreasing__"), ).select( F.min(F.coalesce("__increasing__", F.lit(True))), F.min(F.coalesce("__decreasing__", F.lit(True))), ).first()) if start is None and rows_sel.start is not None: start = rows_sel.start if inc is not False: cond.append(index_column._scol >= F.lit( start).cast(index_data_type)) elif dec is not False: cond.append(index_column._scol <= F.lit( start).cast(index_data_type)) else: raise KeyError(rows_sel.start) if stop is None and rows_sel.stop is not None: stop = rows_sel.stop if inc is not False: cond.append(index_column._scol <= F.lit(stop).cast( index_data_type)) elif dec is not False: cond.append(index_column._scol >= F.lit(stop).cast( index_data_type)) else: raise KeyError(rows_sel.stop) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None, None else: LocIndexer._raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple): rows_sel = list(rows_sel) if len(rows_sel) == 0: return F.lit(False), None, None elif len(self._internal.index_columns) == 1: index_column = self._kdf_or_kser.index.to_series() index_data_type = index_column.spark_type if len(rows_sel) == 1: return ( index_column._scol == F.lit( rows_sel[0]).cast(index_data_type), None, None, ) else: return ( index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ]), None, None, ) else: LocIndexer._raiseNotImplemented( "Cannot select with MultiIndex with Spark.") else: if not isinstance(rows_sel, tuple): rows_sel = (rows_sel, ) if len(rows_sel) > len(self._internal.index_map): raise SparkPandasIndexingError("Too many indexers") rows = [ scol == value for scol, value in zip(self._internal.index_scols, rows_sel) ] return ( reduce(lambda x, y: x & y, rows), None, len(self._internal.index_map) - len(rows_sel), )
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list( self._kdf_or_kser.columns)] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series( self._internal.copy(scol=column_scols[0], column_labels=[column_labels[0]]), anchor=self._kdf_or_kser, ) if remaining_index is not None: index_scols = self._internal.index_scols[-remaining_index:] index_map = self._internal.index_map[-remaining_index:] else: index_scols = self._internal.index_scols index_map = self._internal.index_map if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names level = column_labels_level(column_labels) column_label_names = self._internal.column_label_names[-level:] try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(index_scols + column_scols) except AnalysisException: raise KeyError("[{}] don't exist in columns".format( [col._jc.toString() for col in column_scols])) internal = _InternalFrame( sdf=sdf, index_map=index_map, column_labels=column_labels, column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, tuple): if len(key) > 1: raise SparkPandasIndexingError('Too many indexers') key = key[0] if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf['__temp_col__'] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf['__temp_col__']] cond, limit = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_index = self._internal.column_index column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf['__temp_col__'] = rows_sel return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list( self._kdf_or_kser.columns)] cond, limit = self._select_rows(rows_sel) column_index, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series(self._internal.copy( scol=column_scols[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(self._internal.index_scols + column_scols) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in column_scols])) if returns_series: return Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def _select_rows(self, rows_sel): from databricks.koalas.indexes import Index if isinstance(rows_sel, tuple) and len(rows_sel) > 1: raise SparkPandasIndexingError("Too many indexers") elif isinstance(rows_sel, Index): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None elif isinstance(rows_sel, slice): def verify_type(i): if not isinstance(i, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}".format(i, type(i)) ) has_negative = False start = rows_sel.start if start is not None: verify_type(start) if start == 0: start = None elif start < 0: has_negative = True stop = rows_sel.stop if stop is not None: verify_type(stop) if stop == 0: stop = None elif stop < 0: has_negative = True step = rows_sel.step if step is not None: verify_type(step) if step == 0: raise ValueError("slice step cannot be zero") else: step = 1 if start is None and step == 1: return None, stop, None sdf = self._internal.spark_frame sequence_scol = sdf[self._sequence_col] if has_negative or (step < 0 and start is None): cnt = sdf.count() cond = [] if start is not None: if start < 0: start = start + cnt if step >= 0: cond.append(sequence_scol >= F.lit(start).cast(LongType())) else: cond.append(sequence_scol <= F.lit(start).cast(LongType())) if stop is not None: if stop < 0: stop = stop + cnt if step >= 0: cond.append(sequence_scol < F.lit(stop).cast(LongType())) else: cond.append(sequence_scol > F.lit(stop).cast(LongType())) if step != 1: if step > 0: start = start or 0 else: start = start or (cnt - 1) cond.append(((sequence_scol - start) % F.lit(step).cast(LongType())) == F.lit(0)) return reduce(lambda x, y: x & y, cond), None, None elif isinstance(rows_sel, int): sdf = self._internal.spark_frame return (sdf[self._sequence_col] == rows_sel), None, 0 else: iLocIndexer._raiseNotImplemented( ".iloc requires numeric slice or conditional " "boolean Index, got {}".format(type(rows_sel)) )
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels data_spark_columns = self._internal.data_spark_columns returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError("Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][ list(self._kdf_or_kser.columns) ] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, data_spark_columns, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: return self._kdf_or_kser._kser_for(column_labels[0]) if remaining_index is not None: index_scols = self._internal.index_spark_columns[-remaining_index:] index_map = OrderedDict(list(self._internal.index_map.items())[-remaining_index:]) else: index_scols = self._internal.index_spark_columns index_map = self._internal.index_map if len(column_labels) > 0: column_labels = column_labels.copy() column_labels_level = max( len(label) if label is not None else 1 for label in column_labels ) none_column = 0 for i, label in enumerate(column_labels): if label is None: label = (str(none_column),) none_column += 1 if len(label) < column_labels_level: label = tuple(list(label) + ([""]) * (column_labels_level - len(label))) column_labels[i] = label if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names column_label_names = self._internal.column_label_names[-column_labels_level:] else: column_label_names = None try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) data_columns = sdf.select(data_spark_columns).columns sdf = sdf.select(index_scols + data_spark_columns) except AnalysisException: raise KeyError( "[{}] don't exist in columns".format( [col._jc.toString() for col in data_spark_columns] ) ) internal = _InternalFrame( spark_frame=sdf, index_map=index_map, column_labels=column_labels, data_spark_columns=[scol_for(sdf, col) for col in data_columns], column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(spark_column=kdf._internal.data_spark_columns[0]), anchor=kdf ) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def _select_rows(self, rows_sel): from databricks.koalas.indexes import MultiIndex from databricks.koalas.series import Series if isinstance(rows_sel, Series): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None elif isinstance(rows_sel, slice): assert len(self._internal.index_spark_column_names) > 0 if rows_sel.step is not None: LocIndexer._raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do return None, None, None elif len(self._internal.index_spark_column_names) == 1: sdf = self._internal.spark_frame index = self._kdf_or_kser.index index_column = index.to_series() index_data_type = index_column.spark_type start = rows_sel.start stop = rows_sel.stop # get natural order from '__natural_order__' from start to stop # to keep natural order. start_and_stop = (sdf.select( index_column._scol, NATURAL_ORDER_COLUMN_NAME ).where( (index_column._scol == F.lit(start).cast(index_data_type)) | (index_column._scol == F.lit(stop).cast(index_data_type)) ).collect()) start = [row[1] for row in start_and_stop if row[0] == start] start = start[0] if len(start) > 0 else None stop = [row[1] for row in start_and_stop if row[0] == stop] stop = stop[-1] if len(stop) > 0 else None cond = [] if start is not None: cond.append( F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast( LongType())) if stop is not None: cond.append( F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast( LongType())) # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError if (start is None and rows_sel.start is not None) or ( stop is None and rows_sel.stop is not None): inc = index_column.is_monotonic_increasing if inc is False: dec = index_column.is_monotonic_decreasing if start is None and rows_sel.start is not None: start = rows_sel.start if inc is not False: cond.append(index_column._scol >= F.lit( start).cast(index_data_type)) elif dec is not False: cond.append(index_column._scol <= F.lit( start).cast(index_data_type)) else: raise KeyError(rows_sel.start) if stop is None and rows_sel.stop is not None: stop = rows_sel.stop if inc is not False: cond.append(index_column._scol <= F.lit(stop).cast( index_data_type)) elif dec is not False: cond.append(index_column._scol >= F.lit(stop).cast( index_data_type)) else: raise KeyError(rows_sel.stop) return reduce(lambda x, y: x & y, cond), None, None else: index = self._kdf_or_kser.index index_data_type = [ f.dataType for f in index.to_series().spark_type ] start = rows_sel.start if start is not None: if not isinstance(start, tuple): start = (start, ) if len(start) == 0: start = None stop = rows_sel.stop if stop is not None: if not isinstance(stop, tuple): stop = (stop, ) if len(stop) == 0: stop = None depth = max( len(start) if start is not None else 0, len(stop) if stop is not None else 0) if depth == 0: return None, None, None elif (depth > len(self._internal.index_map) or not index.droplevel( list(range(len(self._internal.index_map)) [depth:])).is_monotonic): raise KeyError( "Key length ({}) was greater than MultiIndex sort depth" .format(depth)) conds = [] if start is not None: cond = F.lit(True) for scol, value, dt in list( zip(self._internal.index_spark_columns, start, index_data_type))[::-1]: compare = MultiIndex._comparator_for_monotonic_increasing( dt) cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise( compare(scol, F.lit(value).cast(dt), spark.Column.__gt__)) conds.append(cond) if stop is not None: cond = F.lit(True) for scol, value, dt in list( zip(self._internal.index_spark_columns, stop, index_data_type))[::-1]: compare = MultiIndex._comparator_for_monotonic_increasing( dt) cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)), cond).otherwise( compare(scol, F.lit(value).cast(dt), spark.Column.__lt__)) conds.append(cond) return reduce(lambda x, y: x & y, conds), None, None elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple): rows_sel = list(rows_sel) if len(rows_sel) == 0: return F.lit(False), None, None elif len(self._internal.index_spark_column_names) == 1: index_column = self._kdf_or_kser.index.to_series() index_data_type = index_column.spark_type if len(rows_sel) == 1: return ( index_column._scol == F.lit( rows_sel[0]).cast(index_data_type), None, None, ) else: return ( index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ]), None, None, ) else: LocIndexer._raiseNotImplemented( "Cannot select with MultiIndex with Spark.") else: if not isinstance(rows_sel, tuple): rows_sel = (rows_sel, ) if len(rows_sel) > len(self._internal.index_map): raise SparkPandasIndexingError("Too many indexers") rows = [ scol == value for scol, value in zip( self._internal.index_spark_columns, rows_sel) ] return ( reduce(lambda x, y: x & y, rows), None, len(self._internal.index_map) - len(rows_sel), )
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series, _col if self._is_series: if (isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf) or ( isinstance(value, Series) and value._kdf is not self._kdf_or_kser._kdf): kdf = self._kdf_or_kser.to_frame() temp_natural_order = verify_temp_column_name( kdf, "__temp_natural_order__") temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__") temp_value_col = verify_temp_column_name( kdf, "__temp_value_col__") kdf[temp_natural_order] = F.monotonically_increasing_id() if isinstance(key, Series): kdf[temp_key_col] = key if isinstance(value, Series): kdf[temp_value_col] = value kdf = kdf.sort_values(temp_natural_order).drop( temp_natural_order) kser = kdf[self._kdf_or_kser.name] if isinstance(key, Series): key = kdf[temp_key_col] if isinstance(value, Series): value = kdf[temp_value_col] type(self)(kser)[key] = value self._kdf_or_kser._internal = kser._internal self._kdf_or_kser._kdf = kser._kdf return if isinstance(value, DataFrame): raise ValueError("Incompatible indexer with DataFrame") cond, limit, remaining_index = self._select_rows(key) if cond is None: cond = F.lit(True) if limit is not None: cond = cond & (self._internal.spark_frame[self._sequence_col] < F.lit(limit)) if isinstance(value, Series): if remaining_index is not None and remaining_index == 0: raise ValueError( "No axis named {} for object type {}".format( key, type(value))) value = value._scol else: value = F.lit(value) scol = (F.when(cond, value).otherwise(self._internal.spark_column).alias( name_like_string(self._kdf_or_kser.name or "0"))) internal = self._internal.copy(spark_column=scol) self._kdf_or_kser._internal = internal else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(value, DataFrame): if len(value.columns) == 1: value = _col(value) else: raise ValueError( "Only a dataframe with one column can be assigned") if (isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser) or ( isinstance(value, Series) and value._kdf is not self._kdf_or_kser): kdf = self._kdf_or_kser.copy() temp_natural_order = verify_temp_column_name( kdf, "__temp_natural_order__") temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__") temp_value_col = verify_temp_column_name( kdf, "__temp_value_col__") kdf[temp_natural_order] = F.monotonically_increasing_id() if isinstance(rows_sel, Series): kdf[temp_key_col] = rows_sel if isinstance(value, Series): kdf[temp_value_col] = value kdf = kdf.sort_values(temp_natural_order) if isinstance(rows_sel, Series): rows_sel = kdf[temp_key_col] if isinstance(value, Series): value = kdf[temp_value_col] type(self)(kdf)[rows_sel, cols_sel] = value self._kdf_or_kser._internal = kdf[list( self._kdf_or_kser.columns)]._internal return cond, limit, remaining_index = self._select_rows(rows_sel) missing_keys = [] _, data_spark_columns, _ = self._select_cols( cols_sel, missing_keys=missing_keys) if cond is None: cond = F.lit(True) if limit is not None: cond = cond & (self._internal.spark_frame[self._sequence_col] < F.lit(limit)) if isinstance(value, Series): if remaining_index is not None and remaining_index == 0: raise ValueError("Incompatible indexer with Series") if len(data_spark_columns) > 1: raise ValueError("shape mismatch") value = value._scol else: value = F.lit(value) new_data_spark_columns = [] for new_scol, spark_column_name in zip( self._internal.data_spark_columns, self._internal.data_spark_column_names): for scol in data_spark_columns: if new_scol._jc.equals(scol._jc): new_scol = F.when( cond, value).otherwise(scol).alias(spark_column_name) break new_data_spark_columns.append(new_scol) column_labels = self._internal.column_labels.copy() for label in missing_keys: if isinstance(label, str): label = (label, ) if len(label) < self._internal.column_labels_level: label = tuple( list(label) + ([""] * (self._internal.column_labels_level - len(label)))) elif len(label) > self._internal.column_labels_level: raise KeyError( "Key length ({}) exceeds index depth ({})".format( len(label), self._internal.column_labels_level)) column_labels.append(label) new_data_spark_columns.append( F.when(cond, value).alias(name_like_string(label))) internal = self._internal.with_new_columns(new_data_spark_columns, column_labels) self._kdf_or_kser._internal = internal
def _select_rows(self, rows_sel): from databricks.koalas.indexes import Index if isinstance(rows_sel, tuple) and len(rows_sel) > 1: raise SparkPandasIndexingError("Too many indexers") elif isinstance(rows_sel, Index): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None elif isinstance(rows_sel, slice): def verify_type(i): if not isinstance(i, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}" .format(i, type(i))) has_negative = False start = rows_sel.start if start is not None: verify_type(start) if start == 0: start = None elif start < 0: has_negative = True stop = rows_sel.stop if stop is not None: verify_type(stop) if stop == 0: stop = None elif stop < 0: has_negative = True step = rows_sel.step if step is not None: verify_type(step) if step == 0: raise ValueError("slice step cannot be zero") else: step = 1 if start is None and step == 1: return None, stop, None sdf = self._internal.spark_frame sequence_scol = sdf[self._sequence_col] if has_negative or (step < 0 and start is None): cnt = sdf.count() cond = [] if start is not None: if start < 0: start = start + cnt if step >= 0: cond.append(sequence_scol >= F.lit(start).cast(LongType())) else: cond.append(sequence_scol <= F.lit(start).cast(LongType())) if stop is not None: if stop < 0: stop = stop + cnt if step >= 0: cond.append(sequence_scol < F.lit(stop).cast(LongType())) else: cond.append(sequence_scol > F.lit(stop).cast(LongType())) if step != 1: if step > 0: start = start or 0 else: start = start or (cnt - 1) cond.append(((sequence_scol - start) % F.lit(step).cast(LongType())) == F.lit(0)) return reduce(lambda x, y: x & y, cond), None, None elif isinstance(rows_sel, int): sdf = self._internal.spark_frame return (sdf[self._sequence_col] == rows_sel), None, 0 elif isinstance(rows_sel, Iterable): sdf = self._internal.spark_frame if any( isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel): offset = sdf.count() else: offset = 0 new_rows_sel = [] for key in list(rows_sel): if not isinstance(key, (int, np.int, np.int64, np.int32)): raise TypeError( "cannot do positional indexing with these indexers [{}] of {}" .format(key, type(key))) if key < 0: key = key + offset new_rows_sel.append(key) if len(new_rows_sel) != len(set(new_rows_sel)): raise NotImplementedError( "Duplicated row selection is not currently supported; " "however, normalised index was [%s]" % new_rows_sel) sequence_scol = sdf[self._sequence_col] cond = [] for key in new_rows_sel: cond.append(sequence_scol == F.lit(int(key)).cast(LongType())) if len(cond) == 0: cond = [F.lit(False)] return reduce(lambda x, y: x | y, cond), None, None else: iLocIndexer._raiseNotImplemented( ".iloc requires numeric slice, conditional " "boolean Index or a sequence of positions as int, " "got {}".format(type(rows_sel)))