def sdc_pandas_series_operator_binop(self, other): """ Pandas Series operator :attr:`pandas.Series.binop` implementation Note: Currently implemented for numeric Series only. Differs from Pandas in returning Series with fixed dtype :obj:`float64` .. only:: developer **Test**: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_op1* python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_op2* python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_operator_binop* Parameters ---------- series: :obj:`pandas.Series` Input series other: :obj:`pandas.Series` or :obj:`scalar` Series or scalar value to be used as a second argument of binary operation Returns ------- :obj:`pandas.Series` The result of the operation """ _func_name = 'Operator binop().' ty_checker = TypeChecker('Operator binop().') self_is_series, other_is_series = isinstance(self, SeriesType), isinstance( other, SeriesType) if not (self_is_series or other_is_series): return None # this overload is not for string series self_is_string_series = self_is_series and isinstance( self.dtype, types.UnicodeType) other_is_string_series = other_is_series and isinstance( other.dtype, types.UnicodeType) if self_is_string_series or other_is_string_series: return None if not isinstance(self, (SeriesType, types.Number)): ty_checker.raise_exc(self, 'pandas.series or scalar', 'self') if not isinstance(other, (SeriesType, types.Number)): ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') operands_are_series = self_is_series and other_is_series if operands_are_series: none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) series_indexes_comparable = check_types_comparable( self.index, other.index) or none_or_numeric_indexes if not series_indexes_comparable: raise TypingError( '{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format( _func_name, self.index, other.index)) series_data_comparable = check_types_comparable(self, other) if not series_data_comparable: raise TypingError('{} Not supported for not-comparable operands. \ Given: self={}, other={}'.format(_func_name, self, other)) # specializations for numeric series only if not operands_are_series: def _series_operator_binop_scalar_impl(self, other): if self_is_series == True: # noqa result_data = numpy.empty(len(self._data), dtype=numpy.float64) result_data[:] = self._data + numpy.float64(other) return pandas.Series(result_data, index=self._index, name=self._name) else: result_data = numpy.empty(len(other._data), dtype=numpy.float64) result_data[:] = numpy.float64(self) + other._data return pandas.Series(result_data, index=other._index, name=other._name) return _series_operator_binop_scalar_impl else: # both operands are numeric series # optimization for series with default indexes, that can be aligned differently if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): def _series_operator_binop_none_indexes_impl(self, other): if (len(self._data) == len(other._data)): result_data = astype(self._data, numpy.float64) result_data = result_data + other._data return pandas.Series(result_data) else: left_size, right_size = len(self._data), len(other._data) min_data_size = min(left_size, right_size) max_data_size = max(left_size, right_size) result_data = numpy.empty(max_data_size, dtype=numpy.float64) if (left_size == min_data_size): result_data[:min_data_size] = self._data result_data[min_data_size:] = numpy.nan result_data = result_data + other._data else: result_data[:min_data_size] = other._data result_data[min_data_size:] = numpy.nan result_data = self._data + result_data return pandas.Series(result_data) return _series_operator_binop_none_indexes_impl else: # for numeric indexes find common dtype to be used when creating joined index if none_or_numeric_indexes: ty_left_index_dtype = types.int64 if isinstance( self.index, types.NoneType) else self.index.dtype ty_right_index_dtype = types.int64 if isinstance( other.index, types.NoneType) else other.index.dtype numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( [ty_left_index_dtype, ty_right_index_dtype], []) def _series_operator_binop_common_impl(self, other): left_index, right_index = self.index, other.index # check if indexes are equal and series don't have to be aligned if sdc_check_indexes_equal(left_index, right_index): result_data = numpy.empty(len(self._data), dtype=numpy.float64) result_data[:] = self._data + other._data if none_or_numeric_indexes == True: # noqa result_index = astype(left_index, numba_index_common_dtype) else: result_index = self._index return pandas.Series(result_data, index=result_index) # TODO: replace below with core join(how='outer', return_indexers=True) when implemented joined_index, left_indexer, right_indexer = sdc_join_series_indexes( left_index, right_index) result_size = len(joined_index) left_values = numpy.empty(result_size, dtype=numpy.float64) right_values = numpy.empty(result_size, dtype=numpy.float64) for i in numba.prange(result_size): left_pos, right_pos = left_indexer[i], right_indexer[i] left_values[i] = self._data[ left_pos] if left_pos != -1 else numpy.nan right_values[i] = other._data[ right_pos] if right_pos != -1 else numpy.nan result_data = left_values + right_values return pandas.Series(result_data, joined_index) return _series_operator_binop_common_impl return None
def sdc_pandas_series_binop(self, other, level=None, fill_value=None, axis=0): """ Intel Scalable Dataframe Compiler User Guide ******************************************** Pandas API: pandas.Series.binop Limitations ----------- Parameters ``level`` and ``axis`` are currently unsupported by Intel Scalable Dataframe Compiler Examples -------- .. literalinclude:: ../../../examples/series/series_binop.py :language: python :lines: 27- :caption: :name: ex_series_binop .. command-output:: python ./series/series_binop.py :cwd: ../../../examples Intel Scalable Dataframe Compiler Developer Guide ************************************************* Pandas Series method :meth:`pandas.Series.binop` implementation. .. only:: developer Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_series_op5 """ _func_name = 'Method binop().' ty_checker = TypeChecker(_func_name) self_is_series, other_is_series = isinstance(self, SeriesType), isinstance( other, SeriesType) if not (self_is_series or other_is_series): return None # this overload is not for string series self_is_string_series = self_is_series and isinstance( self.dtype, types.UnicodeType) other_is_string_series = other_is_series and isinstance( other.dtype, types.UnicodeType) if self_is_string_series or other_is_string_series: return None if not isinstance(self, (SeriesType, types.Number)): ty_checker.raise_exc(self, 'pandas.series or scalar', 'self') if not isinstance(other, (SeriesType, types.Number)): ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') operands_are_series = self_is_series and other_is_series if operands_are_series: none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) series_indexes_comparable = check_types_comparable( self.index, other.index) or none_or_numeric_indexes if not series_indexes_comparable: raise TypingError( '{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format( _func_name, self.index, other.index)) series_data_comparable = check_types_comparable(self, other) if not series_data_comparable: raise TypingError('{} Not supported for not-comparable operands. \ Given: self={}, other={}'.format(_func_name, self, other)) if not isinstance(level, types.Omitted) and level is not None: ty_checker.raise_exc(level, 'None', 'level') if not isinstance(fill_value, (types.Omitted, types.Number, types.NoneType)) and fill_value is not None: ty_checker.raise_exc(fill_value, 'number', 'fill_value') fill_value_is_none = isinstance( fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not isinstance(axis, types.Omitted) and axis != 0: ty_checker.raise_exc(axis, 'int', 'axis') # specializations for numeric series only if not operands_are_series: def _series_binop_scalar_impl(self, other, level=None, fill_value=None, axis=0): if self_is_series == True: # noqa numpy_like.fillna(self._data, inplace=True, value=fill_value) result_data = numpy.empty(len(self._data), dtype=numpy.float64) result_data[:] = self._data + numpy.float64(other) return pandas.Series(result_data, index=self._index, name=self._name) else: numpy_like.fillna(other._data, inplace=True, value=fill_value) result_data = numpy.empty(len(other._data), dtype=numpy.float64) result_data[:] = numpy.float64(self) + other._data return pandas.Series(result_data, index=other._index, name=other._name) return _series_binop_scalar_impl else: # both operands are numeric series # optimization for series with default indexes, that can be aligned differently if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): def _series_binop_none_indexes_impl(self, other, level=None, fill_value=None, axis=0): numpy_like.fillna(self._data, inplace=True, value=fill_value) numpy_like.fillna(other._data, inplace=True, value=fill_value) if (len(self._data) == len(other._data)): result_data = numpy_like.astype(self._data, numpy.float64) result_data = result_data + other._data return pandas.Series(result_data) else: left_size, right_size = len(self._data), len(other._data) min_data_size = min(left_size, right_size) max_data_size = max(left_size, right_size) result_data = numpy.empty(max_data_size, dtype=numpy.float64) _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa if (left_size == min_data_size): result_data[:min_data_size] = self._data for i in range(min_data_size, len(result_data)): result_data[i] = _fill_value result_data = result_data + other._data else: result_data[:min_data_size] = other._data for i in range(min_data_size, len(result_data)): result_data[i] = _fill_value result_data = self._data + result_data return pandas.Series(result_data) return _series_binop_none_indexes_impl else: left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) right_index_is_range = isinstance(other.index, (RangeIndexType, types.NoneType)) check_index_equal = left_index_is_range and right_index_is_range self_index_dtype = RangeIndexType.dtype if isinstance( self.index, types.NoneType) else self.index.dtype other_index_dtype = RangeIndexType.dtype if isinstance( other.index, types.NoneType) else other.index.dtype index_dtypes_match = self_index_dtype == other_index_dtype if not index_dtypes_match: numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( [self_index_dtype, other_index_dtype], []) else: numba_index_common_dtype = self_index_dtype def _series_binop_common_impl(self, other, level=None, fill_value=None, axis=0): left_index, right_index = self.index, other.index numpy_like.fillna(self._data, inplace=True, value=fill_value) numpy_like.fillna(other._data, inplace=True, value=fill_value) if check_index_equal == True: # noqa equal_indexes = numpy_like.array_equal( left_index, right_index) else: equal_indexes = False if (left_index is right_index or equal_indexes): result_data = numpy.empty(len(self._data), dtype=numpy.float64) result_data[:] = self._data + other._data if index_dtypes_match == False: # noqa result_index = numpy_like.astype( left_index, numba_index_common_dtype) else: result_index = left_index.values if left_index_is_range == True else left_index # noqa return pandas.Series(result_data, index=result_index) # TODO: replace below with core join(how='outer', return_indexers=True) when implemented joined_index, left_indexer, right_indexer = sdc_join_series_indexes( left_index, right_index) result_size = len(joined_index) left_values = numpy.empty(result_size, dtype=numpy.float64) right_values = numpy.empty(result_size, dtype=numpy.float64) _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa for i in range(result_size): left_pos, right_pos = left_indexer[i], right_indexer[i] left_values[i] = self._data[ left_pos] if left_pos != -1 else _fill_value right_values[i] = other._data[ right_pos] if right_pos != -1 else _fill_value result_data = left_values + right_values return pandas.Series(result_data, joined_index) return _series_binop_common_impl return None
def sdc_pandas_series_operator_comp_binop(self, other): """ Pandas Series operator :attr:`pandas.Series.comp_binop` implementation .. only:: developer **Test**: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_op7* python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_operator_comp_binop* Parameters ---------- series: :obj:`pandas.Series` Input series other: :obj:`pandas.Series` or :obj:`scalar` Series or scalar value to be used as a second argument of binary operation Returns ------- :obj:`pandas.Series` The result of the operation """ _func_name = 'Operator comp_binop().' ty_checker = TypeChecker('Operator comp_binop().') self_is_series, other_is_series = isinstance(self, SeriesType), isinstance( other, SeriesType) if not (self_is_series or other_is_series): return None if not isinstance(self, (SeriesType, types.Number, types.UnicodeType)): ty_checker.raise_exc(self, 'pandas.series or scalar', 'self') if not isinstance(other, (SeriesType, types.Number, types.UnicodeType)): ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') operands_are_series = self_is_series and other_is_series if operands_are_series: none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) series_indexes_comparable = check_types_comparable( self.index, other.index) or none_or_numeric_indexes if not series_indexes_comparable: raise TypingError( '{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format( _func_name, self.index, other.index)) series_data_comparable = check_types_comparable(self, other) if not series_data_comparable: raise TypingError('{} Not supported for not-comparable operands. \ Given: self={}, other={}'.format(_func_name, self, other)) if not operands_are_series: def _series_operator_comp_binop_scalar_impl(self, other): if self_is_series == True: # noqa return pandas.Series(self._data < other, index=self._index, name=self._name) else: return pandas.Series(self < other._data, index=other._index, name=other._name) return _series_operator_comp_binop_scalar_impl else: # optimization for series with default indexes, that can be aligned differently if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): def _series_operator_comp_binop_none_indexes_impl(self, other): left_size, right_size = len(self._data), len(other._data) if (left_size == right_size): return pandas.Series(self._data < other._data) else: raise ValueError( "Can only compare identically-labeled Series objects") return _series_operator_comp_binop_none_indexes_impl else: if none_or_numeric_indexes: ty_left_index_dtype = types.int64 if isinstance( self.index, types.NoneType) else self.index.dtype ty_right_index_dtype = types.int64 if isinstance( other.index, types.NoneType) else other.index.dtype numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( [ty_left_index_dtype, ty_right_index_dtype], []) def _series_operator_comp_binop_common_impl(self, other): left_index, right_index = self.index, other.index if sdc_check_indexes_equal(left_index, right_index): if none_or_numeric_indexes == True: # noqa new_index = astype(left_index, numba_index_common_dtype) else: new_index = self._index return pandas.Series(self._data < other._data, new_index) else: raise ValueError( "Can only compare identically-labeled Series objects") return _series_operator_comp_binop_common_impl return None
def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" use_A_array = isinstance(A, (RangeIndexType, Int64IndexType)) use_B_array = isinstance(B, (RangeIndexType, Int64IndexType)) if isinstance(A, (types.Array, RangeIndexType, Int64IndexType)): if isinstance(B, (types.Array, RangeIndexType, Int64IndexType)): def _append_single_numeric_impl(A, B): _A = A.values if use_A_array == True else A # noqa _B = B.values if use_B_array == True else B # noqa return numpy.concatenate(( _A, _B, )) return _append_single_numeric_impl elif (isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType, Int64IndexType))): B_dtype_is_index = isinstance(B.dtype, (RangeIndexType, Int64IndexType)) numba_common_dtype = find_common_dtype_from_numpy_dtypes( [A.dtype, B.dtype.dtype], []) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): total_length = len(A) + numpy.array([len(arr) for arr in B]).sum() new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) _A = numpy.array(A) if use_A_array == True else A # noqa new_data[:stop] = _A for arr in B: _arr = arr.values if B_dtype_is_index == True else arr # noqa start = stop stop = start + len(_arr) new_data[start:stop] = _arr return new_data return _append_list_numeric_impl elif A == string_array_type: if B == string_array_type: def _append_single_string_array_impl(A, B): total_size = len(A) + len(B) total_chars = num_total_chars(A) + num_total_chars(B) new_data = sdc.str_arr_ext.pre_alloc_string_array( total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) pos += append_string_array_to(new_data, pos, B) return new_data return _append_single_string_array_impl elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type): def _append_list_string_array_impl(A, B): array_list = [A] + list(B) total_size = numpy.array([len(arr) for arr in array_list]).sum() total_chars = numpy.array( [num_total_chars(arr) for arr in array_list]).sum() new_data = sdc.str_arr_ext.pre_alloc_string_array( total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) for arr in B: pos += append_string_array_to(new_data, pos, arr) return new_data return _append_list_string_array_impl
def sdc_pandas_series_comp_binop(self, other, level=None, fill_value=None, axis=0): """ Intel Scalable Dataframe Compiler User Guide ******************************************** Pandas API: pandas.Series.comp_binop Limitations ----------- Parameters ``level`` and ``axis`` are currently unsupported by Intel Scalable Dataframe Compiler Examples -------- .. literalinclude:: ../../../examples/series/series_comp_binop.py :language: python :lines: 27- :caption: :name: ex_series_comp_binop .. command-output:: python ./series/series_comp_binop.py :cwd: ../../../examples Intel Scalable Dataframe Compiler Developer Guide ************************************************* Pandas Series method :meth:`pandas.Series.comp_binop` implementation. .. only:: developer Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_op8 """ _func_name = 'Method comp_binop().' ty_checker = TypeChecker(_func_name) ty_checker.check(self, SeriesType) if not (isinstance(level, types.Omitted) or level is None): ty_checker.raise_exc(level, 'None', 'level') if not isinstance(fill_value, (types.Omitted, types.Number, types.NoneType)) and fill_value is not None: ty_checker.raise_exc(fill_value, 'number', 'fill_value') if not (isinstance(axis, types.Omitted) or axis == 0): ty_checker.raise_exc(axis, 'int', 'axis') self_is_series, other_is_series = isinstance(self, SeriesType), isinstance( other, SeriesType) if not (self_is_series or other_is_series): return None if not isinstance(self, (SeriesType, types.Number, types.UnicodeType)): ty_checker.raise_exc(self, 'pandas.series or scalar', 'self') if not isinstance(other, (SeriesType, types.Number, types.UnicodeType)): ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') operands_are_series = self_is_series and other_is_series if operands_are_series: none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) series_indexes_comparable = check_types_comparable( self.index, other.index) or none_or_numeric_indexes if not series_indexes_comparable: raise TypingError( '{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format( _func_name, self.index, other.index)) series_data_comparable = check_types_comparable(self, other) if not series_data_comparable: raise TypingError('{} Not supported for not-comparable operands. \ Given: self={}, other={}'.format(_func_name, self, other)) fill_value_is_none = isinstance( fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: def _series_comp_binop_scalar_impl(self, other, level=None, fill_value=None, axis=0): if self_is_series == True: # noqa numpy_like.fillna(self._data, inplace=True, value=fill_value) return pandas.Series(self._data < other, index=self._index, name=self._name) else: numpy_like.fillna(other._data, inplace=True, value=fill_value) return pandas.Series(self < other._data, index=other._index, name=other._name) return _series_comp_binop_scalar_impl else: # optimization for series with default indexes, that can be aligned differently if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): def _series_comp_binop_none_indexes_impl(self, other, level=None, fill_value=None, axis=0): numpy_like.fillna(self._data, inplace=True, value=fill_value) numpy_like.fillna(other._data, inplace=True, value=fill_value) left_size, right_size = len(self._data), len(other._data) if (left_size == right_size): return pandas.Series(self._data < other._data) else: raise ValueError( "Can only compare identically-labeled Series objects") return _series_comp_binop_none_indexes_impl else: left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) index_dtypes_match = self.index.dtype == other.index.dtype if not index_dtypes_match: numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( [self.index.dtype, other.index.dtype], []) else: numba_index_common_dtype = self.index.dtype def _series_comp_binop_common_impl(self, other, level=None, fill_value=None, axis=0): numpy_like.fillna(self._data, inplace=True, value=fill_value) numpy_like.fillna(other._data, inplace=True, value=fill_value) left_index, right_index = self.index, other.index if (left_index is right_index or numpy_like.array_equal(left_index, right_index)): if index_dtypes_match == False: # noqa new_index = numpy_like.astype( left_index, numba_index_common_dtype) else: new_index = left_index.values if left_index_is_range == True else left_index # noqa return pandas.Series(self._data < other._data, new_index) else: raise ValueError( "Can only compare identically-labeled Series objects") return _series_comp_binop_common_impl return None
def sdc_join_series_indexes_overload(left, right): """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" # check that both operands are of types used for representing Pandas indexes if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types) and not isinstance(left, types.NoneType) and not isinstance(right, types.NoneType)): return None convert_left = isinstance(left, (RangeIndexType, Int64IndexType)) convert_right = isinstance(right, (RangeIndexType, Int64IndexType)) def _convert_to_arrays_impl(left, right): _left = left.values if convert_left == True else left # noqa _right = right.values if convert_right == True else right # noqa return sdc_join_series_indexes(_left, _right) if isinstance(left, RangeIndexType) and isinstance(right, RangeIndexType): def sdc_join_range_indexes_impl(left, right): if (left is right or numpy_like.array_equal(left, right)): joined = left.values lidx = numpy.arange(len(joined)) ridx = lidx return joined, lidx, ridx else: return sdc_join_series_indexes(left.values, right.values) return sdc_join_range_indexes_impl elif (isinstance(left, (RangeIndexType, Int64IndexType, types.Array)) and isinstance(right, (RangeIndexType, Int64IndexType, types.Array)) and not (isinstance(left, types.Array) and isinstance(right, types.Array))): return _convert_to_arrays_impl # TODO: remove code duplication below and merge numeric and StringArray impls into one # needs equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays elif isinstance(left, types.Array) and isinstance(right, types.Array): numba_common_dtype = find_common_dtype_from_numpy_dtypes( [left.dtype, right.dtype], []) if isinstance(numba_common_dtype, types.Number): def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) joined = numpy.empty(est_total_size, numba_common_dtype) left_nan = [] right_nan = [] for i in range(lsize): if numpy.isnan(left[i]): left_nan.append(i) for i in range(rsize): if numpy.isnan(right[i]): right_nan.append(i) # sort arrays saving the old positions sorted_left = numpy_like.argsort(left, kind='mergesort') sorted_right = numpy_like.argsort(right, kind='mergesort') # put the position of the nans in an increasing sequence sorted_left[lsize - len(left_nan):] = left_nan sorted_right[rsize - len(right_nan):] = right_nan i, j, k = 0, 0, 0 while (i < lsize and j < rsize): joined = _hpat_ensure_array_capacity(k + 1, joined) lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index) or numpy.isnan(right_index): joined[k] = left_index lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index) or numpy.isnan(left_index): joined[k] = right_index lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_joined = numpy.repeat(left_index, block_size) to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array([ sorted_right[k] for k in numpy.arange(j, nj, 1) ], numpy.int64) joined = _hpat_ensure_array_capacity( k + block_size, joined) lidx = _hpat_ensure_array_capacity( k + block_size, lidx) ridx = _hpat_ensure_array_capacity( k + block_size, ridx) joined[k:k + block_size] = to_joined lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i joined = _hpat_ensure_array_capacity( k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k:k + block_size] = numpy.repeat(-1, block_size) while i < lsize: joined[k] = left[sorted_left[i]] lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j joined = _hpat_ensure_array_capacity( k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = numpy.repeat(-1, block_size) while j < rsize: joined[k] = right[sorted_right[j]] ridx[k] = sorted_right[j] j += 1 k += 1 return joined[:k], lidx[:k], ridx[:k] return sdc_join_series_indexes_impl else: return None elif (left == string_array_type and right == string_array_type): def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) # use Series.sort_values since argsort for StringArrays not implemented original_left_series = pandas.Series(left) original_right_series = pandas.Series(right) # sort arrays saving the old positions left_series = original_left_series.sort_values(kind='mergesort') right_series = original_right_series.sort_values(kind='mergesort') sorted_left = left_series._index sorted_right = right_series._index i, j, k = 0, 0, 0 while (i < lsize and j < rsize): lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index): lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index): lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array( [sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) lidx = _hpat_ensure_array_capacity( k + block_size, lidx) ridx = _hpat_ensure_array_capacity( k + block_size, ridx) lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k:k + block_size] = numpy.repeat(-1, block_size) while i < lsize: lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = numpy.repeat(-1, block_size) while j < rsize: ridx[k] = sorted_right[j] j += 1 k += 1 # count total number of characters and allocate joined array total_joined_size = k num_chars_in_joined = 0 for i in numpy.arange(total_joined_size): if lidx[i] != -1: num_chars_in_joined += len(left[lidx[i]]) elif ridx[i] != -1: num_chars_in_joined += len(right[ridx[i]]) joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) # iterate over joined and fill it with indexes using lidx and ridx indexers for i in numpy.arange(total_joined_size): if lidx[i] != -1: joined[i] = left[lidx[i]] if (str_arr_is_na(left, lidx[i])): str_arr_set_na(joined, i) elif ridx[i] != -1: joined[i] = right[ridx[i]] if (str_arr_is_na(right, ridx[i])): str_arr_set_na(joined, i) else: str_arr_set_na(joined, i) return joined, lidx, ridx return sdc_join_series_indexes_impl return None
def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" if isinstance(A, types.Array): if isinstance(B, types.Array): def _append_single_numeric_impl(A, B): return numpy.concatenate(( A, B, )) return _append_single_numeric_impl elif isinstance(B, (types.UniTuple, types.List)): # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way # to resolve common dtype of heterogeneous sequence of arrays numba_common_dtype = find_common_dtype_from_numpy_dtypes( [A.dtype, B.dtype.dtype], []) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): total_length = len(A) + numpy.array([len(arr) for arr in B]).sum() new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) new_data[:stop] = A for arr in B: start = stop stop = start + len(arr) new_data[start:stop] = arr return new_data return _append_list_numeric_impl elif A == string_array_type: if B == string_array_type: def _append_single_string_array_impl(A, B): total_size = len(A) + len(B) total_chars = num_total_chars(A) + num_total_chars(B) new_data = sdc.str_arr_ext.pre_alloc_string_array( total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) pos += append_string_array_to(new_data, pos, B) return new_data return _append_single_string_array_impl elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type): def _append_list_string_array_impl(A, B): array_list = [A] + list(B) total_size = numpy.array([len(arr) for arr in array_list]).sum() total_chars = numpy.array( [num_total_chars(arr) for arr in array_list]).sum() new_data = sdc.str_arr_ext.pre_alloc_string_array( total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) for arr in B: pos += append_string_array_to(new_data, pos, arr) return new_data return _append_list_string_array_impl
def sdc_join_series_indexes_overload(left, right): """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" # TODO: eliminate code duplication by merging implementations for numeric and StringArray # requires equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays if (isinstance(left, types.Array) and isinstance(right, types.Array)): numba_common_dtype = find_common_dtype_from_numpy_dtypes( [left.dtype, right.dtype], []) if isinstance(numba_common_dtype, types.Number): def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) joined = numpy.empty(est_total_size, numba_common_dtype) # sort arrays saving the old positions sorted_left = numpy.argsort(left, kind='mergesort') sorted_right = numpy.argsort(right, kind='mergesort') i, j, k = 0, 0, 0 while (i < lsize and j < rsize): joined = _hpat_ensure_array_capacity(k + 1, joined) lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index): joined[k] = left_index lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index): joined[k] = right_index lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_joined = numpy.repeat(left_index, block_size) to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array([ sorted_right[k] for k in numpy.arange(j, nj, 1) ], numpy.int64) joined = _hpat_ensure_array_capacity( k + block_size, joined) lidx = _hpat_ensure_array_capacity( k + block_size, lidx) ridx = _hpat_ensure_array_capacity( k + block_size, ridx) joined[k:k + block_size] = to_joined lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i joined = _hpat_ensure_array_capacity( k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k:k + block_size] = numpy.repeat(-1, block_size) while i < lsize: joined[k] = left[sorted_left[i]] lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j joined = _hpat_ensure_array_capacity( k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = numpy.repeat(-1, block_size) while j < rsize: joined[k] = right[sorted_right[j]] ridx[k] = sorted_right[j] j += 1 k += 1 return joined[:k], lidx[:k], ridx[:k] return sdc_join_series_indexes_impl else: # TODO: support joining indexes with common dtype=object - requires Numba # support of such numpy arrays in nopython mode, for now just return None return None elif (left == string_array_type and right == string_array_type): def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) # use Series.sort_values since argsort for StringArrays not implemented original_left_series = pandas.Series(left) original_right_series = pandas.Series(right) # sort arrays saving the old positions left_series = original_left_series.sort_values(kind='mergesort') right_series = original_right_series.sort_values(kind='mergesort') sorted_left = left_series._index sorted_right = right_series._index i, j, k = 0, 0, 0 while (i < lsize and j < rsize): lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index): lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index): lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array( [sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) lidx = _hpat_ensure_array_capacity( k + block_size, lidx) ridx = _hpat_ensure_array_capacity( k + block_size, ridx) lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k:k + block_size] = numpy.repeat(-1, block_size) while i < lsize: lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = numpy.repeat(-1, block_size) while j < rsize: ridx[k] = sorted_right[j] j += 1 k += 1 # count total number of characters and allocate joined array total_joined_size = k num_chars_in_joined = 0 for i in numpy.arange(total_joined_size): if lidx[i] != -1: num_chars_in_joined += len(left[lidx[i]]) elif ridx[i] != -1: num_chars_in_joined += len(right[ridx[i]]) joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) # iterate over joined and fill it with indexes using lidx and ridx indexers for i in numpy.arange(total_joined_size): if lidx[i] != -1: joined[i] = left[lidx[i]] if (str_arr_is_na(left, lidx[i])): str_arr_set_na(joined, i) elif ridx[i] != -1: joined[i] = right[ridx[i]] if (str_arr_is_na(right, ridx[i])): str_arr_set_na(joined, i) else: str_arr_set_na(joined, i) return joined, lidx, ridx return sdc_join_series_indexes_impl return None
def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" if not isinstance(A, sdc_pandas_df_column_types): return None # this function should work with arrays, not indexes, but until all indexes support # common API (e.g. append is not supported for types.Array indexes) it is simplier to support # indexes here rather than branch depending on index types on call site # TO-DO: clean-up when Float64Index and StringArrayIndex are supported # if not (isinstance(B, sdc_pandas_df_column_types) or isinstance(B.dtype, sdc_pandas_df_column_types)): # return None valid_num_single_B_dtype = (types.Array, ) + sdc_pandas_index_types valid_num_seq_B_dtypes = (types.Array, ) + sdc_pandas_index_types if isinstance(A, types.Array): if isinstance(B, valid_num_single_B_dtype): convert_B = not isinstance(B, types.Array) def _append_single_numeric_impl(A, B): _B = B if convert_B == False else B.values # noqa return numpy.concatenate((A, _B,)) return _append_single_numeric_impl elif (isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, valid_num_seq_B_dtypes)): numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) convert_B = not isinstance(B.dtype, types.Array) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): total_length = len(A) + numpy.array([len(arr) for arr in B]).sum() new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) new_data[:stop] = A for arr in B: start = stop stop = start + len(arr) if convert_B == False: # noqa new_data[start:stop] = arr else: new_data[start:stop] = arr.values return new_data return _append_list_numeric_impl elif A == string_array_type: if B == string_array_type: def _append_single_string_array_impl(A, B): total_size = len(A) + len(B) total_chars = num_total_chars(A) + num_total_chars(B) new_data = sdc.str_arr_ext.pre_alloc_string_array(total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) pos += append_string_array_to(new_data, pos, B) return new_data return _append_single_string_array_impl elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type): def _append_list_string_array_impl(A, B): array_list = [A] + list(B) total_size = numpy.array([len(arr) for arr in array_list]).sum() total_chars = numpy.array([num_total_chars(arr) for arr in array_list]).sum() new_data = sdc.str_arr_ext.pre_alloc_string_array(total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) for arr in B: pos += append_string_array_to(new_data, pos, arr) return new_data return _append_list_string_array_impl
def _sdc_internal_join_ovld(left, right): if isinstance(left, types.Array) and isinstance(right, types.Array): numba_common_dtype = find_common_dtype_from_numpy_dtypes([left.dtype, right.dtype], []) if isinstance(numba_common_dtype, types.Number): def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) joined = numpy.empty(est_total_size, numba_common_dtype) left_nan = [] right_nan = [] for i in range(lsize): if numpy.isnan(left[i]): left_nan.append(i) for i in range(rsize): if numpy.isnan(right[i]): right_nan.append(i) # sort arrays saving the old positions sorted_left = numpy_like.argsort(left, kind='mergesort') sorted_right = numpy_like.argsort(right, kind='mergesort') # put the position of the nans in an increasing sequence sorted_left[lsize-len(left_nan):] = left_nan sorted_right[rsize-len(right_nan):] = right_nan i, j, k = 0, 0, 0 while (i < lsize and j < rsize): joined = _hpat_ensure_array_capacity(k + 1, joined) lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index) or numpy.isnan(right_index): joined[k] = left_index lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index) or numpy.isnan(left_index): joined[k] = right_index lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_joined = numpy.repeat(left_index, block_size) to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) joined = _hpat_ensure_array_capacity(k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) joined[k:k + block_size] = to_joined lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i joined = _hpat_ensure_array_capacity(k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k: k + block_size] = numpy.repeat(-1, block_size) while i < lsize: joined[k] = left[sorted_left[i]] lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j joined = _hpat_ensure_array_capacity(k + block_size, joined) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k: k + block_size] = numpy.repeat(-1, block_size) while j < rsize: joined[k] = right[sorted_right[j]] ridx[k] = sorted_right[j] j += 1 k += 1 return joined[:k], lidx[:k], ridx[:k] return sdc_join_series_indexes_impl else: return None elif (left == string_array_type and right == string_array_type): def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) # use Series.sort_values since argsort for StringArrays not implemented original_left_series = pandas.Series(left) original_right_series = pandas.Series(right) # sort arrays saving the old positions left_series = original_left_series.sort_values(kind='mergesort') right_series = original_right_series.sort_values(kind='mergesort') sorted_left = left_series._index sorted_right = right_series._index i, j, k = 0, 0, 0 while (i < lsize and j < rsize): lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index): lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index): lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k: k + block_size] = numpy.repeat(-1, block_size) while i < lsize: lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k: k + block_size] = numpy.repeat(-1, block_size) while j < rsize: ridx[k] = sorted_right[j] j += 1 k += 1 # count total number of characters and allocate joined array total_joined_size = k num_chars_in_joined = 0 for i in numpy.arange(total_joined_size): if lidx[i] != -1: num_chars_in_joined += len(left[lidx[i]]) elif ridx[i] != -1: num_chars_in_joined += len(right[ridx[i]]) joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) # iterate over joined and fill it with indexes using lidx and ridx indexers for i in numpy.arange(total_joined_size): if lidx[i] != -1: joined[i] = left[lidx[i]] if (str_arr_is_na(left, lidx[i])): str_arr_set_na(joined, i) elif ridx[i] != -1: joined[i] = right[ridx[i]] if (str_arr_is_na(right, ridx[i])): str_arr_set_na(joined, i) else: str_arr_set_na(joined, i) return joined, lidx, ridx return sdc_join_series_indexes_impl return None