def _aggregate(self, operation):
        aggregated_data = weld_groupby_aggregate(
            self.expr, [str(numpy_to_weld_type(k)) for k in self.by_types],
            [str(numpy_to_weld_type(k))
             for k in self.columns_types], operation)

        if len(self.by) == 1:
            new_index = Index(weld_get_column(aggregated_data, 0, True),
                              self.by_types[0], self.by[0])
        else:
            arrays = [
                LazyResult(weld_get_column(aggregated_data, index, True),
                           numpy_to_weld_type(self.by_types[index]), 1)
                for index in xrange(len(self.by))
            ]
            new_index = MultiIndex.from_arrays(arrays, self.by)

        new_data = OrderedDict()
        for i in xrange(len(self.columns)):
            column_name = self.columns[i]
            new_data[column_name] = Series(
                weld_get_column(aggregated_data, i, False),
                self.columns_types[i], new_index, column_name)

        return DataFrame(new_data, new_index)
    def __getitem__(self, item):
        """ Retrieve a portion of the Index

        Parameters
        ----------
        item : slice or LazyResult
            if slice, returns a sliced Index;
            if LazyResult, returns a filtered Index only with the labels corresponding to
            True in the Series

        Returns
        -------
        Index

        """
        if isinstance(item, slice):
            item = replace_slice_defaults(item)

            self.update_rows(item)

            return Index(self.expr, self.dtype)
        elif isinstance(item, LazyResult):
            if str(item.weld_type) != str(numpy_to_weld_type('bool')):
                raise ValueError(
                    'expected LazyResult of bool to filter Index elements')

            return Index(weld_filter(self.expr, item.expr), self.dtype)
        else:
            raise TypeError(
                'expected slice or LazyResult of bool in Index.__getitem__')
Beispiel #3
0
def get_weld_type(data):
    if isinstance(data, LazyResult):
        return data.weld_type
    elif isinstance(data, np.ndarray):
        return numpy_to_weld_type(data.dtype)
    else:
        raise TypeError('expected LazyResult or np.ndarray')
def read_csv(path):
    """ Read a csv file as a DataFrame

    Parameters
    ----------
    path : str
        path of the file

    Returns
    -------
    DataFrame

    """
    table = csv_weld.Table(path)

    new_columns = {}
    for column_name in table.columns:
        column = table.columns[column_name]
        weld_obj, weld_input_name = LazyResult.generate_placeholder_weld_object(column.data_id, column.encoder, column.decoder)
        new_columns[column_name] = LazyResult(weld_obj, numpy_to_weld_type(column.dtype), 1)

    random_column = new_columns[new_columns.keys()[0]]
    index_weld_obj = weld_range(0, 'len({})'.format(random_column.expr.weld_code), 1)
    index_weld_obj.update(random_column.expr)

    return DataFrame(new_columns, Index(index_weld_obj, np.dtype(np.int64)))
    def __init__(self, data, dtype, name=None):
        if not isinstance(data, (np.ndarray, WeldObject)):
            raise TypeError(
                'expected np.ndarray or WeldObject in Series.__init__')

        super(Index, self).__init__(data, numpy_to_weld_type(dtype), 1)

        self.dtype = dtype
        self.name = 'Index' if name is None else name
Beispiel #6
0
    def test_getitem_slice(self):
        weld_type = numpy_to_weld_type('int64')
        data = LazyResult(np.array([1, 2, 3]), weld_type, 1)
        series = Series(data.expr, np.dtype(np.int64), RangeIndex(0, 3, 1))

        expected_result = Series(np.array([1, 2]), np.dtype(np.int64),
                                 RangeIndex(0, 2, 1))
        result = series[:2]

        test_equal_series(expected_result, result)
    def test_getitem_filter(self):
        to_filter = LazyResult(
            np.array([True, False, True], dtype=np.dtype(np.bool)),
            numpy_to_weld_type(np.dtype(np.bool)), 1)
        result = pdw.Index(np.array([1, 2, 3]), np.dtype(np.int64))[to_filter]

        expected_result = pdw.Index(np.array([1, 3]), np.dtype(np.int64))

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result).data,
            evaluate_if_necessary(result).data)
    def __init__(self, file_id, column_name, dimensions, shape, attributes, expression, dtype):
        inferred_dtype = self._infer_dtype(dtype, attributes)
        weld_type = numpy_to_weld_type(inferred_dtype)
        LazyResult.__init__(self, expression, weld_type, 1)

        self.file_id = file_id
        self.column_name = column_name
        self.dimensions = dimensions
        self.shape = shape
        self.attributes = attributes
        # when reading data with netCDF4, the values are multiplied by the scale_factor if it exists,
        # which means that even if data is of type int, the scale factor is often float making the result a float
        self.dtype = inferred_dtype

        # same as [:]
        # the param used to lazy_slice_rows
        self.tuple_slices = slice(None)
        self._slice = None
def duplicate_elements_indices(array, n, cartesian=False):
    """ Expands array by multiplying each element n times

    Parameters
    ----------
    array : np.ndarray or LazyResult
        the source data
    n : long or LazyResult
        how many times to repeat each element; if LazyResult, will use its length
    cartesian : bool
        True if used internally by cartesian_product to signify the operation
        has been done once already and hence must behave slightly different by using the number
        in the array instead of the index of that number (since at this point the array already contains indexes)

    Returns
    -------
    LazyResult
        the expanded array containing the indices, not the elements

    Examples
    --------
    >>> duplicate_elements_indices(np.array([1, 2, 3]), 2)
    [0, 0, 1, 1, 2, 2]

    """
    if isinstance(array, LazyResult):
        weld_type = array.weld_type
        array = array.expr
    elif isinstance(array, np.ndarray):
        weld_type = numpy_to_weld_type(array.dtype)
    else:
        raise NotImplementedError

    if isinstance(n, LazyResult):
        n = n.expr
    elif isinstance(n, np.ndarray):
        n = len(n)
    elif not isinstance(n, long):
        raise TypeError(
            'expected either a long value or a LazyResult to use its length')

    return LazyResult(
        _duplicate_elements_indices(array, n, weld_type, cartesian),
        WeldLong(), 1)
    def __getitem__(self, item):
        """ Retrieve a portion of the MultiIndex

        Parameters
        ----------
        item : slice or LazyResult
            if slice, returns a sliced MultiIndex;
            if LazyResult, returns a filtered MultiIndex only with the labels corresponding to
            True in the LazyResult

        Returns
        -------
        MultiIndex

        """
        # TODO: filter unnecessary levels too, both slice and LazyResult
        if isinstance(item, slice):
            item = replace_slice_defaults(item)

            new_labels = [
                LazyResult(weld_subset(get_expression_or_raw(label), item),
                           get_weld_type(label), 1) for label in self.labels
            ]

            return MultiIndex(self.levels, new_labels, self.names)
        elif isinstance(item, LazyResult):
            if str(item.weld_type) != str(numpy_to_weld_type('bool')):
                raise ValueError(
                    'expected series of bool to filter DataFrame rows')

            new_labels = []
            for label in self.labels:
                label, weld_type = get_weld_info(label, True, True)

                new_labels.append(
                    LazyResult(weld_filter(label, item.expr), weld_type, 1))

            return MultiIndex(self.levels, new_labels, self.names)
        else:
            raise TypeError(
                'expected slice or LazyResult of bool in MultiIndex.__getitem__'
            )
    def test_getitem_filter(self):
        levels = [
            LazyResult(np.array([1, 2]), WeldLong(), 1),
            LazyResult(np.array([3, 4]), WeldLong(), 1)
        ]
        names = ['a', 'b']

        to_filter = LazyResult(
            np.array([True, False, True, False], dtype=np.bool),
            numpy_to_weld_type(np.dtype(np.bool)), 1)
        result = pdw.MultiIndex.from_product(levels, names)[to_filter]

        expected_result = pdw.MultiIndex([
            LazyResult(np.array([1, 2]), WeldLong(), 1),
            LazyResult(np.array([3, 4]), WeldLong(), 1)
        ], [
            LazyResult(np.array([0, 1]), WeldLong(), 1),
            LazyResult(np.array([0, 0]), WeldLong(), 1)
        ], ['a', 'b'])

        test_equal_multiindex(expected_result, result)
Beispiel #12
0
    def __getitem__(self, item):
        """ Lazy operation to select a subset of the series

        Has consequences! When slicing, any previous and/or following operations on
        the data within will be done only on this subset of the data

        Parameters
        ----------
        item : slice or LazyResult
            if slice, a slice of the data for the number of desired rows; currently
            must contain a stop value and will not work as expected for
            start != 0 and stride != 1;
            if LazyResult, returns a filtered Series only with the elements corresponding to
            True in the item LazyResult

        Returns
        -------
        Series

        """
        if isinstance(item, slice):
            item = replace_slice_defaults(item)

            self.update_rows(item)

            new_index = self.index[item]

            return Series(self.expr, self.dtype, new_index, self.name)
        elif isinstance(item, LazyResult):
            if str(item.weld_type) != str(numpy_to_weld_type('bool')):
                raise ValueError(
                    'expected series of bool to filter DataFrame rows')

            new_index = self.index[item]

            return Series(weld_filter(self.expr, item.expr), self.dtype,
                          new_index, self.name)
        else:
            raise TypeError(
                'expected a slice or a Series of bool in Series.__getitem__')
    def _element_wise_op(self, array, value, operation):
        weld_obj = WeldObject(Variable.encoder, Variable.decoder)

        array_var = weld_obj.update(array)

        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array

        weld_template = """
        result(
            for(%(array)s, 
                appender[%(type)s], 
                |b: appender[%(type)s], i: i64, n: %(type)s| 
                    merge(b, n %(operation)s %(value)s)
            )
        )"""

        weld_obj.weld_code = weld_template % {'array': array_var,
                                              'value': value,
                                              'operation': operation,
                                              'type': numpy_to_weld_type(self.dtype)}

        return weld_obj
    def __getitem__(self, item):
        """ Retrieve a portion of the DataFrame

        Has consequences! When slicing, any previous and/or following operations on
        the data within will be done only on this subset of the data

        Parameters
        ----------
        item : str or slice or list of str or LazyResult
            if str, returns a column as a Series;
            if slice, returns a sliced DataFrame;
            if list, returns a DataFrame with only the columns from the list;
            if LazyResult, returns a filtered DataFrame only with the rows corresponding to
            True in the LazyResult

        Returns
        -------
        Series or DataFrame

        """
        if isinstance(item, str):
            element = self.data[item]

            data, dtype = get_weld_info(element, expression=True, dtype=True)

            return Series(data, dtype, self.index, item)
        elif isinstance(item, slice):
            item = replace_slice_defaults(item)

            new_data = {}
            for column_name in self:
                # making series because Series has the proper method to slice something; re-use the code above
                series = self[str(column_name)]
                # the actual slice handled by Series getitem
                new_data[column_name] = series[item]

            # index slice handled by index
            new_index = self.index[item]

            return DataFrame(new_data, new_index)
        elif isinstance(item, list):
            new_data = {}

            for column_name in item:
                if not isinstance(column_name, str):
                    raise TypeError(
                        'expected a list of column names as strings')

                new_data[column_name] = self.data[column_name]

            return DataFrame(new_data, self.index)
        elif isinstance(item, LazyResult):
            if str(item.weld_type) != str(numpy_to_weld_type('bool')):
                raise ValueError(
                    'expected series of bool to filter DataFrame rows')

            new_data = {}
            for column_name in self:
                data = self.data[column_name]

                data, weld_type, dtype = get_weld_info(data,
                                                       expression=True,
                                                       weld_type=True,
                                                       dtype=True)

                new_data[column_name] = Series(weld_filter(data, item.expr),
                                               dtype, self.index, column_name)
            # slice the index
            new_index = self.index[item]

            return DataFrame(new_data, new_index)
        else:
            raise TypeError(
                'expected a str, slice, list, or Series in DataFrame.__getitem__'
            )