Esempio n. 1
0
    def generate_placeholder_weld_object(data_id, encoder, decoder):
        """ Generates a WeldObject which will evaluate to the data
        represented by the placeholder

        Parameters
        ----------
        data_id : str
            expected from generate_data_id, yet not enforced
        encoder : WeldObjectEncoder
        decoder : WeldObjectDecoder

        Returns
        -------
        WeldObject
            with weld_code which would evaluate to the data itself

        """
        # create weld object which will represent this data
        weld_obj = WeldObject(encoder, decoder)
        # update the context of this WeldObject and retrieve the generated _inpX id; WeldObject._registry
        # will hence link this data_id to the _inpX id
        weld_input_id = weld_obj.update(data_id)
        # should always be a new object, else there's a bug somewhere
        assert weld_input_id is not None
        # the code is just the input
        weld_obj.weld_code = '%s' % weld_input_id

        return weld_obj, weld_input_id
Esempio n. 2
0
def weld_count(array):
    """ Returns the length of the array

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    weld_template = """
    len(
        %(array)s
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var}

    return weld_obj
Esempio n. 3
0
def weld_range(start, stop, step):
    """ Create a vector for the range parameters above

    Parameters
    ----------
    start : int
    stop : int
    step : int

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    weld_template = """
    result(
        for(
            rangeiter(%(start)s, %(stop)s, %(step)s),
            appender[i64],
            |b: appender[i64], i: i64, e: i64| 
                merge(b, e)
        )
    )"""

    weld_obj.weld_code = weld_template % {'start': 'i64(%s)' % start,
                                          'stop': 'i64(%s)' % stop,
                                          'step': 'i64(%s)' % step}

    return weld_obj
Esempio n. 4
0
def weld_unique(array, type):
    """ Extract the unique elements in the array

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array
    type : WeldType
        of the input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    weld_template = """
    sort(
        map(
            tovec(
                result(
                    for(
                        map(
                            %(array)s,
                            |e| 
                                {e, 0}
                        ),
                        dictmerger[%(type)s, i32, +],
                        |b, i, e| 
                            merge(b, e)
                    )
                )
            ),
            |e| 
                e.$0
        ),
        |x: %(type)s| 
            x
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'type': type}

    return weld_obj
Esempio n. 5
0
def weld_subset(array, slice_):
    """ Return a subset of the input array

    Parameters
    ----------
    array : np.array or WeldObject
        1-dimensional array
    slice_ : slice
        subset to return

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(NumPyEncoder(), NumPyDecoder())

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    if slice_.step == 1:
        weld_template = """
        slice(
            %(array)s,
            %(slice_start)s,
            %(slice_stop)s
        )"""
    else:
        weld_template = """
        result(
            for(
                iter(%(array)s, %(slice_start)s, %(slice_stop)s, %(slice_step)s),
                appender,
                |b, i, n| 
                    merge(b, n)
            )  
        )"""

    weld_obj.weld_code = weld_template % {
        'array': array_var,
        'slice_start': 'i64(%s)' % slice_.start,
        'slice_stop': 'i64(%s)' % (slice_.stop - slice_.start),
        'slice_step': 'i64(%s)' % slice_.step
    }

    return weld_obj
Esempio n. 6
0
def weld_element_wise_op(array, scalar, operation, weld_type):
    """ Applies operation to each element in the array with scalar

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array
    scalar : str or scalar type
        value to compare with; must be same type as the values in the array. If not a str,
        it is casted to weld_type (allowing one to write e.g. native Python int)
    operation : {+, -, *, /}
    weld_type : WeldType
        type of each element in the input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)

    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    # this means input is a native python literal, therefore need to cast to weld one
    if not isinstance(scalar, str):
        scalar = "%s(%s)" % (weld_type, str(scalar))

    weld_template = """
    result(
        for(%(array)s, 
            appender, 
            |b, i, n| 
                merge(b, n %(operation)s %(scalar)s)
        )
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'scalar': scalar,
                                          'operation': operation}

    return weld_obj
Esempio n. 7
0
def weld_compare(array, scalar, operation, weld_type):
    """ Applies comparison operation between each element in the array with scalar

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array
    scalar : str or scalar type
        value to compare with; must be same type as the values in the array. If not a str,
        it is casted to weld_type (allowing one to write e.g. native Python int)
    operation : {<, <=, ==, !=, >=, >}
    weld_type : WeldType
        type of each element in the input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    # this means input is a native python literal, therefore need to cast to weld one
    if not isinstance(scalar, str):
        scalar = "%s(%s)" % (weld_type, str(scalar))

    weld_template = """
    map(
        %(array)s,
        |a: %(type)s| 
            a %(operation)s %(scalar)s
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'scalar': scalar,
                                          'operation': operation,
                                          'type': weld_type}

    return weld_obj
def _cartesian_product_indices(arrays):
    # compute (lazily) the x resulting columns
    results = [0] * len(arrays)
    results[0] = duplicate_elements_indices(arrays[0], arrays[1])
    results[1] = duplicate_array_indices(arrays[1], arrays[0])

    for i in range(2, len(arrays)):
        for j in range(0, i):
            results[j] = duplicate_elements_indices(results[j],
                                                    arrays[i],
                                                    cartesian=True)

        results[i] = duplicate_array_indices(arrays[i], arrays[0])

        for j in range(1, i):
            results[i] = duplicate_array_indices(results[i],
                                                 arrays[j],
                                                 cartesian=True)

    # final object
    weld_obj = WeldObject(_encoder, _decoder)
    # add the columns as dependencies to the final output
    for result in results:
        weld_obj.update(result.expr)
        weld_obj.dependencies[result.expr.obj_id] = result.expr

    # construct the template for a single vec[vec[i64]] which will result in a np.ndarray of ndim=2
    weld_template = 'let res = {%s};\n' % ', '.join(
        [res.expr.obj_id for res in results])
    for i in range(len(results) + 1):
        line = 'let a_%s = ' % str(i)
        if i == 0:
            line += 'appender[vec[i64]];\n'
        else:
            index = str(i - 1)
            line += 'merge(a_%s, res.$%s);\n' % (index, index)
        weld_template += line
    weld_template += 'result(a_%s)\n' % str(len(results))

    # no other replacements needed
    weld_obj.weld_code = weld_template

    return weld_obj
Esempio n. 9
0
def weld_udf(weld_template, mapping):
    """ Apply weld_code given arrays and scalars as input

    Parameters
    ----------
    weld_template : str
        the code that will be recorded for execution
    mapping : dict
        maps placeholders to either arrays (np.array or WeldObject) or scalars

    Returns
    -------
        the result of the inputted weld_code computation

    Examples
    -------
    >>> array = np.array([1, 3, 4])
    >>> weld_template = "map(%(array)s, |e| e + %(scalar)s)"
    >>> mapping = {'array': array, 'scalar': '2L'}
    >>> result = weld_udf(weld_template, mapping)
    >>> LazyResult(result, WeldLong(), 1).evaluate()
    [3 5 6]

    """
    weld_obj = WeldObject(_encoder, _decoder)

    # update the mapping with the weld var's (array_var in other methods)
    for k, v in mapping.items():
        # does not need to be registered if not np.array or weldobject
        if not isinstance(v, (np.ndarray, WeldObject)):
            continue

        array_var = weld_obj.update(v)

        if isinstance(v, WeldObject):
            array_var = v.obj_id
            weld_obj.dependencies[array_var] = v

        mapping.update({k: array_var})

    weld_obj.weld_code = weld_template % mapping

    return weld_obj
Esempio n. 10
0
    def _create_columns(self, header_df):
        from weld.weldobject import WeldObject

        columns = {}
        for column_name in header_df:
            data_id = LazyResult.generate_data_id(column_name)
            column = Column(column_name, self, data_id, header_df[column_name].dtype)
            columns[column_name] = column
            weld_input_name = WeldObject.generate_input_name(data_id)
            LazyResult.register_lazy_data(weld_input_name, column)

        return columns
Esempio n. 11
0
def weld_aggregate(array, operation, weld_type):
    """ Returns operation on the elements in the array.

    Arguments
    ---------
    array : WeldObject or np.ndarray
        input array
    operation : {'+', '*', 'min', 'max'}
        operation to apply
    weld_type : WeldType
        type of each element in the input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    weld_template = """
    result(
        for(
            %(array)s,
            merger[%(type)s, %(operation)s],
            |b, i, e| 
                merge(b, e)
        )
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'type': weld_type,
                                          'operation': operation}

    return weld_obj
Esempio n. 12
0
def weld_mean(array, weld_type):
    """ Returns the mean of the array

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array
    weld_type : WeldType
        type of each element in the input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    weld_template = """
    f64(
        result(
            for(
                %(array)s,
                merger[%(type)s, +],
                |b, i, n|
                    merge(b, n)
            )
        )
    ) / f64(len(%(array)s))"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'type': weld_type}

    return weld_obj
Esempio n. 13
0
    def _copy(self):
        copy_expr = WeldObject(self.expr.encoder, self.expr.decoder)
        copy_expr.weld_code = self.expr.weld_code
        copy_expr.context = deepcopy(self.expr.context)
        copy_expr.dependencies = self.expr.dependencies.copy()
        copy_expr.argtypes = deepcopy(self.expr.argtypes)

        return LazyResult(copy_expr, self.weld_type, self.dim)
Esempio n. 14
0
def weld_get_column(grouped_df, index, is_index=False):
    """ Gets the (index) column from the grouped DataFrame

    Parameters
    ----------
    grouped_df : WeldObject
        DataFrame which has been grouped through weld_groupby
    index : int
        index of the column; the mapping name-to-index is maintained by DataFrameGroupBy
    is_index : bool
        to signal if the requested column is in the index

    Returns
    -------
    WeldObject
        representation of the computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    grouped_df_var = weld_obj.update(grouped_df)

    assert grouped_df_var is None

    grouped_df_var = grouped_df.obj_id
    weld_obj.dependencies[grouped_df_var] = grouped_df

    weld_template = """
    map(
        %(grouped_df)s,
        |e|
            e.$%(index)s
    )"""

    weld_obj.weld_code = weld_template % {'grouped_df': grouped_df_var,
                                          'index': '0.$%s' % index if is_index else '1.$%s' % index}

    return weld_obj
Esempio n. 15
0
def weld_index_to_values(levels, labels):
    """ Construct the actual index from levels ('values') and labels ('indices')

    Parameters
    ----------
    levels : np.array or WeldObject
        the possible values
    labels : np.array or WeldObject
        the indices to the levels for the actual index values

    Returns
    -------
    WeldObject
        representation of the computation

    Examples
    --------
    >>> levels = np.array([1.0, 2.5, 3.0])
    >>> labels = np.array([0, 0, 1, 2])
    >>> print(LazyResult(weld_index_to_values(levels, labels), WeldDouble(), 1).evaluate(verbose=False))
    [1. 1. 2.5 3.]

    """
    # TODO: fix this temporary hack
    if isinstance(levels, np.ndarray) and levels.dtype == 'object':
        levels = levels.astype(np.str)

    weld_obj = WeldObject(_encoder, _decoder)

    levels_var = weld_obj.update(levels)
    if isinstance(levels, WeldObject):
        levels_var = levels.obj_id
        weld_obj.dependencies[levels_var] = levels

    labels_var = weld_obj.update(labels)
    if isinstance(labels, WeldObject):
        labels_var = labels.obj_id
        weld_obj.dependencies[labels_var] = labels

    weld_template = """
    result(
        for(
            %(labels)s,
            appender,
            |b, i, n|
                merge(b, lookup(%(levels)s, n))
        )
    )"""

    weld_obj.weld_code = weld_template % {'labels': labels_var,
                                          'levels': levels_var}

    return weld_obj
Esempio n. 16
0
def weld_standard_deviation(array, weld_type):
    """ Returns the standard deviation of the array

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array
    weld_type : WeldType
        type of each element in the input array

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    # obtain the mean
    mean_obj = weld_mean(array, weld_type)
    # we know it's a registered WeldObject, no need to check
    weld_obj.update(mean_obj)
    mean_var = mean_obj.obj_id
    weld_obj.dependencies[mean_var] = mean_obj

    weld_template = """
    sqrt(
        result(
            for(
                %(array)s,
                merger[f64, +],
                |b, i, n|
                    merge(b, pow(f64(n) - %(mean)s, 2.0))
            )
        ) / f64(len(%(array)s) - 1L)
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'type': weld_type,
                                          'mean': mean_var}

    return weld_obj
Esempio n. 17
0
    def _element_wise_op(self, array, value, operation):
        weld_obj = WeldObject(Variable.encoder, Variable.decoder)

        array_var = weld_obj.update(array)

        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array

        weld_template = """
        result(
            for(%(array)s, 
                appender[%(type)s], 
                |b: appender[%(type)s], i: i64, n: %(type)s| 
                    merge(b, n %(operation)s %(value)s)
            )
        )"""

        weld_obj.weld_code = weld_template % {'array': array_var,
                                              'value': value,
                                              'operation': operation,
                                              'type': numpy_to_weld_type(self.dtype)}

        return weld_obj
Esempio n. 18
0
def weld_array_op(array1, array2, operation):
    """ Applies operation to each element in the array with scalar

    Their lengths and types are assumed to be the same.
    TODO: what happens if not?

    Parameters
    ----------
    array1 : np.ndarray or WeldObject
        input array
    array2 : np.ndarray or WeldObject
        second input array
    operation : {+, -, *, /, &&, ||}

    Returns
    -------
    WeldObject
        representation of this computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    array1_var = weld_obj.update(array1)
    if isinstance(array1, WeldObject):
        array1_var = array1.obj_id
        weld_obj.dependencies[array1_var] = array1

    array2_var = weld_obj.update(array2)
    if isinstance(array2, WeldObject):
        array2_var = array2.obj_id
        weld_obj.dependencies[array2_var] = array2

    weld_template = """
    result(
        for(zip(%(array1)s, %(array2)s), 
            appender, 
            |b, i, n| 
                merge(b, n.$0 %(operation)s n.$1)
        )
    )"""

    weld_obj.weld_code = weld_template % {'array1': array1_var,
                                          'array2': array2_var,
                                          'operation': operation}

    return weld_obj
Esempio n. 19
0
    def _create_columns(self, header_df):
        from weld.weldobject import WeldObject

        columns = {}
        for column_name in header_df:
            data_id = LazyResult.generate_data_id(column_name)
            column = Column(column_name, self, data_id,
                            header_df[column_name].dtype)

            weld_input_name = WeldObject.generate_input_name(data_id)
            LazyResult.register_lazy_data(weld_input_name, column)

            # force read it eagerly
            LazyResult.input_mapping[str(
                weld_input_name)] = column.eager_read()

            columns[column_name] = column

        return columns
def _duplicate_elements_indices(array, n, weld_type, cartesian=False):
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)

    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    if isinstance(n, WeldObject):
        weld_obj.update(n)
        weld_obj.dependencies[n.obj_id] = n
        n = 'len(%s)' % n.obj_id
    elif isinstance(n, np.ndarray):
        array_var = weld_obj.update(n)
        n = 'len(%s)' % array_var

    weld_template = """
    result(
        for(
            %(array)s,
            appender[i64],
            |b: appender[i64], i: i64, n: %(type)s| 
                iterate(
                    {b, %(index_or_value)s, 1L}, 
                    |p| 
                        {{merge(b, p.$1), p.$1, p.$2 + 1L}, 
                        p.$2 < %(n)s}
                ).$0
        )  
    )"""

    weld_obj.weld_code = weld_template % {
        'array': array_var,
        'n': 'i64(%s)' % n,
        'index_or_value': 'n' if cartesian else 'i',
        'type': 'i64' if cartesian else weld_type
    }

    return weld_obj
Esempio n. 21
0
def weld_filter(array, bool_array):
    """ Returns a new array only with the elements with a corresponding
    True in bool_array

    Parameters
    ----------
    array : np.ndarray or WeldObject
        input array
    bool_array : np.ndarray / WeldObject
        array of bool with True for elements in array desired in the result array

    Returns
    -------
    WeldObject
        representation of this computation
        
    """
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    bool_array_var = weld_obj.update(bool_array)
    if isinstance(bool_array, WeldObject):
        bool_array_var = bool_array.obj_id
        weld_obj.dependencies[bool_array_var] = bool_array

    weld_template = """
    result(
        for(
            zip(%(array)s, %(bool_array)s),
            appender,
            |b, i, e| 
                if (e.$1, 
                    merge(b, e.$0), 
                    b)
        )
    )"""

    weld_obj.weld_code = weld_template % {'array': array_var,
                                          'bool_array': bool_array_var}

    return weld_obj
Esempio n. 22
0
class Scalar(WeldBase):

    def __init__(self, value, weldty):
        self.value = value
        self.weldty = weldty
        self.weldobj = WeldObject(NumpyArrayEncoder(), NumpyArrayDecoder())
        name = self.weldobj.update(value, WeldInt)

    # Algebraic operators

    def __add__(self, other):  # self + other
        return self.updated("{!s} + {!s}".format(self, other))

    def __mul__(self, other):  # self * other
        return self.updated("{!s} * {!s}".format(self, other))

    def __sub__(self, other):  # self - other
        return self.updated("{!s} - {!s}".format(self, other))

    def __div__(self, other):  # self / other
        return self.updated("{!s} / {!s}".format(self, other))

    def __pow__(self, other, modulo=None):  # self ** other
        return self.updated("pow({!s}, {!s})".format(self, other))

    def __eq__(self, other):  # self == other
        return self.updated("{!s} == {!s}".format(self, other))

    def __ne__(self, other):  # self != other
        return self.updated("{!s} != {!s}".format(self, other))

    def __lt__(self, other):  # self < other
        return self.updated("{!s} < {!s}".format(self, other))

    def __le__(self, other):  # self <= other
        return self.updated("{!s} <= {!s}".format(self, other))

    def __ge__(self, other):  # self >= other
        return self.updated("{!s} >= {!s}".format(self, other))

    def __gt__(self, other):  # self > other
        return self.updated("{!s} > {!s}".format(self, other))
def _array_to_labels(array, levels, levels_type):
    weld_obj = WeldObject(_encoder, _decoder)

    array_var = weld_obj.update(array)
    if isinstance(array, WeldObject):
        array_var = array.obj_id
        weld_obj.dependencies[array_var] = array

    levels_var = weld_obj.update(levels)
    if isinstance(levels, WeldObject):
        levels_var = levels.obj_id
        weld_obj.dependencies[levels_var] = levels

    weld_template = """
    let indices = result(
        for(%(levels)s,
            appender[i64],
            |b, i, e|
                merge(b, i)
        )
    );
    let indices_dict = result(
        for(zip(%(levels)s, indices),
            dictmerger[%(type)s, i64, +],
            |b, i, e|
                merge(b, {e.$0, e.$1})        
        )
    );
    result(   
        for(
            %(array)s,
            appender[i64],
            |b, i, e|
                merge(b, lookup(indices_dict, e))
        )
    )"""

    weld_obj.weld_code = weld_template % {
        'array': array_var,
        'levels': levels_var,
        'type': levels_type
    }

    return weld_obj
Esempio n. 24
0
def weld_groupby(by, by_types, columns, columns_types):
    """ Groups by the columns in by

    Parameters
    ----------
    by : list of np.ndarray or list of WeldObject
        the data to group by
    by_types : list of WeldType
        corresponding to by
    columns : list of np.ndarray or list of WeldObject
        the data to group
    columns_types : list of WeldType
        corresponding to columns

    Returns
    -------
    WeldObject
        representation of the computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    by_list_var = []
    for by_elem in by:
        by_var = weld_obj.update(by_elem)
        if isinstance(by_elem, WeldObject):
            by_var = by_elem.obj_id
            weld_obj.dependencies[by_var] = by_elem
        by_list_var.append(by_var)

    columns_list_var = []
    for column in columns:
        column_var = weld_obj.update(column)
        if isinstance(column, WeldObject):
            column_var = column.obj_id
            weld_obj.dependencies[column_var] = column
        columns_list_var.append(column_var)

    weld_template = """
    let columns = result(
            for(
                %(columns)s,
                appender,
                |b, i, e|
                    merge(b, e)
            )
    );
    tovec(
        result(
            for(
                zip(%(by)s, columns),
                groupmerger[%(by_types)s, %(columns_types)s],
                |b, i, e|
                    merge(b, {%(to_merge_keys)s, %(to_merge_values)s})
            )
        )
    )"""

    by = ', '.join(by_list_var) if len(by_list_var) > 1 else '%s' % by_list_var[0]
    columns = 'zip(%s)' % ', '.join(columns_list_var) if len(columns_list_var) > 1 else '%s' % columns_list_var[0]
    to_merge_keys = '{%s}' % ', '.join(['e.$%s' % str(k) for k in range(len(by_types))])
    to_merge_values = '{%s}' % ', '.join(['e.$%s.$%s' % (str(len(by_types)), str(k)) for k in range(len(columns_types))]) if len(columns_types) > 1 else '{e.$%s}' % str(len(by_types))
    by_types = '{%s}' % ', '.join([str(k) for k in by_types])
    columns_types = '{%s}' % ', '.join([str(k) for k in columns_types]) if len(columns_types) > 1 else '{%s}' % columns_types[0]

    weld_obj.weld_code = weld_template % {'by': by,
                                          'columns': columns,
                                          'by_types': by_types,
                                          'to_merge_keys': to_merge_keys,
                                          'to_merge_values': to_merge_values,
                                          'columns_types': columns_types}

    return weld_obj
Esempio n. 25
0
def weld_merge_single_index(indexes, cache=True):
    """ Returns bool arrays for which indexes shall be kept

    Parameters
    ----------
    indexes : list of np.array or WeldObject
        input array
    cache : bool
        flag to indicate whether to cache result as intermediate result

    Returns
    -------
    list of WeldObject
        representation of the computations

    Examples
    -------
    >>> index1 = np.array([1, 3, 4, 5, 6])
    >>> index2 = np.array([2, 3, 5])
    >>> result = weld_merge_single_index([index1, index2])
    >>> LazyResult(result[0], WeldBit(), 1).evaluate(verbose=False)
    [False True False True False]
    >>> LazyResult(result[1], WeldBit(), 1).evaluate(verbose=False)
    [False True True]

    """
    weld_obj = WeldObject(_encoder, _decoder)
    weld_ids = []
    for array in indexes:
        array_var = weld_obj.update(array)
        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array
        weld_ids.append(array_var)

    weld_template = """
    let len1 = len(%(array1)s);
    let len2 = len(%(array2)s);
    # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2
    let maxlen = if(len1 > len2, len1, len2);
    let res = iterate({0L, 0L, appender[bool], appender[bool]},
            |p|
                let val1 = lookup(%(array1)s, p.$0);
                let val2 = lookup(%(array2)s, p.$1);
                let iter_output = 
                    if(val1 == val2,
                        {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)},
                        if(val1 < val2,  
                            {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                            {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                        )
                    );
                    
                {
                    iter_output,
                    iter_output.$0 < len1 && 
                    iter_output.$1 < len2
                }
    );
    # iterate over remaining un-checked elements in both arrays
    let res = if (res.$0 < maxlen, iterate(res,
            |p|
                {
                    {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                    p.$0 + 1L < maxlen
                }
    ), res);
    let res = if (res.$1 < maxlen, iterate(res,
            |p|
                {
                    {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)},
                    p.$1 + 1L < maxlen
                }
    ), res);
    let b = appender[vec[bool]];
    let c = merge(b, result(res.$2));
    result(merge(c, result(res.$3)))"""

    weld_obj.weld_code = weld_template % {'array1': weld_ids[0],
                                          'array2': weld_ids[1]}
    # this has both required bool arrays into 1 ndarray; note that arrays have been padded with False until of same len
    # TODO: this could still be a single vec/array with the arrays concatenated instead to avoid decoder with ndim=2 mallocs
    result = LazyResult(weld_obj, WeldBit(), 2)

    # creating the actual results to return
    weld_objects = []
    weld_ids = []
    weld_col_ids = []

    if cache:
        id_ = LazyResult.generate_intermediate_id('sindex_merge')
        weld_input_id = WeldObject.generate_input_name(id_)
        LazyResult.register_intermediate_result(weld_input_id, result)

        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(id_)
            assert result_var is not None

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)
    else:
        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(result.expr)
            assert result_var is None
            result_var = result.expr.obj_id
            weld_obj.dependencies[result_var] = result.expr

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)

    # need 1 array from each resulting tables to get actual length
    for i in range(2):
        array_var = weld_objects[i].update(indexes[i])
        if isinstance(indexes[i], WeldObject):
            array_var = indexes[i].obj_id
            weld_objects[i].dependencies[array_var] = indexes[i]
        weld_col_ids.append(array_var)

    weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))"""

    for i in range(2):
        weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i],
                                                  'i': str(i) + 'L',
                                                  'col': weld_col_ids[i]}

    return weld_objects
Esempio n. 26
0
def weld_merge_triple_index(indexes, cache=True):
    """ Returns bool arrays for which indexes shall be kept

    Note it does NOT work correctly with duplicate elements; indexes MUST be already sorted

    Parameters
    ----------
    indexes : list of list
        of np.array or WeldObject
        list of len 2 with first and second elements being the labels in a list
        for the first and second DataFrame MultiIndex, respectively
    cache : bool
        flag to indicate whether to cache result as intermediate result

    Returns
    -------
    list of WeldObject
        representation of the computations, one for each DataFrame

    """
    assert len(indexes) == 2
    assert len(indexes[0]) == len(indexes[1]) == 3

    # flatten the list
    indexes = [elem for sublist in indexes for elem in sublist]

    # create final weld objects of what will be the bool arrays
    # also save the weld_ids for the inputs
    weld_obj = WeldObject(_encoder, _decoder)
    weld_ids = []

    for array in indexes:
        array_var = weld_obj.update(array)
        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array

        weld_ids.append(array_var)

    weld_template = """
    let len1 = len(%(array1)s);
    let len2 = len(%(array4)s);
    # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2
    let maxlen = if(len1 > len2, len1, len2);
    let indexes1 = {%(array1)s, %(array2)s, %(array3)s};
    let indexes2 = {%(array4)s, %(array5)s, %(array6)s};
    let res = if(len1 > 0L && len2 > 0L,
                iterate({0L, 0L, appender[bool], appender[bool]},
                |p|
                    let val1 = {lookup(indexes1.$0, p.$0), lookup(indexes1.$1, p.$0), lookup(indexes1.$2, p.$0)};
                    let val2 = {lookup(indexes2.$0, p.$1), lookup(indexes2.$1, p.$1), lookup(indexes2.$2, p.$1)};
                    
                    let iter_output = 
                        if(val1.$0 == val2.$0,
                            if(val1.$1 == val2.$1,
                                if(val1.$2 == val2.$2,
                                    {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)},
                                    if(val1.$2 < val2.$2,
                                        {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                                        {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                                    )
                                ),
                                if(val1.$1 < val2.$1,
                                    {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                                    {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                                )
                            ),
                            if(val1.$0 < val2.$0,
                                {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                                {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                            )
                        );
                    {
                        iter_output,
                        iter_output.$0 < len1 && 
                        iter_output.$1 < len2
                    }
                ),
                {0L, 0L, appender[bool], appender[bool]}
    );
    # iterate over remaining un-checked elements in both arrays and append False until maxLen
    let res = if(res.$0 < maxlen, iterate(res,
            |p|
                {
                    {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                    p.$0 + 1L < maxlen
                }
    ), res);
    let res = if(res.$1 < maxlen, iterate(res,
            |p|
                {
                    {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)},
                    p.$1 + 1L < maxlen
                }
    ), res);
    let b = appender[vec[bool]];
    let c = merge(b, result(res.$2));
    result(merge(c, result(res.$3)))"""

    weld_obj.weld_code = weld_template % {'array1': weld_ids[0],
                                          'array2': weld_ids[1],
                                          'array3': weld_ids[2],
                                          'array4': weld_ids[3],
                                          'array5': weld_ids[4],
                                          'array6': weld_ids[5]}

    result = LazyResult(weld_obj, WeldBit(), 2)

    weld_objects = []
    weld_ids = []
    weld_col_ids = []

    if cache:
        id_ = LazyResult.generate_intermediate_id('mindex_merge')
        weld_input_name = WeldObject.generate_input_name(id_)
        LazyResult.register_intermediate_result(weld_input_name, result)

        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(id_)
            assert result_var is not None

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)
    else:
        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(result.expr)
            assert result_var is None
            result_var = result.expr.obj_id
            weld_obj.dependencies[result_var] = result.expr

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)

    # need 1 array from each resulting tables to get actual length
    for i in range(2):
        array_var = weld_objects[i].update(indexes[i * 3])
        if isinstance(indexes[i * 3], WeldObject):
            array_var = indexes[i * 3].obj_id
            weld_objects[i].dependencies[array_var] = indexes[i * 3]
        weld_col_ids.append(array_var)

    weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))"""

    for i in range(2):
        weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i],
                                                  'i': str(i) + 'L',
                                                  'col': weld_col_ids[i]}

    return weld_objects
Esempio n. 27
0
def weld_groupby_mean(grouped_df, by_types, columns_types):
    """ Groups by the columns in by

    Parameters
    ----------
    grouped_df : WeldObject
        DataFrame which has been grouped through weld_groupby
    by_types : list of WeldType
        corresponding to by
    columns_types : list of WeldType
        corresponding to columns

    Returns
    -------
    WeldObject
        representation of the computation

    """
    weld_obj = WeldObject(_encoder, _decoder)

    grouped_df_var = weld_obj.update(grouped_df)

    assert grouped_df_var is None

    grouped_df_var = grouped_df.obj_id
    weld_obj.dependencies[grouped_df_var] = grouped_df

    weld_template = """
    tovec(
        result(
            for(
                %(grouped_df)s,
                dictmerger[%(by_types)s, %(columns_types)s, +],
                |b, i, e|
                    let group_res = for(e.$1,
                        %(mergers)s,
                        |c, j, f|
                            %(merger_ops)s
                    );
                    
                    merge(b, {e.$0, %(merger_res)s})
            )
        )
    )"""

    """ should be this but unsupported by Weld
    let merged = 
                        result(
                            for(e.$1,
                                merger[%(columns_types)s, %(operation)s],
                                |c, j, f|
                                    merge(c, f)
                            )
                    );
    """

    by_types = '{%s}' % ', '.join([str(k) for k in by_types])
    columns_typess = '{%s}' % ', '.join(['f64' for k in columns_types])
    mergers = '{%s}' % ', '.join(['merger[%s, +]' % str(k) for k in columns_types])
    merger_ops = '{%s}' % ', '.join(['merge(c.$%s, f.$%s)' % (str(k), str(k)) for k in range(len(columns_types))])
    merger_res = '{%s}' % ', '.join(['f64(result(group_res.$%s)) / f64(len(e.$1))' % str(k) for k in range(len(columns_types))])

    weld_obj.weld_code = weld_template % {'grouped_df': grouped_df_var,
                                          'mergers': mergers,
                                          'merger_ops': merger_ops,
                                          'merger_res': merger_res,
                                          'by_types': by_types,
                                          'columns_types': columns_typess}

    return weld_obj
Esempio n. 28
0
 def __init__(self, value, weldty):
     self.value = value
     self.weldty = weldty
     self.weldobj = WeldObject(NumpyArrayEncoder(), NumpyArrayDecoder())
     name = self.weldobj.update(value, WeldInt)
Esempio n. 29
0
def compute_year_month(time):
    WeldObject.load_binary(os.getcwd() + '/udf_yearmonth.so')
    weld_template = "cudf[udf_yearmonth, vec[vec[i8]]](%(self)s)"

    return time.map(weld_template, {})
def cartesian_product_indices(arrays, cache=True):
    """ Performs cartesian product between all arrays

    Returns the indices instead of the actual values

    Parameters
    ----------
    arrays : list of (np.ndarray or LazyResult)
        list containing arrays that need to be in the product
    cache : bool, optional
        flag to indicate whether to cache result as intermediate result

    Returns
    -------
    list of LazyResult

    Examples
    --------
    >>> cartesian_product_indices([np.array([1, 2]), np.array([3, 4])])
    [[0, 0, 1, 1], [0, 1, 0, 1]]

    See also
    --------
    pandas.MultiIndex

    """
    if len(arrays) < 2:
        raise ValueError('expected at least 2 arrays')

    weld_object = _cartesian_product_indices(arrays)
    # this now contains the entire np.ndarray with all results of cartesian product
    result = LazyResult(weld_object, WeldLong(), 2)

    # construct the actual weld_objects corresponding to single result columns/arrays
    weld_objects = []
    weld_ids = []
    if cache:
        id_ = LazyResult.generate_intermediate_id('cartesian_product')
        weld_input_name = WeldObject.generate_input_name(id_)
        LazyResult.register_intermediate_result(weld_input_name, result)

        for i in range(len(arrays)):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(id_)
            assert result_var is not None

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)
    else:
        for i in range(len(arrays)):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(result.expr)
            assert result_var is None
            result_var = result.expr.obj_id
            weld_obj.dependencies[result_var] = result.expr

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)

    weld_template = """lookup(%(array)s, %(i)sL)"""
    for i in range(len(arrays)):
        weld_objects[i].weld_code = weld_template % {
            'array': weld_ids[i],
            'i': str(i)
        }

    return [LazyResult(obj, WeldLong(), 1) for obj in weld_objects]