Ejemplo n.º 1
0
def weld_merge_triple_index(indexes, cache=True):
    """ Returns bool arrays for which indexes shall be kept

    Note it does NOT work correctly with duplicate elements; indexes MUST be already sorted

    Parameters
    ----------
    indexes : list of list
        of np.array or WeldObject
        list of len 2 with first and second elements being the labels in a list
        for the first and second DataFrame MultiIndex, respectively
    cache : bool
        flag to indicate whether to cache result as intermediate result

    Returns
    -------
    list of WeldObject
        representation of the computations, one for each DataFrame

    """
    assert len(indexes) == 2
    assert len(indexes[0]) == len(indexes[1]) == 3

    # flatten the list
    indexes = [elem for sublist in indexes for elem in sublist]

    # create final weld objects of what will be the bool arrays
    # also save the weld_ids for the inputs
    weld_obj = WeldObject(_encoder, _decoder)
    weld_ids = []

    for array in indexes:
        array_var = weld_obj.update(array)
        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array

        weld_ids.append(array_var)

    weld_template = """
    let len1 = len(%(array1)s);
    let len2 = len(%(array4)s);
    # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2
    let maxlen = if(len1 > len2, len1, len2);
    let indexes1 = {%(array1)s, %(array2)s, %(array3)s};
    let indexes2 = {%(array4)s, %(array5)s, %(array6)s};
    let res = if(len1 > 0L && len2 > 0L,
                iterate({0L, 0L, appender[bool], appender[bool]},
                |p|
                    let val1 = {lookup(indexes1.$0, p.$0), lookup(indexes1.$1, p.$0), lookup(indexes1.$2, p.$0)};
                    let val2 = {lookup(indexes2.$0, p.$1), lookup(indexes2.$1, p.$1), lookup(indexes2.$2, p.$1)};
                    
                    let iter_output = 
                        if(val1.$0 == val2.$0,
                            if(val1.$1 == val2.$1,
                                if(val1.$2 == val2.$2,
                                    {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)},
                                    if(val1.$2 < val2.$2,
                                        {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                                        {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                                    )
                                ),
                                if(val1.$1 < val2.$1,
                                    {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                                    {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                                )
                            ),
                            if(val1.$0 < val2.$0,
                                {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                                {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                            )
                        );
                    {
                        iter_output,
                        iter_output.$0 < len1 && 
                        iter_output.$1 < len2
                    }
                ),
                {0L, 0L, appender[bool], appender[bool]}
    );
    # iterate over remaining un-checked elements in both arrays and append False until maxLen
    let res = if(res.$0 < maxlen, iterate(res,
            |p|
                {
                    {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                    p.$0 + 1L < maxlen
                }
    ), res);
    let res = if(res.$1 < maxlen, iterate(res,
            |p|
                {
                    {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)},
                    p.$1 + 1L < maxlen
                }
    ), res);
    let b = appender[vec[bool]];
    let c = merge(b, result(res.$2));
    result(merge(c, result(res.$3)))"""

    weld_obj.weld_code = weld_template % {'array1': weld_ids[0],
                                          'array2': weld_ids[1],
                                          'array3': weld_ids[2],
                                          'array4': weld_ids[3],
                                          'array5': weld_ids[4],
                                          'array6': weld_ids[5]}

    result = LazyResult(weld_obj, WeldBit(), 2)

    weld_objects = []
    weld_ids = []
    weld_col_ids = []

    if cache:
        id_ = LazyResult.generate_intermediate_id('mindex_merge')
        weld_input_name = WeldObject.generate_input_name(id_)
        LazyResult.register_intermediate_result(weld_input_name, result)

        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(id_)
            assert result_var is not None

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)
    else:
        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(result.expr)
            assert result_var is None
            result_var = result.expr.obj_id
            weld_obj.dependencies[result_var] = result.expr

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)

    # need 1 array from each resulting tables to get actual length
    for i in range(2):
        array_var = weld_objects[i].update(indexes[i * 3])
        if isinstance(indexes[i * 3], WeldObject):
            array_var = indexes[i * 3].obj_id
            weld_objects[i].dependencies[array_var] = indexes[i * 3]
        weld_col_ids.append(array_var)

    weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))"""

    for i in range(2):
        weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i],
                                                  'i': str(i) + 'L',
                                                  'col': weld_col_ids[i]}

    return weld_objects
Ejemplo n.º 2
0
def cartesian_product_indices(arrays, cache=True):
    """ Performs cartesian product between all arrays

    Returns the indices instead of the actual values

    Parameters
    ----------
    arrays : list of (np.ndarray or LazyResult)
        list containing arrays that need to be in the product
    cache : bool, optional
        flag to indicate whether to cache result as intermediate result

    Returns
    -------
    list of LazyResult

    Examples
    --------
    >>> cartesian_product_indices([np.array([1, 2]), np.array([3, 4])])
    [[0, 0, 1, 1], [0, 1, 0, 1]]

    See also
    --------
    pandas.MultiIndex

    """
    if len(arrays) < 2:
        raise ValueError('expected at least 2 arrays')

    weld_object = _cartesian_product_indices(arrays)
    # this now contains the entire np.ndarray with all results of cartesian product
    result = LazyResult(weld_object, WeldLong(), 2)

    # construct the actual weld_objects corresponding to single result columns/arrays
    weld_objects = []
    weld_ids = []
    if cache:
        id_ = LazyResult.generate_intermediate_id('cartesian_product')
        weld_input_name = WeldObject.generate_input_name(id_)
        LazyResult.register_intermediate_result(weld_input_name, result)

        for i in range(len(arrays)):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(id_)
            assert result_var is not None

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)
    else:
        for i in range(len(arrays)):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(result.expr)
            assert result_var is None
            result_var = result.expr.obj_id
            weld_obj.dependencies[result_var] = result.expr

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)

    weld_template = """lookup(%(array)s, %(i)sL)"""
    for i in range(len(arrays)):
        weld_objects[i].weld_code = weld_template % {
            'array': weld_ids[i],
            'i': str(i)
        }

    return [LazyResult(obj, WeldLong(), 1) for obj in weld_objects]
Ejemplo n.º 3
0
def weld_merge_single_index(indexes, cache=True):
    """ Returns bool arrays for which indexes shall be kept

    Parameters
    ----------
    indexes : list of np.array or WeldObject
        input array
    cache : bool
        flag to indicate whether to cache result as intermediate result

    Returns
    -------
    list of WeldObject
        representation of the computations

    Examples
    -------
    >>> index1 = np.array([1, 3, 4, 5, 6])
    >>> index2 = np.array([2, 3, 5])
    >>> result = weld_merge_single_index([index1, index2])
    >>> LazyResult(result[0], WeldBit(), 1).evaluate(verbose=False)
    [False True False True False]
    >>> LazyResult(result[1], WeldBit(), 1).evaluate(verbose=False)
    [False True True]

    """
    weld_obj = WeldObject(_encoder, _decoder)
    weld_ids = []
    for array in indexes:
        array_var = weld_obj.update(array)
        if isinstance(array, WeldObject):
            array_var = array.obj_id
            weld_obj.dependencies[array_var] = array
        weld_ids.append(array_var)

    weld_template = """
    let len1 = len(%(array1)s);
    let len2 = len(%(array2)s);
    # bool arrays shall be padded until maxLen so that result can be cached as np.ndarray of ndim=2
    let maxlen = if(len1 > len2, len1, len2);
    let res = iterate({0L, 0L, appender[bool], appender[bool]},
            |p|
                let val1 = lookup(%(array1)s, p.$0);
                let val2 = lookup(%(array2)s, p.$1);
                let iter_output = 
                    if(val1 == val2,
                        {p.$0 + 1L, p.$1 + 1L, merge(p.$2, true), merge(p.$3, true)},
                        if(val1 < val2,  
                            {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                            {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)}
                        )
                    );
                    
                {
                    iter_output,
                    iter_output.$0 < len1 && 
                    iter_output.$1 < len2
                }
    );
    # iterate over remaining un-checked elements in both arrays
    let res = if (res.$0 < maxlen, iterate(res,
            |p|
                {
                    {p.$0 + 1L, p.$1, merge(p.$2, false), p.$3},
                    p.$0 + 1L < maxlen
                }
    ), res);
    let res = if (res.$1 < maxlen, iterate(res,
            |p|
                {
                    {p.$0, p.$1 + 1L, p.$2, merge(p.$3, false)},
                    p.$1 + 1L < maxlen
                }
    ), res);
    let b = appender[vec[bool]];
    let c = merge(b, result(res.$2));
    result(merge(c, result(res.$3)))"""

    weld_obj.weld_code = weld_template % {'array1': weld_ids[0],
                                          'array2': weld_ids[1]}
    # this has both required bool arrays into 1 ndarray; note that arrays have been padded with False until of same len
    # TODO: this could still be a single vec/array with the arrays concatenated instead to avoid decoder with ndim=2 mallocs
    result = LazyResult(weld_obj, WeldBit(), 2)

    # creating the actual results to return
    weld_objects = []
    weld_ids = []
    weld_col_ids = []

    if cache:
        id_ = LazyResult.generate_intermediate_id('sindex_merge')
        weld_input_id = WeldObject.generate_input_name(id_)
        LazyResult.register_intermediate_result(weld_input_id, result)

        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(id_)
            assert result_var is not None

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)
    else:
        for i in range(2):
            weld_obj = WeldObject(_encoder, _decoder)

            result_var = weld_obj.update(result.expr)
            assert result_var is None
            result_var = result.expr.obj_id
            weld_obj.dependencies[result_var] = result.expr

            weld_objects.append(weld_obj)
            weld_ids.append(result_var)

    # need 1 array from each resulting tables to get actual length
    for i in range(2):
        array_var = weld_objects[i].update(indexes[i])
        if isinstance(indexes[i], WeldObject):
            array_var = indexes[i].obj_id
            weld_objects[i].dependencies[array_var] = indexes[i]
        weld_col_ids.append(array_var)

    weld_templ = """slice(lookup(%(array)s, %(i)s), 0L, len(%(col)s))"""

    for i in range(2):
        weld_objects[i].weld_code = weld_templ % {'array': weld_ids[i],
                                                  'i': str(i) + 'L',
                                                  'col': weld_col_ids[i]}

    return weld_objects