Beispiel #1
0
 def _map_fields(self, field_map, field_sources, field_sinks):
     rtn_sinks = None
     if field_sinks is None:
         left_sinks = list()
         for src in field_sources:
             src_ = val.array_from_parameter(self, 'left_field_sources',
                                             src)
             snk_ = ops.map_valid(src_, field_map)
             left_sinks.append(snk_)
         rtn_sinks = left_sinks
     elif val.is_field_parameter(field_sinks[0]):
         # groups or fields
         for src, snk in zip(field_sources, field_sinks):
             src_ = val.array_from_parameter(self, 'left_field_sources',
                                             src)
             snk_ = ops.map_valid(src_, field_map)
             snk = val.field_from_parameter(self, 'left_field_sinks', snk)
             snk.data.write(snk_)
     else:
         # raw arrays
         for src, snk in zip(field_sources, field_sinks):
             src_ = val.array_from_parameter(self, 'left_field_sources',
                                             src)
             snk_ = val.array_from_parameter(self, 'left_field_sinks', snk)
             ops.map_valid(src_, field_map, snk_)
     return None if rtn_sinks is None else tuple(rtn_sinks)
Beispiel #2
0
    def get_spans(self, field=None, fields=None):
        """
        Calculate a set of spans that indicate contiguous equal values.
        The entries in the result array correspond to the inclusive start and
        exclusive end of the span (the ith span is represented by element i and
        element i+1 of the result array). The last entry of the result array is
        the length of the source field.

        Only one of 'field' or 'fields' may be set. If 'fields' is used and more
        than one field specified, the fields are effectively zipped and the check
        for spans is carried out on each corresponding tuple in the zipped field.

        Example:
            field: [1, 2, 2, 1, 1, 1, 3, 4, 4, 4, 2, 2, 2, 2, 2]
            result: [0, 1, 3, 6, 7, 10, 15]
        """
        if field is None and fields is None:
            raise ValueError("One of 'field' and 'fields' must be set")
        if field is not None and fields is not None:
            raise ValueError("Only one of 'field' and 'fields' may be set")
        raw_field = None
        raw_fields = None
        if field is not None:
            raw_field = val.array_from_parameter(self, 'field', field)

        raw_fields = []
        if fields is not None:
            for i_f, f in enumerate(fields):
                raw_fields.append(
                    val.array_from_parameter(self, "'fields[{}]'".format(i_f),
                                             f))
        return per._get_spans(raw_field, raw_fields)
Beispiel #3
0
    def apply_index(self, index_to_apply, src, dest=None):
        """
        Apply a index to an a src field. The indexed field is written to dest if it set,
        and returned from the function call. If the field is an IndexedStringField, the
        indices and values are returned separately.

        :param index_to_apply: the index to be applied to the source field
        :param src: the field to be index
        :param dest: optional - a field to write the indexed data to
        :return: the indexed values
        """
        index_to_apply_ = val.array_from_parameter(self, 'index_to_apply',
                                                   index_to_apply)
        writer_ = None
        if dest is not None:
            writer_ = val.field_from_parameter(self, 'writer', dest)
        if isinstance(src, fld.IndexedStringField):
            src_ = val.field_from_parameter(self, 'reader', src)
            dest_indices, dest_values =\
                ops.apply_indices_to_index_values(index_to_apply_,
                                                  src_.indices[:], src_.values[:])
            if writer_ is not None:
                writer_.indices.write(dest_indices)
                writer_.values.write(dest_values)
            return dest_indices, dest_values
        else:
            reader_ = val.array_from_parameter(self, 'reader', src)
            result = reader_[index_to_apply]
            if writer_:
                writer_.data.write(result)
            return result
Beispiel #4
0
    def _apply_spans_src(self, predicate, spans, src, dest=None):
        assert (dest is None or isinstance(dest, fld.Field))
        src_ = val.array_from_parameter(self, 'src', src)
        if len(src) != spans[-1]:
            error_msg = (
                "'src' (length {}) must be one element shorter than 'spans' "
                "(length {})")
            raise ValueError(error_msg.format(len(src_), len(spans)))

        if dest is not None:
            dest_f = val.field_from_parameter(self, 'dest', dest)
            results = np.zeros(len(spans) - 1, dtype=dest_f.data.dtype)
            predicate(spans, src_, results)
            dest_f.data.write(results)
            return results
        else:
            results = np.zeros(len(spans) - 1, dtype=src_.dtype)
            predicate(spans, src_, results)
            return results
Beispiel #5
0
    def ordered_merge_left(self,
                           left_on,
                           right_on,
                           right_field_sources=tuple(),
                           left_field_sinks=None,
                           left_to_right_map=None,
                           left_unique=False,
                           right_unique=False):
        """
        Generate the results of a left join apply it to the fields described in the tuple
        'left_field_sources'. If 'left_field_sinks' is set, the mapped values are written
        to the fields / arrays set there.
        Note: in order to achieve best scalability, you should use groups / fields rather
        than numpy arrays and provide a tuple of groups/fields to left_field_sinks, so
        that the session and compute the merge and apply the mapping in a streaming
        fashion.
        :param left_on: the group/field/numba array that contains the left key values
        :param right_on: the group/field/numba array that contains the right key values
        :param left_to_right_map: a group/field/numba array that the map is written to. If
        it is a numba array, it must be the size of the resulting merge
        :param left_field_sources: a tuple of group/fields/numba arrays that contain the
        fields to be joined
        :param left_field_sinks: optional - a tuple of group/fields/numba arrays that
        the mapped fields should be written to
        :param left_unique: a hint to indicate whether the 'left_on' field contains unique
        values
        :param right_unique: a hint to indicate whether the 'right_on' field contains
        unique values
        :return: If left_field_sinks is not set, a tuple of the output fields is returned
        """
        if left_field_sinks is not None:
            if len(right_field_sources) != len(left_field_sinks):
                msg = (
                    "{} and {} should be of the same length but are length {} and {} "
                    "respectively")
                raise ValueError(
                    msg.format(len(right_field_sources),
                               len(left_field_sinks)))
        val.all_same_basic_type('left_field_sources', right_field_sources)
        if left_field_sinks and len(left_field_sinks) > 0:
            val.all_same_basic_type('left_field_sinks', left_field_sinks)

        streamable = val.is_field_parameter(left_on) and \
                     val.is_field_parameter(right_on) and \
                     val.is_field_parameter(right_field_sources[0]) and \
                     left_field_sinks is not None and \
                     val.is_field_parameter(left_field_sinks[0]) and \
                     left_to_right_map is not None

        result = None
        has_unmapped = None
        if left_unique == False:
            if right_unique == False:
                raise ValueError("Right key must not have duplicates")
            else:
                if streamable:
                    has_unmapped = \
                        ops.ordered_map_to_right_right_unique_streamed(left_on, right_on,
                                                                      left_to_right_map)
                    result = left_to_right_map
                else:
                    result = np.zeros(len(left_on), dtype=np.int64)
                    left_data = val.array_from_parameter(
                        self, "left_on", left_on)
                    right_data = val.array_from_parameter(
                        self, "right_on", right_on)
                    has_unmapped = \
                        ops.ordered_map_to_right_right_unique(
                            left_data, right_data, result)
        else:
            if right_unique == False:
                raise ValueError("Right key must not have duplicates")
            else:
                result = np.zeros(len(left_on), dtype=np.int64)
                left_data = val.array_from_parameter(self, "left_on", left_on)
                right_data = val.array_from_parameter(self, "right_on",
                                                      right_on)
                has_unmapped = ops.ordered_map_to_right_both_unique(
                    left_data, right_data, result)

        if streamable:
            self._streaming_map_fields(result, right_field_sources,
                                       left_field_sinks)
            return None
        else:
            rtn_left_sinks = self._map_fields(result, right_field_sources,
                                              left_field_sinks)
            return rtn_left_sinks
Beispiel #6
0
    def ordered_merge_inner(self,
                            left_on,
                            right_on,
                            left_field_sources=tuple(),
                            left_field_sinks=None,
                            right_field_sources=tuple(),
                            right_field_sinks=None,
                            left_unique=False,
                            right_unique=False):

        if left_field_sinks is not None:
            if len(left_field_sources) != len(left_field_sinks):
                msg = (
                    "{} and {} should be of the same length but are length {} and {} "
                    "respectively")
                raise ValueError(
                    msg.format(len(left_field_sources), len(left_field_sinks)))
        val.all_same_basic_type('left_field_sources', left_field_sources)
        if left_field_sinks and len(left_field_sinks) > 0:
            val.all_same_basic_type('left_field_sinks', left_field_sinks)

        if right_field_sinks is not None:
            if len(right_field_sources) != len(right_field_sinks):
                msg = (
                    "{} and {} should be of the same length but are length {} and {} "
                    "respectively")
                raise ValueError(
                    msg.format(len(right_field_sources),
                               len(right_field_sinks)))
        val.all_same_basic_type('right_field_sources', right_field_sources)
        if right_field_sinks and len(right_field_sinks) > 0:
            val.all_same_basic_type('right_field_sinks', right_field_sinks)

        left_data = val.array_from_parameter(self, 'left_on', left_on)
        right_data = val.array_from_parameter(self, 'right_on', right_on)

        result = None
        inner_length = ops.ordered_inner_map_result_size(left_data, right_data)

        left_to_inner = np.zeros(inner_length, dtype=np.int64)
        right_to_inner = np.zeros(inner_length, dtype=np.int64)
        if left_unique is False:
            if right_unique is False:
                ops.ordered_inner_map(left_data, right_data, left_to_inner,
                                      right_to_inner)
            else:
                ops.ordered_inner_map_left_unique(right_data, left_data,
                                                  right_to_inner,
                                                  left_to_inner)
        else:
            if right_unique is False:
                ops.ordered_inner_map_left_unique(left_data, right_data,
                                                  left_to_inner,
                                                  right_to_inner)
            else:
                ops.ordered_inner_map_both_unique(left_data, right_data,
                                                  left_to_inner,
                                                  right_to_inner)

        rtn_left_sinks = self._map_fields(left_to_inner, left_field_sources,
                                          left_field_sinks)
        rtn_right_sinks = self._map_fields(right_to_inner, right_field_sources,
                                           right_field_sinks)

        if rtn_left_sinks:
            if rtn_right_sinks:
                return rtn_left_sinks, rtn_right_sinks
            else:
                return rtn_left_sinks
        else:
            return rtn_right_sinks