Beispiel #1
0
    def equal_to(self, *fields):
        """
        Continues a Join transformation.

        Defines the Tuple fields of the second join DataSet that should be used as join keys.
        Note: Fields can only be selected as join keys on Tuple DataSets.

        :param fields:The indexes of the Tuple fields of the second join DataSet that should be used as keys.
        :return:An incomplete Join Transformation.
        """
        f = None
        if isinstance(fields[0], TYPES.FunctionType):
            f = lambda x: (fields[0](x), )
        if isinstance(fields[0], KeySelectorFunction):
            f = lambda x: (fields[0].get_key(x), )
        if f is None:
            f = lambda x: tuple([x[key] for key in fields])

        new_other_set = self._info.other_set.map(lambda x: (f(x), x))
        new_other_set._info.types = _createKeyValueTypeInfo(len(fields))
        self._info.other = new_other_set._info
        self._info.other.parallelism = self._info.parallelism
        self._info.other.children.append(self._info)
        self._info.key2 = tuple([x for x in range(len(fields))])
        self._env._sets.append(self._info)
        return JoinOperator(self._env, self._info)
Beispiel #2
0
    def equal_to(self, *fields):
        """
        Continues a Join transformation.

        Defines the Tuple fields of the second join DataSet that should be used as join keys.
        Note: Fields can only be selected as join keys on Tuple DataSets.

        :param fields:The indexes of the Tuple fields of the second join DataSet that should be used as keys.
        :return:An incomplete Join Transformation.
        """
        f = None
        if isinstance(fields[0], TYPES.FunctionType):
            f = lambda x: (fields[0](x),)
        if isinstance(fields[0], KeySelectorFunction):
            f = lambda x: (fields[0].get_key(x),)
        if f is None:
            f = lambda x: tuple([x[key] for key in fields])

        new_other_set = self._info.other_set.map(lambda x: (f(x), x))
        new_other_set._info.types = _createKeyValueTypeInfo(len(fields))
        self._info.other = new_other_set._info
        self._info.other.parallelism = self._info.parallelism
        self._info.other.children.append(self._info)
        self._info.key2 = tuple([x for x in range(len(fields))])
        self._env._sets.append(self._info)
        return JoinOperator(self._env, self._info)
Beispiel #3
0
    def _finalize(self):
        grouping = self._child_chain[0]
        sortings = self._child_chain[1:]

        #list of used index keys to prevent duplicates and determine final index
        index_keys = set()

        if not isinstance(grouping.keys[0],
                          (TYPES.FunctionType, KeySelectorFunction)):
            index_keys = index_keys.union(set(grouping.keys))

        #list of sorts using indices
        index_sorts = []
        #list of sorts using functions
        ksl_sorts = []
        for s in sortings:
            if not isinstance(s.field,
                              (TYPES.FunctionType, KeySelectorFunction)):
                index_keys.add(s.field)
                index_sorts.append(s)
            else:
                ksl_sorts.append(s)

        used_keys = sorted(index_keys)
        #all data gathered

        #construct list of extractor lambdas
        lambdas = []
        i = 0
        for key in used_keys:
            lambdas.append(lambda x, k=key: x[k])
            i += 1
        if isinstance(grouping.keys[0],
                      (TYPES.FunctionType, KeySelectorFunction)):
            lambdas.append(grouping.keys[0])
        for ksl_op in ksl_sorts:
            lambdas.append(ksl_op.field)

        grouping.parent.operator.map = lambda x: (tuple(
            [l(x) for l in lambdas]), x)
        grouping.parent.types = _createKeyValueTypeInfo(len(lambdas))
        #modify keys
        ksl_offset = len(used_keys)
        if not isinstance(grouping.keys[0],
                          (TYPES.FunctionType, KeySelectorFunction)):
            grouping.keys = tuple(
                [used_keys.index(key) for key in grouping.keys])
        else:
            grouping.keys = (ksl_offset, )
            ksl_offset += 1

        for iop in index_sorts:
            iop.field = used_keys.index(iop.field)

        for kop in ksl_sorts:
            kop.field = ksl_offset
            ksl_offset += 1
Beispiel #4
0
 def _distinct(self, fields):
     self._info.types = _createKeyValueTypeInfo(len(fields))
     child = OperationInfo()
     child_set = DataSet(self._env, child)
     child.identifier = _Identifier.DISTINCT
     child.parent = self._info
     child.keys = fields
     self._info.children.append(child)
     self._env._sets.append(child)
     return child_set
Beispiel #5
0
 def _distinct(self, fields):
     self._info.types = _createKeyValueTypeInfo(len(fields))
     child = OperationInfo()
     child_set = DataSet(self._env, child)
     child.identifier = _Identifier.DISTINCT
     child.parent = self._info
     child.keys = fields
     self._info.children.append(child)
     self._env._sets.append(child)
     return child_set
Beispiel #6
0
    def _finalize(self):
        grouping = self._child_chain[0]
        sortings = self._child_chain[1:]

        #list of used index keys to prevent duplicates and determine final index
        index_keys = set()

        if not isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)):
            index_keys = index_keys.union(set(grouping.keys))

        #list of sorts using indices
        index_sorts = []
        #list of sorts using functions
        ksl_sorts = []
        for s in sortings:
            if not isinstance(s.field, (TYPES.FunctionType, KeySelectorFunction)):
                index_keys.add(s.field)
                index_sorts.append(s)
            else:
                ksl_sorts.append(s)

        used_keys = sorted(index_keys)
        #all data gathered

        #construct list of extractor lambdas
        lambdas = []
        i = 0
        for key in used_keys:
            lambdas.append(lambda x, k=key: x[k])
            i += 1
        if isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)):
            lambdas.append(grouping.keys[0])
        for ksl_op in ksl_sorts:
            lambdas.append(ksl_op.field)

        grouping.parent.operator.map = lambda x: (tuple([l(x) for l in lambdas]), x)
        grouping.parent.types = _createKeyValueTypeInfo(len(lambdas))
        #modify keys
        ksl_offset = len(used_keys)
        if not isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)):
            grouping.keys = tuple([used_keys.index(key) for key in grouping.keys])
        else:
            grouping.keys = (ksl_offset,)
            ksl_offset += 1

        for iop in index_sorts:
            iop.field = used_keys.index(iop.field)

        for kop in ksl_sorts:
            kop.field = ksl_offset
            ksl_offset += 1
Beispiel #7
0
    def _finalize(self):
        grouping = self._child_chain[0]
        keys = grouping.keys
        f = None
        if isinstance(keys[0], TYPES.FunctionType):
            f = lambda x: (keys[0](x),)
        if isinstance(keys[0], KeySelectorFunction):
            f = lambda x: (keys[0].get_key(x),)
        if f is None:
            f = lambda x: tuple([x[key] for key in keys])

        grouping.parent.operator.map = lambda x: (f(x), x)
        grouping.parent.types = _createKeyValueTypeInfo(len(keys))
        grouping.keys = tuple([i for i in range(len(grouping.keys))])
Beispiel #8
0
    def _finalize(self):
        grouping = self._child_chain[0]
        keys = grouping.keys
        f = None
        if isinstance(keys[0], TYPES.FunctionType):
            f = lambda x: (keys[0](x), )
        if isinstance(keys[0], KeySelectorFunction):
            f = lambda x: (keys[0].get_key(x), )
        if f is None:
            f = lambda x: tuple([x[key] for key in keys])

        grouping.parent.operator.map = lambda x: (f(x), x)
        grouping.parent.types = _createKeyValueTypeInfo(len(keys))
        grouping.keys = tuple([i for i in range(len(grouping.keys))])
Beispiel #9
0
    def _partition_by_hash(self, fields):
        """
        Hash-partitions a DataSet on the specified key fields.
        Important:This operation shuffles the whole DataSet over the network and can take significant amount of time.

        :param fields: The field indexes on which the DataSet is hash-partitioned.
        :return: The partitioned DataSet.
        """
        self._info.types = _createKeyValueTypeInfo(len(fields))
        child = OperationInfo()
        child_set = DataSet(self._env, child)
        child.identifier = _Identifier.PARTITION_HASH
        child.parent = self._info
        child.keys = fields
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Beispiel #10
0
    def _partition_by_hash(self, fields):
        """
        Hash-partitions a DataSet on the specified key fields.
        Important:This operation shuffles the whole DataSet over the network and can take significant amount of time.

        :param fields: The field indexes on which the DataSet is hash-partitioned.
        :return: The partitioned DataSet.
        """
        self._info.types = _createKeyValueTypeInfo(len(fields))
        child = OperationInfo()
        child_set = DataSet(self._env, child)
        child.identifier = _Identifier.PARTITION_HASH
        child.parent = self._info
        child.keys = fields
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Beispiel #11
0
    def equal_to(self, *fields):
        """
        Continues a CoGroup transformation.

        Defines the Tuple fields of the second co-grouped DataSet that should be used as grouping keys.
        Note: Fields can only be selected as grouping keys on Tuple DataSets.

        :param fields: The indexes of the Tuple fields of the second co-grouped DataSet that should be used as keys.
        :return: An incomplete CoGroup transformation.
        """
        f = None
        if isinstance(fields[0], TYPES.FunctionType):
            f = lambda x: (fields[0](x),)
        if isinstance(fields[0], KeySelectorFunction):
            f = lambda x: (fields[0].get_key(x),)
        if f is None:
            f = lambda x: tuple([x[key] for key in fields])

        new_other_set = self._info.other_set.map(lambda x: (f(x), x))
        new_other_set._info.types = _createKeyValueTypeInfo(len(fields))
        self._info.other = new_other_set._info
        self._info.other.children.append(self._info)
        self._info.key2 = fields
        return CoGroupOperatorUsing(self._env, self._info)
Beispiel #12
0
    def equal_to(self, *fields):
        """
        Continues a CoGroup transformation.

        Defines the Tuple fields of the second co-grouped DataSet that should be used as grouping keys.
        Note: Fields can only be selected as grouping keys on Tuple DataSets.

        :param fields: The indexes of the Tuple fields of the second co-grouped DataSet that should be used as keys.
        :return: An incomplete CoGroup transformation.
        """
        f = None
        if isinstance(fields[0], TYPES.FunctionType):
            f = lambda x: (fields[0](x), )
        if isinstance(fields[0], KeySelectorFunction):
            f = lambda x: (fields[0].get_key(x), )
        if f is None:
            f = lambda x: tuple([x[key] for key in fields])

        new_other_set = self._info.other_set.map(lambda x: (f(x), x))
        new_other_set._info.types = _createKeyValueTypeInfo(len(fields))
        self._info.other = new_other_set._info
        self._info.other.children.append(self._info)
        self._info.key2 = fields
        return CoGroupOperatorUsing(self._env, self._info)