Ejemplo n.º 1
0
def _union(rdd: RDD, other: RDD, func):
    num_partition = max(rdd.getNumPartitions(), other.getNumPartitions())

    def _func(pair):
        iter1, iter2 = pair
        val1 = list(iter1)
        val2 = list(iter2)
        if not val1:
            return val2[0]
        if not val2:
            return val1[0]
        return func(val1[0], val2[0])

    return _map_value(rdd.cogroup(other, num_partition), _func)
Ejemplo n.º 2
0
 def from_rdd(cls, rdd: RDD, job_id: str, namespace: str, name: str):
     partitions = rdd.getNumPartitions()
     return RDDTable(session_id=job_id,
                     namespace=namespace,
                     name=name,
                     partitions=partitions,
                     rdd=rdd)
Ejemplo n.º 3
0
 def _tmp_table_from_rdd(self, rdd: RDD, name=None):
     """
     tmp table, with namespace == job_id
     """
     rdd = materialize(rdd)
     name = name or str(uuid.uuid1())
     return RDDTable(session_id=self._session_id,
                     namespace=self._namespace,
                     name=name,
                     partitions=rdd.getNumPartitions(),
                     rdd=rdd,
                     dtable=None)
    def _check_data(train: RDD = None, test: RDD = None) -> (RDD, int):
        # Data-type check
        if isinstance(train, RDD):
            is_legal_train = train.map(
                lambda u: len(u) >= 3 and u[0] is not None and u[1] is not None
                and isinstance(u[2], Number)).reduce(lambda u1, u2: u1 and u2)
            if not is_legal_train:
                raise ValueError(
                    "Parameter train should be an RDD<(user, item, rating)>")
            num_partitions_of_train = train.getNumPartitions()
            return train

        if isinstance(test, RDD):
            is_legal_test = test.map(lambda u: len(u) >= 2 and u[0] is not None
                                     and u[1] is not None).reduce(
                                         lambda u1, u2: u1 and u2)
            if not is_legal_test:
                raise ValueError(
                    "Parameter train should be an RDD<(user, item, rating)>")
            num_partitions_of_test = test.getNumPartitions()
            return test

        raise ValueError("RDD train/test need to be input.")
def repar_rdd(rdd: RDD,
              rdd_count: int,
              example_per_par=100000,
              coalesce_only=True):
    """
    repar rdd based on number of example. if coalesce_only is False and expected
    partition is greater than current partition then nothing will happen
    """
    num_partition = rdd.getNumPartitions()
    expect_partition = max(1, int(rdd_count / example_per_par))

    if expect_partition < num_partition:
        rdd = rdd.coalesce(expect_partition)
    elif expect_partition > num_partition and coalesce_only is False:
        rdd = rdd.repartition(expect_partition)

    return rdd
    def __call__(self, rdd: RDD, **kwargs: Any) -> RDD:
        """
        Performs a single step of an algorithm, running all operations in sequence
        and ensuring data is partitioned correctly.

        Any additional keyword arguments passed to this function will be available
        in all life-cycle functions of the step:
        - `group`
        - `emit_by_group`
        - `broadcast`
        - `step`

        **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.**
        """
        if rdd.getNumPartitions() != self._n_partitions:
            rdd = rdd.repartition(self._n_partitions)

        step_cls: Type[Step] = self.__class__
        rdd = step_cls.group(
            rdd, **kwargs
        ).cache()  # cache because we use it twice (emit and step)

        def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]:
            k, v = kv
            new_v = step_cls.emit_by_group(k, v, **kwargs)
            return new_v

        emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect())
        to_broadcast = step_cls.broadcast(emitted, **kwargs)
        broadcast: Broadcast = self._sc.broadcast(to_broadcast)

        def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]:
            k, v = kv
            for new_v in step_cls.step(k, v, broadcast, **kwargs):
                yield new_v

        rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True)
        return rdd
Ejemplo n.º 7
0
def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD:
    """
    Modified from
    https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py

    """
    starts = [0]
    if rdd.getNumPartitions() > 1:
        nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
        for i in range(len(nums) - 1):
            starts.append(starts[-1] + nums[i])

    def func1(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield i, v

    def func2(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield list(v) + [i]

    if not to_rows:
        return rdd.mapPartitionsWithIndex(func1)
    else:
        return rdd.mapPartitionsWithIndex(func2)
Ejemplo n.º 8
0
def _join(rdd: RDD, other: RDD, func=None):
    num_partitions = max(rdd.getNumPartitions(), other.getNumPartitions())
    rtn_rdd = rdd.join(other, numPartitions=num_partitions)
    if func is not None:
        rtn_rdd = _map_value(rtn_rdd, lambda x: func(x[0], x[1]))
    return rtn_rdd
Ejemplo n.º 9
0
def _subtract_by_key(rdd: RDD, other: RDD):
    return rdd.subtractByKey(other, rdd.getNumPartitions())