Ejemplo n.º 1
0
    def test_multiple_python_java_RDD_conversions(self):
        # Regression test for SPARK-5361
        data = [("1", {"director": "David Lean"}), ("2", {"director": "Andrew Dominik"})]
        data_rdd = self.sc.parallelize(data)
        data_java_rdd = data_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())

        # conversion between python and java RDD threw exceptions
        data_java_rdd = converted_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())
Ejemplo n.º 2
0
def train_model(data: RDD, l=1.0) -> MLNaiveBayesModel:
    aggregated = data.flatMap(lambda x:
                              [(l, x['features']) for l in x['labels']]) \
        .combineByKey(lambda v: (1, v),
                      lambda c, v: (c[0] + 1, c[1] + v),
                      lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])) \
        .sortBy(lambda x: x[0]) \
        .collect()
    num_labels = len(aggregated)
    num_documents = data.count()
    num_features = aggregated[0][1][1].size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggregated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom

        sum_term_freq_dense = sum_term_freq.toarray()
        theta_log_denom = math.log(sum_term_freq.sum() + num_features * l)
        theta[i, :] = np.log(sum_term_freq_dense + l) - theta_log_denom
        i += 1
    return MLNaiveBayesModel(labels, pi, theta)
Ejemplo n.º 3
0
 def __call__(self, head: RDD):
     if self.distinct and not self.approximate:
         head = head.distinct()
     if self.explained:
         self._log.info("toDebugString():\n%s", head.toDebugString().decode())
     if not self.approximate or not self.distinct:
         return head.count()
     return head.countApproxDistinct()
Ejemplo n.º 4
0
    def test_multiple_python_java_RDD_conversions(self):
        # Regression test for SPARK-5361
        data = [
            (u'1', {u'director': u'David Lean'}),
            (u'2', {u'director': u'Andrew Dominik'})
        ]
        data_rdd = self.sc.parallelize(data)
        data_java_rdd = data_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())

        # conversion between python and java RDD threw exceptions
        data_java_rdd = converted_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())
Ejemplo n.º 5
0
 def evaluate(self, lables_and_predictions: RDD):
     TP = lables_and_predictions.map(lambda x:
                                 (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                 filter(lambda x:
                                        len(x[0].intersection(x[1])) > self._intersect_n)
     accuracy = 100.0 * TP.count() / lables_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
    def __blocking_matrix(self,
                          train: RDD = None,
                          test: RDD = None,
                          similarity=None) -> RDD:
        """
        Divide matrix into blocks for the purpose of reduce key number.
        :param train: RDD<(Hashable, Hashable, float)>
            = RDD<bucket, item, rating>
        :param test: RDD<(Hashable, Hashable)>
            = RDD<bucket, item>
        :param similarity: RDD<(Hashable, Hashable, float)>
            RDD<bucket, bucket, similarity>
        :return: RDD<(int, int)(Hashable, Hashable, float)>
            = RDD<(bucket_block, item_block), (bucket, item, rating)> or
              RDD<(bucket_block, bucket_block), (bucket, bucket, similarity)>
        """
        seed = self._seed
        n_bucket_block = self._n_bucket_block
        n_item_block = self._n_item_block
        n_cross_block = self._n_cross_block

        if train is not None:
            train = train.map(lambda u: ((hash2int(
                u[0], max_value=n_cross_block, seed=seed
            ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache()
            train.count()
            return train

        if test is not None:
            test = test.map(lambda u: ((hash2int(
                u[0], max_value=n_bucket_block, seed=seed
            ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache()
            test.count()
            return test

        if similarity is not None:
            similarity = similarity.flatMap(lambda u: [(u[0], u[1], u[
                2]), (u[1], u[0], u[2])]).map(lambda u: (
                    (hash2int(u[0], max_value=n_bucket_block, seed=seed),
                     hash2int(u[1], max_value=n_cross_block, seed=seed)), u)
                                              ).cache()
            similarity.count()
            return similarity
 def evaluate(self, lables_and_predictions: RDD):
     TP = lables_and_predictions.map(lambda x:
                                 (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                 filter(lambda x:
                                        len(x[0].intersection(x[1])) > self._intersect_n)
     accuracy = 100.0 * TP.count() / lables_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
Ejemplo n.º 8
0
 def evaluate(self, labels_and_predictions: RDD) -> float:
     tp = labels_and_predictions \
         .map(lambda x:
              (set(x[0]),
               set(features for features, weights in x[1][:self._pred_n]))) \
         .filter(lambda x:
                 len(x[0].intersection(x[1])) >= self._intersect_n)
     accuracy = 100.0 * tp.count() / labels_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
    def run(self, rdd: RDD) -> RDD:  # type: ignore
        rdd = rdd.cache()

        n_points = rdd.count()
        m = n_points / self.n_partitions
        optimal_p = math.log(n_points * self.n_partitions) / m

        rdd = self.assign_buckets(  # type: ignore
            rdd, p=optimal_p, key_func=_label_first_coord_and_type
        )
        rdd = self.sort_and_assign_labels(rdd)  # type: ignore

        return rdd
    def run(
        self,
        rdd: RDD,
        key_func: Callable[[Tuple[Any]], Tuple[Any]] = lambda x: x
    ) -> RDD:  # type: ignore
        rdd = rdd.cache()

        n_points = rdd.count()
        m = n_points / self.n_partitions
        optimal_p = math.log(n_points * self.n_partitions) / m

        rdd = self.assign_buckets(rdd, p=optimal_p,
                                  key_func=key_func)  # type: ignore
        rdd = self.sort(rdd, key_func=key_func)  # type: ignore

        return rdd
Ejemplo n.º 11
0
def partition_per_row(rdd: RDD) -> RDD:
    """Place each row in an RDD into a separate partition.

    Only useful if that row represents something large to be computed over,
    perhaps an external resource such as a multi-gb training dataset. The spark
    part of the dataset is expected to be tiny and easily fit in a single
    partition.
    """
    num_rows = rdd.count()
    # Help out mypy. Also don't use `identity`, as it somehow fails serialization
    partition_fn = cast(Callable[[int], int], lambda x: x)

    return (
        # bring everything together and assign each row a partition id
        rdd.repartition(1).mapPartitions(lambda rows: enumerate(rows))
        # Partition by the new parition_id
        .partitionBy(num_rows, partition_fn)
        # Drop the partition id, giving back the origional shape
        .map(lambda pair: pair[1]))
Ejemplo n.º 12
0
def mean(rdd: RDD) -> float:
	return rdd.sum() / float(rdd.count())
Ejemplo n.º 13
0
def kurtosis(rdd: RDD, mean: float, stdev: float) -> float:
	return rdd.map(lambda x: pow(x-mean, 4)).sum() / (pow(stdev, 4)*rdd.count())
Ejemplo n.º 14
0
def skewness(rdd: RDD, mean: float, stdev: float) -> float:
	return rdd.map(lambda x: pow(x-mean, 3)).sum() / (pow(stdev, 3)*rdd.count())
Ejemplo n.º 15
0
def stdev(rdd: RDD, mean: float) -> float:
	return sqrt(rdd.map(lambda x: pow(x-mean, 2)).sum() / rdd.count())