Ejemplo n.º 1
0
 def _compute_tfid(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts.map(lambda t: t.words))
     tf.cache()
     idf = IDF().fit(tf)
     tfidfs = idf.transform(tf)
     text_tfs = texts.zip(tfidfs)
     return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
Ejemplo n.º 2
0
def train_model(data: RDD, l=1.0) -> MLNaiveBayesModel:
    aggregated = data.flatMap(lambda x:
                              [(l, x['features']) for l in x['labels']]) \
        .combineByKey(lambda v: (1, v),
                      lambda c, v: (c[0] + 1, c[1] + v),
                      lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])) \
        .sortBy(lambda x: x[0]) \
        .collect()
    num_labels = len(aggregated)
    num_documents = data.count()
    num_features = aggregated[0][1][1].size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggregated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom

        sum_term_freq_dense = sum_term_freq.toarray()
        theta_log_denom = math.log(sum_term_freq.sum() + num_features * l)
        theta[i, :] = np.log(sum_term_freq_dense + l) - theta_log_denom
        i += 1
    return MLNaiveBayesModel(labels, pi, theta)
    def __init_parameters(self, train: RDD):
        """
        _n_buckets/_n_items:
            The number of distinct buckets/items in the train RDD.
        _bucket_block_size/_cross_block_size/_item_block_size:
            The size of blocks when dividing buckets/cross buckets/items into blocks.
        _n_bucket_block/_n_cross_block/_n_item_block:
            The number of blocks when dividing buckets/cross buckets/items into blocks.
        """
        self._n_buckets = train.map(lambda u: u[0]).distinct().count()
        if self._n_buckets <= self._k:
            self._k = float("inf")

        # For the bucket dimension.
        if self._bucket_block_size is None:
            # Interpret bucket_block_size from n_bucket_block
            self._bucket_block_size = self._n_buckets // self._n_bucket_block + 1
        else:
            self._n_bucket_block = self._n_buckets // self._bucket_block_size + 1

        # For the cross dimension.
        if self._cross_block_size is None:
            self._cross_block_size = self._n_buckets // self._n_cross_block + 1
        else:
            self._n_cross_block = self._n_buckets // self._cross_block_size + 1

        # For the item dimension
        self._n_items = train.map(lambda u: u[1]).distinct().count()
        if self._item_block_size is None:
            self._item_block_size = self._n_items // self._n_item_block + 1
        else:
            self._n_item_block = self._n_item // self._item_block_size + 1
        return self
Ejemplo n.º 4
0
def join_multiple_keys(left: RDD, right: RDD, n: int) -> RDD:
    """
    Join RDDs with multiple keys.
        ((key1, key2, ...), value_left) x (key_i, value_right_i) ->
        ((key1, key2, ...), (value_left, value_right_1, value_right_2, ...))
    :param left: RDD<tuple<int>, value>
    :param right: RDD<int, value>
    :param n: int, the length of the key in left-RDD
    :return: joint RDD.
    """
    left = left.map(
        lambda u: (-1, (u[0], (u[1],)))
    )  # (_, (tuple<key>, tuple<value>))
    right = right.map(
        lambda u: (u[0], (u[1],))
    ).cache()  # (_, tuple<value>)
    for key_order in range(n):
        left = left.map(
            lambda u: (u[1][0][key_order], u[1])  # (_, (tuple<key>, tuple<value>))
        ).join(
            right  # (_, ((tuple<key>, tuple<value>), tuple<value>))
        ).map(
            lambda u: (-1, (u[1][0][0], u[1][0][1] + u[1][1]))
        )  # (_, (tuple<key>, tuple<value>))

    left = left.map(
        lambda u: u[1]
    )  # (tuple<key>, tuple<value>)
    return left
Ejemplo n.º 5
0
    def _evaluate(self, rdd: RDD, **kwargs):
        yaml_model = self.master_network.to_yaml()
        optimizer = deserialize_optimizer(self.master_optimizer)
        loss = self.master_loss
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects
        metrics = self.master_metrics

        def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs,
                      data_iterator):
            model = model_from_yaml(model, custom_objects)
            model.compile(optimizer, loss, metrics)
            model.set_weights(weights.value)
            feature_iterator, label_iterator = tee(data_iterator, 2)
            x_test = np.asarray([x for x, y in feature_iterator])
            y_test = np.asarray([y for x, y in label_iterator])
            return [model.evaluate(x_test, y_test, **kwargs)]

        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        results = rdd.mapPartitions(
            partial(_evaluate, yaml_model, optimizer, loss, custom_objects,
                    metrics, kwargs))
        if not metrics:
            # if no metrics, we can just return the scalar corresponding to the loss value
            return results.mean()
        else:
            # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API
            loss_value = results.map(lambda x: x[0]).mean()
            metric_value = results.map(lambda x: x[1]).mean()
            return [loss_value, metric_value]
def naive_multiplication_rdd(mat_a: pyspark.RDD, mat_b: pyspark.RDD, is_triangle=False):
    """
    mat_a is the left matrix
    mat_b is the right matix
    :param mat_a:
    :param mat_b:
    :param is_triangle:
    :return:
    """
    if is_triangle:
        left_rdd = (
            mat_a.flatMap(lambda x: [((x.j, x.i), x.value), ((x.i, x.j), x.value)])
                .aggregateByKey(zeroValue=(0.0, 0.0),
                                seqFunc=lambda x, y: (x[0]+y, x[1]+1),
                                combFunc=lambda x, y: (x[0] + y[0], x[1]+y[1]))
                .mapValues(lambda x: x[0] / x[1])
                .map(lambda x: (x[0][0], (x[0][1], x[1])))
        )
    else:
        left_rdd = mat_a.map(lambda x: (x.j, (x.i, x.value)))

    right_rdd = mat_b.map(lambda x: (x.i, (x.j, x.value)))
    combined_rdd = (left_rdd.join(right_rdd).map(lambda x: x[1])
        .map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][1]))
        .reduceByKey(lambda x, y: x+y)
        .map(lambda x: distributed.MatrixEntry(i=x[0][0], j=x[0][1], value=x[1]))
    )
    return combined_rdd
Ejemplo n.º 7
0
    def build_vocabularies(self, rows: RDD):
        """
        Process rows to gather values and paths with their frequencies.
        :param rows: row structure is ((key, doc), val) where:
            * key: str with the path context
            * doc: file name
            * val: number of occurrences of key in doc
        """

        def _flatten_row(row: Row):
            # 2: removes the namespace v. from the string to parse it as tuple
            k = Vocabulary2Id._unstringify_path_context(row)
            return [(k[0], 1), (k[1], 1), (k[2], 1)]

        rows = rows \
            .flatMap(_flatten_row) \
            .reduceByKey(operator.add) \
            .persist()

        values = rows.filter(lambda x: type(x[0]) == str).collect()
        paths = rows.filter(lambda x: type(x[0]) == tuple).collect()

        value2index = {w: id for id, (w, _) in enumerate(values)}
        path2index = {w: id for id, (w, _) in enumerate(paths)}
        value2freq = {w: freq for _, (w, freq) in enumerate(values)}
        path2freq = {w: freq for _, (w, freq) in enumerate(paths)}

        rows.unpersist()

        return value2index, path2index, value2freq, path2freq
Ejemplo n.º 8
0
 def __call__(self, head: RDD):
     if self.distinct and not self.approximate:
         head = head.distinct()
     if self.explained:
         self._log.info("toDebugString():\n%s", head.toDebugString().decode())
     if not self.approximate or not self.distinct:
         return head.count()
     return head.countApproxDistinct()
Ejemplo n.º 9
0
def _flat_map(rdd: RDD, func):
    from itertools import chain

    def _fn(x):
        return func(x[0], x[1])

    def _func(_, iterator):
        return chain.from_iterable(map(fail_on_stopiteration(_fn), iterator))

    rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False)
Ejemplo n.º 10
0
 def evaluate(self, lables_and_predictions: RDD):
     TP = lables_and_predictions.map(lambda x:
                                 (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                 filter(lambda x:
                                        len(x[0].intersection(x[1])) > self._intersect_n)
     accuracy = 100.0 * TP.count() / lables_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
 def evaluate(self, lables_and_predictions: RDD):
     TP = lables_and_predictions.map(lambda x:
                                 (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                 filter(lambda x:
                                        len(x[0].intersection(x[1])) > self._intersect_n)
     accuracy = 100.0 * TP.count() / lables_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
def cStress(rdd: RDD) -> RDD:

    # TODO: TWH Temporary
    ecg_sampling_frequency = 64.0
    rip_sampling_frequency = 64.0
    accel_sampling_frequency = 64.0 / 6.0

    # Timestamp correct datastreams
    ecg_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['ecg'],
                          sampling_frequency=ecg_sampling_frequency)))
    rip_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['rip'],
                          sampling_frequency=rip_sampling_frequency)))

    accelx_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['accelx'],
                          sampling_frequency=accel_sampling_frequency)))
    accely_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['accely'],
                          sampling_frequency=accel_sampling_frequency)))
    accelz_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['accelz'],
                          sampling_frequency=accel_sampling_frequency)))

    accel_group = accelx_corrected.join(accely_corrected).join(
        accelz_corrected).map(fix_two_joins)
    accel = accel_group.map(lambda ds: (
        ds[0],
        autosense_sequence_align(datastreams=[ds[1][0], ds[1][1], ds[1][2]],
                                 sampling_frequency=accel_sampling_frequency)))

    # Accelerometer Feature Computation
    accel_features = accel.map(
        lambda ds: (ds[0], accelerometer_features(ds[1], window_length=10.0)))

    # rip features
    peak_valley = rip_corrected.map(
        lambda ds: (ds[0], rip.compute_peak_valley(rip=ds[1])))
    rip_features = peak_valley.map(
        lambda ds: (ds[0], rip_feature_computation(ds[1][0], ds[1][1])))

    # r-peak datastream computation
    ecg_rr_rdd = ecg_corrected.map(lambda ds: (ds[
        0], compute_rr_intervals(ds[1], ecg_sampling_frequency)))
    ecg_features = ecg_rr_rdd.map(lambda ds: (ds[
        0], ecg_feature_computation(ds[1], window_size=60, window_offset=60)))

    # return rip_features.join(ecg_features).join(accel_features).map(fix_two_joins)
    return ecg_features
Ejemplo n.º 13
0
 def __preprocessRdd(self, rdd: RDD):
     rddc = rddCorrector()
     rdd = rdd.map(lambda l: rddc.correct(l))
     if rdd != None:
         if (rdd.isEmpty() == False):
             rdd = rdd.map(lambda l: l.replace("<tweet>", ""))
             rdd = rdd.map(lambda l: l.replace("</tweet>", ""))
             df = DataFrameWorks().convertDataFrame(rdd, self.__spark)
             df = CleanText().clean(df, self.__spark)
             return df
     return None
 def evaluate(truth: RDD, prediction: RDD) -> float:
     """
     Calculate RMSE between truth and predictions.
     :param truth: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)>
     :param prediction: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)>
     :return: float = RMSE
     """
     truth = truth.map(lambda u: ((u[0], u[1]), u[2]))
     prediction = prediction.map(lambda u: ((u[0], u[1]), u[2]))
     return truth.join(prediction).map(lambda u:
                                       (u[1][0] - u[1][1])**2).mean()**0.5
Ejemplo n.º 15
0
    def convertDataFrame(self, rdd: RDD, SqlObject) -> DataFrame:
        """RDD to DataFrame"""
        #rdd = rdd.map(lambda l: l.replace("½",""))

        rdd =  rdd.map(lambda l: (l[:19], l[19:]))
        schema = [StructField("id", StringType(), False),
                  StructField("rawData", StringType(), False),
                  StructField("preprocessedData", ArrayType(elementType=StringType(), containsNull=True), True),
                  StructField("sentiment", FloatType(), True)]
        final_struct = StructType(fields=schema)
        rdd =rdd.map(lambda l: (l[0],l[1], [None], None))
        return SqlObject.createDataFrame(rdd, schema=final_struct)
Ejemplo n.º 16
0
def java_to_python_rdd(sc, rdd, is_pair, is_json):
    jrdd = sc._jvm.SerDe.javaToPython(rdd)
    output = RDD(jrdd, sc)
    if is_pair:
        if is_json:
            return output.map(lambda x: (x.split("\t")[0], json.loads(x.split("\t")[1])))
        else:
            return output.map(lambda x: (x.split("\t")[0], x.split("\t")[1]))

    if is_json:
        return output.map(lambda x: json.loads(x))
    return output
Ejemplo n.º 17
0
    def run(self, rdd: RDD) -> RDD:  # type: ignore
        rdd = rdd.cache()

        n_points = rdd.count()
        m = n_points / self.n_partitions
        optimal_p = math.log(n_points * self.n_partitions) / m

        rdd = self.assign_buckets(  # type: ignore
            rdd, p=optimal_p, key_func=_label_first_coord_and_type
        )
        rdd = self.sort_and_assign_labels(rdd)  # type: ignore

        return rdd
Ejemplo n.º 18
0
 def __call__(self, head: RDD):
     if self.keymap is None:
         return head.coalesce(self.partitions, self.shuffle)
     # partitionBy the key extracted using self.keymap
     try:
         # this checks if keymap is an identity
         probe = self.keymap("probe")
     except:  # noqa: E722
         probe = None
     if probe != "probe":
         head = head.map(lambda x: (self.keymap(x), x))
     return head \
         .partitionBy(self.partitions) \
         .map(lambda x: x[1])
Ejemplo n.º 19
0
def _save_as_func(rdd: RDD, name, namespace, partition, persistent):
    from arch.api import session
    dup = session.table(name=name,
                        namespace=namespace,
                        partition=partition,
                        persistent=persistent)

    def _func(_, it):
        eggroll_util.maybe_create_eggroll_client()
        dup.put_all(list(it))
        return 1,

    rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False).collect()
    return dup
Ejemplo n.º 20
0
def _union(rdd: RDD, other: RDD, func):
    num_partition = max(rdd.getNumPartitions(), other.getNumPartitions())

    def _func(pair):
        iter1, iter2 = pair
        val1 = list(iter1)
        val2 = list(iter2)
        if not val1:
            return val2[0]
        if not val2:
            return val1[0]
        return func(val1[0], val2[0])

    return _map_value(rdd.cogroup(other, num_partition), _func)
Ejemplo n.º 21
0
def transform_online_retail(
    sc: SparkSession,
    raw_rdd: RDD,
    schema: str,
    max_month: Optional[int] = None,
) -> DataFrame:
    """Method to transform online retail dataset to its correct dataformats, specific
    for online retail

    :return:
    """

    # initial transformation of the raw RDD
    raw_rdd = raw_rdd.map(lambda retail: (
        retail[0],  # InvoiceNo
        retail[1],  # StockCode
        retail[2] if retail[2] != '' else None,  # Description
        int(retail[3]),  # Quantity
        datetime.strptime(retail[4], '%d/%m/%Y %H:%M') if
        int(retail[4].split('/')[1]) < max_month
        else datetime.strptime(retail[4], '%m/%d/%Y %H:%M'),  # InvoiceDate
        float(retail[5]),  # UnitPrice
        int(retail[6]) if retail[6] != '' else None,  # CustomerID
        retail[7] if retail[7] != '' else None)  # Country
    )

    return sc.createDataFrame(
        raw_rdd,
        schema=schema
    )
Ejemplo n.º 22
0
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast, b_item_map: Broadcast):
    def parse_item_claims(item):
        item_id = item['id']
        item_map = b_item_map.value
        if item_id not in item_map:
            return []
        item_label = item_map[item_id]
        property_map = b_property_map.value
        if 'enwiki' in item['sitelinks']:
            title = item['sitelinks']['enwiki']['title']
        else:
            title = None
        item_claims = []
        for property_id, property_claims in item['claims'].items():
            if property_id in property_map:
                if property_id not in property_map:
                    continue
                property_name = property_map[property_id]
                for claim in property_claims:
                    mainsnak = claim['mainsnak']
                    if 'datatype' in mainsnak and 'datavalue' in mainsnak:
                        datatype = mainsnak['datatype']
                        datavalue = mainsnak['datavalue']
                        if datatype in datatype_parsers:
                            wiki_object = datatype_parsers[datatype](datavalue)
                            if wiki_object is not None:
                                item_claims.append(
                                    Claim(item_label, property_name, wiki_object, datatype, title, property_id, item_id)
                                )
        return item_claims

    return wikidata_items.flatMap(parse_item_claims)
Ejemplo n.º 23
0
 def __init__(self, dt_index, rdd, jtsrdd=None, sc=None):
     if jtsrdd == None:
         # Construct from a Python RDD object and a Python DateTimeIndex
         jvm = rdd.ctx._jvm
         jrdd = rdd._reserialize(_TimeSeriesSerializer())._jrdd.mapToPair( \
             jvm.com.cloudera.sparkts.BytesToKeyAndSeries())
         self._jtsrdd = jvm.com.cloudera.sparkts.api.java.JavaTimeSeriesRDDFactory.timeSeriesRDD( \
             dt_index._jdt_index, jrdd)
         RDD.__init__(self, rdd._jrdd, rdd.ctx)
     else:
         # Construct from a py4j.JavaObject pointing to a JavaTimeSeriesRDD and a Python SparkContext
         jvm = sc._jvm
         jrdd = jtsrdd.map( \
             jvm.com.cloudera.sparkts.KeyAndSeriesToBytes())
         RDD.__init__(self, jrdd, sc, _TimeSeriesSerializer())
         self._jtsrdd = jtsrdd
Ejemplo n.º 24
0
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
Ejemplo n.º 25
0
    def __compute_signature(self, data: RDD) -> RDD:
        """
        Compute signature for items.
        :param data: RDD<(Hashable, Iterator<Hashable>)>
            = RDD<(item, content)>
        :return: RDD<(Hashable, tuple<int>)>
            = RDD<(item, signature)>
        """
        hashing_range = self.__hashing_range
        signature_length = self.__signature_length
        random_seed = self.__seed
        min_hash_func = self.__min_hash

        def _signature(key_values: (Hashable, Iterator)) -> (Hashable, tuple):
            """
            Compute signature for each item
            :return (Hashable, tuple<int>)
                = (item, signature)
            """
            item, content = key_values
            signature = [hashing_range for _ in range(signature_length)]
            for element in content:
                for index_i, hashed_value in enumerate(
                        min_hash_func(element, signature_length, hashing_range,
                                      random_seed)):
                    signature[index_i] = min(hashed_value, signature[index_i])
            return item, tuple(signature)

        return data.map(_signature)
Ejemplo n.º 26
0
def _java2py(sc, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != 'JavaRDD' and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = 'JavaRDD'

        if clsName == 'JavaRDD':
            jrdd = sc._jvm.SerDe.javaToPython(r)
            return RDD(jrdd, sc)

        if clsName == 'DataFrame':
            return DataFrame(r, get_spark_sql_context(sc))

        if clsName == 'Dataset':
            return DataFrame(r, get_spark_sql_context(sc))

        if clsName in _picklable_classes:
            r = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(r)
        elif isinstance(r, (JavaArray, JavaList, JavaMap)):
            try:
                r = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                    r)
            except Py4JJavaError:
                pass  # not pickable

    if isinstance(r, (bytearray, bytes)):
        r = PickleSerializer().loads(bytes(r), encoding=encoding)
    return r
Ejemplo n.º 27
0
    def apply(self,
              data_points: RDD,
              fault_tolerant: bool = False) -> np.ndarray:
        """Label PySpark RDD of data points with LFs.

        Parameters
        ----------
        data_points
            PySpark RDD containing data points to be labeled by LFs
        fault_tolerant
            Output ``-1`` if LF execution fails?

        Returns
        -------
        np.ndarray
            Matrix of labels emitted by LFs
        """
        f_caller = _FunctionCaller(fault_tolerant)

        def map_fn(args: Tuple[DataPoint, int]) -> RowData:
            return apply_lfs_to_data_point(*args,
                                           lfs=self._lfs,
                                           f_caller=f_caller)

        labels = data_points.zipWithIndex().map(map_fn).collect()
        return self._numpy_from_row_data(labels)
Ejemplo n.º 28
0
def _java2py(sc: SparkContext,
             r: "JavaObjectOrPickleDump",
             encoding: str = "bytes") -> Any:
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != "JavaRDD" and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = "JavaRDD"

        assert sc._jvm is not None

        if clsName == "JavaRDD":
            jrdd = sc._jvm.org.apache.spark.ml.python.MLSerDe.javaToPython(
                r)  # type: ignore[attr-defined]
            return RDD(jrdd, sc)

        if clsName == "Dataset":
            return DataFrame(r, SparkSession(sc)._wrapped)

        if clsName in _picklable_classes:
            r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(
                r)  # type: ignore[attr-defined]
        elif isinstance(r, (JavaArray, JavaList)):
            try:
                r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(
                    r)  # type: ignore[attr-defined]
            except Py4JJavaError:
                pass  # not picklable

    if isinstance(r, (bytearray, bytes)):
        r = CPickleSerializer().loads(bytes(r), encoding=encoding)
    return r
Ejemplo n.º 29
0
def _java2py(sc, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != "JavaRDD" and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = "JavaRDD"

        if clsName == "JavaRDD":
            jrdd = sc._jvm.org.apache.spark.ml.python.MLSerDe.javaToPython(r)
            return RDD(jrdd, sc)

        if clsName == "Dataset":
            return DataFrame(r, SQLContext.getOrCreate(sc))

        if clsName in _picklable_classes:
            r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r)
        elif isinstance(r, (JavaArray, JavaList)):
            try:
                r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r)
            except Py4JJavaError:
                pass  # not pickable

    if isinstance(r, (bytearray, bytes)):
        r = PickleSerializer().loads(bytes(r), encoding=encoding)
    return r
Ejemplo n.º 30
0
    def SpatialRangeQuery(self, spatialRDD: SpatialRDD, rangeQueryWindow: BaseGeometry, considerBoundaryIntersection: bool, usingIndex: bool):
        """

        :param spatialRDD:
        :param rangeQueryWindow:
        :param considerBoundaryIntersection:
        :param usingIndex:
        :return:
        """

        jvm = spatialRDD._jvm
        sc = spatialRDD._sc

        jvm_geom = GeometryAdapter.create_jvm_geometry_from_base_geometry(jvm, rangeQueryWindow)

        srdd = jvm.\
            RangeQuery.SpatialRangeQuery(
            spatialRDD._srdd,
            jvm_geom,
            considerBoundaryIntersection,
            usingIndex
        )

        serialized = JvmGeoSparkPythonConverter(jvm).translate_spatial_rdd_to_python(srdd)

        return RDD(serialized, sc, GeoSparkPickler())
Ejemplo n.º 31
0
    def create_python_rdd(self, jrdd, serializer):
        """Creates a Python RDD from a RDD from Scala.

        Args:
            jrdd (org.apache.spark.api.java.JavaRDD): The RDD that came from Scala.
            serializer (:class:`~geopyspark.AvroSerializer` or pyspark.serializers.AutoBatchedSerializer(AvroSerializer)):
                An instance of ``AvroSerializer`` that is either alone, or wrapped by ``AutoBatchedSerializer``.

        Returns:
            ``pyspark.RDD``
        """

        if isinstance(serializer, AutoBatchedSerializer):
            return RDD(jrdd, self.pysc, serializer)
        else:
            return RDD(jrdd, self.pysc, AutoBatchedSerializer(serializer))
Ejemplo n.º 32
0
 def __call__(self, rdd: RDD) -> RDD:
     def select_fields(row):
         return Row(**{f: getattr(row, f) for f in self.fields})
     res = rdd.map(select_fields)
     if self.explained:
         self._log.info("toDebugString():\n%s", res.toDebugString().decode())
     return res
Ejemplo n.º 33
0
 def __init__(self, dt_index, rdd, jtsrdd = None, sc = None):
     if jtsrdd == None:
         # Construct from a Python RDD object and a Python DateTimeIndex
         jvm = rdd.ctx._jvm
         jrdd = rdd._reserialize(_TimeSeriesSerializer())._jrdd.map( \
             jvm.com.cloudera.sparkts.BytesToKeyAndSeries())
         self._jtsrdd = jvm.com.cloudera.sparkts.TimeSeriesRDD( \
             dt_index._jdt_index, jrdd.rdd())
         RDD.__init__(self, rdd._jrdd, rdd.ctx)
     else:
         # Construct from a py4j.JavaObject pointing to a TimeSeriesRDD and a Python SparkContext
         jvm = sc._jvm
         jrdd = jvm.org.apache.spark.api.java.JavaRDD(jtsrdd, None).map( \
             jvm.com.cloudera.sparkts.KeyAndSeriesToBytes())
         RDD.__init__(self, jrdd, sc, _TimeSeriesSerializer())
         self._jtsrdd = jtsrdd
Ejemplo n.º 34
0
class ByteTileSchemaTest(BaseTestClass):
    tiles = [
        Tile.from_numpy_array(np.int8([0, 0, 1, 1]).reshape(2, 2), -128),
        Tile.from_numpy_array(np.int8([1, 2, 3, 4]).reshape(2, 2), -128),
        Tile.from_numpy_array(np.int8([5, 6, 7, 8]).reshape(2, 2), -128)
    ]

    sc = BaseTestClass.pysc._jsc.sc()
    tw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ByteArrayTileWrapper

    java_rdd = tw.testOut(sc)
    ser = ProtoBufSerializer(tile_decoder, tile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    def test_encoded_tiles(self):
        expected_encoded = [to_pb_tile(x) for x in self.collected]

        for actual, expected in zip(self.tiles, expected_encoded):
            cells = actual.cells
            rows, cols = cells.shape

            self.assertEqual(expected.cols, cols)
            self.assertEqual(expected.rows, rows)
            self.assertEqual(expected.cellType.nd, actual.no_data_value)
            self.assertEqual(expected.cellType.dataType,
                             mapped_data_types[actual.cell_type])

    def test_decoded_tiles(self):
        for actual, expected in zip(self.collected, self.tiles):
            self.assertTrue((actual.cells == expected.cells).all())
            self.assertTrue(actual.cells.dtype == expected.cells.dtype)
            self.assertEqual(actual.cells.shape, actual.cells.shape)
Ejemplo n.º 35
0
 def from_rdd(cls, rdd: RDD, job_id: str, namespace: str, name: str):
     partitions = rdd.getNumPartitions()
     return RDDTable(session_id=job_id,
                     namespace=namespace,
                     name=name,
                     partitions=partitions,
                     rdd=rdd)
Ejemplo n.º 36
0
def extract_item_map(wikidata_items: RDD):
    def parse_item(item):
        if 'en' in item['labels']:
            label = item['labels']['en']['value']
            return item['id'], label
        else:
            return None
    return wikidata_items.map(parse_item).filter(lambda i: i is not None).collectAsMap()
Ejemplo n.º 37
0
    def test_multiple_python_java_RDD_conversions(self):
        # Regression test for SPARK-5361
        data = [
            (u'1', {u'director': u'David Lean'}),
            (u'2', {u'director': u'Andrew Dominik'})
        ]
        data_rdd = self.sc.parallelize(data)
        data_java_rdd = data_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())

        # conversion between python and java RDD threw exceptions
        data_java_rdd = converted_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())
Ejemplo n.º 38
0
def extract_item_page_map(wikidata_items: RDD):
    def parse_item_page(item):
        item_id = item['id']
        if 'enwiki' in item['sitelinks']:
            return [(item_id, item['sitelinks']['enwiki']['title'])]
        else:
            return []
    return wikidata_items.flatMap(parse_item_page).collectAsMap()
Ejemplo n.º 39
0
    def __init__(self, rdd, file_type='CSV', t_rdd=None, sc=None):
        if rdd is not None:
            jvm = rdd.ctx._jvm
            java_import(jvm, ClassNames.BYTES_TO_STRING)
            java_import(jvm, ClassNames.TRANSFORMABLE_RDD)

            self.__set_file_type(jvm, file_type)
            self.spark_context = rdd.ctx
            java_rdd = rdd._reserialize(BuddySerializer())._jrdd.map(jvm.BytesToString())
            self._transformable_rdd = jvm.JavaTransformableRDD(java_rdd, self.__file_type)
            RDD.__init__(self, rdd._jrdd, rdd.ctx)
        else:
            jvm = sc._jvm
            java_import(jvm, ClassNames.STRING_TO_BYTES)
            self.spark_context = sc
            self.__set_file_type(jvm, file_type)
            self._transformable_rdd = t_rdd
            rdd = t_rdd.map(jvm.StringToBytes())
            RDD.__init__(self, rdd, sc, BuddySerializer())
Ejemplo n.º 40
0
def extract_claim_types(wikidata_items: RDD):
    def parse_types(item):
        value_types = []
        for property_claims in item['claims'].values():
            for c in property_claims:
                mainsnak = c['mainsnak']
                if 'datatype' in mainsnak:
                    value_types.append(mainsnak['datatype'])
        return value_types

    return set(wikidata_items.flatMap(parse_types).distinct().collect())
Ejemplo n.º 41
0
 def predict(self, x):
     """
     Predict the label of one or more examples.
     :param x:  Data point (feature vector),
                or an RDD of data points (feature vectors).
     """
     pythonAPI = self._sc._jvm.PythonMLLibAPI()
     if isinstance(x, RDD):
         # Bulk prediction
         if x.count() == 0:
             return self._sc.parallelize([])
         dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
         jSerializedPreds = \
             pythonAPI.predictDecisionTreeModel(self._java_model,
                                                dataBytes._jrdd)
         serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
         return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
     else:
         # Assume x is a single data point.
         x_ = _serialize_double_vector(x)
         return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
Ejemplo n.º 42
0
 def evaluate(self, labels_and_predictions: RDD) -> float:
     tp = labels_and_predictions \
         .map(lambda x:
              (set(x[0]),
               set(features for features, weights in x[1][:self._pred_n]))) \
         .filter(lambda x:
                 len(x[0].intersection(x[1])) >= self._intersect_n)
     accuracy = 100.0 * tp.count() / labels_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
Ejemplo n.º 43
0
def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == 'wikibase-item':
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == 'quantity':
            unit = claim.object.unit
            unit = unit.split('/')[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'}

    return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(lambda c: c is not None)
 def evaluate(self, lables_and_predictions: RDD):
     result = lables_and_predictions.map(lambda p: _hamming_loss(p[0], p[1])). \
         mean()
     self._results.append(result)
     return result
Ejemplo n.º 45
0
def shuffle_and_split(data: RDD, fold_n: int, seed: int = 0) -> list:
    fold_weights = [1 / fold_n] * fold_n
    return data.randomSplit(fold_weights, seed)
Ejemplo n.º 46
0
 def test_null_in_rdd(self):
     jrdd = self.sc._jvm.PythonUtils.generateRDDWithNull(self.sc._jsc)
     rdd = RDD(jrdd, self.sc, UTF8Deserializer())
     self.assertEqual([u"a", None, u"b"], rdd.collect())
     rdd = RDD(jrdd, self.sc, NoOpSerializer())
     self.assertEqual([b"a", None, b"b"], rdd.collect())