def _compute_tfid(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts.map(lambda t: t.words)) tf.cache() idf = IDF().fit(tf) tfidfs = idf.transform(tf) text_tfs = texts.zip(tfidfs) return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
def train_model(data: RDD, l=1.0) -> MLNaiveBayesModel: aggregated = data.flatMap(lambda x: [(l, x['features']) for l in x['labels']]) \ .combineByKey(lambda v: (1, v), lambda c, v: (c[0] + 1, c[1] + v), lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])) \ .sortBy(lambda x: x[0]) \ .collect() num_labels = len(aggregated) num_documents = data.count() num_features = aggregated[0][1][1].size labels = np.zeros(num_labels) pi = np.zeros(num_labels, dtype=int) theta = np.zeros((num_labels, num_features)) pi_log_denom = math.log(num_documents + num_labels * l) i = 0 for (label, (n, sum_term_freq)) in aggregated: labels[i] = label pi[i] = math.log(n + l) - pi_log_denom sum_term_freq_dense = sum_term_freq.toarray() theta_log_denom = math.log(sum_term_freq.sum() + num_features * l) theta[i, :] = np.log(sum_term_freq_dense + l) - theta_log_denom i += 1 return MLNaiveBayesModel(labels, pi, theta)
def __init_parameters(self, train: RDD): """ _n_buckets/_n_items: The number of distinct buckets/items in the train RDD. _bucket_block_size/_cross_block_size/_item_block_size: The size of blocks when dividing buckets/cross buckets/items into blocks. _n_bucket_block/_n_cross_block/_n_item_block: The number of blocks when dividing buckets/cross buckets/items into blocks. """ self._n_buckets = train.map(lambda u: u[0]).distinct().count() if self._n_buckets <= self._k: self._k = float("inf") # For the bucket dimension. if self._bucket_block_size is None: # Interpret bucket_block_size from n_bucket_block self._bucket_block_size = self._n_buckets // self._n_bucket_block + 1 else: self._n_bucket_block = self._n_buckets // self._bucket_block_size + 1 # For the cross dimension. if self._cross_block_size is None: self._cross_block_size = self._n_buckets // self._n_cross_block + 1 else: self._n_cross_block = self._n_buckets // self._cross_block_size + 1 # For the item dimension self._n_items = train.map(lambda u: u[1]).distinct().count() if self._item_block_size is None: self._item_block_size = self._n_items // self._n_item_block + 1 else: self._n_item_block = self._n_item // self._item_block_size + 1 return self
def join_multiple_keys(left: RDD, right: RDD, n: int) -> RDD: """ Join RDDs with multiple keys. ((key1, key2, ...), value_left) x (key_i, value_right_i) -> ((key1, key2, ...), (value_left, value_right_1, value_right_2, ...)) :param left: RDD<tuple<int>, value> :param right: RDD<int, value> :param n: int, the length of the key in left-RDD :return: joint RDD. """ left = left.map( lambda u: (-1, (u[0], (u[1],))) ) # (_, (tuple<key>, tuple<value>)) right = right.map( lambda u: (u[0], (u[1],)) ).cache() # (_, tuple<value>) for key_order in range(n): left = left.map( lambda u: (u[1][0][key_order], u[1]) # (_, (tuple<key>, tuple<value>)) ).join( right # (_, ((tuple<key>, tuple<value>), tuple<value>)) ).map( lambda u: (-1, (u[1][0][0], u[1][0][1] + u[1][1])) ) # (_, (tuple<key>, tuple<value>)) left = left.map( lambda u: u[1] ) # (tuple<key>, tuple<value>) return left
def _evaluate(self, rdd: RDD, **kwargs): yaml_model = self.master_network.to_yaml() optimizer = deserialize_optimizer(self.master_optimizer) loss = self.master_loss weights = self.master_network.get_weights() weights = rdd.context.broadcast(weights) custom_objects = self.custom_objects metrics = self.master_metrics def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs, data_iterator): model = model_from_yaml(model, custom_objects) model.compile(optimizer, loss, metrics) model.set_weights(weights.value) feature_iterator, label_iterator = tee(data_iterator, 2) x_test = np.asarray([x for x, y in feature_iterator]) y_test = np.asarray([y for x, y in label_iterator]) return [model.evaluate(x_test, y_test, **kwargs)] if self.num_workers: rdd = rdd.repartition(self.num_workers) results = rdd.mapPartitions( partial(_evaluate, yaml_model, optimizer, loss, custom_objects, metrics, kwargs)) if not metrics: # if no metrics, we can just return the scalar corresponding to the loss value return results.mean() else: # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API loss_value = results.map(lambda x: x[0]).mean() metric_value = results.map(lambda x: x[1]).mean() return [loss_value, metric_value]
def naive_multiplication_rdd(mat_a: pyspark.RDD, mat_b: pyspark.RDD, is_triangle=False): """ mat_a is the left matrix mat_b is the right matix :param mat_a: :param mat_b: :param is_triangle: :return: """ if is_triangle: left_rdd = ( mat_a.flatMap(lambda x: [((x.j, x.i), x.value), ((x.i, x.j), x.value)]) .aggregateByKey(zeroValue=(0.0, 0.0), seqFunc=lambda x, y: (x[0]+y, x[1]+1), combFunc=lambda x, y: (x[0] + y[0], x[1]+y[1])) .mapValues(lambda x: x[0] / x[1]) .map(lambda x: (x[0][0], (x[0][1], x[1]))) ) else: left_rdd = mat_a.map(lambda x: (x.j, (x.i, x.value))) right_rdd = mat_b.map(lambda x: (x.i, (x.j, x.value))) combined_rdd = (left_rdd.join(right_rdd).map(lambda x: x[1]) .map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][1])) .reduceByKey(lambda x, y: x+y) .map(lambda x: distributed.MatrixEntry(i=x[0][0], j=x[0][1], value=x[1])) ) return combined_rdd
def build_vocabularies(self, rows: RDD): """ Process rows to gather values and paths with their frequencies. :param rows: row structure is ((key, doc), val) where: * key: str with the path context * doc: file name * val: number of occurrences of key in doc """ def _flatten_row(row: Row): # 2: removes the namespace v. from the string to parse it as tuple k = Vocabulary2Id._unstringify_path_context(row) return [(k[0], 1), (k[1], 1), (k[2], 1)] rows = rows \ .flatMap(_flatten_row) \ .reduceByKey(operator.add) \ .persist() values = rows.filter(lambda x: type(x[0]) == str).collect() paths = rows.filter(lambda x: type(x[0]) == tuple).collect() value2index = {w: id for id, (w, _) in enumerate(values)} path2index = {w: id for id, (w, _) in enumerate(paths)} value2freq = {w: freq for _, (w, freq) in enumerate(values)} path2freq = {w: freq for _, (w, freq) in enumerate(paths)} rows.unpersist() return value2index, path2index, value2freq, path2freq
def __call__(self, head: RDD): if self.distinct and not self.approximate: head = head.distinct() if self.explained: self._log.info("toDebugString():\n%s", head.toDebugString().decode()) if not self.approximate or not self.distinct: return head.count() return head.countApproxDistinct()
def _flat_map(rdd: RDD, func): from itertools import chain def _fn(x): return func(x[0], x[1]) def _func(_, iterator): return chain.from_iterable(map(fail_on_stopiteration(_fn), iterator)) rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False)
def evaluate(self, lables_and_predictions: RDD): TP = lables_and_predictions.map(lambda x: (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \ filter(lambda x: len(x[0].intersection(x[1])) > self._intersect_n) accuracy = 100.0 * TP.count() / lables_and_predictions.count() if self._verbose: print('accuracy: ', accuracy) self._results.append(accuracy) return accuracy
def cStress(rdd: RDD) -> RDD: # TODO: TWH Temporary ecg_sampling_frequency = 64.0 rip_sampling_frequency = 64.0 accel_sampling_frequency = 64.0 / 6.0 # Timestamp correct datastreams ecg_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['ecg'], sampling_frequency=ecg_sampling_frequency))) rip_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['rip'], sampling_frequency=rip_sampling_frequency))) accelx_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['accelx'], sampling_frequency=accel_sampling_frequency))) accely_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['accely'], sampling_frequency=accel_sampling_frequency))) accelz_corrected = rdd.map(lambda ds: ( ds['participant'], timestamp_correct(datastream=ds['accelz'], sampling_frequency=accel_sampling_frequency))) accel_group = accelx_corrected.join(accely_corrected).join( accelz_corrected).map(fix_two_joins) accel = accel_group.map(lambda ds: ( ds[0], autosense_sequence_align(datastreams=[ds[1][0], ds[1][1], ds[1][2]], sampling_frequency=accel_sampling_frequency))) # Accelerometer Feature Computation accel_features = accel.map( lambda ds: (ds[0], accelerometer_features(ds[1], window_length=10.0))) # rip features peak_valley = rip_corrected.map( lambda ds: (ds[0], rip.compute_peak_valley(rip=ds[1]))) rip_features = peak_valley.map( lambda ds: (ds[0], rip_feature_computation(ds[1][0], ds[1][1]))) # r-peak datastream computation ecg_rr_rdd = ecg_corrected.map(lambda ds: (ds[ 0], compute_rr_intervals(ds[1], ecg_sampling_frequency))) ecg_features = ecg_rr_rdd.map(lambda ds: (ds[ 0], ecg_feature_computation(ds[1], window_size=60, window_offset=60))) # return rip_features.join(ecg_features).join(accel_features).map(fix_two_joins) return ecg_features
def __preprocessRdd(self, rdd: RDD): rddc = rddCorrector() rdd = rdd.map(lambda l: rddc.correct(l)) if rdd != None: if (rdd.isEmpty() == False): rdd = rdd.map(lambda l: l.replace("<tweet>", "")) rdd = rdd.map(lambda l: l.replace("</tweet>", "")) df = DataFrameWorks().convertDataFrame(rdd, self.__spark) df = CleanText().clean(df, self.__spark) return df return None
def evaluate(truth: RDD, prediction: RDD) -> float: """ Calculate RMSE between truth and predictions. :param truth: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)> :param prediction: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)> :return: float = RMSE """ truth = truth.map(lambda u: ((u[0], u[1]), u[2])) prediction = prediction.map(lambda u: ((u[0], u[1]), u[2])) return truth.join(prediction).map(lambda u: (u[1][0] - u[1][1])**2).mean()**0.5
def convertDataFrame(self, rdd: RDD, SqlObject) -> DataFrame: """RDD to DataFrame""" #rdd = rdd.map(lambda l: l.replace("½","")) rdd = rdd.map(lambda l: (l[:19], l[19:])) schema = [StructField("id", StringType(), False), StructField("rawData", StringType(), False), StructField("preprocessedData", ArrayType(elementType=StringType(), containsNull=True), True), StructField("sentiment", FloatType(), True)] final_struct = StructType(fields=schema) rdd =rdd.map(lambda l: (l[0],l[1], [None], None)) return SqlObject.createDataFrame(rdd, schema=final_struct)
def java_to_python_rdd(sc, rdd, is_pair, is_json): jrdd = sc._jvm.SerDe.javaToPython(rdd) output = RDD(jrdd, sc) if is_pair: if is_json: return output.map(lambda x: (x.split("\t")[0], json.loads(x.split("\t")[1]))) else: return output.map(lambda x: (x.split("\t")[0], x.split("\t")[1])) if is_json: return output.map(lambda x: json.loads(x)) return output
def run(self, rdd: RDD) -> RDD: # type: ignore rdd = rdd.cache() n_points = rdd.count() m = n_points / self.n_partitions optimal_p = math.log(n_points * self.n_partitions) / m rdd = self.assign_buckets( # type: ignore rdd, p=optimal_p, key_func=_label_first_coord_and_type ) rdd = self.sort_and_assign_labels(rdd) # type: ignore return rdd
def __call__(self, head: RDD): if self.keymap is None: return head.coalesce(self.partitions, self.shuffle) # partitionBy the key extracted using self.keymap try: # this checks if keymap is an identity probe = self.keymap("probe") except: # noqa: E722 probe = None if probe != "probe": head = head.map(lambda x: (self.keymap(x), x)) return head \ .partitionBy(self.partitions) \ .map(lambda x: x[1])
def _save_as_func(rdd: RDD, name, namespace, partition, persistent): from arch.api import session dup = session.table(name=name, namespace=namespace, partition=partition, persistent=persistent) def _func(_, it): eggroll_util.maybe_create_eggroll_client() dup.put_all(list(it)) return 1, rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False).collect() return dup
def _union(rdd: RDD, other: RDD, func): num_partition = max(rdd.getNumPartitions(), other.getNumPartitions()) def _func(pair): iter1, iter2 = pair val1 = list(iter1) val2 = list(iter2) if not val1: return val2[0] if not val2: return val1[0] return func(val1[0], val2[0]) return _map_value(rdd.cogroup(other, num_partition), _func)
def transform_online_retail( sc: SparkSession, raw_rdd: RDD, schema: str, max_month: Optional[int] = None, ) -> DataFrame: """Method to transform online retail dataset to its correct dataformats, specific for online retail :return: """ # initial transformation of the raw RDD raw_rdd = raw_rdd.map(lambda retail: ( retail[0], # InvoiceNo retail[1], # StockCode retail[2] if retail[2] != '' else None, # Description int(retail[3]), # Quantity datetime.strptime(retail[4], '%d/%m/%Y %H:%M') if int(retail[4].split('/')[1]) < max_month else datetime.strptime(retail[4], '%m/%d/%Y %H:%M'), # InvoiceDate float(retail[5]), # UnitPrice int(retail[6]) if retail[6] != '' else None, # CustomerID retail[7] if retail[7] != '' else None) # Country ) return sc.createDataFrame( raw_rdd, schema=schema )
def extract_claims(wikidata_items: RDD, b_property_map: Broadcast, b_item_map: Broadcast): def parse_item_claims(item): item_id = item['id'] item_map = b_item_map.value if item_id not in item_map: return [] item_label = item_map[item_id] property_map = b_property_map.value if 'enwiki' in item['sitelinks']: title = item['sitelinks']['enwiki']['title'] else: title = None item_claims = [] for property_id, property_claims in item['claims'].items(): if property_id in property_map: if property_id not in property_map: continue property_name = property_map[property_id] for claim in property_claims: mainsnak = claim['mainsnak'] if 'datatype' in mainsnak and 'datavalue' in mainsnak: datatype = mainsnak['datatype'] datavalue = mainsnak['datavalue'] if datatype in datatype_parsers: wiki_object = datatype_parsers[datatype](datavalue) if wiki_object is not None: item_claims.append( Claim(item_label, property_name, wiki_object, datatype, title, property_id, item_id) ) return item_claims return wikidata_items.flatMap(parse_item_claims)
def __init__(self, dt_index, rdd, jtsrdd=None, sc=None): if jtsrdd == None: # Construct from a Python RDD object and a Python DateTimeIndex jvm = rdd.ctx._jvm jrdd = rdd._reserialize(_TimeSeriesSerializer())._jrdd.mapToPair( \ jvm.com.cloudera.sparkts.BytesToKeyAndSeries()) self._jtsrdd = jvm.com.cloudera.sparkts.api.java.JavaTimeSeriesRDDFactory.timeSeriesRDD( \ dt_index._jdt_index, jrdd) RDD.__init__(self, rdd._jrdd, rdd.ctx) else: # Construct from a py4j.JavaObject pointing to a JavaTimeSeriesRDD and a Python SparkContext jvm = sc._jvm jrdd = jtsrdd.map( \ jvm.com.cloudera.sparkts.KeyAndSeriesToBytes()) RDD.__init__(self, jrdd, sc, _TimeSeriesSerializer()) self._jtsrdd = jtsrdd
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ SerDe = self._sc._jvm.SerDe ser = PickleSerializer() if isinstance(x, RDD): # Bulk prediction first = x.take(1) if not first: return self._sc.parallelize([]) if not isinstance(first[0], Vector): x = x.map(_convert_to_vector) jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD() jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred) return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024)) else: # Assume x is a single data point. bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) return self._java_model.predict(vec)
def __compute_signature(self, data: RDD) -> RDD: """ Compute signature for items. :param data: RDD<(Hashable, Iterator<Hashable>)> = RDD<(item, content)> :return: RDD<(Hashable, tuple<int>)> = RDD<(item, signature)> """ hashing_range = self.__hashing_range signature_length = self.__signature_length random_seed = self.__seed min_hash_func = self.__min_hash def _signature(key_values: (Hashable, Iterator)) -> (Hashable, tuple): """ Compute signature for each item :return (Hashable, tuple<int>) = (item, signature) """ item, content = key_values signature = [hashing_range for _ in range(signature_length)] for element in content: for index_i, hashed_value in enumerate( min_hash_func(element, signature_length, hashing_range, random_seed)): signature[index_i] = min(hashed_value, signature[index_i]) return item, tuple(signature) return data.map(_signature)
def _java2py(sc, r, encoding="bytes"): if isinstance(r, JavaObject): clsName = r.getClass().getSimpleName() # convert RDD into JavaRDD if clsName != 'JavaRDD' and clsName.endswith("RDD"): r = r.toJavaRDD() clsName = 'JavaRDD' if clsName == 'JavaRDD': jrdd = sc._jvm.SerDe.javaToPython(r) return RDD(jrdd, sc) if clsName == 'DataFrame': return DataFrame(r, get_spark_sql_context(sc)) if clsName == 'Dataset': return DataFrame(r, get_spark_sql_context(sc)) if clsName in _picklable_classes: r = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(r) elif isinstance(r, (JavaArray, JavaList, JavaMap)): try: r = sc._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps( r) except Py4JJavaError: pass # not pickable if isinstance(r, (bytearray, bytes)): r = PickleSerializer().loads(bytes(r), encoding=encoding) return r
def apply(self, data_points: RDD, fault_tolerant: bool = False) -> np.ndarray: """Label PySpark RDD of data points with LFs. Parameters ---------- data_points PySpark RDD containing data points to be labeled by LFs fault_tolerant Output ``-1`` if LF execution fails? Returns ------- np.ndarray Matrix of labels emitted by LFs """ f_caller = _FunctionCaller(fault_tolerant) def map_fn(args: Tuple[DataPoint, int]) -> RowData: return apply_lfs_to_data_point(*args, lfs=self._lfs, f_caller=f_caller) labels = data_points.zipWithIndex().map(map_fn).collect() return self._numpy_from_row_data(labels)
def _java2py(sc: SparkContext, r: "JavaObjectOrPickleDump", encoding: str = "bytes") -> Any: if isinstance(r, JavaObject): clsName = r.getClass().getSimpleName() # convert RDD into JavaRDD if clsName != "JavaRDD" and clsName.endswith("RDD"): r = r.toJavaRDD() clsName = "JavaRDD" assert sc._jvm is not None if clsName == "JavaRDD": jrdd = sc._jvm.org.apache.spark.ml.python.MLSerDe.javaToPython( r) # type: ignore[attr-defined] return RDD(jrdd, sc) if clsName == "Dataset": return DataFrame(r, SparkSession(sc)._wrapped) if clsName in _picklable_classes: r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps( r) # type: ignore[attr-defined] elif isinstance(r, (JavaArray, JavaList)): try: r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps( r) # type: ignore[attr-defined] except Py4JJavaError: pass # not picklable if isinstance(r, (bytearray, bytes)): r = CPickleSerializer().loads(bytes(r), encoding=encoding) return r
def _java2py(sc, r, encoding="bytes"): if isinstance(r, JavaObject): clsName = r.getClass().getSimpleName() # convert RDD into JavaRDD if clsName != "JavaRDD" and clsName.endswith("RDD"): r = r.toJavaRDD() clsName = "JavaRDD" if clsName == "JavaRDD": jrdd = sc._jvm.org.apache.spark.ml.python.MLSerDe.javaToPython(r) return RDD(jrdd, sc) if clsName == "Dataset": return DataFrame(r, SQLContext.getOrCreate(sc)) if clsName in _picklable_classes: r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r) elif isinstance(r, (JavaArray, JavaList)): try: r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r) except Py4JJavaError: pass # not pickable if isinstance(r, (bytearray, bytes)): r = PickleSerializer().loads(bytes(r), encoding=encoding) return r
def SpatialRangeQuery(self, spatialRDD: SpatialRDD, rangeQueryWindow: BaseGeometry, considerBoundaryIntersection: bool, usingIndex: bool): """ :param spatialRDD: :param rangeQueryWindow: :param considerBoundaryIntersection: :param usingIndex: :return: """ jvm = spatialRDD._jvm sc = spatialRDD._sc jvm_geom = GeometryAdapter.create_jvm_geometry_from_base_geometry(jvm, rangeQueryWindow) srdd = jvm.\ RangeQuery.SpatialRangeQuery( spatialRDD._srdd, jvm_geom, considerBoundaryIntersection, usingIndex ) serialized = JvmGeoSparkPythonConverter(jvm).translate_spatial_rdd_to_python(srdd) return RDD(serialized, sc, GeoSparkPickler())
def create_python_rdd(self, jrdd, serializer): """Creates a Python RDD from a RDD from Scala. Args: jrdd (org.apache.spark.api.java.JavaRDD): The RDD that came from Scala. serializer (:class:`~geopyspark.AvroSerializer` or pyspark.serializers.AutoBatchedSerializer(AvroSerializer)): An instance of ``AvroSerializer`` that is either alone, or wrapped by ``AutoBatchedSerializer``. Returns: ``pyspark.RDD`` """ if isinstance(serializer, AutoBatchedSerializer): return RDD(jrdd, self.pysc, serializer) else: return RDD(jrdd, self.pysc, AutoBatchedSerializer(serializer))
def __call__(self, rdd: RDD) -> RDD: def select_fields(row): return Row(**{f: getattr(row, f) for f in self.fields}) res = rdd.map(select_fields) if self.explained: self._log.info("toDebugString():\n%s", res.toDebugString().decode()) return res
def __init__(self, dt_index, rdd, jtsrdd = None, sc = None): if jtsrdd == None: # Construct from a Python RDD object and a Python DateTimeIndex jvm = rdd.ctx._jvm jrdd = rdd._reserialize(_TimeSeriesSerializer())._jrdd.map( \ jvm.com.cloudera.sparkts.BytesToKeyAndSeries()) self._jtsrdd = jvm.com.cloudera.sparkts.TimeSeriesRDD( \ dt_index._jdt_index, jrdd.rdd()) RDD.__init__(self, rdd._jrdd, rdd.ctx) else: # Construct from a py4j.JavaObject pointing to a TimeSeriesRDD and a Python SparkContext jvm = sc._jvm jrdd = jvm.org.apache.spark.api.java.JavaRDD(jtsrdd, None).map( \ jvm.com.cloudera.sparkts.KeyAndSeriesToBytes()) RDD.__init__(self, jrdd, sc, _TimeSeriesSerializer()) self._jtsrdd = jtsrdd
class ByteTileSchemaTest(BaseTestClass): tiles = [ Tile.from_numpy_array(np.int8([0, 0, 1, 1]).reshape(2, 2), -128), Tile.from_numpy_array(np.int8([1, 2, 3, 4]).reshape(2, 2), -128), Tile.from_numpy_array(np.int8([5, 6, 7, 8]).reshape(2, 2), -128) ] sc = BaseTestClass.pysc._jsc.sc() tw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ByteArrayTileWrapper java_rdd = tw.testOut(sc) ser = ProtoBufSerializer(tile_decoder, tile_encoder) rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser)) collected = rdd.collect() def test_encoded_tiles(self): expected_encoded = [to_pb_tile(x) for x in self.collected] for actual, expected in zip(self.tiles, expected_encoded): cells = actual.cells rows, cols = cells.shape self.assertEqual(expected.cols, cols) self.assertEqual(expected.rows, rows) self.assertEqual(expected.cellType.nd, actual.no_data_value) self.assertEqual(expected.cellType.dataType, mapped_data_types[actual.cell_type]) def test_decoded_tiles(self): for actual, expected in zip(self.collected, self.tiles): self.assertTrue((actual.cells == expected.cells).all()) self.assertTrue(actual.cells.dtype == expected.cells.dtype) self.assertEqual(actual.cells.shape, actual.cells.shape)
def from_rdd(cls, rdd: RDD, job_id: str, namespace: str, name: str): partitions = rdd.getNumPartitions() return RDDTable(session_id=job_id, namespace=namespace, name=name, partitions=partitions, rdd=rdd)
def extract_item_map(wikidata_items: RDD): def parse_item(item): if 'en' in item['labels']: label = item['labels']['en']['value'] return item['id'], label else: return None return wikidata_items.map(parse_item).filter(lambda i: i is not None).collectAsMap()
def test_multiple_python_java_RDD_conversions(self): # Regression test for SPARK-5361 data = [ (u'1', {u'director': u'David Lean'}), (u'2', {u'director': u'Andrew Dominik'}) ] data_rdd = self.sc.parallelize(data) data_java_rdd = data_rdd._to_java_object_rdd() data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) converted_rdd = RDD(data_python_rdd, self.sc) self.assertEqual(2, converted_rdd.count()) # conversion between python and java RDD threw exceptions data_java_rdd = converted_rdd._to_java_object_rdd() data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) converted_rdd = RDD(data_python_rdd, self.sc) self.assertEqual(2, converted_rdd.count())
def extract_item_page_map(wikidata_items: RDD): def parse_item_page(item): item_id = item['id'] if 'enwiki' in item['sitelinks']: return [(item_id, item['sitelinks']['enwiki']['title'])] else: return [] return wikidata_items.flatMap(parse_item_page).collectAsMap()
def __init__(self, rdd, file_type='CSV', t_rdd=None, sc=None): if rdd is not None: jvm = rdd.ctx._jvm java_import(jvm, ClassNames.BYTES_TO_STRING) java_import(jvm, ClassNames.TRANSFORMABLE_RDD) self.__set_file_type(jvm, file_type) self.spark_context = rdd.ctx java_rdd = rdd._reserialize(BuddySerializer())._jrdd.map(jvm.BytesToString()) self._transformable_rdd = jvm.JavaTransformableRDD(java_rdd, self.__file_type) RDD.__init__(self, rdd._jrdd, rdd.ctx) else: jvm = sc._jvm java_import(jvm, ClassNames.STRING_TO_BYTES) self.spark_context = sc self.__set_file_type(jvm, file_type) self._transformable_rdd = t_rdd rdd = t_rdd.map(jvm.StringToBytes()) RDD.__init__(self, rdd, sc, BuddySerializer())
def extract_claim_types(wikidata_items: RDD): def parse_types(item): value_types = [] for property_claims in item['claims'].values(): for c in property_claims: mainsnak = c['mainsnak'] if 'datatype' in mainsnak: value_types.append(mainsnak['datatype']) return value_types return set(wikidata_items.flatMap(parse_types).distinct().collect())
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ pythonAPI = self._sc._jvm.PythonMLLibAPI() if isinstance(x, RDD): # Bulk prediction if x.count() == 0: return self._sc.parallelize([]) dataBytes = _get_unmangled_double_vector_rdd(x, cache=False) jSerializedPreds = \ pythonAPI.predictDecisionTreeModel(self._java_model, dataBytes._jrdd) serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer()) return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes))) else: # Assume x is a single data point. x_ = _serialize_double_vector(x) return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
def evaluate(self, labels_and_predictions: RDD) -> float: tp = labels_and_predictions \ .map(lambda x: (set(x[0]), set(features for features, weights in x[1][:self._pred_n]))) \ .filter(lambda x: len(x[0].intersection(x[1])) >= self._intersect_n) accuracy = 100.0 * tp.count() / labels_and_predictions.count() if self._verbose: print('accuracy: ', accuracy) self._results.append(accuracy) return accuracy
def clean_claims(claims: RDD, b_item_map: Broadcast): def clean(claim): item_map = b_item_map.value if claim.datatype == 'wikibase-item': if claim.object in item_map: claim = claim._replace(object=item_map[claim.object]) return claim else: return None elif claim.datatype == 'quantity': unit = claim.object.unit unit = unit.split('/')[-1] if unit in item_map: claim = claim._replace(object=item_map[unit]) return claim else: return None return claim dt_filter = {'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'} return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(lambda c: c is not None)
def evaluate(self, lables_and_predictions: RDD): result = lables_and_predictions.map(lambda p: _hamming_loss(p[0], p[1])). \ mean() self._results.append(result) return result
def shuffle_and_split(data: RDD, fold_n: int, seed: int = 0) -> list: fold_weights = [1 / fold_n] * fold_n return data.randomSplit(fold_weights, seed)
def test_null_in_rdd(self): jrdd = self.sc._jvm.PythonUtils.generateRDDWithNull(self.sc._jsc) rdd = RDD(jrdd, self.sc, UTF8Deserializer()) self.assertEqual([u"a", None, u"b"], rdd.collect()) rdd = RDD(jrdd, self.sc, NoOpSerializer()) self.assertEqual([b"a", None, b"b"], rdd.collect())