Example #1
0
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        :param sc: Spark context
        :param path: file or directory path in any Hadoop-supported file
                     system URI
        :param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        jrdd = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
            sc._jsc, path, minPartitions)
        jpyrdd = sc._jvm.SerDe.javaToPython(jrdd)
        return RDD(jpyrdd, sc, AutoBatchedSerializer(PickleSerializer()))
Example #2
0
def joinWithCassandraTable(dstream, keyspace, table, selected_columns=None,
                           join_columns=None):
    """Joins a DStream (a stream of RDDs) with a Cassandra table

    Arguments:
        @param dstream(DStream)
        The DStream to join. Equals to self when invoking
        joinWithCassandraTable on a monkey patched RDD.
        @param keyspace(string):
            The keyspace to join on.
        @param table(string):
            The CQL table to join on.
        @param selected_columns(string):
            The columns to select from the Cassandra table.
        @param join_columns(string):
            The columns used to join on from the Cassandra table.
    """

    ssc = dstream._ssc
    ctx = ssc._sc
    gw = ctx._gateway

    selected_columns = as_java_array(
        gw, "String", selected_columns) if selected_columns else None
    join_columns = as_java_array(gw, "String",
                                 join_columns) if join_columns else None

    h = helper(ctx)
    dstream = h.joinWithCassandraTable(dstream._jdstream, keyspace, table,
                                       selected_columns,
                                       join_columns)
    dstream = h.pickleRows(dstream)
    dstream = h.javaDStream(dstream)

    return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
Example #3
0
 def __init__(self,
              aggregator,
              memory_limit=512,
              serializer=None,
              localdirs=None,
              scale=1,
              partitions=59,
              batch=1000):
     Merger.__init__(self, aggregator)
     self.memory_limit = memory_limit
     # default serializer is only used for tests
     self.serializer = serializer or AutoBatchedSerializer(
         PickleSerializer())
     self.localdirs = localdirs or _get_local_dirs(str(id(self)))
     # number of partitions when spill data into disks
     self.partitions = partitions
     # check the memory after # of items merged
     self.batch = batch
     # scale is used to scale down the hash of key for recursive hash map
     self.scale = scale
     # unpartitioned merged data
     self.data = {}
     # partitioned merged data, list of dicts
     self.pdata = []
     # number of chunks dumped into disks
     self.spills = 0
     # randomize the hash of key, id(o) is the address of o (aligned by 8)
     self._seed = id(self) + 7
Example #4
0
class SpatialKeySchemaTest(BaseTestClass):
    expected_keys = {'col': 7, 'row': 3}

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.SpatialKeyWrapper

    java_rdd = ew.testOut(sc)
    ser = ProtoBufSerializer(spatial_key_decoder, spatial_key_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.first()._asdict()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_keys, expected_keys):
        self.assertDictEqual(actual_keys, expected_keys)

    def test_encoded_keyss(self):
        actual_encoded = [spatial_key_encoder(x) for x in self.rdd.collect()]
        proto_spatial_key = keyMessages_pb2.ProtoSpatialKey()

        proto_spatial_key.col = 7
        proto_spatial_key.row = 3

        expected_encoded = proto_spatial_key.SerializeToString()

        self.assertEqual(actual_encoded[0], expected_encoded)

    def test_decoded_extents(self):
        self.assertDictEqual(self.collected, self.expected_keys)
Example #5
0
class SpatialKeySchemaTest(BaseTestClass):
    expected_keys = {'col': 7, 'row': 3}

    sc = BaseTestClass.geopysc.pysc._jsc.sc()
    ew = BaseTestClass.geopysc.pysc._jvm.geopyspark.geotrellis.tests.schemas.SpatialKeyWrapper

    tup = ew.testOut(sc)
    java_rdd = tup._1()
    ser = AvroSerializer(tup._2())

    rdd = RDD(java_rdd, BaseTestClass.geopysc.pysc, AutoBatchedSerializer(ser))
    collected = rdd.first()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.geopysc.pysc._gateway.close()

    def result_checker(self, actual_keys, expected_keys):
        self.assertDictEqual(actual_keys, expected_keys)

    def test_encoded_keyss(self):
        encoded = self.rdd.map(lambda s: s)
        actual_encoded = encoded.first()

        self.result_checker(actual_encoded, self.expected_keys)

    def test_decoded_extents(self):
        self.assertDictEqual(self.collected, self.expected_keys)
Example #6
0
class ByteTileSchemaTest(BaseTestClass):
    tiles = [
        Tile.from_numpy_array(np.int8([0, 0, 1, 1]).reshape(2, 2), -128),
        Tile.from_numpy_array(np.int8([1, 2, 3, 4]).reshape(2, 2), -128),
        Tile.from_numpy_array(np.int8([5, 6, 7, 8]).reshape(2, 2), -128)
    ]

    sc = BaseTestClass.pysc._jsc.sc()
    tw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ByteArrayTileWrapper

    java_rdd = tw.testOut(sc)
    ser = ProtoBufSerializer(tile_decoder, tile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    def test_encoded_tiles(self):
        expected_encoded = [to_pb_tile(x) for x in self.collected]

        for actual, expected in zip(self.tiles, expected_encoded):
            cells = actual.cells
            rows, cols = cells.shape

            self.assertEqual(expected.cols, cols)
            self.assertEqual(expected.rows, rows)
            self.assertEqual(expected.cellType.nd, actual.no_data_value)
            self.assertEqual(expected.cellType.dataType,
                             mapped_data_types[actual.cell_type])

    def test_decoded_tiles(self):
        for actual, expected in zip(self.collected, self.tiles):
            self.assertTrue((actual.cells == expected.cells).all())
            self.assertTrue(actual.cells.dtype == expected.cells.dtype)
            self.assertEqual(actual.cells.shape, actual.cells.shape)
Example #7
0
class ProjectedExtentSchemaTest(BaseTestClass):
    projected_extents = [
        {'epsg': 2004, 'extent': {'xmax': 1.0, 'xmin': 0.0, 'ymax': 1.0, 'ymin': 0.0}, 'proj4': None},
        {'epsg': 2004, 'extent': {'xmax': 3.0, 'xmin': 1.0, 'ymax': 4.0, 'ymin': 2.0}, 'proj4': None},
        {'epsg': 2004, 'extent': {'xmax': 7.0, 'xmin': 5.0, 'ymax': 8.0, 'ymin': 6.0}, 'proj4': None}]

    sc = BaseTestClass.geopysc.pysc._jsc.sc()
    ew = BaseTestClass.geopysc.pysc._jvm.geopyspark.geotrellis.tests.schemas.ProjectedExtentWrapper

    tup = ew.testOut(sc)
    java_rdd = tup._1()
    ser = AvroSerializer(tup._2())

    rdd = RDD(java_rdd, BaseTestClass.geopysc.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.geopysc.pysc._gateway.close()

    def result_checker(self, actual_pe, expected_pe):
        for actual, expected in zip(actual_pe, expected_pe):
            self.assertDictEqual(actual, expected)

    def test_encoded_pextents(self):
        encoded = self.rdd.map(lambda s: s)
        actual_encoded = encoded.collect()

        self.result_checker(actual_encoded, self.projected_extents)

    def test_decoded_pextents(self):
        self.result_checker(self.collected, self.projected_extents)
Example #8
0
class ByteTileSchemaTest(BaseTestClass):
    tiles = [
        {'data': np.array([0, 0, 1, 1]).reshape(2, 2), 'no_data_value': -128},
        {'data': np.array([1, 2, 3, 4]).reshape(2, 2), 'no_data_value': -128},
        {'data': np.array([5, 6, 7, 8]).reshape(2, 2), 'no_data_value': -128}
    ]

    sc = BaseTestClass.geopysc.pysc._jsc.sc()
    tw = BaseTestClass.geopysc.pysc._jvm.geopyspark.geotrellis.tests.schemas.ByteArrayTileWrapper

    tup = tw.testOut(sc)
    java_rdd = tup._1()
    ser = AvroSerializer(tup._2(), AvroRegistry.tile_decoder, AvroRegistry.tile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.geopysc.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    def test_encoded_tiles(self):
        encoded = self.rdd.map(lambda s: AvroRegistry.tile_encoder(s))
        actual_encoded = encoded.collect()

        expected_encoded = [
            {'bands': [{'cols': 2, 'rows': 2, 'cells': bytearray([0, 0, 1, 1]), 'noDataValue': -128}]},
            {'bands': [{'cols': 2, 'rows': 2, 'cells': bytearray([1, 2, 3, 4]), 'noDataValue': -128}]},
            {'bands': [{'cols': 2, 'rows': 2, 'cells': bytearray([5, 6, 7, 8]), 'noDataValue': -128}]}
        ]

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_tiles(self):
        for actual, expected in zip(self.collected, self.tiles):
            self.assertTrue((actual['data'] == expected['data']).all())
Example #9
0
    def registerFunction(self, name, f, returnType=StringType()):
        """Registers a lambda function as a UDF so it can be used in SQL statements.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not given it default to a string and conversion will automatically
        be done.  For any other return type, the produced object must match the specified type.

        :param name: name of the UDF
        :param samplingRatio: lambda function
        :param returnType: a :class:`DataType` object

        >>> sqlContext.registerFunction("stringLengthString", lambda x: len(x))
        >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
        [Row(c0=u'4')]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]
        """
        func = lambda _, it: map(lambda x: f(*x), it)
        ser = AutoBatchedSerializer(PickleSerializer())
        command = (func, None, ser, ser)
        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(
            self._sc, command, self)
        self._ssql_ctx.udf().registerPython(name, bytearray(pickled_cmd), env,
                                            includes, self._sc.pythonExec,
                                            self._sc.pythonVer, bvars,
                                            self._sc._javaAccumulator,
                                            returnType.json())
Example #10
0
class TupleSchemaTest(BaseTestClass):
    extent = {
        'epsg': 2004,
        'extent': {
            'xmax': 1.0,
            'xmin': 0.0,
            'ymax': 1.0,
            'ymin': 0.0
        },
        'proj4': None
    }

    arr = np.int8([0, 0, 1, 1]).reshape(2, 2)
    bands = [arr, arr, arr]
    multiband_tile = np.array(bands)
    multiband_dict = Tile(multiband_tile, 'BYTE', -128)

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.TupleWrapper

    java_rdd = ew.testOut(sc)

    decoder = create_partial_tuple_decoder(key_type="ProjectedExtent")
    encoder = create_partial_tuple_encoder(key_type="ProjectedExtent")

    ser = ProtoBufSerializer(decoder, encoder)
    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    @pytest.mark.skipif(
        'TRAVIS' in os.environ,
        reason="Encoding using methods in Main causes issues on Travis")
    def test_encoded_tuples(self):
        proto_tuple = tupleMessages_pb2.ProtoTuple()

        self.extent['extent'] = Extent(**self.extent['extent'])
        proto_extent = to_pb_projected_extent(ProjectedExtent(**self.extent))
        proto_multiband = to_pb_multibandtile(self.multiband_dict)

        proto_tuple.projectedExtent.CopyFrom(proto_extent)
        proto_tuple.tiles.CopyFrom(proto_multiband)

        bs = proto_tuple.SerializeToString()
        expected_encoded = [self.ser.dumps(x) for x in self.collected]

        for expected in expected_encoded:
            self.assertEqual(bs, expected)

    def test_decoded_tuples(self):
        expected_tuples = [(self.extent, self.multiband_dict),
                           (self.extent, self.multiband_dict),
                           (self.extent, self.multiband_dict)]

        for actual, expected in zip(self.collected, expected_tuples):
            (actual_extent, actual_tile) = actual
            (expected_extent, expected_tile) = expected

            self.assertTrue((actual_tile.cells == expected_tile.cells).all())
            self.assertDictEqual(actual_extent._asdict(), expected_extent)
Example #11
0
def _to_java_object_rdd(rdd: RDD) -> JavaObject:
    """Return a JavaRDD of Object by unpickling

    It will convert each Python object into Java object by Pickle, whenever the
    RDD is serialized in batch or not.
    """
    rdd = rdd._reserialize(AutoBatchedSerializer(CPickleSerializer()))  # type: ignore[attr-defined]
    return rdd.ctx._jvm.org.apache.spark.mllib.api.python.SerDe.pythonToJava(rdd._jrdd, True)  # type: ignore[attr-defined]
Example #12
0
def _to_java_object_rdd(rdd):
    """ Return an JavaRDD of Object by unpickling

    It will convert each Python object into Java object by Pyrolite, whenever the
    RDD is serialized in batch or not.
    """
    rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))
    return rdd.ctx._jvm.SerDe.pythonToJava(rdd._jrdd, True)
class TemporalProjectedExtentSchemaTest(BaseTestClass):
    extents = [
        Extent(0.0, 0.0, 1.0, 1.0),
        Extent(1.0, 2.0, 3.0, 4.0),
        Extent(5.0, 6.0, 7.0, 8.0),
    ]

    time = datetime.datetime.strptime("2016-08-24T09:00:00Z",
                                      '%Y-%m-%dT%H:%M:%SZ')

    expected_tpextents = [
        TemporalProjectedExtent(epsg=2004, extent=extents[0],
                                instant=time)._asdict(),
        TemporalProjectedExtent(epsg=2004, extent=extents[1],
                                instant=time)._asdict(),
        TemporalProjectedExtent(epsg=2004, extent=extents[2],
                                instant=time)._asdict()
    ]

    sc = BaseTestClass.pysc._jsc.sc()
    ew = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.TemporalProjectedExtentWrapper

    java_rdd = ew.testOut(sc)
    ser = ProtoBufSerializer(temporal_projected_extent_decoder,
                             temporal_projected_extent_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = [tpex._asdict() for tpex in rdd.collect()]

    @pytest.fixture(scope='class', autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_tpe, expected_tpe):
        for actual, expected in zip(actual_tpe, expected_tpe):
            self.assertDictEqual(actual, expected)

    def test_encoded_tpextents(self):
        actual_encoded = [
            temporal_projected_extent_encoder(x) for x in self.rdd.collect()
        ]

        for x in range(0, len(self.expected_tpextents)):
            self.expected_tpextents[x]['extent'] = Extent(
                **self.expected_tpextents[x]['extent'])

        expected_encoded = [
            to_pb_temporal_projected_extent(TemporalProjectedExtent(**ex)).SerializeToString() \
            for ex in self.expected_tpextents
        ]

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_tpextents(self):
        self.result_checker(self.collected, self.expected_tpextents)
Example #14
0
def _to_java_object_rdd(rdd: RDD) -> JavaObject:
    """Return an JavaRDD of Object by unpickling

    It will convert each Python object into Java object by Pickle, whenever the
    RDD is serialized in batch or not.
    """
    rdd = rdd._reserialize(AutoBatchedSerializer(CPickleSerializer()))
    assert rdd.ctx._jvm is not None
    return rdd.ctx._jvm.org.apache.spark.ml.python.MLSerDe.pythonToJava(
        rdd._jrdd, True)
Example #15
0
    def create_tuple_serializer(self, schema, key_type, value_type):
        decoder = \
                self.avroregistry.create_partial_tuple_decoder(key_type=key_type,
                                                               value_type=value_type)

        encoder = \
                self.avroregistry.create_partial_tuple_encoder(key_type=key_type,
                                                               value_type=value_type)

        return AutoBatchedSerializer(AvroSerializer(schema, decoder, encoder))
Example #16
0
class MultibandSchemaTest(BaseTestClass):
    arr = np.int8([0, 0, 1, 1]).reshape(2, 2)
    no_data = -128
    arr_dict = Tile(arr, 'BYTE', no_data)
    band_dicts = [arr_dict, arr_dict, arr_dict]

    bands = [arr, arr, arr]
    multiband_tile = np.array(bands)
    multiband_dict = Tile(multiband_tile, 'BYTE', no_data)

    sc = BaseTestClass.pysc._jsc.sc()
    mw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.ArrayMultibandTileWrapper

    java_rdd = mw.testOut(sc)
    ser = ProtoBufSerializer(multibandtile_decoder, multibandtile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def test_encoded_multibands(self):
        actual_encoded = [multibandtile_encoder(x) for x in self.collected]

        proto_tile = tileMessages_pb2.ProtoTile()
        cell_type = tileMessages_pb2.ProtoCellType()

        cell_type.nd = self.no_data
        cell_type.hasNoData = True
        cell_type.dataType = 1

        proto_tile.cols = 2
        proto_tile.rows = 2
        proto_tile.sint32Cells.extend(self.arr.flatten().tolist())
        proto_tile.cellType.CopyFrom(cell_type)

        proto_multiband = tileMessages_pb2.ProtoMultibandTile()
        proto_multiband.tiles.extend([proto_tile, proto_tile, proto_tile])
        bs = proto_multiband.SerializeToString()

        expected_encoded = [bs, bs, bs]

        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)

    def test_decoded_multibands(self):
        expected_multibands = [
            self.multiband_dict, self.multiband_dict, self.multiband_dict
        ]

        for actual, expected in zip(self.collected, expected_multibands):
            self.assertTrue((actual.cells == expected.cells).all())
Example #17
0
def wrap_function(func, profiler=None):
    def pickle_command(command):
        # the serialized command will be compressed by broadcast
        ser = CloudPickleSerializer()
        pickled_command = ser.dumps(command)
        return pickled_command

    ser = AutoBatchedSerializer(PickleSerializer())
    command = (func, profiler, NoOpSerializer(), ser)
    pickled_command = pickle_command(command)
    return bytearray(pickled_command)
Example #18
0
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(_to_java_object_rdd(cached), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])
Example #19
0
def _to_java_object_rdd(rdd):
    """ 
    Return a JavaRDD of Object by unpickling
    It will convert each Python object into Java object by Pyrolite, whenever the
    RDD is serialized in batch or not.
    
    :param rdd: The spark rdd
    """
    rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))
    return rdd.ctx._jvm.org.apache.spark.mllib.api.python.SerDe.pythonToJava(
        rdd._jrdd, True)
Example #20
0
class FeatureCellValueSchemaTest(BaseTestClass):
    sc = BaseTestClass.pysc._jsc.sc()
    fw = BaseTestClass.pysc._jvm.geopyspark.geotrellis.tests.schemas.FeatureCellValueWrapper

    java_rdd = fw.testOut(sc)
    ser = ProtoBufSerializer(feature_cellvalue_decoder,
                             feature_cellvalue_encoder)

    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))

    point = Point(0, 2)
    line_1 = LineString(
        [point, Point(1, 3),
         Point(2, 4),
         Point(3, 5),
         Point(4, 6)])
    line_2 = LineString(
        [Point(5, 7),
         Point(6, 8),
         Point(7, 9),
         Point(8, 10),
         Point(9, 11)])
    multi_line = MultiLineString([line_1, line_2])

    features = [
        Feature(point, CellValue(2, 1)),
        Feature(line_1, CellValue(1, 0)),
        Feature(multi_line, CellValue(1, 0))
    ]

    collected = [f for f in rdd.collect()]

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def test_decoder(self):
        geoms = [g.geometry for g in self.collected]
        ms = [m.properties for m in self.collected]

        for x in self.features:
            self.assertTrue(x.geometry in geoms)
            self.assertTrue(x.properties in ms)

    def test_encoder(self):
        expected_encoded = [
            to_pb_feature_cellvalue(f).SerializeToString()
            for f in self.features
        ]
        actual_encoded = [feature_cellvalue_encoder(f) for f in self.collected]

        for x in expected_encoded:
            self.assertTrue(x in actual_encoded)
Example #21
0
 def test_hash_serializer(self):
     hash(NoOpSerializer())
     hash(UTF8Deserializer())
     hash(CPickleSerializer())
     hash(MarshalSerializer())
     hash(AutoSerializer())
     hash(BatchedSerializer(CPickleSerializer()))
     hash(AutoBatchedSerializer(MarshalSerializer()))
     hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CompressedSerializer(CPickleSerializer()))
     hash(FlattenedValuesSerializer(CPickleSerializer()))
Example #22
0
 def _prepare(cls, ratings):
     assert isinstance(ratings, RDD), "ratings should be RDD"
     first = ratings.first()
     if not isinstance(first, Rating):
         if isinstance(first, (tuple, list)):
             ratings = ratings.map(lambda x: Rating(*x))
         else:
             raise ValueError(
                 "rating should be RDD of Rating or tuple/list")
     # serialize them by AutoBatchedSerializer before cache to reduce the
     # objects overhead in JVM
     cached = ratings._reserialize(AutoBatchedSerializer(
         PickleSerializer())).cache()
     return cached._to_java_object_rdd()
Example #23
0
 def _create_judf(self):
     f = self.func  # put it in closure `func`
     func = lambda _, it: map(lambda x: f(*x), it)
     ser = AutoBatchedSerializer(PickleSerializer())
     command = (func, None, ser, ser)
     sc = SparkContext._active_spark_context
     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
     ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
     jdt = ssql_ctx.parseDataType(self.returnType.json())
     fname = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__
     judf = sc._jvm.UserDefinedPythonFunction(fname, bytearray(pickled_command), env, includes,
                                              sc.pythonExec, sc.pythonVer, broadcast_vars,
                                              sc._javaAccumulator, jdt)
     return judf
Example #24
0
def _java2py(sc, r):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        if clsName in ("RDD", "JavaRDD"):
            if clsName == "RDD":
                r = r.toJavaRDD()
            jrdd = sc._jvm.SerDe.javaToPython(r)
            return RDD(jrdd, sc, AutoBatchedSerializer(PickleSerializer()))

        elif clsName in _picklable_classes:
            r = sc._jvm.SerDe.dumps(r)

    if isinstance(r, bytearray):
        r = PickleSerializer().loads(str(r))
    return r
Example #25
0
    def create_python_rdd(self, jrdd, serializer):
        """Creates a Python RDD from a RDD from Scala.

        Args:
            jrdd (org.apache.spark.api.java.JavaRDD): The RDD that came from Scala.
            serializer (:class:`~geopyspark.AvroSerializer` or pyspark.serializers.AutoBatchedSerializer(AvroSerializer)):
                An instance of ``AvroSerializer`` that is either alone, or wrapped by ``AutoBatchedSerializer``.

        Returns:
            ``pyspark.RDD``
        """

        if isinstance(serializer, AutoBatchedSerializer):
            return RDD(jrdd, self.pysc, serializer)
        else:
            return RDD(jrdd, self.pysc, AutoBatchedSerializer(serializer))
Example #26
0
class ExtentSchemaTest(BaseTestClass):
    ew = BaseTestClass.pysc._gateway.jvm.geopyspark.geotrellis.tests.schemas.ExtentWrapper
    java_rdd = ew.testOut(BaseTestClass.pysc._jsc.sc())
    ser = ProtoBufSerializer(extent_decoder, extent_encoder)
    rdd = RDD(java_rdd, BaseTestClass.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    expected_extents = [{
        "xmin": 0.0,
        "ymin": 0.0,
        "xmax": 1.0,
        "ymax": 1.0
    }, {
        "xmin": 1.0,
        "ymin": 2.0,
        "xmax": 3.0,
        "ymax": 4.0
    }, {
        "xmin": 5.0,
        "ymin": 6.0,
        "xmax": 7.0,
        "ymax": 8.0
    }]

    @pytest.fixture(scope='class', autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.pysc._gateway.close()

    def result_checker(self, actual_result, expected_result):
        for actual, expected in zip(actual_result, expected_result):
            self.assertDictEqual(actual, expected)

    def test_decoded_extents(self):
        actual_decoded = [
            from_pb_extent(ex)._asdict() for ex in self.collected
        ]
        self.result_checker(actual_decoded, self.expected_extents)

    def test_encoded_extents(self):
        expected_encoded = [
            to_pb_extent(Extent(**x)).SerializeToString()
            for x in self.expected_extents
        ]
        actual_encoded = [extent_encoder(x) for x in self.collected]
        for actual, expected in zip(actual_encoded, expected_encoded):
            self.assertEqual(actual, expected)
Example #27
0
def pyspark_SizeEstimator(sdf):
    """Return the memory size in bytes of PySpark DataFrame.

    It firstly converts each Python object into Java object by Pyrolite, whenever the RDD
    is serialized in batch or not. Then it utilize the Java function `SizeEstimator` to
    obtain the object size.

    """
    sdf.persist(StorageLevel.MEMORY_ONLY)
    s_rdd = sdf.rdd
    rdd = s_rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))
    java_obj = s_rdd.ctx._jvm.org.apache.spark.mllib.api.python.SerDe.pythonToJava(
        rdd._jrdd, True)
    size = SparkContext._jvm.org.apache.spark.util.SizeEstimator.estimate(
        java_obj)

    return size
class MultibandSchemaTest(BaseTestClass):
    arr = np.array(bytearray([0, 0, 1, 1])).reshape(2, 2)
    no_data = -128
    arr_dict = {'data': arr, 'no_data_value': no_data}
    band_dicts = [arr_dict, arr_dict, arr_dict]

    bands = [arr, arr, arr]
    multiband_tile = np.array(bands)
    multiband_dict = {'data': multiband_tile, 'no_data_value': no_data}

    sc = BaseTestClass.geopysc.pysc._jsc.sc()
    mw = BaseTestClass.geopysc.pysc._jvm.geopyspark.geotrellis.tests.schemas.ArrayMultibandTileWrapper

    tup = mw.testOut(sc)
    java_rdd = tup._1()

    ser = AvroSerializer(tup._2(),
                         AvroRegistry.tile_decoder,
                         AvroRegistry.tile_encoder)

    rdd = RDD(java_rdd, BaseTestClass.geopysc.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    @pytest.fixture(autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.geopysc.pysc._gateway.close()

    def test_encoded_multibands(self):
        encoded = self.rdd.map(lambda s: AvroRegistry.tile_encoder(s))

        actual_encoded = encoded.collect()[0]
        expected_encoded = AvroRegistry.tile_encoder(self.multiband_dict)

        for actual, expected in zip(actual_encoded['bands'], expected_encoded['bands']):
            self.assertEqual(actual, expected)

    def test_decoded_multibands(self):
        expected_multibands = [
            self.multiband_dict,
            self.multiband_dict,
            self.multiband_dict
        ]

        for actual, expected in zip(self.collected, expected_multibands):
            self.assertTrue((actual['data'] == expected['data']).all())
Example #29
0
 def train(cls,
           rdd,
           k,
           maxIterations=100,
           runs=1,
           initializationMode="k-means||"):
     """Train a k-means clustering model."""
     sc = rdd.context
     ser = PickleSerializer()
     # cache serialized data to avoid objects over head in JVM
     cached = rdd.map(_convert_to_vector)._reserialize(
         AutoBatchedSerializer(ser)).cache()
     model = sc._jvm.PythonMLLibAPI().trainKMeansModel(
         _to_java_object_rdd(cached), k, maxIterations, runs,
         initializationMode)
     bytes = sc._jvm.SerDe.dumps(model.clusterCenters())
     centers = ser.loads(str(bytes))
     return KMeansModel([c.toArray() for c in centers])
class ExtentSchemaTest(BaseTestClass):
    ew = BaseTestClass.geopysc._jvm.geopyspark.geotrellis.tests.schemas.ExtentWrapper

    tup = ew.testOut(BaseTestClass.geopysc.sc)
    java_rdd = tup._1()
    ser = AvroSerializer(tup._2())

    rdd = RDD(java_rdd, BaseTestClass.geopysc.pysc, AutoBatchedSerializer(ser))
    collected = rdd.collect()

    expected_extents = [{
        "xmin": 0.0,
        "ymin": 0.0,
        "xmax": 1.0,
        "ymax": 1.0
    }, {
        "xmin": 1.0,
        "ymin": 2.0,
        "xmax": 3.0,
        "ymax": 4.0
    }, {
        "xmin": 5.0,
        "ymin": 6.0,
        "xmax": 7.0,
        "ymax": 8.0
    }]

    @pytest.fixture(scope='class', autouse=True)
    def tearDown(self):
        yield
        BaseTestClass.geopysc.pysc._gateway.close()

    def result_checker(self, actual_result, expected_result):
        for actual, expected in zip(actual_result, expected_result):
            self.assertDictEqual(actual, expected)

    def test_encoded_extents(self):
        encoded = self.rdd.map(lambda s: s)
        actual_encoded = encoded.collect()

        self.result_checker(actual_encoded, self.expected_extents)

    def test_decoded_extents(self):
        self.result_checker(self.collected, self.expected_extents)