コード例 #1
0
ファイル: udf.py プロジェクト: mattomatic/flint
def _fn_and_type(udf_column):
    '''Get the python function and sql date type from a spark udf column
    :return: A tuple of (function, dataType)
    '''
    ser = PickleSerializer()
    b = udf_column._jc.expr().func().command()
    return ser.loads(b)
コード例 #2
0
ファイル: tree.py プロジェクト: 312268112/spark
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
コード例 #3
0
ファイル: test_algorithms.py プロジェクト: Brett-A/spark
 def test_als_ratings_serialize(self):
     ser = PickleSerializer()
     r = Rating(7, 1123, 3.14)
     jr = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(r)))
     nr = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jr)))
     self.assertEqual(r.user, nr.user)
     self.assertEqual(r.product, nr.product)
     self.assertAlmostEqual(r.rating, nr.rating, 2)
コード例 #4
0
ファイル: regression.py プロジェクト: 312268112/spark
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(cached._to_java_object_rdd(), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])
コード例 #5
0
ファイル: context.py プロジェクト: iAmGhost/spark
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a
     L{Broadcast<pyspark.broadcast.Broadcast>}
     object for reading it in distributed functions. The variable will be
     sent to each cluster only once.
     """
     pickleSer = PickleSerializer()
     pickled = pickleSer.dumps(value)
     jbroadcast = self._jsc.broadcast(bytearray(pickled))
     return Broadcast(jbroadcast.id(), value, jbroadcast, self._pickled_broadcast_vars)
コード例 #6
0
ファイル: clustering.py プロジェクト: 312268112/spark
 def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"):
     """Train a k-means clustering model."""
     sc = rdd.context
     ser = PickleSerializer()
     # cache serialized data to avoid objects over head in JVM
     cached = rdd.map(_convert_to_vector)._reserialize(AutoBatchedSerializer(ser)).cache()
     model = sc._jvm.PythonMLLibAPI().trainKMeansModel(
         cached._to_java_object_rdd(), k, maxIterations, runs, initializationMode)
     bytes = sc._jvm.SerDe.dumps(model.clusterCenters())
     centers = ser.loads(str(bytes))
     return KMeansModel([c.toArray() for c in centers])
コード例 #7
0
ファイル: feature.py プロジェクト: Ludwsam/spark
    def findSynonyms(self, x, num):
        """
        :param x: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Find synonyms of a word

        Note: local use only
        """
        # TODO: make findSynonyms usable in RDD operations from python side
        ser = PickleSerializer()
        if type(x) == str:
            jlist = self._java_model.findSynonyms(x, num)
        else:
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            jlist = self._java_model.findSynonyms(vec, num)
        words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist)))
        return zip(words, similarity)
コード例 #8
0
ファイル: test_linalg.py プロジェクト: apache/spark
 def _test_serialize(self, v):
     ser = PickleSerializer()
     self.assertEqual(v, ser.loads(ser.dumps(v)))
     jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v)))
     nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec)))
     self.assertEqual(v, nv)
     vs = [v] * 100
     jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs)))
     nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs)))
     self.assertEqual(vs, nvs)
コード例 #9
0
ファイル: common.py プロジェクト: MelissaCP1/ExamenU2
def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = ListConverter().convert([_py2java(sc, x) for x in obj],
                                      sc._gateway._gateway_client)
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
    return obj
コード例 #10
0
    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                 environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
                 gateway=None, jsc=None):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        :param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
        :param sparkHome: Location where Spark is installed on cluster nodes.
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        :param environment: A dictionary of environment variables to set on
               worker nodes.
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
        :param serializer: The serializer for RDDs.
        :param conf: A L{SparkConf} object setting Spark properties.
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)
        SparkContext._ensure_initialized(self, gateway=gateway)
        try:
            self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                          conf, jsc)
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
            self.stop()
            raise
コード例 #11
0
def _py2java(sc: SparkContext, obj: Any) -> JavaObject:
    """Convert Python object into Java"""
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc  # type: ignore[attr-defined]
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, float, bool, bytes, str)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(
            data)  # type: ignore[attr-defined]
    return obj
コード例 #12
0
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=0,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None,
                 jsc=None,
                 profiler_cls=BasicProfiler):

        pyspark.SparkContext.__init__(self,
                                      master=master,
                                      appName=appName,
                                      sparkHome=sparkHome,
                                      pyFiles=pyFiles,
                                      environment=environment,
                                      batchSize=batchSize,
                                      serializer=serializer,
                                      conf=conf,
                                      gateway=gateway,
                                      jsc=jsc,
                                      profiler_cls=profiler_cls)

        config = configparser.ConfigParser()
        config.read(os.path.expanduser("~/.aws/credentials"))

        aws_profile = 'default'  # your AWS profile to use

        access_id = config.get(aws_profile, "aws_access_key_id")
        access_key = config.get(aws_profile, "aws_secret_access_key")

        hadoopConf = self._jsc.hadoopConfiguration()

        hadoopConf.set("fs.s3a.impl",
                       "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
        hadoopConf.set("fs.s3a.awsAccessKeyId", access_id)
        hadoopConf.set("fs.s3a.awsSecretAccessKey", access_key)
        hadoopConf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs",
                       "false")
コード例 #13
0
    def test_serialize(self):
        from scipy.sparse import lil_matrix

        ser = PickleSerializer()
        lil = lil_matrix((4, 1))
        lil[1, 0] = 1
        lil[3, 0] = 2
        sv = SparseVector(4, {1: 1, 3: 2})
        self.assertEqual(sv, _convert_to_vector(lil))
        self.assertEqual(sv, _convert_to_vector(lil.tocsc()))
        self.assertEqual(sv, _convert_to_vector(lil.tocoo()))
        self.assertEqual(sv, _convert_to_vector(lil.tocsr()))
        self.assertEqual(sv, _convert_to_vector(lil.todok()))

        def serialize(l):
            return ser.loads(ser.dumps(_convert_to_vector(l)))
        self.assertEqual(sv, serialize(lil))
        self.assertEqual(sv, serialize(lil.tocsc()))
        self.assertEqual(sv, serialize(lil.tocsr()))
        self.assertEqual(sv, serialize(lil.todok()))
コード例 #14
0
ファイル: test_rdd.py プロジェクト: zhining-lu/spark
 def test_zip_with_different_serializers(self):
     a = self.sc.parallelize(range(5))
     b = self.sc.parallelize(range(100, 105))
     self.assertEqual(
         a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103),
                              (4, 104)])
     a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
     b = b._reserialize(MarshalSerializer())
     self.assertEqual(
         a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103),
                              (4, 104)])
     # regression test for SPARK-4841
     path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
     t = self.sc.textFile(path)
     cnt = t.count()
     self.assertEqual(cnt, t.zip(t).count())
     rdd = t.map(str)
     self.assertEqual(cnt, t.zip(rdd).count())
     # regression test for bug in _reserializer()
     self.assertEqual(cnt, t.zip(rdd).count())
コード例 #15
0
def read_udfs(pickleSer, infile):
    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    for i in range(num_udfs):
        arg_offsets, udf = read_single_udf(pickleSer, infile)
        udfs['f%d' % i] = udf
        args = ["a[%d]" % o for o in arg_offsets]
        call_udf.append("f%d(%s)" % (i, ", ".join(args)))
    # Create function like this:
    #   lambda a: (f0(a0), f1(a1, a2), f2(a3))
    # In the special case of a single UDF this will return a single result rather
    # than a tuple of results; this is the format that the JVM side expects.
    mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
    mapper = eval(mapper_str, udfs)

    func = lambda _, it: map(mapper, it)
    ser = BatchedSerializer(PickleSerializer(), 100)
    # profiling is not supported for UDF
    return func, None, ser, ser
コード例 #16
0
 def predictAll(self, user_product):
     assert isinstance(user_product,
                       RDD), "user_product should be RDD of (user, product)"
     first = user_product.first()
     if isinstance(first, list):
         user_product = user_product.map(tuple)
         first = tuple(first)
     assert type(first) is tuple and len(first) == 2, \
         "user_product should be RDD of (user, product)"
     if any(isinstance(x, str) for x in first):
         user_product = user_product.map(lambda (u, p): (int(x), int(p)))
         first = tuple(map(int, first))
     assert all(
         type(x) is int
         for x in first), "user and product in user_product shoul be int"
     sc = self._context
     tuplerdd = sc._jvm.SerDe.asTupleRDD(
         user_product._to_java_object_rdd().rdd())
     jresult = self._java_model.predict(tuplerdd).toJavaRDD()
     return RDD(sc._jvm.PythonRDD.javaToPython(jresult), sc,
                AutoBatchedSerializer(PickleSerializer()))
コード例 #17
0
ファイル: dataframe.py プロジェクト: lorenzfischer/spark
    def collect(self):
        """Return a list that contains all of the rows.

        Each object in the list is a Row, the fields can be accessed as
        attributes.

        >>> df.collect()
        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
        """
        with SCCallSiteSync(self._sc) as css:
            bytesInJava = self._jdf.javaToPython().collect().iterator()
        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
        tempFile.close()
        self._sc._writeToFile(bytesInJava, tempFile.name)
        # Read the data into Python and deserialize it:
        with open(tempFile.name, 'rb') as tempFile:
            rs = list(
                BatchedSerializer(PickleSerializer()).load_stream(tempFile))
        os.unlink(tempFile.name)
        cls = _create_cls(self.schema)
        return [cls(r) for r in rs]
コード例 #18
0
 def __init__(self, aggregator, memory_limit=512, serializer=None,
              localdirs=None, scale=1, partitions=59, batch=1000):
     Merger.__init__(self, aggregator)
     self.memory_limit = memory_limit
     # default serializer is only used for tests
     self.serializer = serializer or AutoBatchedSerializer(PickleSerializer())
     self.localdirs = localdirs or _get_local_dirs(str(id(self)))
     # number of partitions when spill data into disks
     self.partitions = partitions
     # check the memory after # of items merged
     self.batch = batch
     # scale is used to scale down the hash of key for recursive hash map
     self.scale = scale
     # unpartitioned merged data
     self.data = {}
     # partitioned merged data, list of dicts
     self.pdata = []
     # number of chunks dumped into disks
     self.spills = 0
     # randomize the hash of key, id(o) is the address of o (aligned by 8)
     self._seed = id(self) + 7
    def createStream(ssc, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        Create an input stream that pulls messages from a Event Hub.

        :param ssc:  StreamingContext object        
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("com.ge.predix.predixinsights.eventhub.EventHubUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, jlevel)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                EHUtils._printErrorMsg(ssc.sparkContext)
            raise e

        return DStream(jstream, ssc, PickleSerializer())
コード例 #20
0
ファイル: common.py プロジェクト: ssattari/BigDL
def _java2py(gateway, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != 'JavaRDD' and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = 'JavaRDD'

        if clsName == 'JavaRDD':
            jrdd = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.javaToPython(
                r)
            return RDD(jrdd, get_spark_context())

        if clsName == 'DataFrame':
            return DataFrame(r, get_spark_sql_context(get_spark_context()))

        if clsName == 'Dataset':
            return DataFrame(r, get_spark_sql_context(get_spark_context()))

        if clsName == "ImageFrame[]":
            return r

        if clsName in _picklable_classes:
            r = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                r)
        elif isinstance(r, (JavaArray, JavaList)) and len(r) != 0 \
                and isinstance(r[0], JavaObject) \
                and r[0].getClass().getSimpleName() in ['DataFrame', 'Dataset']:
            spark = get_spark_sql_context(get_spark_context())
            r = list(map(lambda x: DataFrame(x, spark), r))
        elif isinstance(r, (JavaArray, JavaList, JavaMap)):
            try:
                r = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                    r)
            except Py4JJavaError:
                pass  # not pickable

        if isinstance(r, (bytearray, bytes)):
            r = PickleSerializer().loads(bytes(r), encoding=encoding)
    return r
コード例 #21
0
    def __init__(self,
                 keyspace,
                 table,
                 ctx,
                 row_format=None,
                 split_size=None,
                 fetch_size=None,
                 consistency_level=None):
        self.keyspace = keyspace
        self.table = table

        if not row_format:
            row_format = RowFormat.ROW
        elif row_format < 0 or row_format >= len(RowFormat.values):
            raise ValueError("invalid row_format %s" % row_format)

        jvm = ctx._jvm
        ReadConf = jvm.ReadConf

        split_size = split_size or ReadConf.DefaultSplitSize()
        fetch_size = fetch_size or ReadConf.DefaultFetchSize()
        consistency_level = jvm.ConsistencyLevel.values()[consistency_level] \
         if consistency_level else ReadConf.DefaultConsistencyLevel()

        read_conf = ReadConf(
            split_size,
            fetch_size,
            consistency_level,
        )

        row_format = ctx._jvm.RowFormat.values()[row_format]
        reader_factory = ctx._jvm.PickleRowReaderFactory(row_format)
        jrdd = (
         ctx._cjcs \
          .cassandraTable(keyspace, table, reader_factory)
          .withReadConf(read_conf)
        )

        super(CassandraRDD, self).__init__(jrdd, ctx, PickleSerializer())
コード例 #22
0
    def registerFunction(self, name, f, returnType=StringType()):
        """Registers a lambda function as a UDF so it can be used in SQL statements.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not given it default to a string and conversion will automatically
        be done.  For any other return type, the produced object must match the specified type.

        :param name: name of the UDF
        :param samplingRatio: lambda function
        :param returnType: a :class:`DataType` object

        >>> sqlContext.registerFunction("stringLengthString", lambda x: len(x))
        >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
        [Row(c0=u'4')]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]

        >>> from pyspark.sql.types import IntegerType
        >>> sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
        [Row(c0=4)]
        """
        func = lambda _, it: map(lambda x: f(*x), it)
        ser = AutoBatchedSerializer(PickleSerializer())
        command = (func, None, ser, ser)
        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self._sc, command, self)
        self._ssql_ctx.udf().registerPython(name,
                                            bytearray(pickled_cmd),
                                            env,
                                            includes,
                                            self._sc.pythonExec,
                                            self._sc.pythonVer,
                                            bvars,
                                            self._sc._javaAccumulator,
                                            returnType.json())
コード例 #23
0
    def __init__(self, jrdd, jrdd_deserializer = BatchedSerializer(PickleSerializer())):
        """
        Constructor
        :param jrdd:               A JavaRDD reference passed from the parent
                                   RDD object
        :param jrdd_deserializer:  The deserializer used in Python workers
                                   created from PythonRDD to execute a
                                   serialized Python function and RDD

        """

        self.name = "EdgeRDD"
        self.jrdd = jrdd
        self.is_cached = False
        self.is_checkpointed = False
        self.ctx = SparkContext._active_spark_context
        self.jedge_rdd_deserializer = jrdd_deserializer
        self.id = jrdd.id()
        self.partitionFunc = None
        self.bypass_serializer = False
        self.preserve_partitioning = False

        self.jedge_rdd = self.getJavaEdgeRDD(jrdd, jrdd_deserializer)
コード例 #24
0
def _java2py(sc, r):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != 'JavaRDD' and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = 'JavaRDD'

        if clsName == 'JavaRDD':
            jrdd = sc._jvm.SerDe.javaToPython(r)
            return RDD(jrdd, sc)

        if clsName in _picklable_classes:
            r = sc._jvm.SerDe.dumps(r)
        elif isinstance(r, (JavaArray, JavaList)):
            try:
                r = sc._jvm.SerDe.dumps(r)
            except Py4JJavaError:
                pass  # not pickable

    if isinstance(r, bytearray):
        r = PickleSerializer().loads(str(r))
    return r
コード例 #25
0
    def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None,
                  valueConverter=None, conf=None):
        """
        Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
        Hadoop configuration, which is passed in as a Python dict.
        This will be converted into a Configuration in Java.
        The mechanism is the same as for sc.sequenceFile.

        @param inputFormatClass: fully qualified classname of Hadoop InputFormat
               (e.g. "org.apache.hadoop.mapred.TextInputFormat")
        @param keyClass: fully qualified classname of key Writable class
               (e.g. "org.apache.hadoop.io.Text")
        @param valueClass: fully qualified classname of value Writable class
               (e.g. "org.apache.hadoop.io.LongWritable")
        @param keyConverter: (None by default)
        @param valueConverter: (None by default)
        @param conf: Hadoop configuration, passed in as a dict
               (None by default)
        """
        jconf = self._dictToJavaMap(conf)
        jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass,
                                             keyConverter, valueConverter, jconf)
        return RDD(jrdd, self, PickleSerializer())
コード例 #26
0
def _mml_py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, JavaParams):
        obj._transfer_params_to_java()
        obj = obj._java_obj
    elif isinstance(obj, PipelineModel):
        obj = obj._to_java()
    elif isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_mml_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, float, bool, bytes, str)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
    return obj
コード例 #27
0
def read_udfs(pickleSer, infile):
    num_udfs = read_int(infile)
    if num_udfs == 1:
        # fast path for single UDF
        _, udf = read_single_udf(pickleSer, infile)
        mapper = lambda a: udf(*a)
    else:
        udfs = {}
        call_udf = []
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer, infile)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        # Create function like this:
        #   lambda a: (f0(a0), f1(a1, a2), f2(a3))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
        mapper = eval(mapper_str, udfs)

    func = lambda _, it: map(mapper, it)
    ser = BatchedSerializer(PickleSerializer(), 100)
    # profiling is not supported for UDF
    return func, None, ser, ser
コード例 #28
0
def joinWithNeo4jTable(dstream,
                       keyspace,
                       table,
                       selected_columns=None,
                       join_columns=None):
    """Joins a DStream (a stream of RDDs) with a Neo4j table

    Arguments:
        @param dstream(DStream)
        The DStream to join. Equals to self when invoking
        joinWithNeo4jTable on a monkey patched RDD.
        @param keyspace(string):
            The keyspace to join on.
        @param table(string):
            The CQL table to join on.
        @param selected_columns(string):
            The columns to select from the Neo4j table.
        @param join_columns(string):
            The columns used to join on from the Neo4j table.
    """

    ssc = dstream._ssc
    ctx = ssc._sc
    gw = ctx._gateway

    selected_columns = as_java_array(
        gw, "String", selected_columns) if selected_columns else None
    join_columns = as_java_array(gw, "String",
                                 join_columns) if join_columns else None

    h = helper(ctx)
    dstream = h.joinWithNeo4jTable(dstream._jdstream, keyspace, table,
                                   selected_columns, join_columns)
    dstream = h.pickleRows(dstream)
    dstream = h.javaDStream(dstream)

    return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
コード例 #29
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get("spark.sql.execution.pandas.arrowSafeTypeConversion",
                                    "false").lower() == 'true'
        # NOTE: this is duplicated from wrap_grouped_map_pandas_udf
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        ser = ArrowStreamPandasSerializer(timezone, safecheck, assign_cols_by_name)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(
            pickleSer, infile, eval_type, runner_conf, udf_index=0)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1: split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(
                pickleSer, infile, eval_type, runner_conf, udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
コード例 #30
0
from pyspark.broadcast import Broadcast, _broadcastRegistry
from pyspark.java_gateway import local_connect_and_auth
from pyspark.taskcontext import BarrierTaskContext, TaskContext
from pyspark.files import SparkFiles
from pyspark.rdd import PythonEvalType
from pyspark.serializers import write_with_length, write_int, read_long, read_bool, \
    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, \
    BatchedSerializer, ArrowStreamPandasSerializer
from pyspark.sql.types import to_arrow_type, StructType
from pyspark.util import _get_argspec, fail_on_stopiteration
from pyspark import shuffle

if sys.version >= '3':
    basestring = str

pickleSer = PickleSerializer()
utf8_deserializer = UTF8Deserializer()


def report_times(outfile, boot, init, finish):
    write_int(SpecialLengths.TIMING_DATA, outfile)
    write_long(int(1000 * boot), outfile)
    write_long(int(1000 * init), outfile)
    write_long(int(1000 * finish), outfile)


def add_path(path):
    # worker can be used, so donot add path multiple times
    if path not in sys.path:
        # overwrite system packages
        sys.path.insert(1, path)
コード例 #31
0
ファイル: worker.py プロジェクト: XpressAI/spark
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.convertToArrowArraySafely",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            ser = CogroupUDFSerializer(timezone, safecheck,
                                       assign_cols_by_name)
        else:
            # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
            # pandas Series. See SPARK-27240.
            df_for_struct = (
                eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
                or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
                or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
            ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                                 assign_cols_by_name,
                                                 df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = 0

            def map_batch(batch):
                nonlocal num_input_rows

                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                # This assert is for Scalar Iterator UDF to fail fast.
                # The length of the entire input can only be explicitly known
                # by consuming the input iterator in user side. Therefore,
                # it's very unlikely the output length is higher than
                # input length.
                assert is_map_iter or num_output_rows <= num_input_rows, \
                    "Pandas SCALAR_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError(
                        "pandas iterator UDF should exhaust the input "
                        "iterator.")

                if num_output_rows != num_input_rows:
                    raise RuntimeError(
                        "The length of output in Scalar iterator pandas UDF should be "
                        "the same with the input's; however, the length of output was %d and the "
                        "length of input was %d." %
                        (num_output_rows, num_input_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    def extract_key_value_indexes(grouped_arg_offsets):
        """
        Helper function to extract the key and value indexes from arg_offsets for the grouped and
        cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code.

        Parameters
        ----------
        grouped_arg_offsets:  list
            List containing the key and value indexes of columns of the
            DataFrames to be passed to the udf. It consists of n repeating groups where n is the
            number of DataFrames.  Each group has the following format:
                group[0]: length of group
                group[1]: length of key indexes
                group[2.. group[1] +2]: key attributes
                group[group[1] +3 group[0]]: value attributes
        """
        parsed = []
        idx = 0
        while idx < len(grouped_arg_offsets):
            offsets_len = grouped_arg_offsets[idx]
            idx += 1
            offsets = grouped_arg_offsets[idx:idx + offsets_len]
            split_index = offsets[0] + 1
            offset_keys = offsets[1:split_index]
            offset_values = offsets[split_index:]
            parsed.append([offset_keys, offset_values])
            idx += offsets_len
        return parsed

    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, f = read_single_udf(pickleSer,
                                         infile,
                                         eval_type,
                                         runner_conf,
                                         udf_index=0)
        parsed_offsets = extract_key_value_indexes(arg_offsets)

        # Create function like this:
        #   mapper a: f([a[0]], [a[0], a[1]])
        def mapper(a):
            keys = [a[o] for o in parsed_offsets[0][0]]
            vals = [a[o] for o in parsed_offsets[0][1]]
            return f(keys, vals)
    elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because cogrouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1
        arg_offsets, f = read_single_udf(pickleSer,
                                         infile,
                                         eval_type,
                                         runner_conf,
                                         udf_index=0)

        parsed_offsets = extract_key_value_indexes(arg_offsets)

        def mapper(a):
            df1_keys = [a[0][o] for o in parsed_offsets[0][0]]
            df1_vals = [a[0][o] for o in parsed_offsets[0][1]]
            df2_keys = [a[1][o] for o in parsed_offsets[1][0]]
            df2_vals = [a[1][o] for o in parsed_offsets[1][1]]
            return f(df1_keys, df1_vals, df2_keys, df2_vals)
    else:
        udfs = []
        for i in range(num_udfs):
            udfs.append(
                read_single_udf(pickleSer,
                                infile,
                                eval_type,
                                runner_conf,
                                udf_index=i))

        def mapper(a):
            result = tuple(
                f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
            # In the special case of a single UDF this will return a single result rather
            # than a tuple of results; this is the format that the JVM side expects.
            if len(result) == 1:
                return result[0]
            else:
                return result

    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
コード例 #32
0
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=1024,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None,
                 sparkContext=None,
                 duration=None):
        """
        Create a new StreamingContext. At least the master and app name and duration
        should be set, either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.
        @param sparkContext: L{SparkContext} object.
        @param duration: A L{Duration} object for SparkStreaming.

        """

        if not isinstance(duration, Duration):
            raise TypeError(
                "Input should be pyspark.streaming.duration.Duration object")

        if sparkContext is None:
            # Create the Python Sparkcontext
            self._sc = SparkContext(master=master,
                                    appName=appName,
                                    sparkHome=sparkHome,
                                    pyFiles=pyFiles,
                                    environment=environment,
                                    batchSize=batchSize,
                                    serializer=serializer,
                                    conf=conf,
                                    gateway=gateway)
        else:
            self._sc = sparkContext

        # Start py4j callback server.
        # Callback sever is need only by SparkStreming; therefore the callback sever
        # is started in StreamingContext.
        SparkContext._gateway.restart_callback_server()
        self._set_clean_up_handler()
        self._jvm = self._sc._jvm
        self._jssc = self._initialize_context(self._sc._jsc,
                                              duration._jduration)
コード例 #33
0
ファイル: shuffle.py プロジェクト: swisscom-bigdata/spark
def _compressed_serializer(self, serializer=None):
    # always use PickleSerializer to simplify implementation
    ser = PickleSerializer()
    return AutoBatchedSerializer(CompressedSerializer(ser))
コード例 #34
0
ファイル: test_algorithms.py プロジェクト: Brett-A/spark
 def test_als_ratings_id_long_error(self):
     ser = PickleSerializer()
     r = Rating(1205640308657491975, 50233468418, 1.0)
     # rating user id exceeds max int value, should fail when pickled
     self.assertRaises(Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads,
                       bytearray(ser.dumps(r)))
コード例 #35
0
ファイル: context.py プロジェクト: mkolod/incubator-spark
    def __init__(self, master, jobName, sparkHome=None, pyFiles=None,
        environment=None, batchSize=1024, serializer=PickleSerializer()):
        """
        Create a new SparkContext.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param jobName: A name for your job, to display on the cluster web UI
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        SparkContext._ensure_initialized(self)

        self.master = master
        self.jobName = jobName
        self.sparkHome = sparkHome or None # None becomes null in Py4J
        self.environment = environment or {}
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Create the Java SparkContext through Py4J
        empty_string_array = self._gateway.new_array(self._jvm.String, 0)
        self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
                                              empty_string_array)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
                self._jvm.java.util.ArrayList(),
                self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.append(root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir()
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
コード例 #36
0
ファイル: EntityRecognition4.py プロジェクト: felixjff/wdps
# java_path = "C:\Program Files\Java\jre1.8.0_191\binjava.exe"
# os.environ['JAVAHOME'] = java_path

record_attribute = "WARC-Record-ID"
# Here we use a smaller testfile due to computation time. Use the sample.war.gz for real testing.
in_file = "C:/Users/klm85310/Documents/WDPS/sample.warc.gz"
stanford = 'C:/Users/klm85310/Documents/WDPS/stanford-ner-2017-06-09/stanford-ner-2017-06-09'

# Create Spark Context -- Remove this when running on cluster
# sc = SparkContext.getOrCreate()

conf = SparkConf().setAppName("Entity Recognition").setMaster("local[*]")
sc = SparkContext(
    conf=conf,
    serializer=PickleSerializer(),  # Default serializer
    # Unlimited batch size -> BatchedSerializer instead of AutoBatchedSerializer
    batchSize=64)

st = StanfordNERTagger(stanford +
                       '/classifiers/english.all.3class.distsim.crf.ser.gz',
                       stanford + '/stanford-ner.jar',
                       encoding='utf-8')

rdd_whole_warc_file = rdd = sc.newAPIHadoopFile(
    in_file,
    "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "org.apache.hadoop.io.Text",
    conf={"textinputformat.record.delimiter": "WARC/1.0"})
コード例 #37
0
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=1024,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        if rdd._extract_concise_traceback() is not None:
            self._callsite = rdd._extract_concise_traceback()
        else:
            tempNamedTuple = namedtuple("Callsite", "function file linenum")
            self._callsite = tempNamedTuple(function=None,
                                            file=None,
                                            linenum=None)
        SparkContext._ensure_initialized(self, gateway=gateway)

        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.iteritems():
                self._conf.setExecutorEnv(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception(
                "An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        # Create the Java SparkContext through Py4J
        self._jsc = self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.append(root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
            self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
コード例 #38
0
ファイル: context.py プロジェクト: Ditto0/Sparks
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=0,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None,
                 jsc=None,
                 profiler_cls=BasicProfiler):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        :param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
        :param sparkHome: Location where Spark is installed on cluster nodes.
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        :param environment: A dictionary of environment variables to set on
               worker nodes.
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
        :param serializer: The serializer for RDDs.
        :param conf: A L{SparkConf} object setting Spark properties.
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.
        :param jsc: The JavaSparkContext instance (optional).
        :param profiler_cls: A class of custom Profiler used to do profiling
               (default is pyspark.profiler.BasicProfiler).


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)
        if gateway is not None and gateway.gateway_parameters.auth_token is None:
            allow_insecure_env = os.environ.get(
                "PYSPARK_ALLOW_INSECURE_GATEWAY", "0")
            if allow_insecure_env == "1" or allow_insecure_env.lower(
            ) == "true":
                warnings.warn(
                    "You are passing in an insecure Py4j gateway.  This "
                    "presents a security risk, and will be completely forbidden in Spark 3.0"
                )
            else:
                raise ValueError(
                    "You are trying to pass an insecure Py4j gateway to Spark. This"
                    " presents a security risk.  If you are sure you understand and accept this"
                    " risk, you can set the environment variable"
                    " 'PYSPARK_ALLOW_INSECURE_GATEWAY=1', but"
                    " note this option will be removed in Spark 3.0")

        SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
        try:
            self._do_init(master, appName, sparkHome, pyFiles, environment,
                          batchSize, serializer, conf, jsc, profiler_cls)
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
            self.stop()
            raise
コード例 #39
0
ファイル: mllibutils.py プロジェクト: zyyjalyt/spark
def make_serializer():
    return PickleSerializer()