Example #1
0
	def __init__(self, loadDefaults=True, _jvm=None):
		super(SparkConf, self).__init__()
		self.arg = arg
		from pyspark.context import SparkContext
		SparkContext._ensure_initialized()
		_jvm = _jvm or SparkContext._jvm
		self._jconf = _jvm.SparkConf(loadDefaults)
Example #2
0
File: sql.py Project: heyook/spark
def _test():
    import doctest
    from array import array
    from pyspark.context import SparkContext

    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[4]", "PythonTest", batchSize=2)
    globs["sc"] = sc
    globs["sqlCtx"] = SQLContext(sc)
    globs["rdd"] = sc.parallelize(
        [{"field1": 1, "field2": "row1"}, {"field1": 2, "field2": "row2"}, {"field1": 3, "field2": "row3"}]
    )
    globs["nestedRdd1"] = sc.parallelize(
        [{"f1": array("i", [1, 2]), "f2": {"row1": 1.0}}, {"f1": array("i", [2, 3]), "f2": {"row2": 2.0}}]
    )
    globs["nestedRdd2"] = sc.parallelize(
        [
            {"f1": [[1, 2], [2, 3]], "f2": set([1, 2]), "f3": (1, 2)},
            {"f1": [[2, 3], [3, 4]], "f2": set([2, 3]), "f3": (2, 3)},
        ]
    )
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs["sc"].stop()
    if failure_count:
        exit(-1)
Example #3
0
def _test():
    import doctest
    import os
    import tempfile
    import py4j
    from pyspark.context import SparkContext
    from pyspark.sql import SparkSession, Row
    import pyspark.sql.readwriter

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    try:
        spark = SparkSession.builder.enableHiveSupport().getOrCreate()
    except py4j.protocol.Py4JError:
        spark = SparkSession(sc)

    globs["tempfile"] = tempfile
    globs["os"] = os
    globs["sc"] = sc
    globs["spark"] = spark
    globs["df"] = spark.read.parquet("python/test_support/sql/parquet_partitioned")
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
    )
    sc.stop()
    if failure_count:
        exit(-1)
Example #4
0
File: sql.py Project: 7472741/spark
def _test():
    import doctest
    from array import array
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
    jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
       '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}',
       '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}']
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    globs['nestedRdd1'] = sc.parallelize([
        {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
        {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}])
    globs['nestedRdd2'] = sc.parallelize([
        {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
        {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}])
    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #5
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
Example #6
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe

    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["sc"] = sc
    globs["sqlContext"] = SQLContext(sc)
    globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF(
        StructType([StructField("age", IntegerType()), StructField("name", StringType())])
    )
    globs["df2"] = sc.parallelize([Row(name="Tom", height=80), Row(name="Bob", height=85)]).toDF()
    globs["df4"] = sc.parallelize(
        [
            Row(name="Alice", age=10, height=80),
            Row(name="Bob", age=5, height=None),
            Row(name="Tom", age=None, height=None),
            Row(name=None, age=None, height=None),
        ]
    ).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
    )
    globs["sc"].stop()
    if failure_count:
        exit(-1)
Example #7
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.readwriter
    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #8
0
def _test():
    import doctest
    import os
    import tempfile
    import py4j
    from pyspark.context import SparkContext
    from pyspark.sql import SparkSession, Row
    import pyspark.sql.readwriter

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    try:
        spark = SparkSession.builder.enableHiveSupport().getOrCreate()
    except py4j.protocol.Py4JError:
        spark = SparkSession(sc)

    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')
    globs['sdf'] = \
        spark.read.format('text').stream('python/test_support/sql/streaming')

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    sc.stop()
    if failure_count:
        exit(-1)
Example #9
0
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["sc"] = sc
    globs["sqlContext"] = SQLContext(sc)
    globs["rdd"] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")]
    )
    globs["df"] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}',
    ]
    globs["jsonStrings"] = jsonStrings
    globs["json"] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
    )
    globs["sc"].stop()
    if failure_count:
        exit(-1)
Example #10
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.group
    globs = pyspark.sql.group.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
                                   Row(name='Bob', age=5, height=85)]).toDF()
    globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
                                   Row(course="Java",   year=2012, earnings=20000),
                                   Row(course="dotNET", year=2012, earnings=5000),
                                   Row(course="dotNET", year=2013, earnings=48000),
                                   Row(course="Java",   year=2013, earnings=30000)]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.group, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #11
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
                                   Row(name='Bob', age=5)]).toDF()
    globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
                                  Row(name='Bob', age=5, height=None),
                                  Row(name='Tom', age=None, height=None),
                                  Row(name=None, age=None, height=None)]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #12
0
class SparkTestingBaseTestCase(unittest2.TestCase):

    """Basic common test case for Spark. Provides a Spark context as sc.
    For non local mode testing you can either override sparkMaster
    or set the enviroment property SPARK_MASTER for non-local mode testing."""

    @classmethod
    def getMaster(cls):
        return os.getenv('SPARK_MASTER', "local[4]")

    def setUp(self):
        """Setup a basic Spark context for testing"""
        self.sc = SparkContext(self.getMaster())
        self.sql_context = HiveContext(self.sc)
        quiet_py4j()

    def tearDown(self):
        """
        Tear down the basic panda spark test case. This stops the running
        context and does a hack to prevent Akka rebinding on the same port.
        """
        self.sc.stop()
        # To avoid Akka rebinding to the same port, since it doesn't unbind
        # immediately on shutdown
        self.sc._jvm.System.clearProperty("spark.driver.port")
Example #13
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context
    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")]
    )
    globs['df'] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #14
0
class PyVertexRDDTestCase(unittest.TestCase):
    """
    Test collect, take, count, mapValues, diff,
    filter, mapVertexPartitions, innerJoin and leftJoin
    for VertexRDD
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.take(1)
        self.assertEqual(results, [(3, ("rxin", "student"))])

    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.count()
        self.assertEqual(results, 2)

    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.mapValues(lambda x: x + ":" + x)
        self.assertEqual(results, [(3, ("rxin:rxin", "student:student")),
                                   (7, ("jgonzal:jgonzal", "postdoc:postdoc"))])

    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.innerJoin(vertices1).collect()
        self.assertEqual(results, [])

    def leftJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
Example #15
0
class PySparkTestCase(unittest.TestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        self.sc = SparkContext('local[4]', class_name, batchSize=2)

    def tearDown(self):
        self.sc.stop()
        sys.path = self._old_sys_path
Example #16
0
class PySparkTestCase(unittest.TestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        self.sc = SparkContext("local[4]", class_name, batchSize=2)

    def tearDown(self):
        self.sc.stop()
        sys.path = self._old_sys_path
        # To avoid Akka rebinding to the same port, since it doesn't unbind
        # immediately on shutdown
        self.sc._jvm.System.clearProperty("spark.driver.port")
Example #17
0
    def __init__(self, loadDefaults=True, _jvm=None):
        """
        Create a new Spark configuration.

        @param loadDefaults: whether to load values from Java system
               properties (True by default)
        @param _jvm: internal parameter used to pass a handle to the
               Java VM; does not need to be set by users
        """
        from pyspark.context import SparkContext
        SparkContext._ensure_initialized()
        _jvm = _jvm or SparkContext._jvm
        self._jconf = _jvm.SparkConf(loadDefaults)
Example #18
0
    def __init__(self, millis, _jvm=None):
        """
        Create new Duration.

        @param millis: milisecond

        """
        self._millis = millis

        from pyspark.context import SparkContext
        SparkContext._ensure_initialized()
        _jvm = _jvm or SparkContext._jvm
        self._jduration = _jvm.Duration(millis)
class PySparkTestCase(unittest.TestCase):
    def setUp(self):
        class_name = self.__class__.__name__
        self.sc = SparkContext('local', class_name)
        self.sc._jvm.System.setProperty("spark.ui.showConsoleProgress", "false")
        log4j = self.sc._jvm.org.apache.log4j
        log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL)

    def tearDown(self):
        self.sc.stop()
        # To avoid Akka rebinding to the same port, since it doesn't unbind
        # immediately on shutdown
        self.sc._jvm.System.clearProperty("spark.driver.port")
class PySparkTestCase(unittest.TestCase):
    def setUp(self):
        class_name = self.__class__.__name__
        self.sc = SparkContext('local', class_name)

    def tearDown(self):
        self.sc.stop()

    def test_should_be_able_to_word_count(self):
        rdd = self.sc.parallelize(["This is a text", "Another text", "More text", "a text"])
        result = python_word_count.wordcount(rdd)
        expected = [('a', 2), ('This', 1), ('text', 4), ('is', 1), ('Another', 1), ('More', 1)]
        self.assertEquals(expected, result.collect())
Example #21
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)
            _py4j_cleaner.start()

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
            transformer_serializer.init(
                SparkContext._active_spark_context, CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            #
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            #
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            transformer_serializer.gateway.jvm.PythonDStream.registerSerializer(
                transformer_serializer)
            cls._transformerSerializer = transformer_serializer
        else:
            cls._transformerSerializer.init(
                SparkContext._active_spark_context, CloudPickleSerializer(), gw)
Example #22
0
class PyEdgeRDDTestCase(unittest.TestCase):
    """
    Test collect, take, count, mapValues,
    filter and innerJoin for EdgeRDD
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    # TODO
    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    # TODO
    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    # TODO
    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    # TODO
    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    # TODO
    def filter(self):
        return

    # TODO
    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
Example #23
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #24
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        from pyspark.java_gateway import ensure_callback_server_started
        ensure_callback_server_started(gw)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
class SparkTestingBaseTestCase(unittest2.TestCase):

    """Basic common test case for Spark. Provides a Spark context as sc.
    For non local mode testing you can either override sparkMaster
    or set the enviroment property SPARK_MASTER for non-local mode testing."""

    @classmethod
    def getMaster(cls):
        return os.getenv('SPARK_MASTER', "local[4]")

    def setUp(self):
        """Setup a basic Spark context for testing"""
        self.sc = SparkContext(self.getMaster())
        quiet_py4j()

    def tearDown(self):
        """
        Tear down the basic panda spark test case. This stops the running
        context and does a hack to prevent Akka rebinding on the same port.
        """
        self.sc.stop()
        # To avoid Akka rebinding to the same port, since it doesn't unbind
        # immediately on shutdown
        self.sc._jvm.System.clearProperty("spark.driver.port")

    def assertRDDEquals(self, expected, result):
        return self.compareRDD(expected, result) == []

    def compareRDD(self, expected, result):
        expectedKeyed = expected.map(lambda x: (x, 1))\
                                .reduceByKey(lambda x, y: x + y)
        resultKeyed = result.map(lambda x: (x, 1))\
                            .reduceByKey(lambda x, y: x + y)
        return expectedKeyed.cogroup(resultKeyed)\
                            .map(lambda x: tuple(map(list, x[1])))\
                            .filter(lambda x: x[0] != x[1]).take(1)

    def assertRDDEqualsWithOrder(self, expected, result):
        return self.compareRDDWithOrder(expected, result) == []

    def compareRDDWithOrder(self, expected, result):
        def indexRDD(rdd):
            return rdd.zipWithIndex().map(lambda x: (x[1], x[0]))
        indexExpected = indexRDD(expected)
        indexResult = indexRDD(result)
        return indexExpected.cogroup(indexResult)\
                            .map(lambda x: tuple(map(list, x[1])))\
                            .filter(lambda x: x[0] != x[1]).take(1)
Example #26
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid global default SparkSession, and if
            yes, return that one. If no valid global default SparkSession exists, the method
            creates a new SparkSession and assigns the newly created SparkSession as the global
            default.

            >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
            >>> s1.conf.get("k1") == "v1"
            True

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.

            >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
            >>> s1.conf.get("k1") == s2.conf.get("k1")
            True
            >>> s1.conf.get("k2") == s2.conf.get("k2")
            True
            """
            with self._lock:
                from pyspark.context import SparkContext
                from pyspark.conf import SparkConf
                session = SparkSession._instantiatedContext
                if session is None:
                    sparkConf = SparkConf()
                    for key, value in self._options.items():
                        sparkConf.set(key, value)
                    sc = SparkContext.getOrCreate(sparkConf)
                    session = SparkSession(sc)
                for key, value in self._options.items():
                    session.conf.set(key, value)
                return session
Example #27
0
 def test_stop_only_streaming_context(self):
     self.sc = SparkContext(master=self.master, appName=self.appName)
     self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
     self._addInputStream(self.ssc)
     self.ssc.start()
     self.ssc.stop(False)
     self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
Example #28
0
  def __init__(self):
    # Setup PySpark. This is needed until PySpark becomes available on PyPI,
    # after which we can simply add it to requirements.txt.
    _setup_pyspark()
    from pyspark.conf import SparkConf
    from pyspark.context import SparkContext
    from pyspark.serializers import MarshalSerializer

    # Create a temporary .zip lib file for Metis, which will be copied over to
    # Spark workers so they can unpickle Metis functions and objects.
    metis_lib_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
    metis_lib_file.close()
    _copy_lib_for_spark_workers(metis_lib_file.name)

    # Also ship the Metis lib file so worker nodes can deserialize Metis
    # internal data structures.
    conf = SparkConf()
    conf.setMaster(app.config['SPARK_MASTER'])
    conf.setAppName('chronology:metis')
    parallelism = int(app.config.get('SPARK_PARALLELISM', 0))
    if parallelism:
      conf.set('spark.default.parallelism', parallelism)
    self.context = SparkContext(conf=conf,
                                pyFiles=[metis_lib_file.name],
                                serializer=MarshalSerializer())

    # Delete temporary Metis lib file.
    os.unlink(metis_lib_file.name)

    # We'll use this to parallelize fetching events in KronosSource.
    # The default of 8 is from:
    # https://spark.apache.org/docs/latest/configuration.html
    self.parallelism = parallelism or 8
Example #29
0
File: tests.py Project: 31z4/spark
 def setUp(self):
     class_name = self.__class__.__name__
     conf = SparkConf().set("spark.default.parallelism", 1)
     self.sc = SparkContext(appName=class_name, conf=conf)
     self.sc.setCheckpointDir("/tmp")
     # TODO: decrease duration to speed up tests
     self.ssc = StreamingContext(self.sc, self.duration)
Example #30
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #31
0
        # try again if port unavailable
        if check == notfound:
            port += 1

    # return the first available port
    return port


# this is the deprecated equivalent of ADD_JARS
add_files = None
if os.environ.get("ADD_FILES") is not None:
    add_files = os.environ.get("ADD_FILES").split(',')

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

# setup mesos-based connection
conf = (SparkConf().setMaster(os.environ["SPARK_MASTER"]))

# set the UI port
conf.set("spark.ui.port", ui_get_available_port())

# configure docker containers as executors
conf.setSparkHome(os.environ.get("SPARK_HOME"))
conf.set("spark.mesos.executor.docker.image",
         "lab41/spark-mesos-dockerworker-ipython")
conf.set("spark.mesos.executor.home", "/usr/local/spark-1.4.1-bin-hadoop2.4")
conf.set("spark.executorEnv.MESOS_NATIVE_LIBRARY",
         "/usr/local/lib/libmesos.so")
conf.set("spark.network.timeout", "100")
Example #32
0
parser = argparse.ArgumentParser()
parser.add_argument('--JOB_DATE', dest='JOB_DATE')
parser.add_argument('--S3_BUCKET', dest='S3_BUCKET')
parser.add_argument('--REGION', dest='REGION')
args = parser.parse_args()
print(args)
JOB_DATE = args.JOB_DATE
S3_BUCKET = args.S3_BUCKET
REGION = args.REGION

READ_PATH = 'data/' + JOB_DATE
S3_READ_PATH = 's3://' + S3_BUCKET + '/' + READ_PATH
WRITE_PATH = 'curated/' + JOB_DATE
S3_WRITE_PATH = 's3://' + S3_BUCKET + '/' + WRITE_PATH

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


def does_s3key_exist(bucket, key, ext):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket)
    objects = bucket.objects.all()
    FOUND = 0
    for object in objects:
        if object.key.startswith(key) and object.key.endswith(ext):
            FOUND = 1
    return FOUND


if does_s3key_exist(S3_BUCKET, READ_PATH, '.csv') == 1:
import pyspark
from pyspark.context import SparkContext
from pyspark import SparkConf

conf = SparkConf().setMaster("local")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

# Load the adjacency list file
AdjList1 = sc.textFile("02AdjacencyList.txt")
print AdjList1.collect()

AdjList2 = AdjList1.map(lambda line: line.split(" "))
print AdjList2.collect()
AdjList3 = AdjList2.map(lambda x: (int(x[0]), [int(y) for y in x[1:]]))
AdjList3.persist()
print AdjList3.collect()

nNumOfNodes = AdjList3.count()
print "Total Number of nodes"
print nNumOfNodes

# Initialize each page's rank; since we use mapValues, the resulting RDD will have the same partitioner as links
print "Initialization"
PageRankValues = AdjList3.mapValues(lambda v: 1 / float(nNumOfNodes))
print PageRankValues.collect()

# Run 30 iterations
print "Run 30 Iterations"
for i in range(1, 30):
    print "Number of Iterations"
Example #34
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog(
        database='crm_native', table_name='contacts')

    print('dyf_crm_contacts::schema')
    dyf_crm_contacts.printSchema()
    dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('_key',
                                                              'cast:long')])
    try:
        df_flag = spark.read.parquet(
            "s3a://dts-odin/flag/student_status/user_profile/communication/email.parquet"
        )
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts, f=lambda x: x["_key"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_crm_contacts = dyf_crm_contacts.select_fields(
        ['_key', 'Id', 'Email', 'Email2'])
    dy_source_voxy_cache = dyf_crm_contacts.toDF()
    dy_source_voxy_cache = dy_source_voxy_cache.cache()
    dyf_crm_contacts = DynamicFrame.fromDF(dy_source_voxy_cache, glueContext,
                                           'dyf_crm_contacts')

    today = date.today()
    d4 = today.strftime("%Y-%m-%d")
    print("d4 =", d4)

    print('the number of new contacts: ', dyf_crm_contacts.count())

    if (dyf_crm_contacts.count() > 0):

        # print('Chay vao day nhe------------------')
        # print('dyf_crm_contacts::----------------')
        # dyf_crm_contacts.printSchema()
        # try:
        #--------------------------------------------------------------------------------------------------------------#
        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts,
            f=lambda x: x["Id"] is not None and x["Id"] != '' and x[
                "Email"] is not None and x["Email"] != '')
        # --------------------------------------------------------------------------------------------------------------#

        # --------------------------------------------------------------------------------------------------------------#

        dy_crm_contacts = dyf_crm_contacts.toDF()
        dy_crm_contacts = dy_crm_contacts.withColumn('communication_type',
                                                     f.lit(2))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_primary', f.lit(0))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_deleted', f.lit(0))
        dy_crm_contacts = dy_crm_contacts.withColumn('last_update_date',
                                                     f.lit(d4))
        dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts, glueContext,
                                               'dyf_crm_contacts')

        dyf_crm_contacts = dyf_crm_contacts.resolveChoice(
            specs=[('last_update_date', 'cast:string')])

        applymapping2 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[
                ("Id", "int", "user_id", "bigint"),
                ("communication_type", 'int', 'communication_type', 'int'),
                ("is_primary", 'int', 'is_primary', 'int'),
                ("is_deleted", 'int', 'is_deleted', 'int'),
                ("Email", 'string', 'comunication', 'string'),
                ("last_update_date", 'string', 'last_update_date', 'timestamp')
            ])

        #
        #
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields6 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields2")

        print('dropnullfields6::schema')
        dropnullfields6.printSchema()
        dropnullfields6.show(5)

        datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields6,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_communication",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/",
            transformation_ctx="datasink4")

        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts,
            f=lambda x: x["Email2"] is not None and x["Email2"] != '')

        applymapping2 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[
                ("Id", "int", "user_id", "bigint"),
                ("communication_type", 'int', 'communication_type', 'int'),
                ("is_primary", 'int', 'is_primary', 'int'),
                ("is_deleted", 'int', 'is_deleted', 'int'),
                ("Email2", 'string', 'comunication', 'string'),
                ("last_update_date", 'string', 'last_update_date', 'timestamp')
            ])

        #
        #
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields6 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields2")

        datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields6,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_communication",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/",
            transformation_ctx="datasink4")

        df_datasource = dyf_crm_contacts.toDF()
        flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        df.write.parquet(
            "s3a://dts-odin/flag/student_status/user_profile/communication/email.parquet",
            mode="overwrite")
Example #35
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
sc = SparkContext('local')
spark = SparkSession(sc)
bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket")
project = spark._jsc.hadoopConfiguration().get("fs.gs.project.id")
todays_date = datetime.strftime(datetime.today(), "%Y-%m-%d-%H-%M-%S")

accum = sc.accumulator(0)

print "begin to map input"

train_set = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_set_combine").map(lambda row: row.split("\t")).map(lambda p: Row(uid=p[0], urlid=p[1], ts=p[2], label=p[3]))
combine_uinfo = sc.textFile("gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/data_files_combine_toterm_new/part-00000").map(lambda row: row.split("\t", 1))

print "finish to map input"

def process_uinfo(line):
    if len(line) != 2:
        return Row(urlid=line, urlinfo="")
    return Row(urlid=line[0], urlinfo=line[1])
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


from pyspark.context import SparkContext
from pyspark.conf import SparkConf
sc = SparkContext.getOrCreate(SparkConf())
import re
text_file = sc.textFile("./README.txt")
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (re.sub("[^a-zA-Z\\d]","",word), 1)).reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("./output_python")

from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from awsglue.transforms import *

from awsglue.context import GlueContext
from awsglue.job import Job
import sys

args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME'])

conf = SparkConf()

conf.set("spark.sql.parquet.compression.codec", "snappy")
conf.set("spark.sql.parquet.writeLegacyFormat", "true")

sc = SparkContext()

glueContext = GlueContext(sc)

spark = glueContext.spark_session

job = Job(glueContext)

job.init(args['JOB_NAME'], args)

input_file_path = "s3://troy-dwh-external/user_behavior/2016_funnel.csv"

df = spark.read.option("header","true")\
 .option("inferSchema","true")\
 .option("quote","\"")\
 .option("escape","\"").csv(input_file_path)
Example #38
0
class ContextWrapper(object):
    def __init__(self):
        pass

    def set_context(self, java_gateway):
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        j_spark_conf = spark_context_wrapper.sparkConf()
        p_spark_conf = SparkConf(_jvm=java_gateway.jvm, _jconf=j_spark_conf)
        j_spark_context = spark_context_wrapper.javaContext()
        self._context = SparkContext(jsc=j_spark_context,
                                     gateway=java_gateway,
                                     conf=p_spark_conf)

    def set_sql_context(self, java_gateway):
        from pyspark.sql import SQLContext
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._sql_context = SQLContext(
            self._context,
            sparkSession=spark_context_wrapper.sparkSession(False),
            jsqlContext=spark_context_wrapper.sqlContext())

    def set_hive_context(self, java_gateway):
        from pyspark.sql import HiveContext
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._hive_context = HiveContext(self._context,
                                         spark_context_wrapper.hiveContext())

    def set_session(self, java_gateway):
        from pyspark.sql import SparkSession
        self._session = SparkSession.builder.config(
            conf=self._context.getConf()).getOrCreate()

    def set_hive_session(self, java_gateway):
        from pyspark.sql import SparkSession
        self._session = SparkSession.builder.config(
            conf=self._context.getConf()).enableHiveSupport().getOrCreate()

    def set_streaming_context(self, java_gateway):
        from pyspark.streaming import StreamingContext
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._streaming_context = StreamingContext(
            self._context,
            java_gateway.entry_point.sparkStreamingWrapper().
            getDurationSeconds())

    @property
    def context(self):
        return self._context

    @property
    def sql_context(self):
        return self._sql_context

    @property
    def hive_context(self):
        return self._hive_context

    @property
    def session(self):
        return self._session

    @property
    def streaming_context(self):
        return self._streaming_context
Example #39
0
    from pyspark.sql import DataFrame
    from py4j.java_collections import MapConverter
    if isinstance(df, DataFrame):
        intp.saveDFToCsv(
            df._jdf, path, hasheader, isOverwrite,
            MapConverter().convert(option, gateway._gateway_client))
    else:
        print(str(df))


java_import(gateway.jvm, "scala.Tuple2")

jsc = intp.getJavaSparkContext()
jconf = intp.getSparkConf()
conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)
sqlc = HiveContext(sc, intp.sqlContext())
sqlContext = sqlc
spark = SparkSession(sc, intp.getSparkSession())

##add pyfiles
try:
    pyfile = sys.argv[4]
    pyfiles = pyfile.split(',')
    for i in range(len(pyfiles)):
        if "" != pyfiles[i]:
            sc.addPyFile(pyfiles[i])
except Exception as e:
    print("add pyfile error: " + pyfile)

    from inception.imagenet_data import ImagenetData

    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS._parse_flags()
    print("FLAGS:", FLAGS.__dict__['__flags'])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx)

    inception_eval.evaluate(dataset)


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 0

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            False, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
Example #41
0
 def test_failed_sparkcontext_creation(self):
     # Regression test for SPARK-1550
     self.sc.stop()
     self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
     self.sc = SparkContext("local")
Example #42
0
class PySparkStreamingTestCase(unittest.TestCase):

    timeout = 20  # seconds
    duration = 1

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")
        # TODO: decrease duration to speed up tests
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        self.ssc.stop()

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print "timeout after", self.timeout

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def _test_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists.
        @param func: wrapped function. This function should return PythonDStream object.
        @param expected: expected output for this testcase.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]
        input_stream2 = self.ssc.queueStream(
            input2) if input2 is not None else None

        # Apply test function to stream.
        if input2:
            stream = func(input_stream, input_stream2)
        else:
            stream = func(input_stream)

        result = self._collect(stream, len(expected))
        if sort:
            self._sort_result_based_on_key(result)
            self._sort_result_based_on_key(expected)
        self.assertEqual(expected, result)

    def _sort_result_based_on_key(self, outputs):
        """Sort the list based on first value."""
        for output in outputs:
            output.sort(key=lambda x: x[0])
app_name = "Log2Graph"

delimiter = "\t"
input_file_name = "/Users/woodie/Downloads/sfexpress_rawdata_first2500k.txt"
node_info_fields = [
    "id", "main_business", "oversea", "industry_lv1", "industry_lv2",
    "industry_lv3", "area_code", "area_desc", "area_city", "coop_month"
]
transc_info_fields = ["transc_id", "ship_timestamp", "deliver_timestamp"]
item_info_fields = ["item_info"]
src_node_fields = ["src_" + field for field in node_info_fields]
trg_node_fields = ["trg_" + field for field in node_info_fields]

# Init Spark Context as running in local mode
sc = SparkContext("local")
# Create a basic Spark Session
spark = SparkSession \
 .builder \
 .appName(app_name) \
 .getOrCreate()
# Specify properties of fields,
# including field name and related data type
log_fields = src_node_fields + transc_info_fields + trg_node_fields + item_info_fields

# ------------------------------------------
# Pipeline of the Workflow

# Load rawdata from local file system
# And split each row by specific delimiter
source = sc.textFile(input_file_name) \
Example #44
0
        `features`
        """
        return self._call_java("userFactors")

    @property
    @since("1.4.0")
    def itemFactors(self):
        """
        a DataFrame that stores item factors in two columns: `id` and
        `features`
        """
        return self._call_java("itemFactors")


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.recommendation tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
Example #45
0
            # skip comment
            if s.strip().startswith("#"):
                continue

            if final_code:
                final_code += "\n" + s
            else:
                final_code = s

        if sc is None:
            jsc = kernel.javaSparkContext()
            if jsc != None:
                jconf = kernel.sparkConf()
                conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
                sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)

        if final_code:
            compiled_code = compile(final_code, "<string>", "exec")
            #sc.setJobGroup(jobGroup, "Spark Kernel")
            eval(compiled_code)

        state.markSuccess(code_info.codeId(), output.get())
    except Py4JJavaError:
        excInnerError = traceback.format_exc(
        )  # format_tb() does not return the inner exception
        innerErrorStart = excInnerError.find("Py4JJavaError:")
        if innerErrorStart > -1:
            excInnerError = excInnerError[innerErrorStart:]
        state.markFailure(code_info.codeId(),
                          excInnerError + str(sys.exc_info()))
Example #46
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from datetime import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import array

sc = SparkContext('local')
spark = SparkSession(sc)
print "begin to map input"
fieldSchema = StructType([
    StructField("label", IntegerType(), True),
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term", StringType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("ts", LongType(), True),
    StructField("uid", LongType(), True),
    StructField("urlid", LongType(), True),
    StructField("user_s_term", StringType(), True)
])
train_set_join_user_model = spark.read.csv(
Example #47
0
  print("num_images: ", num_images)
  print("num_labels: ", num_labels)
  print("samples: ", samples)


if __name__ == "__main__":
  import argparse

  from pyspark.context import SparkContext
  from pyspark.conf import SparkConf

  parser = argparse.ArgumentParser()
  parser.add_argument("--format", help="output format", choices=["csv", "csv2", "pickle", "tf", "tfr"], default="csv")
  parser.add_argument("--num-partitions", help="Number of output partitions", type=int, default=10)
  parser.add_argument("--output", help="HDFS directory to save examples in parallelized format", default="mnist_data")
  parser.add_argument("--read", help="read previously saved examples", action="store_true")
  parser.add_argument("--verify", help="verify saved examples after writing", action="store_true")

  args = parser.parse_args()
  print("args:", args)

  sc = SparkContext(conf=SparkConf().setAppName("mnist_parallelize"))

  if not args.read:
    # Note: these files are inside the mnist.zip file
    writeMNIST(sc, "mnist/train-images-idx3-ubyte.gz", "mnist/train-labels-idx1-ubyte.gz", args.output + "/train", args.format, args.num_partitions)
    writeMNIST(sc, "mnist/t10k-images-idx3-ubyte.gz", "mnist/t10k-labels-idx1-ubyte.gz", args.output + "/test", args.format, args.num_partitions)

  if args.read or args.verify:
    readMNIST(sc, args.output + "/train", args.format)
'''
@Author: Matheus Barros
Date: 23/04/2021

'''

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession

#PARALLELIZING WITH 2 CORES
conf = SparkConf().setAppName("rdd basic").setMaster("local[2]")

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

RDD = sc.parallelize([1, 2, 3, 4])
RDD_map = RDD.map(lambda x: x * x)
RDD_map = RDD_map.collect()

print(RDD_map)

RDD1 = sc.parallelize([1, 2, 3, 4])
RDD_filter = RDD1.filter(lambda x: x > 2)
RDD_filter = RDD_filter.collect()

print(RDD_filter)

RDD2 = sc.parallelize(["hello world", "how are you"])
    print("Error: Default Python used is Python%s" % sys.version_info.major)
    print("\tSet env variable PYSPARK_PYTHON to Python2 binary and re-run it.")
    sys.exit(1)


import os
import platform
import pyspark
from pyspark.context import SparkContext
from pyspark.storagelevel import StorageLevel

# this is the equivalent of ADD_JARS
add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("ADD_FILES") != None else None

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])

sc = SparkContext(appName="PySparkShell", pyFiles=add_files)

print("""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.0.2
      /_/
""")
print("Using Python version %s (%s, %s)" % (
    platform.python_version(),
    platform.python_build()[0],
    platform.python_build()[1]))
print("SparkContext available as sc.")
 def __init__(self, nbCores = 2):
     self.sc = SparkContext("local[{}]".format(nbCores))
     self.sqlContext = SQLContext(self.sc)
     self.spark = SparkSession(self.sc)
Example #51
0
# -*- coding: utf-8 -*-
from pyspark.context import SparkContext
import re

if __name__ == "__main__":
    spark = SparkContext("local", "dataAnalyse_floorandage")

    data = spark.textFile("./transaction/transaction_bj.txt")

    def reduceAge(str):
        if str != '未知':
            age = 2020 - int(str)
            if age < 5:
                return "0~5年"
            elif age < 15:
                return "5~15年"
            elif age < 30:
                return "15~30年"
            else:
                return "30年以上"
        else:
            return str

    # 均价 成交价和挂牌价的数据清洗
    # 部分房屋没有均价,会错误的提取到成交年份,需要从成交价和面积自己计算,时间原因 暂时放弃
    def cleanData(line):
        line[4] = line[4][:3]
        line[5] = reduceAge(line[5])
        line[10] = re.findall(r"\d+", line[10])[0]
        line[11] = re.findall(r"\d+", line[11])[0]
        # if float(line[10]) < 2100:
Example #52
0
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)

    def test_zip_with_different_serializers(self):
        a = self.sc.parallelize(range(5))
        b = self.sc.parallelize(range(100, 105))
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
        a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
        b = b._reserialize(MarshalSerializer())
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])

    def test_zip_with_different_number_of_items(self):
        a = self.sc.parallelize(range(5), 2)
        # different number of partitions
        b = self.sc.parallelize(range(100, 106), 3)
        self.assertRaises(ValueError, lambda: a.zip(b))
        # different number of batched items in JVM
        b = self.sc.parallelize(range(100, 104), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # different number of items in one pair
        b = self.sc.parallelize(range(100, 106), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # same total number of items, but different distributions
        a = self.sc.parallelize([2, 3], 2).flatMap(range)
        b = self.sc.parallelize([3, 2], 2).flatMap(range)
        self.assertEquals(a.count(), b.count())
        self.assertRaises(Exception, lambda: a.zip(b).count())
    def getStepSize(self):
        """
        Gets the value of stepSize or its default value.
        """
        return self.getOrDefault(self.stepSize)


class GBTRegressionModel(JavaModel):
    """
    Model fitted by GBTRegressor.
    """


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.regression tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
Example #54
0
                       for l, p in zip(labels, preds)] 
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()




if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("read hive with model and  save to hdfs "))
    hive_context = HiveContext(sc)
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1
    num_ps = 1

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="input hdfs path")
    parser.add_argument("-o", "--output", help="output hdfs path")
    parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference",
                        default="mnist_model")
    parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", default=False)
    parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100)
    parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1)
    parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
    parser.add_argument("-X", "--mode", help="train|inference", default="train")
Example #55
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    # ETL trang thai Co hoc LS/SC; co hoc thanh cong LS/SC:
    ############# lay du lieu bang mdl_logsservice_in_out
    mdl_logsservice_in_out = glueContext.create_dynamic_frame.from_catalog(
        database="topicalms", table_name="mdl_logsservice_in_out_cutoff")

    # Chon cac truong can thiet
    mdl_logsservice_in_out = mdl_logsservice_in_out.select_fields([
        '_key', 'id', 'userid', 'roomid', 'time_in', 'time_out', 'date_in',
        'action'
    ])
    mdl_logsservice_in_out = mdl_logsservice_in_out.resolveChoice(
        specs=[('_key', 'cast:long')])

    df_flag_1 = spark.read.parquet(
        "s3://dts-odin/flag/flag_LS_LSSC_CutOff.parquet")
    max_key = df_flag_1.collect()[0]['flag']
    print("max_key:  ", max_key)
    # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    mdl_logsservice_in_out = Filter.apply(frame=mdl_logsservice_in_out,
                                          f=lambda x: x["_key"] > max_key)
    # data = mdl_logsservice_in_out.toDF()
    # data = data.cacahe()
    # mdl_logsservice_in_out = DynamicFrame.fromDF(data, glueContext, "mdl_logsservice_in_out")
    print("Count data 1:  ", mdl_logsservice_in_out.count())
    # mdl_logsservice_in_out.toDF().show()
    if (mdl_logsservice_in_out.count() > 0):
        try:
            mdl_tpebbb = glueContext.create_dynamic_frame.from_catalog(
                database="topicalms", table_name="mdl_tpebbb")
            mdl_tpebbb = mdl_tpebbb.select_fields(
                ['id', 'timeavailable', 'calendar_code',
                 'roomtype']).rename_field('id', 'room_id')
            mdl_tpe_calendar_teach = glueContext.create_dynamic_frame.from_catalog(
                database="topicalms", table_name="mdl_tpe_calendar_teach")
            mdl_tpe_calendar_teach = mdl_tpe_calendar_teach.select_fields(
                ['status', 'calendar_code',
                 'type_class']).rename_field('calendar_code', 'code_calendar')

            mdl_logsservice_room_start = glueContext.create_dynamic_frame.from_catalog(
                database="topicalms", table_name="mdl_logsservice_room_start")
            mdl_logsservice_room_start = mdl_logsservice_room_start.select_fields(
                ['roomid', 'timecreated']).rename_field('roomid', 'id_room')

            mdl_role_assignments = glueContext.create_dynamic_frame.from_catalog(
                database="topicalms", table_name="mdl_role_assignments")
            mdl_role_assignments = mdl_role_assignments.select_fields(
                ['userid', 'roleid']).rename_field('userid', 'user_id')

            # Loc du lieu
            mdl_tpe_calendar_teach = Filter.apply(frame=mdl_tpe_calendar_teach,
                                                  f=lambda x: x["status"] >= 0)

            data_tpe_bbb = Filter.apply(
                frame=mdl_tpebbb,
                f=lambda x: x["roomtype"] == 'ROOM' and
                (x["calendar_code"] is not None and x["calendar_code"] != ''))

            join_calendar_teach = Join.apply(data_tpe_bbb,
                                             mdl_tpe_calendar_teach,
                                             'calendar_code',
                                             'code_calendar').drop_fields([
                                                 'calendar_code',
                                                 'code_calendar'
                                             ])

            data_in_out = Filter.apply(
                frame=mdl_logsservice_in_out,
                f=lambda x: x["time_out"] is not None and
                (x["userid"] is not None and x["userid"] != '') and
                (x["roomid"] is not None and x["roomid"] != ''))

            data_mdl_role_assignments = Filter.apply(
                frame=mdl_role_assignments,
                f=lambda x: x["roleid"] == '5' and x["user_id"] is not None)

            join_data_role = Join.apply(data_in_out, data_mdl_role_assignments,
                                        'userid', 'user_id')

            # map ls lssc vs thong tin lop
            join_data_tpebbb = Join.apply(join_data_role, join_calendar_teach,
                                          'roomid', 'room_id')

            mdl_logsservice_room_start = Filter.apply(
                frame=mdl_logsservice_room_start,
                f=lambda x: x["id_room"] is not None and x["id_room"] != '')

            df_data_roomstart = mdl_logsservice_room_start.toDF()
            df_data_tpebbb = join_data_tpebbb.toDF()
            print("Count data 222:  ", df_data_tpebbb.count())
            # df_data_tpebbb.show()
            # map ls lssc vs thong tin mo lop
            join_bbb = df_data_tpebbb.join(
                df_data_roomstart,
                df_data_tpebbb.roomid == df_data_roomstart.id_room,
                'left_outer')

            data_bbb = DynamicFrame.fromDF(join_bbb, glueContext, "data_bbb")

            # convert data
            df_bbb = data_bbb.toDF()
            df_bbb = df_bbb.withColumn(
                'time_start',
                when(f.col("timecreated").isNull(),
                     df_bbb['timeavailable']).otherwise(df_bbb['timecreated']))
            df_bbb = df_bbb.withColumn(
                'timein',
                when(df_bbb.time_in < df_bbb.time_start,
                     df_bbb['time_start']).otherwise(df_bbb['time_in']))
            df_bbb = df_bbb.withColumn('time_study',
                                       when((df_bbb.time_out < df_bbb.time_in) | (df_bbb.time_out < df_bbb.time_start),
                                            f.lit(0)).otherwise(df_bbb.time_out - df_bbb.timein)) \
                .withColumn('id_time', from_unixtime(unix_timestamp(df_bbb.date_in, "yyyy-MM-dd"), "yyyyMMdd")) \
                .withColumn('date_login', from_unixtime(df_bbb.timein)) \
                .withColumn('date_logout', from_unixtime(df_bbb.time_out))

            # df_bbb.cache()
            data_lssc_bbb = DynamicFrame.fromDF(df_bbb, glueContext,
                                                "data_lssc_bbb")

            data_lssc_bbb = data_lssc_bbb.resolveChoice(specs=[('time_study',
                                                                'cast:long')])
            data_lssc_bbb.printSchema()
            # chon cac truong va kieu du lieu day vao db
            applymapping = ApplyMapping.apply(
                frame=data_lssc_bbb,
                mappings=[("id", "string", "id", "bigint"),
                          ("userid", "string", "student_id", "string"),
                          ("roomid", 'string', 'room_id', 'string'),
                          ("id_time", 'string', 'date_id', 'bigint'),
                          ("date_login", "string", "time_in", "timestamp"),
                          ("date_logout", "string", "time_out", "timestamp"),
                          ("time_study", "long", "time_study", "long"),
                          ("type_class", "string", "class_type", "string"),
                          ("date_in", "string", "created_time", "timestamp"),
                          ("action", "string", "action", "string")])

            resolvechoice = ResolveChoice.apply(
                frame=applymapping,
                choice="make_cols",
                transformation_ctx="resolvechoice2")
            dropnullfields = DropNullFields.apply(
                frame=resolvechoice, transformation_ctx="dropnullfields")

            print("Count data:  ", dropnullfields.count())
            datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "fact_lich_su_hoc",
                    "database":
                    "dts_odin",
                    "postactions":
                    """ call proc_insert_lssc_thanh_cong_dau_tien()"""
                },
                redshift_tmp_dir=
                "s3n://dts-odin/topicalms/mdl_logsservice_in_out/",
                transformation_ctx="datasink5")

            df_lssc = dropnullfields.toDF()
            # luu bang chuyen trang thai co hoc ls/sc:
            print("Count data data_df_lsscStudy:  ")
            df_lssc = df_lssc.groupby(
                'student_id', 'room_id', 'date_id', 'class_type').agg(
                    f.sum('time_study').alias("measure2_tmp"),
                    f.count('room_id').alias("measure1"))
            df_lssc = df_lssc.withColumn(
                'to_status_id',
                when(df_lssc.class_type == 'LS',
                     f.lit(30)).otherwise(f.lit(31)))
            df_lssc = df_lssc.withColumn('measure2', df_lssc.measure2_tmp / 60)
            print('co_hoc_lssc schema1: ')
            df_lssc.printSchema()
            data_df_lsscStudy = DynamicFrame.fromDF(df_lssc, glueContext,
                                                    "data_df_lsscStudy")
            data_df_lsscStudy = data_df_lsscStudy.resolveChoice(
                specs=[('measure1', 'cast:double')])
            data_df_lsscStudy = data_df_lsscStudy.resolveChoice(
                specs=[('measure2', 'cast:double')])
            print('co_hoc_lssc schema: ')
            data_df_lsscStudy.printSchema()
            applymappingStudy = ApplyMapping.apply(
                frame=data_df_lsscStudy,
                mappings=[("student_id", "string", "student_id", "bigint"),
                          ("date_id", "bigint", "change_status_date_id",
                           "bigint"),
                          ("to_status_id", "int", "to_status_id", "bigint"),
                          ("measure1", 'double', 'measure1', 'double'),
                          ("measure2", 'double', 'measure2', 'double')])

            resolvechoiceStudy = ResolveChoice.apply(
                frame=applymappingStudy,
                choice="make_cols",
                transformation_ctx="resolvechoiceStudy")
            dropnullfieldsStudy = DropNullFields.apply(
                frame=resolvechoiceStudy,
                transformation_ctx="dropnullfieldsStudy")
            dropnullfieldsStudy.printSchema()
            # dropnullfieldsStudy.toDF().show()
            # insert trang thai co hoc thanh cong ls hoac sc
            datasinkStudy = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfieldsStudy,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "temp_mapping_status",
                    "database":
                    "dts_odin",
                    "postactions":
                    """ insert into mapping_changed_status_student(student_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                            select student_id, change_status_date_id, to_status_id, measure1, measure2 from temp_mapping_status;
                                                                                                            update mapping_changed_status_student set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id)
                                                                                                                where user_id is null;
                                                                                                        DROP TABLE IF EXISTS temp_mapping_status
                                                                                                                 """
                },
                redshift_tmp_dir=
                "s3n://dts-odin/topicalms/mdl_logsservice_in_out/",
                transformation_ctx="datasinkStudy")

            # luu bang chuyen trang thai co hoc thanh cong ls/sc:
            # Hoc thanh cong: thoi gian hoc >= 36phut
            df_lssc = dropnullfields.toDF()
            df_lssc = df_lssc.groupby(
                'student_id', 'room_id', 'date_id',
                'class_type').agg(f.sum('time_study').alias("sum_time_study"))
            df_lssc = df_lssc.where('sum_time_study >= 2160')
            df_lssc = df_lssc.groupby(
                'student_id', 'date_id', 'class_type').agg(
                    f.sum('sum_time_study').alias("measure2_tmp"),
                    f.count('room_id').alias("measure1"))
            df_lssc = df_lssc.withColumn(
                'to_status_id',
                when(df_lssc.class_type == 'LS',
                     f.lit(11)).otherwise(f.lit(12)))
            df_lssc = df_lssc.withColumn('measure2', df_lssc.measure2_tmp / 60)
            data_df_lssc = DynamicFrame.fromDF(df_lssc, glueContext,
                                               "data_df_lssc")

            data_df_lssc = data_df_lssc.resolveChoice(specs=[('measure1',
                                                              'cast:double')])
            data_df_lssc = data_df_lssc.resolveChoice(specs=[('measure2',
                                                              'cast:double')])
            print('data_df_lssc schema: ')
            data_df_lssc.printSchema()
            applymappingSuccess = ApplyMapping.apply(
                frame=data_df_lssc,
                mappings=[("student_id", "string", "student_id", "bigint"),
                          ("date_id", "bigint", "change_status_date_id",
                           "bigint"),
                          ("to_status_id", "int", "to_status_id", "bigint"),
                          ("measure1", 'double', 'measure1', 'double'),
                          ("measure2", 'double', 'measure2', 'double')])

            resolvechoiceSuccess = ResolveChoice.apply(
                frame=applymappingSuccess,
                choice="make_cols",
                transformation_ctx="resolvechoiceSuccess")
            dropnullfieldsSuccess = DropNullFields.apply(
                frame=resolvechoiceSuccess,
                transformation_ctx="dropnullfieldsSuccess")
            dropnullfieldsSuccess.printSchema()
            # dropnullfieldsSuccess.toDF().show()
            ## insert trang thai co hoc thanh cong ls hoac sc
            datasinkSuccess = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfieldsSuccess,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "temp_mapping_status",
                    "database":
                    "dts_odin",
                    "postactions":
                    """ insert into mapping_changed_status_student(student_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                select student_id, change_status_date_id, to_status_id, measure1, measure2 from temp_mapping_status;
                                                                                                update mapping_changed_status_student set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id)
                                                                                                    where user_id is null;
                                                                                                DROP TABLE IF EXISTS temp_mapping_status
                                                                                                     """
                },
                redshift_tmp_dir=
                "s3n://dts-odin/topicalms/mdl_logsservice_in_out/",
                transformation_ctx="datasinkSuccess")

            # data_df_lssc.printSchema()
            # df_lssc = data_df_lssc.toDF()
            # print "Count data:  ", dropnullfields1.count()
            # dropnullfields1.toDF().show()
            # ghi flag
            # lay max key trong data source
            datasourceTmp = mdl_logsservice_in_out.toDF()
            flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            # ghi de _key vao s3
            df.write.parquet("s3a://dts-odin/flag/flag_LS_LSSC_CutOff.parquet",
                             mode="overwrite")

        except Exception as e:
            print("No new data")
            print(e)
Example #56
0
 def setUp(self):
     self._old_sys_path = list(sys.path)
     class_name = self.__class__.__name__
     self.sc = SparkContext('local[4]', class_name, batchSize=2)
Example #57
0
from __future__ import print_function
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

config = SparkConf()
config.setMaster("local[*]")
config.setAppName("ParallelizeJOB")

sc = SparkContext(conf=config)
dataRDD = sc.parallelize([100, 200, 300, 400])

#<class 'pyspark.rdd.RDD'>
print(type(dataRDD))

dataRDD.foreach(lambda eachElement: print(eachElement))
    'input_table',
    'output_database',
    'output_table',
    'output_path'
]


args = getResolvedOptions(sys.argv, params)
region = args['region']
input_database = args['input_database']
input_table = args['input_table']
output_database = args['output_database']
output_table = args['output_table']
output_path = args['output_path']

glue_context = GlueContext(SparkContext.getOrCreate())
spark = glue_context.spark_session
job = Job(glue_context)
job.init(args['JOB_NAME'], args)


# Create DynamicFrame from Data Catalog
dyf = glue_context.create_dynamic_frame.from_catalog(
    database=input_database,
    table_name=input_table,
    transformation_ctx='dyf'
)

# Resolve choice type with make_struct
dyf = ResolveChoice.apply(
    frame=dyf,
Example #59
0
from pyspark.context import SparkContext as sc

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("ChiSqSelectorExample") \
        .getOrCreate()
    rawData = spark.sparkContext.textFile("file:///home/tianlei/iris.txt")

    def f(x):
        rel = {}
        rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]),
                                        float(x[3]))
        return rel

    df = sc.textFile("file:///usr/local/spark/iris.txt").map(
        lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
    # 我们建立一个简单的GaussianMixture对象,设定其聚类数目为3,其他参数取默认值。
    gm = GaussianMixture().setK(3).setPredictionCol(
        "Prediction").setProbabilityCol("Probability")
    gmm = gm.fit(df)
    # 调用transform()方法处理数据集之后,打印数据集,可以看到每一个样本的预测簇以及其概率分布向量
    # (这里为了明晰起见,省略了大部分行,只选择三行):
    result = gmm.transform(df)
    result.show(150, False)
    # 得到模型后,即可查看模型的相关参数,与KMeans方法不同,GMM不直接给出聚类中心,
    # 而是给出各个混合成分(多元高斯分布)的参数。在ML的实现中,
    # GMM的每一个混合成分都使用一个MultivariateGaussian类(位于org.apache.spark.ml.stat.distribution包)来存储,
    # 我们可以使用GaussianMixtureModel类的weights成员获取到各个混合成分的权重,
    # 使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵):
    for i in range(3):
        print("Component " + str(i) + " : weight is " + str(gmm.weights[i]) +
Example #60
0
# coding=utf-8

import sys
# 由于PYTHONPATH找不到pyspark包,这里手动添加路径
sys.path.append('/usr/local/spark-2.1.1-bin-hadoop2.7/python')

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

sc = SparkContext('local', 'logistic_regression')
spark = SparkSession(sc)

# Load training data
data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train)

# select example rows to display.
predictions = model.transform(test)