コード例 #1
0
ファイル: context.py プロジェクト: olegz/spark-1
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context
    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize([
        Row(field1=1, field2="row1"),
        Row(field1=2, field2="row2"),
        Row(field1=3, field2="row3")
    ])
    globs['df'] = sqlCtx.createDataFrame(rdd)
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(pyspark.sql.context,
                                                  globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #2
0
ファイル: context.py プロジェクト: fangfangchen-spark/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context
    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")]
    )
    globs['df'] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #3
0
ファイル: group.py プロジェクト: wonyonyon/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.group
    globs = pyspark.sql.group.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df3'] = sc.parallelize([
        Row(name='Alice', age=2, height=80),
        Row(name='Bob', age=5, height=85)
    ]).toDF()
    globs['df4'] = sc.parallelize([
        Row(course="dotNET", year=2012, earnings=10000),
        Row(course="Java", year=2012, earnings=20000),
        Row(course="dotNET", year=2012, earnings=5000),
        Row(course="dotNET", year=2013, earnings=48000),
        Row(course="Java", year=2013, earnings=30000)
    ]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.group,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
        | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #4
0
ファイル: context.py プロジェクト: jbkang/spark
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["sc"] = sc
    globs["sqlContext"] = SQLContext(sc)
    globs["rdd"] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")]
    )
    globs["df"] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}',
    ]
    globs["jsonStrings"] = jsonStrings
    globs["json"] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
    )
    globs["sc"].stop()
    if failure_count:
        exit(-1)
コード例 #5
0
ファイル: group.py プロジェクト: JoeHorn/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.group
    globs = pyspark.sql.group.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
                                   Row(name='Bob', age=5, height=85)]).toDF()
    globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000),
                                   Row(course="Java",   year=2012, earnings=20000),
                                   Row(course="dotNET", year=2012, earnings=5000),
                                   Row(course="dotNET", year=2013, earnings=48000),
                                   Row(course="Java",   year=2013, earnings=30000)]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.group, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #6
0
ファイル: sql.py プロジェクト: 7472741/spark
def _test():
    import doctest
    from array import array
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
    jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
       '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}',
       '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}']
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    globs['nestedRdd1'] = sc.parallelize([
        {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
        {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}])
    globs['nestedRdd2'] = sc.parallelize([
        {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
        {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}])
    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #7
0
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize([
        Row(field1=1, field2="row1"),
        Row(field1=2, field2="row2"),
        Row(field1=3, field2="row3")
    ])
    globs['df'] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #8
0
ファイル: dataframe.py プロジェクト: lorenzfischer/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['df'] = sc.parallelize(
        [Row(name='Alice', age=2),
         Row(name='Bob', age=5)]).toDF()
    globs['df2'] = sc.parallelize(
        [Row(name='Tom', height=80),
         Row(name='Bob', height=85)]).toDF()
    globs['df3'] = sc.parallelize([
        Row(name='Alice', age=2, height=80),
        Row(name='Bob', age=5, height=85)
    ]).toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #9
0
ファイル: dataframe.py プロジェクト: EugenCepoi/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
                                   Row(name='Bob', age=5)]).toDF()
    globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
                                  Row(name='Bob', age=5, height=None),
                                  Row(name='Tom', age=None, height=None),
                                  Row(name=None, age=None, height=None)]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #10
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df2'] = sc.parallelize(
        [Row(name='Tom', height=80),
         Row(name='Bob', height=85)]).toDF()
    globs['df3'] = sc.parallelize([
        Row(name='Alice', age=2, height=80),
        Row(name='Bob', age=5, height=85)
    ]).toDF()

    globs['df4'] = sc.parallelize([
        Row(name='Alice', age=10, height=80),
        Row(name='Bob', age=5, height=None),
        Row(name='Tom', age=None, height=None),
        Row(name=None, age=None, height=None)
    ]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
        | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #11
0
ファイル: dataframe.py プロジェクト: wso2/wso2-spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe

    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["sc"] = sc
    globs["sqlContext"] = SQLContext(sc)
    globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF(
        StructType([StructField("age", IntegerType()), StructField("name", StringType())])
    )
    globs["df2"] = sc.parallelize([Row(name="Tom", height=80), Row(name="Bob", height=85)]).toDF()
    globs["df4"] = sc.parallelize(
        [
            Row(name="Alice", age=10, height=80),
            Row(name="Bob", age=5, height=None),
            Row(name="Tom", age=None, height=None),
            Row(name=None, age=None, height=None),
        ]
    ).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
    )
    globs["sc"].stop()
    if failure_count:
        exit(-1)
コード例 #12
0
ファイル: sql.py プロジェクト: heyook/spark
def _test():
    import doctest
    from array import array
    from pyspark.context import SparkContext

    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[4]", "PythonTest", batchSize=2)
    globs["sc"] = sc
    globs["sqlCtx"] = SQLContext(sc)
    globs["rdd"] = sc.parallelize(
        [{"field1": 1, "field2": "row1"}, {"field1": 2, "field2": "row2"}, {"field1": 3, "field2": "row3"}]
    )
    globs["nestedRdd1"] = sc.parallelize(
        [{"f1": array("i", [1, 2]), "f2": {"row1": 1.0}}, {"f1": array("i", [2, 3]), "f2": {"row2": 2.0}}]
    )
    globs["nestedRdd2"] = sc.parallelize(
        [
            {"f1": [[1, 2], [2, 3]], "f2": set([1, 2]), "f3": (1, 2)},
            {"f1": [[2, 3], [3, 4]], "f2": set([2, 3]), "f3": (2, 3)},
        ]
    )
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs["sc"].stop()
    if failure_count:
        exit(-1)
コード例 #13
0
class PyEdgeRDDTestCase(unittest.TestCase):
    """
    Test collect, take, count, mapValues,
    filter and innerJoin for EdgeRDD
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    # TODO
    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    # TODO
    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    # TODO
    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    # TODO
    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    # TODO
    def filter(self):
        return

    # TODO
    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
コード例 #14
0
ファイル: sql.py プロジェクト: twneale/spark
def _test():
    import doctest
    from array import array
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{
        "field1": 1,
        "field2": "row1"
    }, {
        "field1": 2,
        "field2": "row2"
    }, {
        "field1": 3,
        "field2": "row3"
    }])
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]}, "field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", "field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    globs['nestedRdd1'] = sc.parallelize([{
        "f1": array('i', [1, 2]),
        "f2": {
            "row1": 1.0
        }
    }, {
        "f1": array('i', [2, 3]),
        "f2": {
            "row2": 2.0
        }
    }])
    globs['nestedRdd2'] = sc.parallelize([{
        "f1": [[1, 2], [2, 3]],
        "f2": set([1, 2]),
        "f3": (1, 2)
    }, {
        "f1": [[2, 3], [3, 4]],
        "f2": set([2, 3]),
        "f3": (2, 3)
    }])
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #15
0
 def execute(self):
     print("execute ", self.__class__)
     from pyspark.context import SparkContext
     from pyspark.sql import SparkSession
     sc = SparkContext(appName='test PySparkTask')
     b = sc.broadcast([1, 2, 3, 4, 5])
     sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
     spark = SparkSession.builder \
         .master("local") \
         .appName("Word Count") \
         .getOrCreate()
     data = [('Alice', 1), ('Monica', 2)]
     spark.createDataFrame(data).collect()
     spark.createDataFrame(data, ['name', 'age']).collect()
コード例 #16
0
ファイル: session.py プロジェクト: FUHENG0571/S
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row
    import pyspark.sql.session

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.session.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    globs['rdd'] = rdd = sc.parallelize([
        Row(field1=1, field2="row1"),
        Row(field1=2, field2="row2"),
        Row(field1=3, field2="row3")
    ])
    globs['df'] = rdd.toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.session,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1)
コード例 #17
0
def spark_run():
    conf = SparkConf()\
        .setAppName("Tissue detector")\
        .setMaster("spark://spark-master:7077")
    #conf.set('spark.scheduler.mode', 'FAIR')
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    srcdir = '/data/promort/rois.test'
    slidedir='/data/promort/prom2/slides'
    tissuedir='/data/promort/tissue.test' # must have normal and tumor subdirs
    suf = 'tissue'
    
    dlist = os.scandir(srcdir)
    basenames = [e.name for e in dlist if e.is_dir()]

    # build job list
    job_list = []
    for basename in basenames:
        slide = os.path.join(slidedir, basename + '.mrxs')
        job_list.append((slide, tissuedir, basename, suf))
    procs = sc.defaultParallelism
    rdd = sc.parallelize(job_list, numSlices=procs)
    # run tissue detector for each slide
    rdd.foreachPartition(tissue_kernel())
コード例 #18
0
def spark_run():
    conf = SparkConf()\
        .setAppName("Mask generator")\
        .setMaster("spark://spark-master:7077")
    #conf.set('spark.scheduler.mode', 'FAIR')
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    coredir = '/data/o/svs_review/cores'
    frdir = '/data/o/svs_review/focus_regions'
    slidedir='/data/o/slides'
    maskdir='/data/o/masks' # must have normal and tumor subdirs
    work = [['NORMAL', 'cores.csv', coredir],
            ['TUMOR', 'focus_regions.csv', frdir]]
    suf = 'mask'
    
    # build job list
    job_list = []
    dlist = os.scandir(coredir)
    basenames = [e.name for e in dlist if e.is_dir()]
    for basename in basenames:
        for lab, csvfile, srcdir in work:
            slide = os.path.join(slidedir, basename + '.mrxs')
            csvroot = os.path.join(srcdir, basename)
            job_list.append((slide, csvroot, maskdir, basename, lab,
                             suf, csvfile))
    # run jobs
    procs = sc.defaultParallelism
    rdd = sc.parallelize(job_list, numSlices=procs)
    rdd.foreach(mask_kernel)
コード例 #19
0
def run():
    try: 
        from private_data import cass_pass
    except ImportError:
        cass_pass = getpass('Insert Cassandra password: '******'spark.scheduler.mode', 'FAIR')
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)
    
    parts_0 = 48 # get list of patches
    parts_1 = 24 # extract patches
    parts_2 = 24 # write to cassandra

    samples = next(os.walk(os.path.join(masks_root,'normal')))[2]
    samples = [s.split('_')[0] for s in samples]
    par_samples = sc.parallelize(samples, numSlices=parts_0)
    cols = ['sample_name', 'sample_rep', 'x', 'y', 'label', 'data', 'patch_id']
    data = par_samples\
        .map(get_job_list)\
        .flatMapValues(lambda x: x)\
        .repartition(parts_1)\
        .flatMap(get_tiles)
    # save to Cassandra tables
    data.coalesce(parts_2)\
        .foreachPartition(write_to_cassandra(cass_pass))
コード例 #20
0
ファイル: readwriter.py プロジェクト: ZhangQingcheng/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.readwriter
    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #21
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.readwriter
    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
        | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #22
0
ファイル: dataframe.py プロジェクト: Liuchang0812/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #23
0
def main():
    '''
    some standard spark functions examples
    '''

    # Starting a spark session
    sc = SparkContext()
    sc.setLogLevel("OFF")
    spark = SparkSession.builder.master("local").getOrCreate()

    # Creating a dataframe from data
    l = [(1, 'a', 'b', 'c', 'd'), (1, 'a', 'b', 'c', 'd')]
    df0 = spark.createDataFrame(l, ['col1', 'col2', 'col3', 'col4', 'col5'])

    # Creating a dataframe using rdd
    l = [(2, 'f', 'g'), (2, 'f', 'g')]
    rdd = sc.parallelize(l)
    schema = StructType([
        StructField("col6", IntegerType(), True),
        StructField("col7", StringType(), True),
        StructField("col8", StringType(), True)
    ])
    df1 = spark.createDataFrame(rdd, schema)

    # Joining both df0 and df1
    indexedDf0 = add_column_index(df0)
    indexedDf1 = add_column_index(df1)
    df2 = indexedDf0.join(indexedDf1, indexedDf1.idx == indexedDf0.idx,
                          'inner').drop("idx")
    df2.write.csv("/tmp/file.csv",
                  mode='overwrite',
                  header=True,
                  nullValue='NA',
                  quoteAll=False)

    # Read a CSV file into a dataframe (multiLine=True to avoid splitting data with '\n'
    df = spark.read.csv("/tmp/file.csv",
                        header=True,
                        quote='"',
                        escape='"',
                        multiLine=True)

    # Print the Schema
    df.printSchema()

    # Count the number of rows
    print('Number of rows: {}'.format(df.count()))

    # Show columns
    print('Columns:  {}'.format(df.columns))

    # Display the data
    df.show()

    # Count Total Number of records in csv files in a given path
    path = '/tmp'
    count_total(spark, path)
コード例 #24
0
ファイル: data_generator.py プロジェクト: cyrilsx/spark_sql
def generate():
    ss = SparkSession.builder.appName("generator").getOrCreate()
    df = ss.read.json("../ds/person.json")
    resultsDf = df.select("results")

    sc = SparkContext()

    sc.parallelize()

    resultsDf.printSchema()
    nestedArray = resultsDf.select(fn.explode('results'))
    nestedArray.select("col.*").show()
    dict = nestedArray.select("col.id", "col.name")\
        .rdd \
        .map(lambda e: {"id": e.id, "name": e.name, "birth_year":random.randint(1960, 2000), "hometown": "NA"})\
        .collect()

    json.dump(dict, open("../ds/profile.json", "w"))
コード例 #25
0
def main():
    conf = SparkConf().setAppName('Home run count').setMaster('local')
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.appName("pyspark sql").getOrCreate()

    batting_path = "hdfs://localhost:8020/user/baseball/Batting.csv"
    batting_path="hdfs://localhost:9000/data/Batting.csv"

    data = spark.read.csv(batting_path, inferSchema=True, header=True)
    df = data.filter(data.yearID == '2018').select('playerID', 'teamID')

    # filter players that play on two or more teams
    players_vertices = df.groupBy("playerID").count().filter("count > 1").select("playerID")

    edges = df.withColumnRenamed("playerID", "src")
    edges = edges.withColumnRenamed("teamID", "dst")
    edges=players_vertices.join(edges, players_vertices.playerID == edges.src, "inner").select("src","dst")

    players_vertices=players_vertices.withColumnRenamed("playerID","id")
    teams_vertices = edges.select("dst").distinct().withColumnRenamed("dst","id")
    vertices = players_vertices.union(teams_vertices)
    # add one column with auto increasing id
    vertices = vertices.withColumn('num', monotonically_increasing_id())

    graph=GraphFrame(vertices,edges)

    # motif 1
    motif = graph.find("(a)-[]->(b); (a)-[]->(c)").filter("c.num > b.num")
    calculate(motif)
    # motif 2
    motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c)").filter("c.num > b.num and d.num > a.num")
    calculate(motif)
    # motif 3
    motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c)").filter(
        "c.num > b.num and d.num > a.num and e.num > d.num").distinct()
    calculate(motif)
    # motif 4
    motif = graph.find(
        "(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c);(f)-[]->(b);(f)-[]->(c)").filter(
        "c.num > b.num and d.num > a.num and e.num > d.num and f.num > e.num").distinct()
    calculate(motif)

    output_path = "/user/Wang/graphframe"

    # format the output
    final_result=[]
    for key in result.keys():
        line=""
        key_split=key.split("_")
        for i in range(len(key_split)):
            line += " " + key_split[i]
        for team in result[key]:
            line += " " + team
        final_result.append(line)

    data = sc.parallelize(final_result)
    data.saveAsTextFile(output_path)
コード例 #26
0
def main():
    parser = argparse.ArgumentParser(description="Find Dependency inclusions")
    parser.add_argument('--path', type=str)
    parser.add_argument('--cores', type=str)
    args = parser.parse_args()

    sc = SparkContext(appName="DDM")
    sc.getConf().set("spark.executor.cores", args.cores)
    sc.getConf().set("spark.driver.cores", args.cores)
    sc.getConf().set("spark.worker.cores", args.cores)
    sc.getConf().set("spark.deploy.defaultCores", args.cores)
    sc.getConf().set("spark.driver.memory", "15g")
    global number_of_columns
    data = []
    file_headers = []
    for file in os.listdir(args.path):
        if file.endswith(".csv"):
            rdd = sc.textFile(os.path.join(args.path, file)).map(lambda line: line[1:-1].split("\";\""))

            file_data = rdd.collect()
            file_header = file_data[0]
            del file_data[0]
            file_data = [(number_of_columns, x) for x in file_data]
            data += file_data
            file_headers += file_header
            number_of_columns = number_of_columns + len(file_header)

    header_dummies = list(range(0, number_of_columns))
    rdd = sc.parallelize(data)
    values_as_key = rdd.flatMap(lambda el: list(zip(el[1], range(el[0], el[0] + len(el[1])))))
    unique_values = values_as_key.map(lambda x: (x[0], x[1])).groupByKey().mapValues(set)
    unique_values = unique_values.map(lambda x: (tuple(x[1]), 0)).reduceByKey(sum_func)
    matrix_per_key = unique_values.map(lambda x: make_candidate_matrix(x[0]))
    result_matrix = matrix_per_key.reduce(lambda x, y: matrix_and(x, y))

    assert len(result_matrix) == number_of_columns

    output = []
    for i in range(0, number_of_columns):
        assert len(result_matrix[i]) == number_of_columns
        output.append([])

    for i in range(0, len(result_matrix)):
        for j in range(0, len(result_matrix[i])):
            if i != j and result_matrix[i][j]:
                output[j].append(file_headers[i])

    for i in range(0, len(output)):
        row = output[i]
        if len(row) != 0:
            output_string = str(row[0])
            for j in range(1, len(row)):
                output_string += (", " + str(row[j]))
            print(str(file_headers[i]) + " < " + output_string)

    sc.stop()
コード例 #27
0
def rdd_basic():

    conf = SparkConf().setAppName("appName").setMaster("local")
    sc = SparkContext(conf=conf)

    # Create a RDD for the above array
    rdd = sc.parallelize([
        "10", "21", "90", "34", "40", "98", "21", "44", "59", "21", "90", "34",
        "29", "19", "21", "34", "29", "49", "78"
    ])

    # Display the array
    print("The RDD elements are:", rdd.collect())

    # Display the first element of the array
    print("The first element of the RDD:", rdd.first())

    # Display the sorted output (ascending and descending) through an RDD
    sort_asc_rdd = rdd.sortByKey("asc")
    sort_desc_rdd = rdd.sortByKey(ascending=False)
    print("The ascending order of the RDD:", sort_asc_rdd.collect())
    print("The descending order of the RDD:", sort_desc_rdd.collect())

    # Display the distinct elements of the array using an RDD
    distinct_rdd = rdd.distinct()
    print(distinct_rdd)
    print("The distinct RDD list:", distinct_rdd)

    # Display distinct elements without using a new RDD.
    print("The distinct element are:", rdd.distinct())
    print(rdd.distinct())

    # Display maximum and minimum of given array using RDD.
    print("The maximum value of the RDD element is:", rdd.max())
    print("The minimum value of the RDD element is:", rdd.min())

    # Display top 5 list elements using RDD
    print("The top 5 element of the RDD are :", rdd.top(5))

    # Combine above array with a new array { 30,35,45,60,75,85} and display output.
    new_rdd = sc.parallelize(["30", "35", "45", "60", "75", "85"])
    # print(newRdd.collect())
    print("After join the new elements:", rdd.union(new_rdd).collect())
コード例 #28
0
def initialize():
    global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain
    t = time.time()
    sc_conf = SparkConf()
    sc_conf.setAppName("Task1")
    sc_conf.setMaster('local[*]')
    # sc_conf.set("spark.driver.bindAddress", "127.0.0.1")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    csvread = sc.textFile(inputfile)
    columnName = csvread.first().split(',')
    # print(columnName)
    items = csvread.map(lambda line: line.split(",")).filter(
        lambda line: (line) != columnName)

    # Getting user and their business count
    user_business = items.groupByKey().mapValues(set).collect()
    tuple_edge_list = []

    for i in range(0, len(user_business) - 1):
        for j in range(i + 1, len(user_business)):
            inter = user_business[i][1] & user_business[j][1]
            if len(inter) >= filterThreshold:
                tuple_edge_list.append(
                    (str(user_business[i][0]), str(user_business[j][0])))
                tuple_edge_list.append(
                    (str(user_business[j][0]), str(user_business[i][0])))

    totalEdges = float(len(tuple_edge_list) / 2)
    adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues(
        list).collectAsMap()
    adjacency_listMain = copy.deepcopy(adjacency_list)
    totalNodes = list(adjacency_list.keys())

    # ------------------------Newly added line------------------------
    strict_totalNodes = copy.deepcopy(totalNodes)
    # print(len(totalNodes))

    # ----------------------Part 1---------------------
    bfs(totalNodes, adjacency_list)
    print("Writing Betweenness to File....")

    # Converting into sorted List Initial Betweenness
    list_val = list(cost_dict.items())

    list_val.sort(key=lambda x: (-x[1], x[0]))
    writeToFile(list_val)
    totalNodes = copy.deepcopy(strict_totalNodes)
    # print(len(totalNodes))
    # ----------------------Part 2----------------------
    print("Creating Partitions....")
    create_components(list_val, adjacency_listMain, totalNodes, totalEdges)
    # ---------------------EoC---------------------------

    print("Duration: " + str(time.time() - t))
コード例 #29
0
ファイル: main.py プロジェクト: kaist-dmlab/TensorSparkML
    def insert(self, in_local_path, out_hdfs_path, num_partitions=10):
        # FOR TESTING
        input = np.random.rand(101, 3)
        input[0] = np.array([100, 200, 300])

        #input = np.loadtxt(in_local_path, delimiter=',', dtype=str)
        header = input[0]
        data = input[1:]

        sc = SparkContext(conf=SparkConf().setAppName("data_setup"))

        # Split Data into Partitions
        #header_rdd = sc.parallelize(np.array([header for i in range(0,num_partitions)]), num_partitions).cache()
        header_rdd = sc.parallelize(header, 1).cache()
        data_rdd = sc.parallelize(data, num_partitions).cache()

        # Save Data Partitions
        header_rdd.map(self.to_csv).saveAsTextFile(out_hdfs_path +
                                                   "/csv/header")
        data_rdd.map(self.to_csv).saveAsTextFile(out_hdfs_path + "/csv/data")
コード例 #30
0
ファイル: dataframe.py プロジェクト: wyx0578/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.dataframe
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
    rdd2 = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)])
    rdd3 = sc.parallelize(
        [Row(name='Tom', height=80),
         Row(name='Bob', height=85)])
    globs['df'] = sqlCtx.inferSchema(rdd2)
    globs['df2'] = sqlCtx.inferSchema(rdd3)
    (failure_count, test_count) = doctest.testmod(pyspark.sql.dataframe,
                                                  globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #31
0
class PySparkTestCase(unittest.TestCase):
    def setUp(self):
        class_name = self.__class__.__name__
        self.sc = SparkContext('local', class_name)

    def tearDown(self):
        self.sc.stop()

    def test_should_be_able_to_word_count(self):
        rdd = self.sc.parallelize(["This is a text", "Another text", "More text", "a text"])
        result = python_word_count.wordcount(rdd)
        expected = [('a', 2), ('This', 1), ('text', 4), ('is', 1), ('Another', 1), ('More', 1)]
        self.assertEquals(expected, result.collect())
コード例 #32
0
ファイル: context.py プロジェクト: zhengruifeng/spark
def _test() -> None:
    import os
    import doctest
    import tempfile
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["tempfile"] = tempfile
    globs["os"] = os
    globs["sc"] = sc
    globs["sqlContext"] = SQLContext(sc)
    globs["rdd"] = rdd = sc.parallelize([
        Row(field1=1, field2="row1"),
        Row(field1=2, field2="row2"),
        Row(field1=3, field2="row3"),
    ])
    globs["df"] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", "field3":{"field4":33, "field5": []}}',
    ]
    globs["jsonStrings"] = jsonStrings
    globs["json"] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
    )
    globs["sc"].stop()
    if failure_count:
        sys.exit(-1)
コード例 #33
0
ファイル: sql.py プロジェクト: EronWright/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
        {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
    (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #34
0
class PyVertexRDDTestCase(unittest.TestCase):
    """
    Test collect, take, count, mapValues, diff,
    filter, mapVertexPartitions, innerJoin and leftJoin
    for VertexRDD
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.take(1)
        self.assertEqual(results, [(3, ("rxin", "student"))])

    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.count()
        self.assertEqual(results, 2)

    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.mapValues(lambda x: x + ":" + x)
        self.assertEqual(results, [(3, ("rxin:rxin", "student:student")),
                                   (7, ("jgonzal:jgonzal", "postdoc:postdoc"))])

    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.innerJoin(vertices1).collect()
        self.assertEqual(results, [])

    def leftJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
コード例 #35
0
def run_sim(data, function, n):
    sc = SparkContext()

    input_data = sc.parallelize([data() for _ in range(n)])
    transformed_data = input_data.map(lambda x: function(x))
    kvp_data = transformed_data.map(lambda x: (x, 1))
    
    counts = kvp_data.reduceByKey(lambda a, b: a+b)
    counts = counts.collectAsMap()
    
    try:
        num_true = counts[True]
    except:
        num_true = 0
    num_false = counts[False]
    
    return num_true/n
コード例 #36
0
ファイル: column.py プロジェクト: chenc10/Spark-PAF-INFOCOM18
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    import pyspark.sql.column
    globs = pyspark.sql.column.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #37
0
ファイル: column.py プロジェクト: 15652101501/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    import pyspark.sql.column
    globs = pyspark.sql.column.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #38
0
ファイル: column.py プロジェクト: Julian/spark
def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    import pyspark.sql.column

    globs = pyspark.sql.column.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["sc"] = sc
    globs["sqlContext"] = SQLContext(sc)
    globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF(
        StructType([StructField("age", IntegerType()), StructField("name", StringType())])
    )

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
    )
    globs["sc"].stop()
    if failure_count:
        exit(-1)
コード例 #39
0
ファイル: session.py プロジェクト: ChineseDr/spark
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row
    import pyspark.sql.session

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.session.__dict__.copy()
    sc = SparkContext("local[4]", "PythonTest")
    globs["sc"] = sc
    globs["spark"] = SparkSession(sc)
    globs["rdd"] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")]
    )
    globs["df"] = rdd.toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
    )
    globs["sc"].stop()
    if failure_count:
        exit(-1)
コード例 #40
0
def _test():
    import doctest
    from pyspark.context import SparkContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext('local[4]', 'PythonTest', batchSize=2)
    globs['sc'] = sc
    globs['sqlCtx'] = SQLContext(sc)
    globs['rdd'] = sc.parallelize([{
        "field1": 1,
        "field2": "row1"
    }, {
        "field1": 2,
        "field2": "row2"
    }, {
        "field1": 3,
        "field2": "row3"
    }])
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
コード例 #41
0
    def _create_model(self, java_model):
        return Word2VecModel(java_model)


class Word2VecModel(JavaModel):
    """
    Model fitted by Word2Vec.
    """


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.feature tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"),
                               Row(id=2, label="c"), Row(id=3, label="a"),
                               Row(id=4, label="a"), Row(id=5, label="c")], 2)
    globs['stringIndDf'] = sqlContext.createDataFrame(testData)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #42
0
ファイル: tests.py プロジェクト: giworld/spark
class TestStreamingContextSuite(unittest.TestCase):
    """
    Should we have conf property in  SparkContext?
    @property
    def conf(self):
        return self._conf

    """
    def setUp(self):
        self.master = "local[2]"
        self.appName = self.__class__.__name__
        self.batachDuration = Milliseconds(500)
        self.sparkHome = "SomeDir"
        self.envPair = {"key": "value"}
        self.ssc = None
        self.sc = None

    def tearDown(self):
        # Do not call pyspark.streaming.context.StreamingContext.stop directly because
        # we do not wait to shutdown py4j client.
        # We need change this simply calll streamingConxt.Stop
        #self.ssc._jssc.stop()
        if self.ssc is not None:
            self.ssc.stop()
        if self.sc is not None:
            self.sc.stop()
        # Why does it long time to terminate StremaingContext and SparkContext?
        # Should we change the sleep time if this depends on machine spec?
        time.sleep(1)

    @classmethod
    def tearDownClass(cls):
        # Make sure tp shutdown the callback server
        SparkContext._gateway._shutdown_callback_server()

    def test_from_no_conf_constructor(self):
        self.ssc = StreamingContext(master=self.master, appName=self.appName,
                               duration=self.batachDuration)
        # Alternative call master: ssc.sparkContext.master
        # I try to make code close to Scala.
        self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
        self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)

    def test_from_no_conf_plus_spark_home(self):
        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
                               sparkHome=self.sparkHome, duration=self.batachDuration)
        self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)

    def test_from_no_conf_plus_spark_home_plus_env(self):
        self.ssc = StreamingContext(master=self.master, appName=self.appName, 
                               sparkHome=self.sparkHome, environment=self.envPair,
                               duration=self.batachDuration)
        self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])

    def test_from_existing_spark_context(self):
        self.sc = SparkContext(master=self.master, appName=self.appName)
        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)

    def test_existing_spark_context_with_settings(self):
        conf = SparkConf()
        conf.set("spark.cleaner.ttl", "10")
        self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)

    def test_from_conf_with_settings(self):
        conf = SparkConf()
        conf.set("spark.cleaner.ttl", "10")
        conf.setMaster(self.master)
        conf.setAppName(self.appName)
        self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
        self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)

    def test_stop_only_streaming_context(self):
        self.sc = SparkContext(master=self.master, appName=self.appName)
        self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
        self._addInputStream(self.ssc)
        self.ssc.start()
        self.ssc.stop(False)
        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)

    def test_stop_multiple_times(self):
        self.ssc = StreamingContext(master=self.master, appName=self.appName,
                               duration=self.batachDuration)
        self._addInputStream(self.ssc)
        self.ssc.start()
        self.ssc.stop()
        self.ssc.stop()

    def _addInputStream(self, s):
        # Make sure each length of input is over 3 and 
        # numSlice is 2 due to deserializer problem in pyspark.streaming
        test_inputs = map(lambda x: range(1, x), range(5, 101))
        test_stream = s._testInputStream(test_inputs, 2)
        # Register fake output operation
        result = list()
        test_stream._test_output(result)
コード例 #43
0
ファイル: tests.py プロジェクト: 31z4/spark
class PySparkStreamingTestCase(unittest.TestCase):

    timeout = 20  # seconds
    duration = 1

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")
        # TODO: decrease duration to speed up tests
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        self.ssc.stop()

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def _test_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists.
        @param func: wrapped function. This function should return PythonDStream object.
        @param expected: expected output for this testcase.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]
        input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None

        # Apply test function to stream.
        if input2:
            stream = func(input_stream, input_stream2)
        else:
            stream = func(input_stream)

        result = self._collect(stream, len(expected))
        if sort:
            self._sort_result_based_on_key(result)
            self._sort_result_based_on_key(expected)
        self.assertEqual(expected, result)

    def _sort_result_based_on_key(self, outputs):
        """Sort the list based on first value."""
        for output in outputs:
            output.sort(key=lambda x: x[0])
コード例 #44
0
ファイル: context.py プロジェクト: giworld/spark
class StreamingContext(object):
    """
    Main entry point for Spark Streaming functionality. A StreamingContext represents the
    connection to a Spark cluster, and can be used to create L{DStream}s and
    broadcast variables on that cluster.
    """

    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                 environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
                 gateway=None, sparkContext=None, duration=None):
        """
        Create a new StreamingContext. At least the master and app name and duration
        should be set, either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.
        @param sparkContext: L{SparkContext} object.
        @param duration: A L{Duration} object for SparkStreaming.

        """

        if not isinstance(duration, Duration):
            raise TypeError("Input should be pyspark.streaming.duration.Duration object")

        if sparkContext is None:
            # Create the Python Sparkcontext
            self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
                                    pyFiles=pyFiles, environment=environment, batchSize=batchSize,
                                    serializer=serializer, conf=conf, gateway=gateway)
        else:
            self._sc = sparkContext

        # Start py4j callback server.
        # Callback sever is need only by SparkStreming; therefore the callback sever
        # is started in StreamingContext.
        SparkContext._gateway.restart_callback_server()
        self._set_clean_up_handler()
        self._jvm = self._sc._jvm
        self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)

    # Initialize StremaingContext in function to allow subclass specific initialization
    def _initialize_context(self, jspark_context, jduration):
        return self._jvm.JavaStreamingContext(jspark_context, jduration)

    def _set_clean_up_handler(self):
        """ set clean up hander using atexit """

        def clean_up_handler():
            SparkContext._gateway.shutdown()

        atexit.register(clean_up_handler)
        # atext is not called when the program is killed by a signal not handled by
        # Python.
        for sig in (SIGINT, SIGTERM):
            signal(sig, clean_up_handler)

    @property
    def sparkContext(self):
        """
        Return SparkContext which is associated with this StreamingContext.
        """
        return self._sc

    def start(self):
        """
        Start the execution of the streams.
        """
        self._jssc.start()

    def awaitTermination(self, timeout=None):
        """
        Wait for the execution to stop.
        @param timeout: time to wait in milliseconds
        """
        if timeout is None:
            self._jssc.awaitTermination()
        else:
            self._jssc.awaitTermination(timeout)

    def remember(self, duration):
        """
        Set each DStreams in this context to remember RDDs it generated in the last given duration.
        DStreams remember RDDs only for a limited duration of time and releases them for garbage
        collection. This method allows the developer to specify how to long to remember the RDDs (
        if the developer wishes to query old data outside the DStream computation).
        @param duration pyspark.streaming.duration.Duration object.
               Minimum duration that each DStream should remember its RDDs
        """
        if not isinstance(duration, Duration):
            raise TypeError("Input should be pyspark.streaming.duration.Duration object")

        self._jssc.remember(duration._jduration)

    # TODO: add storageLevel
    def socketTextStream(self, hostname, port):
        """
        Create an input from TCP source hostname:port. Data is received using
        a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
        lines.
        """
        return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer())

    def textFileStream(self, directory):
        """
        Create an input stream that monitors a Hadoop-compatible file system
        for new files and reads them as text files. Files must be wrriten to the
        monitored directory by "moving" them from another location within the same
        file system. File names starting with . are ignored.
        """
        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())

    def stop(self, stopSparkContext=True, stopGraceFully=False):
        """
        Stop the execution of the streams immediately (does not wait for all received data
        to be processed).
        """
        self._jssc.stop(stopSparkContext, stopGraceFully)
        if stopSparkContext:
            self._sc.stop()

        # Shutdown only callback server and all py3j client is shutdowned
        # clean up handler
        SparkContext._gateway._shutdown_callback_server()
        
    def _testInputStream(self, test_inputs, numSlices=None):
        """
        This function is only for unittest.
        It requires a list as input, and returns the i_th element at the i_th batch
        under manual clock.
        """
        test_rdds = list()
        test_rdd_deserializers = list()
        for test_input in test_inputs:
            test_rdd = self._sc.parallelize(test_input, numSlices)
            test_rdds.append(test_rdd._jrdd)
            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
        # All deserializers have to be the same.
        # TODO: add deserializer validation
        jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client)
        jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream()

        return DStream(jinput_stream, self, test_rdd_deserializers[0])
コード例 #45
0
class StreamingContext(object):
    """
    Main entry point for Spark Streaming functionality. A StreamingContext represents the
    connection to a Spark cluster, and can be used to create L{DStream}s and
    broadcast variables on that cluster.
    """
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=1024,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None,
                 sparkContext=None,
                 duration=None):
        """
        Create a new StreamingContext. At least the master and app name and duration
        should be set, either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.
        @param sparkContext: L{SparkContext} object.
        @param duration: A L{Duration} object for SparkStreaming.

        """

        if not isinstance(duration, Duration):
            raise TypeError(
                "Input should be pyspark.streaming.duration.Duration object")

        if sparkContext is None:
            # Create the Python Sparkcontext
            self._sc = SparkContext(master=master,
                                    appName=appName,
                                    sparkHome=sparkHome,
                                    pyFiles=pyFiles,
                                    environment=environment,
                                    batchSize=batchSize,
                                    serializer=serializer,
                                    conf=conf,
                                    gateway=gateway)
        else:
            self._sc = sparkContext

        # Start py4j callback server.
        # Callback sever is need only by SparkStreming; therefore the callback sever
        # is started in StreamingContext.
        SparkContext._gateway.restart_callback_server()
        self._set_clean_up_handler()
        self._jvm = self._sc._jvm
        self._jssc = self._initialize_context(self._sc._jsc,
                                              duration._jduration)

    # Initialize StremaingContext in function to allow subclass specific initialization
    def _initialize_context(self, jspark_context, jduration):
        return self._jvm.JavaStreamingContext(jspark_context, jduration)

    def _set_clean_up_handler(self):
        """ set clean up hander using atexit """
        def clean_up_handler():
            SparkContext._gateway.shutdown()

        atexit.register(clean_up_handler)
        # atext is not called when the program is killed by a signal not handled by
        # Python.
        for sig in (SIGINT, SIGTERM):
            signal(sig, clean_up_handler)

    @property
    def sparkContext(self):
        """
        Return SparkContext which is associated with this StreamingContext.
        """
        return self._sc

    def start(self):
        """
        Start the execution of the streams.
        """
        self._jssc.start()

    def awaitTermination(self, timeout=None):
        """
        Wait for the execution to stop.
        @param timeout: time to wait in milliseconds
        """
        if timeout is None:
            self._jssc.awaitTermination()
        else:
            self._jssc.awaitTermination(timeout)

    def remember(self, duration):
        """
        Set each DStreams in this context to remember RDDs it generated in the last given duration.
        DStreams remember RDDs only for a limited duration of time and releases them for garbage
        collection. This method allows the developer to specify how to long to remember the RDDs (
        if the developer wishes to query old data outside the DStream computation).
        @param duration pyspark.streaming.duration.Duration object.
               Minimum duration that each DStream should remember its RDDs
        """
        if not isinstance(duration, Duration):
            raise TypeError(
                "Input should be pyspark.streaming.duration.Duration object")

        self._jssc.remember(duration._jduration)

    # TODO: add storageLevel
    def socketTextStream(self, hostname, port):
        """
        Create an input from TCP source hostname:port. Data is received using
        a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
        lines.
        """
        return DStream(self._jssc.socketTextStream(hostname, port), self,
                       UTF8Deserializer())

    def textFileStream(self, directory):
        """
        Create an input stream that monitors a Hadoop-compatible file system
        for new files and reads them as text files. Files must be wrriten to the
        monitored directory by "moving" them from another location within the same
        file system. File names starting with . are ignored.
        """
        return DStream(self._jssc.textFileStream(directory), self,
                       UTF8Deserializer())

    def stop(self, stopSparkContext=True, stopGraceFully=False):
        """
        Stop the execution of the streams immediately (does not wait for all received data
        to be processed).
        """
        self._jssc.stop(stopSparkContext, stopGraceFully)
        if stopSparkContext:
            self._sc.stop()

        # Shutdown only callback server and all py3j client is shutdowned
        # clean up handler
        SparkContext._gateway._shutdown_callback_server()

    def _testInputStream(self, test_inputs, numSlices=None):
        """
        This function is only for unittest.
        It requires a list as input, and returns the i_th element at the i_th batch
        under manual clock.
        """
        test_rdds = list()
        test_rdd_deserializers = list()
        for test_input in test_inputs:
            test_rdd = self._sc.parallelize(test_input, numSlices)
            test_rdds.append(test_rdd._jrdd)
            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
        # All deserializers have to be the same.
        # TODO: add deserializer validation
        jtest_rdds = ListConverter().convert(
            test_rdds, SparkContext._gateway._gateway_client)
        jinput_stream = self._jvm.PythonTestInputStream(
            self._jssc, jtest_rdds).asJavaDStream()

        return DStream(jinput_stream, self, test_rdd_deserializers[0])
コード例 #46
0
class SparkExecutor(Executor):
  def __init__(self):
    # Setup PySpark. This is needed until PySpark becomes available on PyPI,
    # after which we can simply add it to requirements.txt.
    _setup_pyspark()
    from pyspark.conf import SparkConf
    from pyspark.context import SparkContext
    from pyspark.serializers import MarshalSerializer

    # Create a temporary .zip lib file for Metis, which will be copied over to
    # Spark workers so they can unpickle Metis functions and objects.
    metis_lib_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False)
    metis_lib_file.close()
    _copy_lib_for_spark_workers(metis_lib_file.name)

    # Also ship the Metis lib file so worker nodes can deserialize Metis
    # internal data structures.
    conf = SparkConf()
    conf.setMaster(app.config['SPARK_MASTER'])
    conf.setAppName('chronology:metis')
    parallelism = int(app.config.get('SPARK_PARALLELISM', 0))
    if parallelism:
      conf.set('spark.default.parallelism', parallelism)
    self.context = SparkContext(conf=conf,
                                pyFiles=[metis_lib_file.name],
                                serializer=MarshalSerializer())

    # Delete temporary Metis lib file.
    os.unlink(metis_lib_file.name)

    # We'll use this to parallelize fetching events in KronosSource.
    # The default of 8 is from:
    # https://spark.apache.org/docs/latest/configuration.html
    self.parallelism = parallelism or 8

  def __getstate__(self):
    # Don't pickle the `SparkContext` object.
    state = self.__dict__.copy()
    del state['context']
    return state

  def finalize(self, rdd):
    return rdd.collect()

  def execute_aggregate(self, node):
    def finalize(event):
      # `event` is of the form (key, event).
      return node.finalize_func(event[1])

    return (self.execute(node.source)
            .map(node.group_func)
            .reduceByKey(node.reduce_func)
            .map(finalize))

  def execute_filter(self, node):
    return self.execute(node.source).filter(generate_filter(node.condition))

  def execute_join(self, node):
    left_alias = node.left.alias or 'left'
    right_alias = node.right.alias or 'right'

    def merge(events):
      event1, event2 = events
      if isinstance(event1, types.StringType):
        # Join case: events = (key, (event1, event2))
        event1, event2 = event2
        event = deepcopy(event1)
        event.update(event2)
      else:
        # Cartesian case: events = (event1, event2)
        event = {}
        for key, value in event1.iteritems():
          event['%s.%s' % (left_alias, key)] = value
        for key, value in event2.iteritems():
          event['%s.%s' % (right_alias, key)] = value
      return event

    def get_equijoin_key_values(condition):
      # condition must be a *leaf* condition.
      if getattr(condition, 'op', None) != Condition.Op.EQ:
        return None

      # Get properties being accessed by left and right side of the
      # conditional.
      left_properties = get_properties_accessed_by_value(condition.left)
      right_properties = get_properties_accessed_by_value(condition.right)

      if not (left_properties and right_properties):
        return None

      # Only return getters if both sides of the conditional read from different
      # sources. You can't use this optimization say if the condition is
      # (left.x + right.y = 10)
      # XXX: This isn't kosher for non-deterministic functions.
      if (all(p.startswith('%s.' % left_alias) for p in left_properties) and
          all(p.startswith('%s.' % right_alias) for p in right_properties)):
        return {'left': condition.left, 'right': condition.right}

      if (all(p.startswith('%s.' % right_alias) for p in left_properties) and
          all(p.startswith('%s.' % left_alias) for p in right_properties)):
        return {'left': condition.right, 'right': condition.left}

      return None

    def map_equijoin(alias, key_values):
      def map(event):
        new_event = {}
        for key, value in event.iteritems():
          new_event['%s.%s' % (alias, key)] = value
        key = json.dumps([get_value(new_event, value) for value in key_values])
        return (key, new_event)
      return map

    def setup_join():
      eq_join_key_values = []

      # TODO(usmanm): Right now we only optimize if the conditional is an EQ or
      # if its an AND and has some EQ in the top level. We don't do any
      # recursive searching in condition trees. Improve that.
      condition = node.condition
      _type = getattr(condition, 'type', None)
      if _type == Condition.Type.AND:
        filter_conditions = []
        for c in condition.conditions:
          values = get_equijoin_key_values(c)
          if values:
            eq_join_key_values.append(values)
          else:
            filter_conditions.append(c)
        if filter_conditions:
          condition.conditions = filter_conditions
        else:
          condition = None
      elif _type != Condition.Type.OR:  # Ignore ORs for now.
        value = get_equijoin_key_values(condition)
        if value:
          eq_join_key_values.append(value)
          condition = None

      return eq_join_key_values, (generate_filter(condition)
                                  if condition else None)

    eq_join_key_values, filter_function = setup_join()

    if eq_join_key_values:
      mapped_left = (self.execute(node.left)
                     .map(map_equijoin(
                       left_alias,
                       [value['left'] for value in eq_join_key_values])))
      mapped_right = (self.execute(node.right)
                      .map(map_equijoin(
                        right_alias,
                        [value['right'] for value in eq_join_key_values])))
      joined = mapped_left.join(mapped_right).map(merge)
    else:
      # Naive O(n^2) cartesian product.
     joined = (self.execute(node.left).cartesian(self.execute(node.right))
               .map(merge))

    if filter_function:
      joined = joined.filter(filter_function)
    return joined

  def execute_limit(self, node):
    # TODO(usmanm): Is there a better way than to collect and redistribute all
    # events?
    return self.context.parallelize(self.execute(node.source).take(node.limit))

  def execute_order_by(self, node):
    return (self.execute(node.source)
            .keyBy(lambda e: tuple(get_value(e, field)
                                   for field in node.fields))
            .sortByKey(ascending=node.order == node.ResultOrder.ASCENDING)
            .map(lambda e: e[1]))

  def execute_project(self, node):
    def project(event):
      if node.merge:
        new_event = deepcopy(event)
      else:
        new_event = {}
      for field in node.fields:
        new_event[field.alias] = get_value(event, field)
      return new_event
    return self.execute(node.source).map(node.map_func)
コード例 #47
0
ファイル: IPM_SGD.py プロジェクト: natviv/Spark_Page_Rank
def avg_model(sgd, slices):
    sgd.coef_ /= slices
    sgd.intercept_ /= slices
    return sgd
 
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print >> sys.stderr, \
            "Usage: PythonLR <master> <iterations>"
        exit(-1)
    
 
    sc = SparkContext(sys.argv[1], "PythonLR")
    ITERATIONS = int(sys.argv[2]) if len(sys.argv) > 2 else ITERATIONS
    slices = int(sys.argv[3]) if len(sys.argv) == 4 else 2
    data = generate_data(N)
    print len(data)
 
    # initializing SGD
    sgd = lm.SGDClassifier(loss='log')
    for ii in range(ITERATIONS):
        sgd = sc.parallelize(data, numSlices=slices) \
                .mapPartitions(lambda x: train(x, sgd)) \
                .reduce(lambda x, y: merge(x, y))
        sgd = avg_model(sgd, slices) # averaging weight vector for IPM update
        print "Iteration %d:" % (ii + 1)
        print "Model: "
        print sgd.coef_
        print sgd.intercept_
        print ""
コード例 #48
0
ファイル: visualize-words.py プロジェクト: agilemobiledev/w2v
#Feat = np.load('mllib-scripts/w2v_may1_may19_june1_june11.npy')
#words = np.load('mllib-scripts/word_may1_may19_june1_june11.npy')
#wordToModel = 'data'
#maxWordsVis = 10

print "\n================================================="
print "Size of the Word2Vec matrix is: ", Feat.shape 
print "Number of words in the models: ", words.shape
print "=================================================\n"

## Spark Context
sc = SparkContext('local','visualize-words') 

## Read the Word2Vec model
# the next line should be read/stored from hdfs if it is large
Feat = sc.parallelize(Feat) 

# map feature matrix to spark vectors
from pyspark.mllib.linalg import Vectors
Feat = Feat.map(lambda vec: (Vectors.dense(vec),))

## Define a df with feature matrix
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
dfFeat = sqlContext.createDataFrame(Feat,["features"])
dfFeat.printSchema()

## PCA to project Feature matrix to 2 dimensions
from pyspark.ml.feature import PCA
numComponents = 3
pca = PCA(k=numComponents, inputCol="features", outputCol="pcaFeatures")
コード例 #49
0
ファイル: tests.py プロジェクト: CodEnFisH/cogngin
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)

    def test_zip_with_different_serializers(self):
        a = self.sc.parallelize(range(5))
        b = self.sc.parallelize(range(100, 105))
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
        a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
        b = b._reserialize(MarshalSerializer())
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])

    def test_zip_with_different_number_of_items(self):
        a = self.sc.parallelize(range(5), 2)
        # different number of partitions
        b = self.sc.parallelize(range(100, 106), 3)
        self.assertRaises(ValueError, lambda: a.zip(b))
        # different number of batched items in JVM
        b = self.sc.parallelize(range(100, 104), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # different number of items in one pair
        b = self.sc.parallelize(range(100, 106), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # same total number of items, but different distributions
        a = self.sc.parallelize([2, 3], 2).flatMap(range)
        b = self.sc.parallelize([3, 2], 2).flatMap(range)
        self.assertEquals(a.count(), b.count())
        self.assertRaises(Exception, lambda: a.zip(b).count())

    def test_histogram(self):
        # empty
        rdd = self.sc.parallelize([])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])
        self.assertRaises(ValueError, lambda: rdd.histogram(1))

        # out of range
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1])

        # in range with one bucket
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals([4], rdd.histogram([0, 10])[1])
        self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1])

        # in range with one bucket exact match
        self.assertEquals([4], rdd.histogram([1, 4])[1])

        # out of range with two buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1])

        # out of range with two uneven buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])

        # in range with two buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two bucket and None
        rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two uneven buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1])

        # mixed range with two uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01])
        self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1])

        # mixed range with four uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1])
        self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # mixed range with uneven buckets and NaN
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0,
                                   199.0, 200.0, 200.1, None, float('nan')])
        self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # out of range with infinite buckets
        rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")])
        self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1])

        # invalid buckets
        self.assertRaises(ValueError, lambda: rdd.histogram([]))
        self.assertRaises(ValueError, lambda: rdd.histogram([1]))
        self.assertRaises(ValueError, lambda: rdd.histogram(0))
        self.assertRaises(TypeError, lambda: rdd.histogram({}))

        # without buckets
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 4], [4]), rdd.histogram(1))

        # without buckets single element
        rdd = self.sc.parallelize([1])
        self.assertEquals(([1, 1], [1]), rdd.histogram(1))

        # without bucket no range
        rdd = self.sc.parallelize([1] * 4)
        self.assertEquals(([1, 1], [4]), rdd.histogram(1))

        # without buckets basic two
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2))

        # without buckets with more requested than elements
        rdd = self.sc.parallelize([1, 2])
        buckets = [1 + 0.2 * i for i in range(6)]
        hist = [1, 0, 0, 0, 1]
        self.assertEquals((buckets, hist), rdd.histogram(5))

        # invalid RDDs
        rdd = self.sc.parallelize([1, float('inf')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))
        rdd = self.sc.parallelize([float('nan')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))

        # string
        rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2)
        self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))

        # mixed RDD
        rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2)
        self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1])
        self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals(([1, "b"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))
コード例 #50
0
ファイル: tests.py プロジェクト: fireflyc/spark
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])
コード例 #51
0
__author__ = "Chenweijia"

import os
import sys
import pandas
from operator import add
from pyspark.sql import *
from pyspark.context import SparkContext

os.environ["SPARK_HOME"] = "F:/spark"
sys.path.append("F:/spark/python")

sc = SparkContext("local", "test")
l = [("Alice", 1)]
sqlContext = SQLContext(sc)
rdd = sc.parallelize(l)
Person = Row("name", "age")
person = rdd.map(lambda r: Person(*r))
df2 = sqlContext.createDataFrame(person)
df = sqlContext.createDataFrame(rdd, ["name", "age"])
from pyspark.sql.types import *

schema = StructType([StructField("name", StringType(), True), StructField("age", IntegerType(), True)])
df3 = sqlContext.createDataFrame(rdd, schema)
# print rdd
# print sqlContext.createDataFrame(l).collect()
# print sqlContext.createDataFrame(l, ['name', 'age']).collect()
# print sqlContext.createDataFrame(rdd).collect()
# print df.collect()
# print df2.collect()
# print df3.collect()
コード例 #52
0
st = RDD.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y:
                                                 (x[0] + y[0], x[1] + y[1]))
task1 = st.mapValues(lambda x: x[0] / x[1]).sortByKey(False)
task1f = task1.top(statecount, key=lambda x: x[1])

t1_output_header = 'state,stars'
with open(output_path1, 'w') as file_op:
    file_op.write(t1_output_header)
    for item in task1f:
        file_op.write("\n%s" % str(item).replace("(", "").replace(
            ")", "").replace("'", "").replace(" ", ""))
    file_op.close()

t2 = time.time()
t2a = task1.collect()
print("Task 2B m1:", sorted(t2a, key=lambda x: x[1], reverse=True)[:5])
t3 = time.time() - t2

t0 = time.time()
t2b = sc.parallelize(task1f).take(5)
print("Task 2B m2: ", t2b)
t1 = time.time() - t0

json_fileop = {}
json_fileop['m1'] = t3
json_fileop['m2'] = t1
json_fileop[
    'explanation'] = "The collect() function takes more time as it brings all the elements of the dataset into the main driver memory or master node, due to which alot of time is consumed as the whole rdd is brought into the driver memory. The take() function takes less time as it just selects the first n items from the rdd"
with open(output_path2, "w") as f:
    out = json.dump(OrderedDict(json_fileop), f)
コード例 #53
0
ファイル: tests.py プロジェクト: ArchangelSeraphim/spark
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)
コード例 #54
0
ファイル: feature.py プロジェクト: amplab/iolap
    def _create_model(self, java_model):
        return Word2VecModel(java_model)


class Word2VecModel(JavaModel):
    """
    Model fitted by Word2Vec.
    """


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.feature tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"),
                               Row(id=2, label="c"), Row(id=3, label="a"),
                               Row(id=4, label="a"), Row(id=5, label="c")], 2)
    globs['stringIndDf'] = sqlContext.createDataFrame(testData)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #55
0
ファイル: tests.py プロジェクト: calhank/reddiculous
class PyGraphXTestCase(unittest.TestCase):
    """
    Test vertices, edges, partitionBy, numEdges, numVertices,
    inDegrees, outDegrees, degrees, triplets, mapVertices,
    mapEdges, mapTriplets, reverse, subgraph, groupEdges,
    joinVertices, outerJoinVertices, collectNeighborIds,
    collectNeighbors, mapReduceTriplets, triangleCount for Graph
    """

    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        self.sc = SparkContext(appName=class_name, conf=conf)
        self.sc.setCheckpointDir("/tmp")

    def tearDown(self):
        self.sc.stop()

    def collect(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def take(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])

    def count(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    def mapValues(self):
        vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertices = VertexRDD(vertexData)
        results = vertices.collect()
        self.assertEqual(results, 2)

    def diff(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)

    def innerJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)

    def leftJoin(self):
        vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
        vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
        vertices0 = VertexRDD(vertexData0)
        vertices1 = VertexRDD(vertexData1)
        results = vertices0.diff(vertices1)
        self.assertEqual(results, 2)
コード例 #56
0
# values()
m = sc.parallelize([(1, 2), (3, 4)]).values()
m.collect()

# variance()
sc.parallelize([1, 2, 3]).variance()

# zip(other)
x = sc.parallelize(range(0,5))
y = sc.parallelize(range(1000, 1005))
x.zip(y).collect()

# zipWithIndex()
sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()

# zipWithUniqueId()
sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect()


### BROADCAST
from pyspark.context import SparkContext
sc = SparkContext('local', 'test')
b = sc.broadcast([1, 2, 3, 4, 5])
b.value
sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
b.unpersist()

large_broadcast = sc.broadcast(range(10000))


df.groupBy("word").count().show()

###############
# Quick code to understand complete example

# Q1.0
from pyspark.sql import Row
from pyspark.sql.functions import split, explode

# Q2.0
lines = ["Good morning. Nice day", "OK bye bye", "Good work", "Good day"]

# Q3.0 Transform to RDD and apply to each element
#             of list, function Row()

lines = sc.parallelize(lines).map(lambda x: Row(x))

# Q4.0 Convert it to dataframe with column name as 'value'
lines = sqlContext.createDataFrame(lines, ['value'])
lines.collect()
lines.show(truncate=False)

# Q5.0  What do split and explode do?
#             explode: Returns a new row for each element in the given array

lines.select(split(lines.value, " ")).show(truncate=False)
lines.select(explode(split(lines.value, " "))).show(truncate=False)
lines.select(explode(split(lines.value,
                           " ")).alias("word")).show(truncate=False)

# Q6.0