Example #1
0
        def m(line):
            val = fmt(line['VAL'])

            point = fmt(val / per) * p

            if top != 0 and point > top:
                point = top

            return Row(PK=line['PK'], VAL=line['VAL'], POINT=int(point))
Example #2
0
 def test_udf_with_array_type(self):
     d = [Row(l=range(3), d={"key": range(5)})]
     rdd = self.sc.parallelize(d)
     self.sqlCtx.inferSchema(rdd).registerTempTable("test")
     self.sqlCtx.registerFunction("copylist", lambda l: list(l), ArrayType(IntegerType()))
     self.sqlCtx.registerFunction("maplen", lambda d: len(d), IntegerType())
     [(l1, l2)] = self.sqlCtx.sql("select copylist(l), maplen(d) from test").collect()
     self.assertEqual(range(3), l1)
     self.assertEqual(1, l2)
Example #3
0
    def test_infer_schema(self):
        d = [Row(l=[], d={}),
             Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}, s="")]
        rdd = self.sc.parallelize(d)
        df = self.sqlCtx.inferSchema(rdd)
        self.assertEqual([], df.map(lambda r: r.l).first())
        self.assertEqual([None, ""], df.map(lambda r: r.s).collect())
        df.registerTempTable("test")
        result = self.sqlCtx.sql("SELECT l[0].a from test where d['key'].d = '2'")
        self.assertEqual(1, result.head()[0])

        df2 = self.sqlCtx.inferSchema(rdd, 1.0)
        self.assertEqual(df.schema(), df2.schema())
        self.assertEqual({}, df2.map(lambda r: r.d).first())
        self.assertEqual([None, ""], df2.map(lambda r: r.s).collect())
        df2.registerTempTable("test2")
        result = self.sqlCtx.sql("SELECT l[0].a from test2 where d['key'].d = '2'")
        self.assertEqual(1, result.head()[0])
Example #4
0
 def test_map_large_month_increase_mtd__first(self):
     res = invoice.map_large_month_increase_mtd(('test_vendor_id', [
         Row(invoice_id='test_invoice',
             invoice_date=datetime.date(2020, 1, 1),
             total_amount=Decimal(1),
             canonical_vendor_id='test_vendor_id'),
     ]))
     with self.assertRaises(StopIteration):
         next(res)
def start():
    conf = SparkConf().setAppName("Test").setMaster("local")
    sc = SparkContext(conf=conf)
    sql = SQLContext(sc)
    ll = ["23040010", "23040011", "23040012", "23040013", "23040010"]
    n_rdd = sc.parallelize(ll).map(lambda row: Row(row))
    df = sql.createDataFrame(n_rdd, ["nums"])
    df.withColumn("NewItem", columntransform(df["nums"])).show()
    return None
Example #6
0
 def ece_idf(self, mergeRDD):
     dataDF = mergeRDD.map(lambda p: Row(**{'edu_city_exp': p[1]})).toDF()
     ece_hashingTF = HashingTF(inputCol='edu_city_exp',
                               outputCol='eceFeatures',
                               numFeatures=64)
     featuresData = ece_hashingTF.transform(dataDF)
     ece_idf = IDF(inputCol='eceFeatures', outputCol='ecefeatures')
     ece_idfModel = ece_idf.fit(featuresData)
     return ece_idfModel
def process(time, rdd):

    try:
        
        # Instance of SparkSession
        spark = getSparkSessionInstance(rdd.context.getConf())

        # Convert RDD[String] to RDD[Row] to dataframe
        rowRdd = rdd.map(lambda w: Row(word = w))
        workingDataFrame = spark.createDataFrame(rowRdd)

        # Formatting/header
        header = rdd.first()
        data = rdd.filter(lambda row : row != header).toDF(header)

        # Vectorize relevant input data
        assembler = VectorAssembler(inputCols = ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10', 'L11', 'L12', 'L13', 'L14', 'L15', 'L16', 'L17', 'L18', 'L19', 'L20', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'Humidity', 'Temperature', 'PositionX', 'PositionY', 'PositiionZ', 'Ambient1', 'Ambient2', 'Ambient3', ],outputCol='features')

        # Transform data
        output = assembler.transform(data)

        # Working data, post transformation
        final_data = output.select('features','Status')

        # Training/testing split
        train_status,test_status = final_data.randomSplit([0.7,0.3])

        # Create LogReg
        lr_status = LogisticRegression(labelCol = 'Status')

        # Fit model to training data
        fitted_status_model = lr_status.fit(train_status)

        # Summary thus far
        training_sum = fitted_status_model.summary

        # Predictions thus far
        training_sum.predictions.describe()

        # Evaluate model
        pred_and_labels = fitted_status_model.evaluate(test_status) 

        # Final predictions
        predictions = pred_and_labels.predictions.select("prediction")

        # Prepare to evaluate accuracy
        status_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Status')

        # Evaluate accuracy
        auc = status_eval.evaluate(pred_and_labels.predictions)

        # Save/export data
        predictions.coalesce(1).write.format('csv').save("#redacted#/data.csv", mode = "append")
        data.coalesce(1).write.format('csv').save("#redacted#/data.csv", mode = "append")

    except:
        pass
Example #8
0
 def test_parquet_with_udt(self):
     from pyspark.sql.tests import ExamplePoint
     row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
     df0 = self.sc.parallelize([row]).toDF()
     output_dir = os.path.join(self.tempdir.name, "labeled_point")
     df0.saveAsParquetFile(output_dir)
     df1 = self.sqlCtx.parquetFile(output_dir)
     point = df1.head().point
     self.assertEquals(point, ExamplePoint(1.0, 2.0))
Example #9
0
def test_transform_happy_path(spark):
    df_test_case = spark.createDataFrame([
        Row(ForecastSiteCode=3008,
            ObservationTime=1,
            ObservationDate=datetime(2018, 12, 1, 4, 15, 0),
            WindDirection=12,
            WindSpeed=2,
            WindGust=37,
            Visibility=20000,
            ScreenTemperature=2.8,
            Pressure=998,
            SignificantWeatherCode=11,
            SiteName='FAIR ISLE (3008)',
            Latitude=59.53,
            Longitude=-1.63,
            Region='Orkney & Shetland',
            Country='SCOTLAND'),
        Row(ForecastSiteCode=3005,
            ObservationTime=2,
            ObservationDate=datetime(2019, 12, 1, 4, 15, 0),
            WindDirection=13,
            WindSpeed=1,
            WindGust=34,
            Visibility=30000,
            ScreenTemperature=5.8,
            Pressure=997,
            SignificantWeatherCode=11,
            SiteName='LERWICK (S. SCREEN) (3005)',
            Latitude=59.53,
            Longitude=-1.63,
            Region='Highland & Eilean Siar',
            Country='IRELAND')
    ])

    expected = pd.DataFrame([[
        datetime(2018, 12, 1, 4, 15, 0), 2.8, 'Orkney & Shetland', 2018
    ], [datetime(2019, 12, 1, 4, 15, 0), 5.8, 'Highland & Eilean Siar', 2019]],
                            columns=[
                                'ObservationDate', 'ScreenTemperature',
                                'Region', 'Year'
                            ])

    result = transform(df_test_case, logger).toPandas()
    assert_frame_equal(result, expected, check_dtype=False)
Example #10
0
 def test_convert_row_to_dict(self):
     row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
     self.assertEqual(1, row.asDict()['l'][0].a)
     df = self.sc.parallelize([row]).toDF()
     df.registerTempTable("test")
     row = self.sqlCtx.sql("select l, d from test").head()
     self.assertEqual(1, row.asDict()["l"][0].a)
     self.assertEqual(1.0, row.asDict()['d']['key'].c)
Example #11
0
def test_spark_with_batch_spec_passthrough(tmp_path_factory, spark_session):
    base_directory: str = str(
        tmp_path_factory.mktemp("basic_spark_datasource_v013_filesystem_data_connector")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test-A.csv",
        ],
    )
    basic_datasource: Datasource = instantiate_class_from_config(
        yaml.load(
            f"""
        class_name: Datasource

        execution_engine:
            class_name: SparkDFExecutionEngine
            spark_config:
                spark.master: local[*]
                spark.executor.memory: 6g
                spark.driver.memory: 6g
                spark.ui.showConsoleProgress: false
                spark.sql.shuffle.partitions: 2
                spark.default.parallelism: 4
        data_connectors:
            simple_filesystem_data_connector:
                class_name: InferredAssetFilesystemDataConnector
                base_directory: {base_directory}
                batch_spec_passthrough:
                    reader_options:
                        header: True
                glob_directive: '*'
                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                    - data_asset_name
            """,
        ),
        runtime_environment={"name": "my_datasource"},
        config_defaults={"module_name": "great_expectations.datasource"},
    )

    data_connector_name: str = "simple_filesystem_data_connector"
    data_asset_name: str = "test-A"

    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
    }

    batch = basic_datasource.get_batch_list_from_batch_request(
        BatchRequest(**batch_request)
    )
    # check that the batch_spec_passthrough has worked
    assert batch[0].data.dataframe.head() == Row(x="1", y="2")
    def setUp(self):
        self.schema_1_field = StructType(
            fields=[StructField(name='field1', dataType=IntegerType())])
        self.schema_2_fields = StructType(fields=[
            StructField(name='field1', dataType=IntegerType()),
            StructField(name='field2', dataType=StringType())
        ])

        self.df = sql_context.createDataFrame(
            [Row(field1=42, field2='value2')], schema=self.schema_2_fields)
Example #13
0
    def setUpClass(cls) -> None:
        cls.spark = SparkSession.builder \
            .master("local[*]") \
            .appName("aap-pyspark-pytest") \
            .getOrCreate()

        my_schema = StructType([
            StructField("Id", StringType()),
            StructField("EventDate", StringType())
        ])

        my_rows = [
            Row("101", "4/5/2020"),
            Row("102", "8/9/2020"),
            Row("103", "3/5/2020"),
            Row("104", "9/10/2020")
        ]
        my_rdd = cls.spark.sparkContext.parallelize(my_rows, 2)
        cls.my_df = cls.spark.createDataFrame(my_rdd, my_schema)
Example #14
0
def write_data_to_hive(hsql, sql, table_name, result_rdd, struct_fields, partition=False):
    """Using SQL sentence to write data to hive"""
    schema = get_schema(struct_fields)
    row_rdd = result_rdd.map(lambda p: Row(*p))
    dataframe = hsql.createDataFrame(row_rdd, schema)
    dataframe.registerTempTable(table_name)
    if partition:
        hsql.sql("set hive.exec.dynamic.partition.mode=nonstrict")
        hsql.sql("set hive.exec.dynamic.partition=true")
    hsql.sql(sql)
Example #15
0
 def nl_idf(self, mergeRDD):
     dataDF = mergeRDD.map(
         lambda p: Row(**{'leibie and name': p[2]})).toDF()
     nl_hashingTF = HashingTF(inputCol='leibie and name',
                              outputCol='nlFeatures',
                              numFeatures=256)
     featuresData = nl_hashingTF.transform(dataDF)
     nl_idf = IDF(inputCol='nlFeatures', outputCol='nlfeatures')
     nl_idfModel = nl_idf.fit(featuresData)
     return nl_idfModel
Example #16
0
    def filter_words(row):
        words = []
        tokens = row['header'].lower().split(
            ' ') + row['content'].lower().split(' ')

        for word in tokens:
            if word not in words_to_ignore:
                words.append(Row(label=row['label'], word=word))

        return words
Example #17
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import Dsl
        self.assertEqual((0, u'99'), tuple(g.agg(Dsl.first(df.key), Dsl.last(df.value)).first()))
        self.assertTrue(95 < g.agg(Dsl.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(Dsl.countDistinct(df.value)).first()[0])
Example #18
0
 def test_infer_schema_with_udt(self):
     from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
     row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
     df = self.sc.parallelize([row]).toDF()
     schema = df.schema
     field = [f for f in schema.fields if f.name == "point"][0]
     self.assertEqual(type(field.dataType), ExamplePointUDT)
     df.registerTempTable("labeled_point")
     point = self.sqlCtx.sql("SELECT point FROM labeled_point").head().point
     self.assertEqual(point, ExamplePoint(1.0, 2.0))
def changeToDF_v1(rdd, table_view):
    def f(x):
        d = {}
        for i in range(len(x)):
            d[str(i)] = x[i]
        return d

    newDF = rdd.map(lambda x: Row(**f(x))).toDF()
    newDF.createOrReplaceTempView(table_view)
    return newDF
Example #20
0
 def test_vendor_not_seen_in_a_while(self):
     res = invoice.map_vendor_not_seen_in_a_while(('test_vendor_id', [
         Row(invoice_id='invoice_1',
             invoice_date=datetime.date(2020, 1, 1),
             canonical_vendor_id='test_vendor_id'),
         Row(invoice_id='invoice_2',
             invoice_date=datetime.date(2020, 4, 1),
             canonical_vendor_id='test_vendor_id'),
     ]))
     self.assertEqual(next(res), (
         datetime.date(2020, 4, 1),
         "First new bill in 3 months from vendor test_vendor_id",
         'vendor_not_seen_in_a_while',
         'invoice',
         'invoice_2',
         'test_vendor_id',
     ))
     with self.assertRaises(StopIteration):
         next(res)
    def test_self_join(self):
        # SPARK-34319: self-join with FlatMapCoGroupsInPandas
        df = self.spark.createDataFrame([(1, 1)], ("column", "value"))

        row = df.groupby("ColUmn").cogroup(df.groupby("COLUMN")).applyInPandas(
            lambda r, l: r + l, "column long, value long")

        row = row.join(row).first()

        self.assertEqual(row.asDict(), Row(column=2, value=2).asDict())
Example #22
0
 def setUpClass(cls):
     ReusedPySparkTestCase.setUpClass()
     cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
     os.unlink(cls.tempdir.name)
     print "type", type(cls.sc)
     print "type", type(cls.sc._jsc)
     _scala_HiveContext =\
         cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
     cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
     cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
     cls.df = cls.sc.parallelize(cls.testData).toDF()
Example #23
0
def message_to_row(descriptor, message):
    field_map = {}
    for field_tuple in message.ListFields():
        field_map[field_tuple[0].name] = field_tuple[1]

    values = {}
    for field_descriptor in sorted(descriptor.fields, key=lambda x: x.name):
        values[field_descriptor.name] = __get_field_value(
            field_descriptor, field_map)

    return Row(**values)
Example #24
0
    def test_serialize_nested_array_and_map(self):
        d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})]
        rdd = self.sc.parallelize(d)
        df = self.sqlCtx.createDataFrame(rdd)
        row = df.head()
        self.assertEqual(1, len(row.l))
        self.assertEqual(1, row.l[0].a)
        self.assertEqual("2", row.d["key"].d)

        l = df.map(lambda x: x.l).first()
        self.assertEqual(1, len(l))
        self.assertEqual('s', l[0].b)

        d = df.map(lambda x: x.d).first()
        self.assertEqual(1, len(d))
        self.assertEqual(1.0, d["key"].c)

        row = df.map(lambda x: x.d["key"]).first()
        self.assertEqual(1.0, row.c)
        self.assertEqual("2", row.d)
Example #25
0
    def test_save_async(self):
        data = [Row(id=i) for i in range(10)]
        df = self.spark.createDataFrame(data)

        path = os.path.join(self.tempdir, "saved_async")
        f = (df.write.format("json").saveAsync(path))

        self.assertIsNone(f.result())
        loaded = self.spark.read.format("json").load(path)
        self.assertEqual(loaded.count(), 10)
        self.assertEqual(sorted(loaded.collect()), data)
Example #26
0
def single_instance_hanler(user_features_1, user_features_2):
    """
    合并计算单个用户的每个app的出现次数
    :param user_features_1: 用户单条记录中app列表 [0,1,1,0]
    :param user_features_2: 用户单条记录中app列表 [1,1,0,0]
    :return: app出现在记录中的次数加和, [0,2,1,0]
    """

    a = np.array([user_features_1.feats, user_features_2.feats])
    applist_feature = np.sum(a, axis=0).tolist()
    return Row(label=user_features_1.label, feats=applist_feature)
Example #27
0
 def __call__(self, head):
     rows = head.map(lambda x: Row(
         sha1=x.blob_id, repo=x.repository_id, commit=x.commit_hash, path=x.path))
     if self.explained:
         self._log.info("toDebugString():\n%s", rows.toDebugString().decode())
     rows.toDF() \
         .write \
         .format("org.apache.spark.sql.cassandra") \
         .mode("append") \
         .options(table=self.table, keyspace=self.keyspace) \
         .save()
    def saveRdd2Hive(rdd):
        # 处理空rdd
        if rdd.isEmpty():
            return

        rowRdd = rdd.map(lambda eachRow: Row(eachRow[0], eachRow[1], eachRow[
            2], eachRow[3]))
        dataFrame = sqlContext.createDataFrame(rowRdd, schema)
        dataFrame.registerTempTable("tempTable")
        dataFrame.show()
        hiveContext.sql("INSERT INTO userbehaviors SELECT * FROM tempTable")
Example #29
0
 def test_map_no_invoice_received__quarterly__end_of_month(self):
     res = invoice.map_no_invoice_received(('test_vendor_id', [
         Row(invoice_id='test_invoice_1',
             invoice_date=datetime.date(2020, 1, 25),
             canonical_vendor_id='test_vendor_id'),
         Row(invoice_id='test_invoice_2',
             invoice_date=datetime.date(2020, 4, 25),
             canonical_vendor_id='test_vendor_id'),
         Row(invoice_id='test_invoice_3',
             invoice_date=datetime.date(2020, 7, 25),
             canonical_vendor_id='test_vendor_id'),
     ]))
     self.assertEqual(list(res), [(
         datetime.date(2020, 10, i),
         f"test_vendor_id generally charges between on 25 day of each month invoices are sent. On 2020-10-25, an invoice from test_vendor_id has not been received",
         'no_invoice_received',
         'vendor',
         None,
         'test_vendor_id',
     ) for i in range(25, 32)])
Example #30
0
def Trans_DF1():
    spark = SparkSession.builder.getOrCreate()
    sc = SparkContext.getOrCreate()
    peopleDF = sc.textFile("people.txt")
    peopleDF = peopleDF.map(lambda line: line.split(",")).map(
        lambda x: Row(**f(x))).toDF()
    peopleDF.createOrReplaceTempView("people")
    personDF = spark.sql("select * from people")
    personDF.rdd.map(lambda t: "Name:" + t[0] + "," + "Age:" + t[1]).foreach(
        print)
    return personDF