def m(line): val = fmt(line['VAL']) point = fmt(val / per) * p if top != 0 and point > top: point = top return Row(PK=line['PK'], VAL=line['VAL'], POINT=int(point))
def test_udf_with_array_type(self): d = [Row(l=range(3), d={"key": range(5)})] rdd = self.sc.parallelize(d) self.sqlCtx.inferSchema(rdd).registerTempTable("test") self.sqlCtx.registerFunction("copylist", lambda l: list(l), ArrayType(IntegerType())) self.sqlCtx.registerFunction("maplen", lambda d: len(d), IntegerType()) [(l1, l2)] = self.sqlCtx.sql("select copylist(l), maplen(d) from test").collect() self.assertEqual(range(3), l1) self.assertEqual(1, l2)
def test_infer_schema(self): d = [Row(l=[], d={}), Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}, s="")] rdd = self.sc.parallelize(d) df = self.sqlCtx.inferSchema(rdd) self.assertEqual([], df.map(lambda r: r.l).first()) self.assertEqual([None, ""], df.map(lambda r: r.s).collect()) df.registerTempTable("test") result = self.sqlCtx.sql("SELECT l[0].a from test where d['key'].d = '2'") self.assertEqual(1, result.head()[0]) df2 = self.sqlCtx.inferSchema(rdd, 1.0) self.assertEqual(df.schema(), df2.schema()) self.assertEqual({}, df2.map(lambda r: r.d).first()) self.assertEqual([None, ""], df2.map(lambda r: r.s).collect()) df2.registerTempTable("test2") result = self.sqlCtx.sql("SELECT l[0].a from test2 where d['key'].d = '2'") self.assertEqual(1, result.head()[0])
def test_map_large_month_increase_mtd__first(self): res = invoice.map_large_month_increase_mtd(('test_vendor_id', [ Row(invoice_id='test_invoice', invoice_date=datetime.date(2020, 1, 1), total_amount=Decimal(1), canonical_vendor_id='test_vendor_id'), ])) with self.assertRaises(StopIteration): next(res)
def start(): conf = SparkConf().setAppName("Test").setMaster("local") sc = SparkContext(conf=conf) sql = SQLContext(sc) ll = ["23040010", "23040011", "23040012", "23040013", "23040010"] n_rdd = sc.parallelize(ll).map(lambda row: Row(row)) df = sql.createDataFrame(n_rdd, ["nums"]) df.withColumn("NewItem", columntransform(df["nums"])).show() return None
def ece_idf(self, mergeRDD): dataDF = mergeRDD.map(lambda p: Row(**{'edu_city_exp': p[1]})).toDF() ece_hashingTF = HashingTF(inputCol='edu_city_exp', outputCol='eceFeatures', numFeatures=64) featuresData = ece_hashingTF.transform(dataDF) ece_idf = IDF(inputCol='eceFeatures', outputCol='ecefeatures') ece_idfModel = ece_idf.fit(featuresData) return ece_idfModel
def process(time, rdd): try: # Instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) # Convert RDD[String] to RDD[Row] to dataframe rowRdd = rdd.map(lambda w: Row(word = w)) workingDataFrame = spark.createDataFrame(rowRdd) # Formatting/header header = rdd.first() data = rdd.filter(lambda row : row != header).toDF(header) # Vectorize relevant input data assembler = VectorAssembler(inputCols = ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10', 'L11', 'L12', 'L13', 'L14', 'L15', 'L16', 'L17', 'L18', 'L19', 'L20', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'Humidity', 'Temperature', 'PositionX', 'PositionY', 'PositiionZ', 'Ambient1', 'Ambient2', 'Ambient3', ],outputCol='features') # Transform data output = assembler.transform(data) # Working data, post transformation final_data = output.select('features','Status') # Training/testing split train_status,test_status = final_data.randomSplit([0.7,0.3]) # Create LogReg lr_status = LogisticRegression(labelCol = 'Status') # Fit model to training data fitted_status_model = lr_status.fit(train_status) # Summary thus far training_sum = fitted_status_model.summary # Predictions thus far training_sum.predictions.describe() # Evaluate model pred_and_labels = fitted_status_model.evaluate(test_status) # Final predictions predictions = pred_and_labels.predictions.select("prediction") # Prepare to evaluate accuracy status_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Status') # Evaluate accuracy auc = status_eval.evaluate(pred_and_labels.predictions) # Save/export data predictions.coalesce(1).write.format('csv').save("#redacted#/data.csv", mode = "append") data.coalesce(1).write.format('csv').save("#redacted#/data.csv", mode = "append") except: pass
def test_parquet_with_udt(self): from pyspark.sql.tests import ExamplePoint row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) df0 = self.sc.parallelize([row]).toDF() output_dir = os.path.join(self.tempdir.name, "labeled_point") df0.saveAsParquetFile(output_dir) df1 = self.sqlCtx.parquetFile(output_dir) point = df1.head().point self.assertEquals(point, ExamplePoint(1.0, 2.0))
def test_transform_happy_path(spark): df_test_case = spark.createDataFrame([ Row(ForecastSiteCode=3008, ObservationTime=1, ObservationDate=datetime(2018, 12, 1, 4, 15, 0), WindDirection=12, WindSpeed=2, WindGust=37, Visibility=20000, ScreenTemperature=2.8, Pressure=998, SignificantWeatherCode=11, SiteName='FAIR ISLE (3008)', Latitude=59.53, Longitude=-1.63, Region='Orkney & Shetland', Country='SCOTLAND'), Row(ForecastSiteCode=3005, ObservationTime=2, ObservationDate=datetime(2019, 12, 1, 4, 15, 0), WindDirection=13, WindSpeed=1, WindGust=34, Visibility=30000, ScreenTemperature=5.8, Pressure=997, SignificantWeatherCode=11, SiteName='LERWICK (S. SCREEN) (3005)', Latitude=59.53, Longitude=-1.63, Region='Highland & Eilean Siar', Country='IRELAND') ]) expected = pd.DataFrame([[ datetime(2018, 12, 1, 4, 15, 0), 2.8, 'Orkney & Shetland', 2018 ], [datetime(2019, 12, 1, 4, 15, 0), 5.8, 'Highland & Eilean Siar', 2019]], columns=[ 'ObservationDate', 'ScreenTemperature', 'Region', 'Year' ]) result = transform(df_test_case, logger).toPandas() assert_frame_equal(result, expected, check_dtype=False)
def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) df = self.sc.parallelize([row]).toDF() df.registerTempTable("test") row = self.sqlCtx.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c)
def test_spark_with_batch_spec_passthrough(tmp_path_factory, spark_session): base_directory: str = str( tmp_path_factory.mktemp("basic_spark_datasource_v013_filesystem_data_connector") ) create_files_in_directory( directory=base_directory, file_name_list=[ "test-A.csv", ], ) basic_datasource: Datasource = instantiate_class_from_config( yaml.load( f""" class_name: Datasource execution_engine: class_name: SparkDFExecutionEngine spark_config: spark.master: local[*] spark.executor.memory: 6g spark.driver.memory: 6g spark.ui.showConsoleProgress: false spark.sql.shuffle.partitions: 2 spark.default.parallelism: 4 data_connectors: simple_filesystem_data_connector: class_name: InferredAssetFilesystemDataConnector base_directory: {base_directory} batch_spec_passthrough: reader_options: header: True glob_directive: '*' default_regex: pattern: (.+)\\.csv group_names: - data_asset_name """, ), runtime_environment={"name": "my_datasource"}, config_defaults={"module_name": "great_expectations.datasource"}, ) data_connector_name: str = "simple_filesystem_data_connector" data_asset_name: str = "test-A" batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, } batch = basic_datasource.get_batch_list_from_batch_request( BatchRequest(**batch_request) ) # check that the batch_spec_passthrough has worked assert batch[0].data.dataframe.head() == Row(x="1", y="2")
def setUp(self): self.schema_1_field = StructType( fields=[StructField(name='field1', dataType=IntegerType())]) self.schema_2_fields = StructType(fields=[ StructField(name='field1', dataType=IntegerType()), StructField(name='field2', dataType=StringType()) ]) self.df = sql_context.createDataFrame( [Row(field1=42, field2='value2')], schema=self.schema_2_fields)
def setUpClass(cls) -> None: cls.spark = SparkSession.builder \ .master("local[*]") \ .appName("aap-pyspark-pytest") \ .getOrCreate() my_schema = StructType([ StructField("Id", StringType()), StructField("EventDate", StringType()) ]) my_rows = [ Row("101", "4/5/2020"), Row("102", "8/9/2020"), Row("103", "3/5/2020"), Row("104", "9/10/2020") ] my_rdd = cls.spark.sparkContext.parallelize(my_rows, 2) cls.my_df = cls.spark.createDataFrame(my_rdd, my_schema)
def write_data_to_hive(hsql, sql, table_name, result_rdd, struct_fields, partition=False): """Using SQL sentence to write data to hive""" schema = get_schema(struct_fields) row_rdd = result_rdd.map(lambda p: Row(*p)) dataframe = hsql.createDataFrame(row_rdd, schema) dataframe.registerTempTable(table_name) if partition: hsql.sql("set hive.exec.dynamic.partition.mode=nonstrict") hsql.sql("set hive.exec.dynamic.partition=true") hsql.sql(sql)
def nl_idf(self, mergeRDD): dataDF = mergeRDD.map( lambda p: Row(**{'leibie and name': p[2]})).toDF() nl_hashingTF = HashingTF(inputCol='leibie and name', outputCol='nlFeatures', numFeatures=256) featuresData = nl_hashingTF.transform(dataDF) nl_idf = IDF(inputCol='nlFeatures', outputCol='nlfeatures') nl_idfModel = nl_idf.fit(featuresData) return nl_idfModel
def filter_words(row): words = [] tokens = row['header'].lower().split( ' ') + row['content'].lower().split(' ') for word in tokens: if word not in words_to_ignore: words.append(Row(label=row['label'], word=word)) return words
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import Dsl self.assertEqual((0, u'99'), tuple(g.agg(Dsl.first(df.key), Dsl.last(df.value)).first())) self.assertTrue(95 < g.agg(Dsl.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(Dsl.countDistinct(df.value)).first()[0])
def test_infer_schema_with_udt(self): from pyspark.sql.tests import ExamplePoint, ExamplePointUDT row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) df = self.sc.parallelize([row]).toDF() schema = df.schema field = [f for f in schema.fields if f.name == "point"][0] self.assertEqual(type(field.dataType), ExamplePointUDT) df.registerTempTable("labeled_point") point = self.sqlCtx.sql("SELECT point FROM labeled_point").head().point self.assertEqual(point, ExamplePoint(1.0, 2.0))
def changeToDF_v1(rdd, table_view): def f(x): d = {} for i in range(len(x)): d[str(i)] = x[i] return d newDF = rdd.map(lambda x: Row(**f(x))).toDF() newDF.createOrReplaceTempView(table_view) return newDF
def test_vendor_not_seen_in_a_while(self): res = invoice.map_vendor_not_seen_in_a_while(('test_vendor_id', [ Row(invoice_id='invoice_1', invoice_date=datetime.date(2020, 1, 1), canonical_vendor_id='test_vendor_id'), Row(invoice_id='invoice_2', invoice_date=datetime.date(2020, 4, 1), canonical_vendor_id='test_vendor_id'), ])) self.assertEqual(next(res), ( datetime.date(2020, 4, 1), "First new bill in 3 months from vendor test_vendor_id", 'vendor_not_seen_in_a_while', 'invoice', 'invoice_2', 'test_vendor_id', )) with self.assertRaises(StopIteration): next(res)
def test_self_join(self): # SPARK-34319: self-join with FlatMapCoGroupsInPandas df = self.spark.createDataFrame([(1, 1)], ("column", "value")) row = df.groupby("ColUmn").cogroup(df.groupby("COLUMN")).applyInPandas( lambda r, l: r + l, "column long, value long") row = row.join(row).first() self.assertEqual(row.asDict(), Row(column=2, value=2).asDict())
def setUpClass(cls): ReusedPySparkTestCase.setUpClass() cls.tempdir = tempfile.NamedTemporaryFile(delete=False) os.unlink(cls.tempdir.name) print "type", type(cls.sc) print "type", type(cls.sc._jsc) _scala_HiveContext =\ cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc()) cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext) cls.testData = [Row(key=i, value=str(i)) for i in range(100)] cls.df = cls.sc.parallelize(cls.testData).toDF()
def message_to_row(descriptor, message): field_map = {} for field_tuple in message.ListFields(): field_map[field_tuple[0].name] = field_tuple[1] values = {} for field_descriptor in sorted(descriptor.fields, key=lambda x: x.name): values[field_descriptor.name] = __get_field_value( field_descriptor, field_map) return Row(**values)
def test_serialize_nested_array_and_map(self): d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})] rdd = self.sc.parallelize(d) df = self.sqlCtx.createDataFrame(rdd) row = df.head() self.assertEqual(1, len(row.l)) self.assertEqual(1, row.l[0].a) self.assertEqual("2", row.d["key"].d) l = df.map(lambda x: x.l).first() self.assertEqual(1, len(l)) self.assertEqual('s', l[0].b) d = df.map(lambda x: x.d).first() self.assertEqual(1, len(d)) self.assertEqual(1.0, d["key"].c) row = df.map(lambda x: x.d["key"]).first() self.assertEqual(1.0, row.c) self.assertEqual("2", row.d)
def test_save_async(self): data = [Row(id=i) for i in range(10)] df = self.spark.createDataFrame(data) path = os.path.join(self.tempdir, "saved_async") f = (df.write.format("json").saveAsync(path)) self.assertIsNone(f.result()) loaded = self.spark.read.format("json").load(path) self.assertEqual(loaded.count(), 10) self.assertEqual(sorted(loaded.collect()), data)
def single_instance_hanler(user_features_1, user_features_2): """ 合并计算单个用户的每个app的出现次数 :param user_features_1: 用户单条记录中app列表 [0,1,1,0] :param user_features_2: 用户单条记录中app列表 [1,1,0,0] :return: app出现在记录中的次数加和, [0,2,1,0] """ a = np.array([user_features_1.feats, user_features_2.feats]) applist_feature = np.sum(a, axis=0).tolist() return Row(label=user_features_1.label, feats=applist_feature)
def __call__(self, head): rows = head.map(lambda x: Row( sha1=x.blob_id, repo=x.repository_id, commit=x.commit_hash, path=x.path)) if self.explained: self._log.info("toDebugString():\n%s", rows.toDebugString().decode()) rows.toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=self.table, keyspace=self.keyspace) \ .save()
def saveRdd2Hive(rdd): # 处理空rdd if rdd.isEmpty(): return rowRdd = rdd.map(lambda eachRow: Row(eachRow[0], eachRow[1], eachRow[ 2], eachRow[3])) dataFrame = sqlContext.createDataFrame(rowRdd, schema) dataFrame.registerTempTable("tempTable") dataFrame.show() hiveContext.sql("INSERT INTO userbehaviors SELECT * FROM tempTable")
def test_map_no_invoice_received__quarterly__end_of_month(self): res = invoice.map_no_invoice_received(('test_vendor_id', [ Row(invoice_id='test_invoice_1', invoice_date=datetime.date(2020, 1, 25), canonical_vendor_id='test_vendor_id'), Row(invoice_id='test_invoice_2', invoice_date=datetime.date(2020, 4, 25), canonical_vendor_id='test_vendor_id'), Row(invoice_id='test_invoice_3', invoice_date=datetime.date(2020, 7, 25), canonical_vendor_id='test_vendor_id'), ])) self.assertEqual(list(res), [( datetime.date(2020, 10, i), f"test_vendor_id generally charges between on 25 day of each month invoices are sent. On 2020-10-25, an invoice from test_vendor_id has not been received", 'no_invoice_received', 'vendor', None, 'test_vendor_id', ) for i in range(25, 32)])
def Trans_DF1(): spark = SparkSession.builder.getOrCreate() sc = SparkContext.getOrCreate() peopleDF = sc.textFile("people.txt") peopleDF = peopleDF.map(lambda line: line.split(",")).map( lambda x: Row(**f(x))).toDF() peopleDF.createOrReplaceTempView("people") personDF = spark.sql("select * from people") personDF.rdd.map(lambda t: "Name:" + t[0] + "," + "Age:" + t[1]).foreach( print) return personDF