# MAGIC # MAGIC Complex outputs are helpful when you need to return multiple values from your UDF. The UDF design pattern involves returning a single column to drill down into, to pull out the desired data. # COMMAND ---------- # MAGIC %md-sandbox # MAGIC Start by determining the desired output. This will look like a schema with a high level `StructType` with numerous `StructFields`. # MAGIC # MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> For a refresher on this, see the lesson **Applying Schemas to JSON Data** in ETL Part 1 module # COMMAND ---------- from pyspark.sql.types import FloatType, StructType, StructField mathOperationsSchema = StructType([ StructField("sum", FloatType(), True), StructField("multiplication", FloatType(), True), StructField("division", FloatType(), True) ]) # COMMAND ---------- # MAGIC %md # MAGIC Create a function that returns a tuple of your desired output. # COMMAND ---------- def manual_math(x, y): return (float(x + y), float(x * y), x / float(y)) manual_math(1, 2)
def test_select_subset_of_columns_as_entity_primary_keys( spark: SparkSession, composite_entity_schema: StructType, customer_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=2)), (2001, 8002, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) feature_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 400.0, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), customer_feature_schema) feature_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], max_age=86400, ) feature_table_df = filter_feature_table_by_time_range( feature_table_df, feature_table, "event_timestamp", entity_df, "event_timestamp", ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, ), ( 2001, 8002, datetime(year=2020, month=9, day=2), 400.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
def test_implicit_type_conversion(spark: SparkSession, ): test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data") entity_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'single_customer.csv')}", "event_timestamp_column": "event_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_source = { "file": { "format": { "json_class": "CSVFormat" }, "path": f"file://{path.join(test_data_dir, 'transactions.csv')}", "event_timestamp_column": "event_timestamp", "created_timestamp_column": "created_timestamp", "options": { "inferSchema": "true", "header": "true" }, } } transaction_table = { "name": "transactions", "entities": [{ "name": "customer_id", "type": "int32" }], "features": [{ "name": "daily_transactions", "type": "float" }], "max_age": 86400, } joined_df = retrieve_historical_features( spark, entity_source, [transaction_source], [transaction_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), ]) expected_joined_data = [ ( 1001, datetime(year=2020, month=9, day=2), 100.0, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
for col in gdelt.columns[1:]: gdelt = gdelt.withColumn(col, F.lower(F.col(col))) print('Getting only GDELT FRA Event') gdelt = gdelt.where('Actor1Code RLIKE "^fra.*"') gdelt = gdelt.where('Actor2Code RLIKE "^fra.*"') # Taking top 10 actors here print('Normalizing Date') date_normalizer = partial(utils.normalize_date, starting_date='20170101', ending_date='20190101', date_format='%Y%m%d') pudf_normalizer = F.pandas_udf(lambda ds: ds.apply(date_normalizer), FloatType(), F.PandasUDFType.SCALAR) # date_normalizer = F.udf(lambda d: normalize_date(d, '20170101', '20190101', '%Y%m%d'), FloatType()) gdelt = gdelt.withColumn('date', pudf_normalizer(gdelt.SQLDATE)).drop('SQLDATE') print('GDELT to dummies') for col in cols_to_take[1:]: categories = gdelt.select(col).distinct().rdd.flatMap( lambda x: x).collect() dummies_var.append([ F.when(F.col(col) == category, 1).otherwise(0).alias(col.lower() + '_' + category) for category in categories ]) gdelt = gdelt.select('date', *list(chain.from_iterable(dummies_var)))
import pyspark from pyspark import SparkContext from pyspark.sql.types import FloatType spark = pyspark.sql.SparkSession.builder \ .master("local") \ .appName("movies") \ .getOrCreate() df = spark.read.csv(path="./comparison.csv", header=True) df = df.withColumn("similarity", df["similarity"].cast(FloatType()))\ .orderBy("similarity", ascending=[0]) \ .collect() sc = SparkContext.getOrCreate() rdd = sc.parallelize(df) def seq_op(acc, row): similarity = row["similarity"] if row["first"] in acc: acc[row["first"]].append((row["second"], similarity)) else: acc[row["first"]] = [(row["second"], similarity)] if row["second"] in acc: acc[row["second"]].append((row["first"], similarity)) else:
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, FloatType spark = ( SparkSession.builder.appName("TotalSpentByCustomer") .master("local[*]") .getOrCreate() ) # Create schema when reading customer-orders customerOrderSchema = StructType( [ StructField("cust_id", IntegerType(), True), StructField("item_id", IntegerType(), True), StructField("amount_spent", FloatType(), True), ] ) # Load up the data into spark dataset customersDF = spark.read.schema(customerOrderSchema).csv("customer-orders.csv") totalByCustomer = customersDF.groupBy("cust_id").agg( func.round(func.sum("amount_spent"), 2).alias("total_spent") ) totalByCustomerSorted = totalByCustomer.sort("total_spent") totalByCustomerSorted.show(totalByCustomerSorted.count()) spark.stop()
#puts together whole crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) # Split the data into train and test splits = joinedDF.randomSplit([0.8, 0.2], 1234) train = splits[0] test = splits[1] # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(train) prediction = cvModel.transform(test) evaluator.evaluate(prediction) #need funciton to extract first element of vector of probability secondelement = udf(lambda v: float(v[1]), FloatType()) selected = prediction.withColumn( "prob_of_cancel", secondelement(prediction["probability"])).select( "ID", "prob_of_cancel", "prediction", 'label') selected.coalesce(1).write.csv( path= "s3n://leesa.east2.training/predictions/flight_cancellation_predictions.csv", mode='overwrite', header=True)
"float": FloatType, "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_TYPES = { "int", "float", "string", "bool", "date", "null", "array", "double" } PROFILER_LEGEND_TYPES = { "string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#",
spark = SparkSession.builder.master("local").appName("Return").getOrCreate() df = spark.read.csv( r'D:\LiBao\data_20200904\NSM_GlobalSelect_Nasdaq\NSM-2016-01-05-TAS-Data-1-of-1-a1.csv', header=True) df_global_market = spark.read.csv( r'D:\LiBao\data_20200904\NMS-2\NMS-2016-01-05-TAS-Data-1-of-1-a1.csv', header=True) df_capital_market = spark.read.csv( r'D:\LiBao\data_20200904\NAQ\NAQ-2016-01-05-TAS-Data-1-of-1-a1.csv', header=True) df = df.union(df_global_market).union(df_capital_market) # rename the first column and sequence number df = df.withColumnRenamed('#RIC', 'Ticker') df = df.withColumnRenamed('Seq. No.', 'SeqNo') # change the type of the data df = df.withColumn('Price', df['Price'].cast(FloatType())) df = df.withColumn('Volume', df['Volume'].cast(FloatType())) df = df.withColumn('SeqNo', df['SeqNo'].cast(IntegerType())) # select the trade entry trade_df = df.where("Type=='Trade'") # convert trading time into hours and minutes trade_df = trade_df.withColumn('Hour', hour(trade_df['Exch Time'])) trade_df = trade_df.withColumn('Minute', minute(trade_df['Exch Time'])) # substract the time with 870 to get the minute indicator trade_df = trade_df.withColumn( 'MinuteIndicator', trade_df['Hour'] * 60 + trade_df['Minute'] - 870) # select the data during trading hours, trading volume larger than 0 and drop any missing values on trade price trade_df = trade_df.\ filter((trade_df['MinuteIndicator'] >= 0) & (trade_df['MinuteIndicator'] <= 390) & (trade_df['Volume']>0)).\ dropna(subset=('Price')) # delete rows with ticker contains '![/', all these rows are minute-by-minute summary rather than real trade
# dropoff lat float(chunk_row[4]), # dropoff long float(chunk_row[5])) except ValueError: pass # parse the datasets into row tuples yellow_rows = yellow.mapPartitions(parse_yellow) citi_rows = citi.mapPartitions(parse_citi) # define dataframe schemas yellow_schema = StructType([ StructField('dropoff_time', TimestampType(), True), StructField('dropoff_lat', FloatType(), True), StructField('dropoff_lng', FloatType(), True) ]) citi_schema = StructType([ StructField('station_id', IntegerType(), True), StructField('ride_id', StringType(), True), StructField('start_time', TimestampType(), True) ]) # instantiate the dataframes yellow_df = sqlContext.createDataFrame(yellow_rows, yellow_schema) citi_df = sqlContext.createDataFrame(citi_rows, citi_schema) # filtering function to check if the taxi dropoff location is within 0.25 miles of citibike station
# train the model model = ALS.train( dfRates.rdd, 20, 20) # you could tune these numbers, but these are reasonable choices print("trained ...") # use this model to predict what the user would rate accommodations that she has not rated allPredictions = None for USER_ID in range(0, 100): dfUserRatings = dfRates.filter( dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect() rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) predictions = model.predictAll(pairsPotential).map( lambda p: (str(p[0]), str(p[1]), float(p[2]))) predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5 print("predicted for user={0}".format(USER_ID)) if (allPredictions == None): allPredictions = predictions else: allPredictions.extend(predictions) # write them schema = StructType([ StructField("userId", StringType(), True), StructField("accoId", StringType(), True), StructField("prediction", FloatType(), True) ]) dfToSave = sqlContext.createDataFrame(allPredictions, schema) dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
def test_as_spark_type(self): type_mapper = { # binary np.character: BinaryType(), np.bytes_: BinaryType(), np.string_: BinaryType(), bytes: BinaryType(), # integer np.int8: ByteType(), np.byte: ByteType(), np.int16: ShortType(), np.int32: IntegerType(), np.int64: LongType(), np.int: LongType(), int: LongType(), # floating np.float32: FloatType(), np.float: DoubleType(), np.float64: DoubleType(), float: DoubleType(), # string np.str: StringType(), np.unicode_: StringType(), str: StringType(), # bool np.bool: BooleanType(), bool: BooleanType(), # datetime np.datetime64: TimestampType(), datetime.datetime: TimestampType(), # DateType datetime.date: DateType(), # DecimalType decimal.Decimal: DecimalType(38, 18), # ArrayType np.ndarray: ArrayType(StringType()), List[bytes]: ArrayType(BinaryType()), List[np.character]: ArrayType(BinaryType()), List[np.bytes_]: ArrayType(BinaryType()), List[np.string_]: ArrayType(BinaryType()), List[bool]: ArrayType(BooleanType()), List[np.bool]: ArrayType(BooleanType()), List[datetime.date]: ArrayType(DateType()), List[np.int8]: ArrayType(ByteType()), List[np.byte]: ArrayType(ByteType()), List[decimal.Decimal]: ArrayType(DecimalType(38, 18)), List[float]: ArrayType(DoubleType()), List[np.float]: ArrayType(DoubleType()), List[np.float64]: ArrayType(DoubleType()), List[np.float32]: ArrayType(FloatType()), List[np.int32]: ArrayType(IntegerType()), List[int]: ArrayType(LongType()), List[np.int]: ArrayType(LongType()), List[np.int64]: ArrayType(LongType()), List[np.int16]: ArrayType(ShortType()), List[str]: ArrayType(StringType()), List[np.unicode_]: ArrayType(StringType()), List[datetime.datetime]: ArrayType(TimestampType()), List[np.datetime64]: ArrayType(TimestampType()), } for numpy_or_python_type, spark_type in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64"))
def classify_spark(training, testing, target_domains, target_domains_dict): # Adjust target_domains_dict["_other"] = len(target_domains) target_domains.append(["_other"]) feature_list = [c for c in training.columns if c.startswith("_")] assembler = VectorAssembler(inputCols=feature_list, outputCol="features", handleInvalid="skip") str2idx = udf(lambda s: float(target_domains_dict[s]), FloatType()) idx2str = udf(lambda f: target_domains[int(f)], StringType()) training = assembler.transform(training) testing = assembler.transform(testing) training = training.withColumn("label_idx", str2idx("label")) testing = testing.withColumn("label_idx", str2idx("label")) bins = np.zeros(len(target_domains)) freqs = { row["label_idx"]: row["count"] for row in training.select("label_idx")\ .groupBy("label_idx").count().collect() } for i in freqs: bins[int(i)] = freqs[i] class_weights = np.sum(bins) / (len(bins) * bins) idx2cw = udf(lambda f: float(class_weights[int(f)]), FloatType()) training = training.withColumn("weigth", idx2cw("label_idx")) #model = pyspark.ml.classification.DecisionTreeClassifier(labelCol="label_idx", # featuresCol="features", predictionCol="prediction_idx") model = pyspark.ml.classification.LogisticRegression( labelCol="label_idx", weightCol="weigth", featuresCol="features", predictionCol="prediction_idx") model_fit = model.fit(training) training_predictions = model_fit.transform(training) testing_predictions = model_fit.transform(testing) training_predictions = training_predictions.withColumn( "prediction", idx2str("prediction_idx")) testing_predictions = testing_predictions.withColumn( "prediction", idx2str("prediction_idx")) labels_training = training_predictions.select("label").toPandas().values labels_test = testing_predictions.select("label").toPandas().values pred_training = training_predictions.select("prediction").toPandas().values pred_test = testing_predictions.select("prediction").toPandas().values training_report = classification_report(labels_training, pred_training, output_dict=True) testing_report = classification_report(labels_test, pred_test, output_dict=True) return model_fit, training_report, testing_report
df = sc.read.parquet('../data/userdata1.parquet') print(df) # Handle duplicate values print(df.drop_duplicates().count()) # Handling missing data print(df.fillna(0).show()) print(df.dropna().show()) # fill missing values in specefic columns print(df.fillna({'cc':'6767119071901597' }).show()) # Changing data type in the DF df1 = df.withColumn("salary", df["salary"].cast(FloatType())) print(df1.show()) print(df1.printSchema()) # replace null values with mean salary print(F.avg(df1.salary)) # df1 = df1.fillna() # drop String literal and cast to integer df = df.withColumn("cc", F.when(df.cc != '', df.cc).otherwise('0')) df = df.withColumn("cc", df.cc.cast(IntegerType())) print(df.printSchema()) # replace empty String literal with something df = df.withColumn("birthdate",F.when(df.birthdate != '',df.birthdate).otherwise("05/05/2020")) df = df.withColumn("birthdate", F.to_date(df.birthdate,'mm/dd/yyyy'))
def test_prepare_data_compress_sparse(self): util.clear_training_cache() expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with mock.patch('horovod.spark.common.util._get_metadata', side_effect=util._get_metadata) as mock_get_metadata: with spark_session('test_prepare_data') as spark: data = [[ 0.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) with local_store() as store: with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['dense', 'sparse', 'mixed'], label_columns=['float'], compress_sparse=True) as dataset_idx: mock_get_metadata.assert_called() assert dataset_idx == 0 train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(dataset_idx) self.assertDictEqual(metadata, expected_metadata)
""" """ from pyspark.sql import functions as F from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, FloatType, IntegerType spark = SparkSession.builder.appName('CustomerAmount').getOrCreate() schema = StructType([ StructField('user_id', IntegerType(), True), StructField('order_id', IntegerType(), True), StructField('value', FloatType(), True) ]) df = spark.read.csv('datasets/customer-orders.csv', schema=schema) df.describe().show() df.show(5) # +-------+--------+-----+ # |user_id|order_id|value| # +-------+--------+-----+ # | 44| 8602|37.19| # | 35| 5368|65.89| # | 2| 3391|40.64| # | 47| 6694|14.98| # | 29| 680|13.08| # +-------+--------+-----+ df_users = df.groupby('user_id')\ .agg(F.sum('value').alias('total'))\
X21_test = scalerX21.transform(X21) pred21 = model_d21.predict(X21_test) prediction21 = scalery21.inverse_transform(pred21) df21['prediction'] = prediction21 df_new = pd.concat([df8,df9,df10,df11,df12,df13,df14,df15,df16,df17,df18,df19,df20,df21]) df_new['ml_score'] = (df_new['prediction']/30)*(31 - df_new['day_num']) + df_new['usage_till_date'] df_new['ma_score'] = df_new['first7day_avg']*(31 - df_new['day_num']) + df_new['usage_till_date'] p_schema = StructType([StructField('concat_agmnt_no',StringType(),True),StructField('billing_start_date',StringType(),True),StructField('billing_end_date',StringType(),True),StructField('day_num',IntegerType(),True),StructField('usage_till_date',FloatType(),True),StructField('first7day_avg',FloatType(),True),StructField('last_1bc_usg',FloatType(),True),StructField('ml_score',FloatType(),True),StructField('ma_score',FloatType(),True)]) df1 = df_new[['concat_agmnt_no', 'billing_start_date', 'billing_end_date', 'day_num', 'usage_till_date', 'first7day_avg', 'last_1bc_usg', 'ml_score', 'ma_score']] df2 = sqlContext.createDataFrame(df1, p_schema) df2.registerTempTable("test_temp_df")
def select_relevant_columns(df, discrete_action: bool = True, include_possible_actions: bool = True): """Select all the relevant columns and perform type conversions.""" if not discrete_action and include_possible_actions: raise NotImplementedError( "currently we don't support include_possible_actions") select_col_list = [ # pyre-fixme[16]: Module `functions` has no attribute `col`. col("reward").cast(FloatType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("state_features").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("state_features_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_state_features").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_state_features_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("not_terminal").cast(BooleanType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action_probability").cast(FloatType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("mdp_id").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("sequence_number").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("step").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("time_diff").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("metrics").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("metrics_presence").cast(ArrayType(BooleanType())), ] if discrete_action: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action").cast(LongType()), ] else: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action_presence").cast(ArrayType(BooleanType())), ] if include_possible_actions: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. col("possible_actions_mask").cast(ArrayType(LongType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. col("possible_next_actions_mask").cast(ArrayType(LongType())), ] return df.select(*select_col_list)
"int": "int", "float": "float", "double": "double", "bool": "boolean", "boolean": "boolean", "struct": "struct", "array": "array", "date": "date", "long": "long" # "vector": "vector" } SPARK_DTYPES_DICT_OBJECTS = { "string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType, "date": DateType() } SPARK_DTYPES_DICT = { "string": StringType, "int": IntegerType, "float": FloatType, "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType,
from pyspark.sql.functions import udf from pyspark.sql.types import ArrayType, IntegerType, FloatType, StructType, Row, StructField def prediction(height, width, nChannels, data): array = np.ndarray(shape=(height, width, nChannels), dtype=np.uint8, buffer=data, strides=(width * nChannels, nChannels, 1)) out_scores, out_boxes, out_classes = predict_util(array) return Row('classes', 'scores')(out_classes.tolist(), out_scores.tolist()) schema = StructType([ StructField("classes", ArrayType(IntegerType()), False), StructField("scores", ArrayType(FloatType()), False) ]) prediction_udf = udf(prediction, schema) # COMMAND ---------- # MAGIC %md # MAGIC # Assert prediction `classes` on `test.jpg` # COMMAND ---------- display( images_df.where( "image.origin='dbfs:/mnt/roy/object-detection/images/test.jpg'"). withColumn(
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType spark = SparkSession.builder.appName("MinTemperatures").getOrCreate() schema = StructType([ \ StructField("stationID", StringType(), True), \ StructField("date", IntegerType(), True), \ StructField("measure_type", StringType(), True), \ StructField("temperature", FloatType(), True) \ ]) # // Read the file as dataframe df = spark.read.schema(schema).csv("data/1800.csv") df.printSchema() # Filter out all but TMIN entries minTemps = df.filter(df.measure_type == "TMIN") # Select only stationID and temperature stationTemps = minTemps.select("stationID", "temperature") # Aggregate to find minimum temperature for every station minTempsByStation = stationTemps.groupBy("stationID").min("temperature") minTempsByStation.show() # Convert temperature to fahrenheit and sort the dataset minTempsByStationF = minTempsByStation.withColumn("temperature", func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2))\ .select("stationID", "temperature").sort("temperature")
new_test_data = dict() for company in test_data: company_data = test_data[company] test_df = vector_assembler.transform(company_data) test_df = scalar.transform(test_df) test_df = test_df.select(['scaledFeatures', TARGET]) new_test_data[company] = test_df return train_df, new_test_data error_pct_udf = udf( lambda arr: (float(abs(arr[0] - arr[1])) * float(100)) / float(arr[0]), FloatType()) def train_and_pred(train, test_data, tech_only=False): # train the linear regression model lr_model = LinearRegression(featuresCol='scaledFeatures', labelCol=TARGET, maxIter=300, regParam=1, elasticNetParam=1).fit(train) print('Coefficients: {}'.format(str(lr_model.coefficients))) print('Intercept: {}'.format(str(lr_model.intercept))) # summarize the training trainingSummary = lr_model.summary print('Training r2 = {}'.format(float(trainingSummary.r2)))
_SMALLINT_TYPE = __short_type.simpleString() __int_type = IntegerType() _INT_TYPE = __int_type.simpleString() assert _INT_TYPE == int.__name__ assert __int_type.typeName().startswith(_INT_TYPE) __long_type = LongType() _BIGINT_TYPE = __long_type.simpleString() assert __long_type.typeName() == 'long' _INT_TYPES = \ [_TINYINT_TYPE, _SMALLINT_TYPE, _INT_TYPE, _BIGINT_TYPE] __float_type = FloatType() _FLOAT_TYPE = __float_type.simpleString() assert _FLOAT_TYPE == __float_type.typeName() __double_type = DoubleType() _DOUBLE_TYPE = __double_type.simpleString() assert _DOUBLE_TYPE == __double_type.typeName() _FLOAT_TYPES = [_FLOAT_TYPE, _DOUBLE_TYPE] _NUM_TYPES = _INT_TYPES + _FLOAT_TYPES _POSSIBLE_CAT_TYPES = [_BOOL_TYPE, _STR_TYPE] + _NUM_TYPES _POSSIBLE_FEATURE_TYPES = _POSSIBLE_CAT_TYPES + _NUM_TYPES __date_type = DateType()
def align_diff_frames(resolve_func, this, that, fillna=True, how="full", preserve_order_column=False): """ This method aligns two different DataFrames with a given `func`. Columns are resolved and handled within the given `func`. To use this, `compute.ops_on_diff_frames` should be True, for now. :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and the column of another DataFrame. It returns an iterable that produces Series. >>> from databricks.koalas.config import set_option, reset_option >>> >>> set_option("compute.ops_on_diff_frames", True) >>> >>> kdf1 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> kdf2 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> >>> def func(kdf, this_column_labels, that_column_labels): ... kdf # conceptually this is A + B. ... ... # Within this function, Series from A or B can be performed against `kdf`. ... this_label = this_column_labels[0] # this is ('a',) from kdf1. ... that_label = that_column_labels[0] # this is ('a',) from kdf2. ... new_series = (kdf[this_label] - kdf[that_label]).rename(str(this_label)) ... ... # This new series will be placed in new DataFrame. ... yield (new_series, this_label) >>> >>> >>> align_diff_frames(func, kdf1, kdf2).sort_index() a 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 >>> reset_option("compute.ops_on_diff_frames") :param this: a DataFrame to align :param that: another DataFrame to align :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`. Otherwise, it returns as are. :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict. - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and 'that_columns' in this function are B, C and B, C. - left: `resolve_func` should resolve columns including that columns. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is B, C but `that_columns` are B, C, D. - inner: Same as 'full' mode; however, internally performs inner join instead. :return: Aligned DataFrame """ assert how == "full" or how == "left" or how == "inner" this_column_labels = this._internal.column_labels that_column_labels = that._internal.column_labels common_column_labels = set(this_column_labels).intersection( that_column_labels) # 1. Perform the join given two dataframes. combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column) # 2. Apply the given function to transform the columns in a batch and keep the new columns. combined_column_labels = combined._internal.column_labels that_columns_to_apply = [] this_columns_to_apply = [] additional_that_columns = [] columns_to_keep = [] column_labels_to_keep = [] for combined_label in combined_column_labels: for common_label in common_column_labels: if combined_label == tuple(["this", *common_label]): this_columns_to_apply.append(combined_label) break elif combined_label == tuple(["that", *common_label]): that_columns_to_apply.append(combined_label) break else: if how == "left" and combined_label in [ tuple(["that", *label]) for label in that_column_labels ]: # In this case, we will drop `that_columns` in `columns_to_keep` but passes # it later to `func`. `func` should resolve it. # Note that adding this into a separate list (`additional_that_columns`) # is intentional so that `this_columns` and `that_columns` can be paired. additional_that_columns.append(combined_label) elif fillna: columns_to_keep.append( F.lit(None).cast(FloatType()).alias(str(combined_label))) column_labels_to_keep.append(combined_label) else: columns_to_keep.append( combined._internal.spark_column_for(combined_label)) column_labels_to_keep.append(combined_label) that_columns_to_apply += additional_that_columns # Should extract columns to apply and do it in a batch in case # it adds new columns for example. if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0: kser_set, column_labels_applied = zip(*resolve_func( combined, this_columns_to_apply, that_columns_to_apply)) columns_applied = [c.spark.column for c in kser_set] column_labels_applied = list(column_labels_applied) else: columns_applied = [] column_labels_applied = [] applied = combined[columns_applied + columns_to_keep] applied.columns = pd.MultiIndex.from_tuples(column_labels_applied + column_labels_to_keep, names=combined.columns.names) # 3. Restore the names back and deduplicate columns. this_labels = OrderedDict() # Add columns in an order of its original frame. for this_label in this_column_labels: for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels and this_label == new_label[1:]: this_labels[new_label[1:]] = new_label # After that, we will add the rest columns. other_labels = OrderedDict() for new_label in applied._internal.column_labels: if new_label[1:] not in this_labels: other_labels[new_label[1:]] = new_label kdf = applied[list(this_labels.values()) + list(other_labels.values())] kdf.columns = kdf.columns.droplevel() return kdf
def test_join_with_composite_entity( spark: SparkSession, composite_entity_schema: StructType, rating_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=1)), (1001, 8002, datetime(year=2020, month=9, day=3)), (1001, 8003, datetime(year=2020, month=9, day=1)), (2001, 8001, datetime(year=2020, month=9, day=2)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) feature_table_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 3.0, 5.0, ), ( 1001, 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 4.0, 3.0, ), ( 2001, 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 4.0, 4.5, ), ] feature_table_df = spark.createDataFrame( spark.sparkContext.parallelize(feature_table_data), rating_feature_schema, ) feature_table = FeatureTable( name="ratings", features=[ Field("customer_rating", "double"), Field("driver_rating", "double") ], entities=[Field("customer_id", "int32"), Field("driver_id", "int32")], max_age=86400, ) feature_table_df = filter_feature_table_by_time_range( feature_table_df, feature_table, "event_timestamp", entity_df, "event_timestamp", ) joined_df = as_of_join( entity_df, "event_timestamp", feature_table_df, feature_table, ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("ratings__customer_rating", FloatType()), StructField("ratings__driver_rating", FloatType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=1), 3.0, 5.0, ), (1001, 8002, datetime(year=2020, month=9, day=3), None, None), (1001, 8003, datetime(year=2020, month=9, day=1), None, None), ( 2001, 8001, datetime(year=2020, month=9, day=2), 4.0, 4.5, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
# Write a config file to create the Spark Job """ from pyspark.sql import SparkSession, SQLContext, HiveContext from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType, LongType from pyspark.sql.functions import * #When Loading id as IntegerType, the schema returns null #Load id as LongType and the schema load works data_schema_acquisition = [ StructField("id", LongType(), False), StructField("channel", StringType(), False), StructField("seller", StringType(), False), StructField("interest_rate", FloatType(), False), StructField("balance", LongType(), False), StructField("loan_term", IntegerType(), False), StructField("origination_date", StringType(), False), StructField("first_payment_date", StringType(), False), StructField("ltv", IntegerType(), False), StructField("cltv", IntegerType(), False), StructField("borrower_count", IntegerType(), False), StructField("dti", IntegerType(), False), StructField("borrower_credit_score", IntegerType(), False), StructField("first_time_homebuyer", StringType(), False), StructField("loan_purpose", StringType(), False), StructField("property_type", StringType(), False), StructField("unit_count", IntegerType(), False), StructField("occupancy_status", StringType(), False), StructField("property_state", StringType(), False),
def test_multiple_join( spark: SparkSession, composite_entity_schema: StructType, customer_feature_schema: StructType, driver_feature_schema: StructType, ): entity_data = [ (1001, 8001, datetime(year=2020, month=9, day=2)), (1001, 8002, datetime(year=2020, month=9, day=2)), (2001, 8002, datetime(year=2020, month=9, day=3)), ] entity_df = spark.createDataFrame( spark.sparkContext.parallelize(entity_data), composite_entity_schema) customer_table_data = [ ( 1001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 100.0, ), ( 2001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 200.0, ), ] customer_table_df = spark.createDataFrame( spark.sparkContext.parallelize(customer_table_data), customer_feature_schema) customer_table = FeatureTable( name="transactions", features=[Field("daily_transactions", "double")], entities=[Field("customer_id", "int32")], max_age=86400, ) customer_table_df = filter_feature_table_by_time_range( customer_table_df, customer_table, "event_timestamp", entity_df, "event_timestamp", ) driver_table_data = [ ( 8001, datetime(year=2020, month=8, day=31), datetime(year=2020, month=8, day=31), 200, ), ( 8001, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 300, ), ( 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=1), 600, ), ( 8002, datetime(year=2020, month=9, day=1), datetime(year=2020, month=9, day=2), 500, ), ] driver_table_df = spark.createDataFrame( spark.sparkContext.parallelize(driver_table_data), driver_feature_schema) driver_table = FeatureTable( name="bookings", features=[Field("completed_bookings", "int32")], entities=[Field("driver_id", "int32")], max_age=7 * 86400, ) driver_table_df = filter_feature_table_by_time_range( driver_table_df, driver_table, "event_timestamp", entity_df, "event_timestamp", ) joined_df = join_entity_to_feature_tables( entity_df, "event_timestamp", [customer_table_df, driver_table_df], [customer_table, driver_table], ) expected_joined_schema = StructType([ StructField("customer_id", IntegerType()), StructField("driver_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("transactions__daily_transactions", FloatType()), StructField("bookings__completed_bookings", IntegerType()), ]) expected_joined_data = [ ( 1001, 8001, datetime(year=2020, month=9, day=2), 100.0, 300, ), ( 1001, 8002, datetime(year=2020, month=9, day=2), 100.0, 500, ), ( 2001, 8002, datetime(year=2020, month=9, day=3), None, 500, ), ] expected_joined_df = spark.createDataFrame( spark.sparkContext.parallelize(expected_joined_data), expected_joined_schema) assert_dataframe_equal(joined_df, expected_joined_df)
#+-------+---------+-----+----+ li=[23,34,56] #list of elements df.filter(df['column_name'].isin(li)) #Checking if column matches any element of a list df.filter(df['column_name'].isin(li)==False) #Selecting if column does not match any element of a list import numpy as np import pyspark.sql.functions as func def median(values_list): med = np.median(values_list) return float(med) udf_median = func.udf(median, FloatType()) df_grouped = df.groupby(['a', 'd']).agg(udf_median(func.collect_list(col('c'))).alias('median')) df_grouped.show() #Extract a column from a dataframe to a list sea_lists=[row[0] for row in dataframe_with_sea.collect()] #Round off a column from pyspark.sql.functions import pow, lit from pyspark.sql.types import LongType num_places = 3 m = pow(lit(10), num_places).cast(LongType()) df = sc.parallelize([(0.6643, ), (0.6446, )]).toDF(["x"]) df.withColumn("trunc", (col("x") * m).cast(LongType()) / m)
# COMMAND ---------- ratings_df.show() # COMMAND ---------- movie_ratings=ratings_df.drop('timestamp') # COMMAND ---------- # Data type convert from pyspark.sql.types import IntegerType, FloatType movie_ratings = movie_ratings.withColumn("userId", movie_ratings["userId"].cast(IntegerType())) movie_ratings = movie_ratings.withColumn("movieId", movie_ratings["movieId"].cast(IntegerType())) movie_ratings = movie_ratings.withColumn("rating", movie_ratings["rating"].cast(FloatType())) # COMMAND ---------- movie_ratings.show() # COMMAND ---------- # MAGIC %md # MAGIC ### ALS Model Selection and Evaluation # MAGIC # MAGIC With the ALS model, we can use a grid search to find the optimal hyperparameters. # COMMAND ---------- # import package
def fe_DaysSinceRegistration(df): days_active = udf(lambda x, y: float((x - y) / (86400000.0)), FloatType()) df = df.withColumn("DaysSinceRegistration", days_active("ts", "registration")) return df