def register_udf(spark): udf = spark.udf udf.register('get_year_month', get_year_month, returnType=StringType()) udf.register('filter_shixi', filter_shixi, returnType=BooleanType())
def make_not_terminal_udf(): """ Return true iff next_action is an empty map """ def get_not_terminal(next_action): return len(next_action) > 0 return udf(get_not_terminal, BooleanType())
false for bad alert, and true for good alert. Examples ---------- >>> pdf = pd.read_parquet('datatest') >>> classification = sso_fink_candidates_(pdf['roid']) >>> print(len(pdf[classification]['objectId'].values)) 3 >>> assert 'ZTF21acqeepb' in pdf[classification]['objectId'].values """ f_roid = roid.astype(int) == 2 return f_roid @pandas_udf(BooleanType(), PandasUDFType.SCALAR) def sso_fink_candidates(roid) -> pd.Series: """ Pandas UDF version of sso_fink_candidates_ for Spark Parameters ---------- roid: Spark DataFrame Column Column containing the Solar System label Returns ---------- out: pandas.Series of bool Return a Pandas DataFrame with the appropriate flag: false for bad alert, and true for good alert. """
StructField("coolantTemp", FloatType(), False), StructField("intakeAirTemp", FloatType(), False), StructField("intakeAirFlowSpeed", FloatType(), False), StructField("batteryPercentage", FloatType(), False), StructField("batteryVoltage", FloatType(), False), StructField("speed", FloatType(), False), StructField("engineVibrationAmplitude", FloatType(), False), StructField("throttlePos", FloatType(), False), StructField("tirePressure11", FloatType(), False), StructField("tirePressure12", FloatType(), False), StructField("tirePressure21", FloatType(), False), StructField("tirePressure22", FloatType(), False), StructField("accelerometer11Value", FloatType(), False), StructField("accelerometer12Value", FloatType(), False), StructField("accelerometer21Value", FloatType(), False), StructField("accelerometer22Value", FloatType(), False), StructField("controlUnitFirmware", IntegerType(), False), StructField("failureOccurred", BooleanType(), False) ]) # parsing the JSONs based on the schema unlabeled = rows.map(lambda x: json.loads(x[1], schema)).drop( "failureOccurred") # make the predictions predictionsDF = model.transform(unlabeled) # start the streaming ssc.start() ssc.awaitTermination()
return partial # Boilerplate for generating example main_summary tables def generate_search_count(engine='google', source='urlbar', count=4): return { 'engine': engine, 'source': source, 'count': count, } addons_type = ArrayType(StructType([ StructField('addon_id', StringType(), False), StructField('blocklisted', BooleanType(), True), StructField('name', StringType(), True), StructField('user_disabled', BooleanType(), True), StructField('app_disabled', BooleanType(), True), StructField('version', StringType(), True), StructField('scope', LongType(), True), StructField('type', StringType(), True), StructField('foreign_install', BooleanType(), True), StructField('has_binary_components', BooleanType(), True), StructField('install_day', LongType(), True), StructField('update_day', LongType(), True), StructField('signed_state', LongType(), True), StructField('is_system', BooleanType(), True), StructField('is_web_extension', BooleanType(), True), StructField('multiprocess_compatible', BooleanType(), True), ]))
schema = StructType() \ .add("RecordNumber",IntegerType(),True) \ .add("Zipcode",IntegerType(),True) \ .add("ZipCodeType",StringType(),True) \ .add("City",StringType(),True) \ .add("State",StringType(),True) \ .add("LocationType",StringType(),True) \ .add("Lat",DoubleType(),True) \ .add("Long",DoubleType(),True) \ .add("Xaxis",IntegerType(),True) \ .add("Yaxis",DoubleType(),True) \ .add("Zaxis",DoubleType(),True) \ .add("WorldRegion",StringType(),True) \ .add("Country",StringType(),True) \ .add("LocationText",StringType(),True) \ .add("Location",StringType(),True) \ .add("Decommisioned",BooleanType(),True) \ .add("TaxReturnsFiled",StringType(),True) \ .add("EstimatedPopulation",IntegerType(),True) \ .add("TotalWages",IntegerType(),True) \ .add("Notes",StringType(),True) df_with_schema = spark.read.format("csv") \ .option("header", True) \ .schema(schema) \ .load("C:/apps/sparkbyexamples/src/pyspark-examples/resources/zipcodes.csv") df_with_schema.printSchema() df2.write.option("header",True) \ .csv("/tmp/spark_output/zipcodes123")
from pyspark.sql import SparkSession from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType # TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic redisServerSchema = StructType([ StructField("key", StringType()), StructField("value", StringType()), StructField("expiredType", StringType()), StructField("expiredValue", StringType()), StructField("existType", StringType()), StructField("ch", StringType()), StructField("incr", BooleanType()), StructField( "zSetEntries", ArrayType( StructType([ StructField("element", StringType()), StructField("score", StringType()) ]))) ]) # TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic customersSchema = StructType([ StructField("customerName", StringType()), StructField("email", StringType()), StructField("phone", StringType()), StructField("birthDay", StringType()) ]) # TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
def test_as_spark_type_pandas_on_spark_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
def process_gdelt_data(spark, datestr): """ Processes the gdelt dataset by day. Transforms datatypes, add columns for month. Reduces the dataset to the necessary columns. Maps the country code to the ISO code which is used in the google and oxford datasets. Determines if url contains covid related content. Writes the cleaned data back to S3 partitioned by country code. spark: spark session datestr: day to be processed, needs to be a string in the format YYYYMMDD """ # get schema of gdelt data schema = schema_helper.gdelt_s0_schema # read in original data from S3 gdelt_file = 'gdelt/' + datestr + '.export.CSV' df_gdelt = spark.read.option("delimiter", "\t") \ .csv(folder_s0 + gdelt_file,header=False,schema=schema) # converte date column to date type datefunc = F.udf(lambda x: datetime.strptime(x, '%Y%m%d'), DateType()) df_gdelt = df_gdelt.withColumn('date', datefunc(col('SQLDATE'))) #create month column df_gdelt = df_gdelt.withColumn("month", F.month("date")) #reduce dataset by selecting specific columns df_gdelt_reduced = df_gdelt.select("GLOBALEVENTID",\ "date",\ "month",\ "Year",\ "Actor1Code",\ "Actor1Name",\ "Actor1CountryCode",\ "Actor2Code",\ "Actor2Name",\ "Actor2CountryCode",\ "IsRootEvent",\ "EventCode",\ "EventBaseCode",\ "EventRootCode",\ "QuadClass",\ "GoldsteinScale",\ "NumMentions",\ "NumSources",\ "NumArticles",\ "AvgTone",\ "ActionGeo_Type",\ "ActionGeo_CountryCode",\ "ActionGeo_ADM1Code",\ "SOURCEURL") #determines if url contains covid related content #and creates column covid (True=contains keywords) covidFunc = F.udf(lambda x: containsCovidContent(x), BooleanType()) df_gdelt_reduced = df_gdelt_reduced. \ withColumn('covid', covidFunc(col('SOURCEURL'))) #rename column Year to year df_gdelt_reduced = df_gdelt_reduced.withColumnRenamed("Year", "year") #maps fips based country code to iso df_mapping = read_fips2iso_mapping(spark) df_gdelt_reduced = df_gdelt_reduced \ .join(df_mapping, \ on=['ActionGeo_CountryCode'], how='left') #write data back to S3 partitioned by country_code df_gdelt_reduced.write.mode('overwrite') \ .partitionBy("country_code") \ .parquet(folder_s1+"gdelt/gdelt.parquet")
def main(context): """Main function takes a Spark SQL context.""" # skips to task 10 if results from 9 are stored if os.path.isfile("full_sentiment_data.parquet/._SUCCESS.crc"): full_sentiment_data = context.read.parquet( "full_sentiment_data.parquet") else: # TASK 1 if os.path.isfile("comments.parquet/._SUCCESS.crc") and os.path.isfile( "submissions.parquet/._SUCCESS.crc") and os.path.isfile( "labeled_data.parquet/._SUCCESS.crc"): # print("WE HERE") comments = context.read.parquet("comments.parquet") submissions = context.read.parquet("submissions.parquet") labeled_data = context.read.parquet("labeled_data.parquet") else: comments = context.read.json("comments-minimal.json.bz2") comments.write.parquet("comments.parquet") submissions = context.read.json("submissions.json.bz2") submissions.write.parquet("submissions.parquet") labeled_data = context.read.csv("labeled_data.csv", header='true') labeled_data.write.parquet("labeled_data.parquet") # Create temporary views for later comments.createGlobalTempView("comments") labeled_data.createGlobalTempView("labeled_data") submissions.createGlobalTempView("submissions") # TASK 4 context.registerFunction("sanitize", modified_sanitize, ArrayType(StringType())) # TASK 5 if os.path.isfile("joined_data.parquet/._SUCCESS.crc"): joined_data = context.read.parquet("joined_data.parquet") else: joined_data = generate_joined_data(labeled_data, comments, context) joined_data.write.parquet("joined_data.parquet") # code to run sanitize on joined data if os.path.isfile("ngrams.parquet/._SUCCESS.crc"): ngrams = context.read.parquet("ngrams.parquet") else: joined_data.createOrReplaceTempView("joined_data") ngram_sql = """ SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) AS body FROM joined_data""" ngrams = context.sql(ngram_sql) ngrams.write.parquet("ngrams.parquet") # TASK 6A # REFERENCE: # https://spark.apache.org/docs/latest/ml-features.html#countvectorizer # couldn't figure out how to save this simply, so it always has to be ran? Ah well, was fast to run ¯\_(ツ)_/¯ vectorizer = CountVectorizer(inputCol="body", outputCol="features", minDF=MIN_DF, binary=True) model = vectorizer.fit(ngrams) # TASK 6B if os.path.isdir("result.parquet"): result = context.read.parquet("result.parquet") else: result = model.transform(ngrams) result.write.parquet("result.parquet") if os.path.isdir("sentiment_data.parquet"): sentiment_data = context.read.parquet("sentiment_data.parquet") else: djt_sentiment_sql = """ SELECT *, if (labeldjt = 1, 1, 0) AS pos_label, if (labeldjt = -1, 1, 0) AS neg_label FROM result""" result.createOrReplaceTempView("result") sentiment_data = context.sql(djt_sentiment_sql) sentiment_data.write.parquet("sentiment_data.parquet") # sentiment_data.show() # TASK 7 if os.path.isfile( "project2/pos.model/bestModel/data/._SUCCESS.crc" ) and os.path.isfile( "project2/neg.model/bestModel/data/._SUCCESS.crc"): pos_model = CrossValidatorModel.load("project2/pos.model") neg_model = CrossValidatorModel.load("project2/neg.model") else: # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="pos_label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="neg_label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator(labelCol="pos_label") negEvaluator = BinaryClassificationEvaluator(labelCol="neg_label") # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = sentiment_data.randomSplit([0.5, 0.5]) negTrain, negTest = sentiment_data.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") pos_model = posCrossval.fit(posTrain) print("Training negative classifier...") neg_model = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. pos_model.save("project2/pos.model") neg_model.save("project2/neg.model") # TASK 8 # had to downsample because of RAM issues, seemed like the correct place to do it albeit a little redundant reloading # (Windows ate up way too much RAM on the desktop we used, and both of our laptops are too much of potatoes to run any of this) comments = context.read.parquet("comments.parquet").sample( False, 0.2, None) submissions = context.read.parquet("submissions.parquet").sample( False, 0.2, None) comments.createOrReplaceTempView("comments") submissions.createOrReplaceTempView("submissions") full_comments_data = generate_full_comments_data( submissions, comments, context) # full_comments_data.filter("state is not null").show() # TASK 9 # task 4 redone # reregisters function in case of exception not happening for previous task 4 context.registerFunction("sanitize", modified_sanitize, ArrayType(StringType())) # task 5 redone sanitized_full_comments = generate_sanitized_full_comments( context, full_comments_data) # sanitized_full_comments.show() # task 6A result_full_data = model.transform(sanitized_full_comments) # result_full_data.show() # classification part of task 9 pos_result = pos_model.transform(result_full_data) # pos_result.show() neg_result = neg_model.transform(result_full_data) # neg_result.show() # probability threshold application from task 9 context.registerFunction("first_element", lambda x: float(x[1]), FloatType()) threshold_sql = """ SELECT a.comment_id AS comment_id, a.submission_id AS submission_id, a.timestamp AS timestamp, a.state AS state, a.title AS title, if (first_element(a.probability) > 0.2, 1, 0) AS pos, if (first_element(b.probability) > 0.25, 1, 0) AS neg, a.comment_score AS comment_score, a.submission_score AS submission_score FROM pos_result a INNER JOIN neg_result b ON a.comment_id = b.comment_id """ pos_result.createOrReplaceTempView("pos_result") neg_result.createOrReplaceTempView("neg_result") # pos_result.printSchema() # full_sentiment_data = context.sql(threshold_sql).explain() full_sentiment_data = context.sql(threshold_sql) full_sentiment_data.write.parquet("full_sentiment_data.parquet") # full_sentiment_data.show() # full_sentiment_data.show(20, False) # exit(1) # TASK 10 # part 1 percent_sql = """ SELECT AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data""" full_sentiment_data.createOrReplaceTempView("full_sentiment_data") task10_1 = context.sql(percent_sql) # task10_1.show() if os.path.isdir("raw_percentages.csv"): shutil.rmtree("raw_percentages.csv") task10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("raw_percentages.csv") # part 2 percent_by_day_sql = """ SELECT FROM_UNIXTIME(timestamp, 'YYYY-MM-dd') AS date, AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data GROUP BY date ORDER BY date""" # full_sentiment_data.createOrReplaceTempView("full_sentiment_data") task10_2 = context.sql(percent_by_day_sql) # task10_2.show() if os.path.isdir("time_data.csv"): shutil.rmtree("time_data.csv") task10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("time_data.csv") # Task 3 context.registerFunction("valid_state", lambda x: x in STATES, BooleanType()) task10_3_sql = """ SELECT state AS state, AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data WHERE (valid_state(state)) GROUP BY state """ # full_sentiment_data.createOrReplaceTempView("full_sentiment_data") task10_3 = context.sql(task10_3_sql) if os.path.isdir("state_data.csv"): shutil.rmtree("state_data.csv") task10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("state_data.csv") # part 4 task_10_4_sql_submission = """ SELECT submission_score, AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data GROUP BY submission_score """ task_10_4_sql_comment = """ SELECT comment_score, AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data GROUP BY comment_score """ # full_sentiment_data.createOrReplaceTempView("full_sentiment_data") percent_submission = context.sql(task_10_4_sql_submission) # full_sentiment_data.createOrReplaceTempView("full_sentiment_data") percent_comment = context.sql(task_10_4_sql_comment) if os.path.isdir("submission_score.csv"): shutil.rmtree("submission_score.csv") percent_submission.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("submission_score.csv") if os.path.isdir("comment_score.csv"): shutil.rmtree("comment_score.csv") percent_comment.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("comment_score.csv") # FOR 4 in, to get the top ten pos and neg submissions top_pos_submissions_sql = """ SELECT submission_id, submission_score, AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data GROUP BY submission_score, submission_id ORDER BY Positive DESC LIMIT 10 """ top_neg_submissions_sql = """ SELECT submission_id, submission_score, AVG(pos) * 100.0 AS Positive, AVG(neg) * 100.0 AS Negative FROM full_sentiment_data GROUP BY submission_score, submission_id ORDER BY Negative DESC LIMIT 10 """ # full_sentiment_data.show() top_pos_submissions = context.sql(top_pos_submissions_sql) # top_pos_submissions.show() top_neg_submissions = context.sql(top_neg_submissions_sql) # top_neg_submissions.show() if os.path.isdir("top_pos_submissions.csv"): shutil.rmtree("top_pos_submissions.csv") top_pos_submissions.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("top_pos_submissions.csv") if os.path.isdir("top_neg_submissions.csv"): shutil.rmtree("top_neg_submissions.csv") top_neg_submissions.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("top_neg_submissions.csv")
StructField("medium", StringType(), True), StructField("campaign", StringType(), True), StructField("content", StringType(), True) ]), True), StructField("channel", StringType(), True), StructField("client_id", StringType(), True), StructField("country", StringType(), True), StructField("default_search_engine", StringType(), True), StructField("distribution_id", StringType(), True), StructField("locale", StringType(), True), StructField("normalized_channel", StringType(), True), StructField("profile_creation_date", LongType(), True), StructField("submission_date_s3", StringType(), False), StructField("subsession_length", LongType(), True), StructField("subsession_start_date", StringType(), True), StructField("sync_configured", BooleanType(), True), StructField("sync_count_desktop", IntegerType(), True), StructField("sync_count_mobile", IntegerType(), True), StructField("timestamp", LongType(), True), StructField(SPBE + "total_uri_count", IntegerType(), True), StructField(SPBE + "unique_domains_count", IntegerType(), True) ]) default_sample = { "app_version": "57.0.0", "attribution": { "source": "source-value", "medium": "medium-value", "campaign": "campaign-value", "content": "content-value" },
STRING = 'string' INTEGER = 'integer' DOUBLE = 'double' BOOLEAN = 'boolean' TIMESTAMP = 'timestamp' try: from pyspark.sql.types import (StructType, StructField, StringType, IntegerType, DoubleType, BooleanType) SPARK_DTYPE_MAPPING = { STRING: StringType(), INTEGER: IntegerType(), DOUBLE: DoubleType(), BOOLEAN: BooleanType() } except (ModuleNotFoundError, NameError): pass MODELS = { 'SubmissionProducer': { 'schema': { 'id': STRING, 'subreddit': STRING, 'subreddit_subscribers': INTEGER, 'title': STRING, 'author': STRING, 'created_utc': DOUBLE, 'over_18': BOOLEAN, 'selftext': STRING },
import requests import boto3 bucket = "podcast-mp3-bucket" try: s3 = boto3.resource('s3') mp3data = requests.get(url).content s3Object = s3.Object(bucket, key) s3Object.put(Body=mp3data) #success return True except: #error return False url_to_s3_udf = udf(lambda x, z: download_to_s3(x, z), BooleanType()) #Get all podcast episodes in Elasticsearch dfES = spark.read.format('org.elasticsearch.spark.sql')\ .option('es.nodes', '10.0.0.6:9200, 10.0.0.14:9200, 10.0.0.10:9200')\ .option('es.resource', "podcast_v1")\ .option('es.read.metadata', 'true')\ .load() dfES = dfES.select( col("_metadata").getItem("_id").alias("idES"), col("downloaded"), col("audiourl")) #Get list of MP3 files in S3 dfS3 = spark.read.format("binaryFile")\ .option("pathGlobFilter", "*")\
# init spark/ with 2 partitions made spark = SparkSession.builder.master("local").appName("HouseScrape").config( "spark.sql.shuffle.partitions", 2).config('spark.sql.warehouse.dir', 'file:///C:/path/to/my/').getOrCreate() # read data from certain csv files lands = spark.read.format('csv').option('header', 'true').load('Downloads/*') # drop the empty data lands.na.drop() # filter the data lands = lands.filter((lands["主要用途"] == "住家用") & (lands["建物型態"].contains("住宅大樓"))) # filter the data with udf filter_udf = udf(filter_13, BooleanType()) lands = lands.filter(filter_udf(lands["總樓層數"])) # create a new column for city name lands = lands.withColumn("縣市名稱", (lands["土地區段位置建物區段門牌"]).substr(1, 3)) # select the columns that matches requirement lands = lands.select(lands["縣市名稱"], lands['交易年月日'], lands['鄉鎮市區'], lands['建物型態']) # rename the column for further requirement lands = lands.withColumnRenamed('縣市名稱', 'city').withColumnRenamed( '交易年月日', 'date').withColumnRenamed('鄉鎮市區', 'district').withColumnRenamed( '建物型態', 'building_state') # transfer the date from Taiwan date to AD date via udf converter_udf = udf(covert_date, StringType())
from pyspark.sql.functions import isnan, when, count, col business_df.select([ count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in business_df.columns ]).show() # Drop rows with null values in categories column business_df = business_df.dropna(subset='categories') # business_df.show(5) #Only keeping the businesses that are restaurants or are food related from pyspark.sql.functions import udf from pyspark.sql.types import BooleanType @udf(returnType=BooleanType()) def filter_restaurants(col): for category in col.split(','): if 'restaurant' in category.lower() or 'food' in category.lower(): return True return False @udf(returnType=BooleanType()) def filter_non_restaurants(col): category = ' '.join(col.split(',')) if 'restaurant' not in category.lower() and 'food' not in category.lower(): return True return False
'USAWV', 'USAWY', 'USSR', 'UZBK', 'VANU', 'VCAN', 'VEN', 'VI', 'VIETN', 'WAFR', 'WALLIS', 'WASIA', 'WEEC', 'WEIND', 'WESTW', 'WEUR', 'WORLD', 'WSOMOA', 'YEMAR', 'YUG', 'ZAIRE', 'ZAMBIA', 'ZIMBAB' ]) Filter = Filter_regions.transform(limit_df) Result_df = Filter.select('DocID', 'Topics').repartition(4) def inter(a, b): match = set(a).issubset(set(b)) return match inter_udf = udf(inter, BooleanType()) def threshold(value): if value == True: return 1 else: return 0 threshold_udf = udf(threshold, IntegerType()) df1 = Result_df.join(Result_df.alias("Result_df1").select(col("DocID").alias("DocID2"),col("Topics").alias("Topics2")),col("DocID") < col("DocID2"), 'inner')\ .withColumn('Intersect_Score',inter_udf(col('Topics'),col('Topics2')))\ .withColumn('True_match',threshold_udf(col('Intersect_Score')))
SCHEMA_TRANSACTIONS = StructType([ StructField('dt', DateType()), StructField('payer_account', StringType()), StructField('beneficiary_account', StringType()), StructField('amount', DoubleType()) ]) SCHEMA_ACCOUNT_INFO = StructType([ StructField('account', StringType()), StructField('name', StringType()), StructField('country', StringType()) ]) SCHEMA_COUNTRIES = StructType([ StructField('country', StringType()), StructField('allowed', BooleanType()) ]) ACCOUNT_INFO_ROWS = [("NL99INGB9999999999", "John Muller BV", "NL"), ("NL88RABO8888888888", "Kris Geusebroek NV", "NL"), ("NL29ABNA5612457383", "Super mooie laptops BV", "NL"), ("BE59587979732526", "Ahmet Erdem Belgian Investment", "BE"), ("BE31199386628955", "Vlaamse Patat", "BE"), ("BE29587431928864", "Gauffre Belgique", "BE"), ("PL84109024029551596171791699", "Polski Beat", "PL"), ("PL75109024026862879594797792", "Zywiec", "PL"), ("NK1", "Kim Jong Un Industries", "NK"), ("NK2", "Kim Jong Un Investment", "NK")] def generate_transactions(number):
from pyspark.sql.types import BooleanType import re def regex_filter(x): regexs = ['\d+'] if x and x.strip(): for r in regexs: if re.match(r, x, re.IGNORECASE): return True return False filter_udf = udf(regex_filter, BooleanType()) data_filter = data.filter(filter_udf(data.listing_id)) # COMMAND ---------- # Create separate dataframe based on the outliers in room pricing, # data_filter_pos --> greater than average price # data_filter_neg --> lower than average price data_filter_pos = data_filter.where( col("listing_id").isin([ "7921556", "18479564", "10452642", "14859885", "6794333", "12382366", "3629096", "16031982", "17494091", "7330060" ])) data_filter_neg = data_filter.where(
"short": "short", "binary": "binary", "null": "null" # "vector": "vector" } SPARK_DTYPES_DICT = {"string": StringType, "int": IntegerType, "float": FloatType, "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_TYPES = {"int", "float", "string", "bool", "date", "null", "array", "double"} PROFILER_LEGEND_TYPES = {"string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#", "bigint": "#"} PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "bool", "null", "array"} # Strings and Function Messages JUST_CHECKING = "Just check that Spark and all necessary environments vars are present..." STARTING_SPARK = "Starting or getting SparkSession and SparkContext..." STARTING_OPTIMUS = "Transform and Roll out..." SUCCESS = "Optimus successfully imported. Have fun :)."
lambdaDF.show() lambdaDF.count() # Past versions of DataFrames used to wrap lambdas around # Spark _User Defined Function_ (UDF). A UDF is a special wrapper around a # function, allowing the function to be used in a DataFrame query, # and requires both the function and the return type to be defined. # In[ ]: from pyspark.sql.types import BooleanType from pyspark.sql.functions import udf less_ten = udf(lambda s: s < 10, BooleanType()) lambdaDF = subDF.filter(less_ten(subDF.age)) lambdaDF.show() lambdaDF.count() # Lets try another example below. # In[ ]: # Let's collect the even values less than 10 even = udf(lambda s: s % 2 == 0, BooleanType()) evenDF = lambdaDF.filter(even(lambdaDF.age)) evenDF.show() evenDF.count()
This script is used to: 1. Generate predictions for a set of reviews using a given lexicon. 2. Find the following metrics given the true values and predictions : Accuracy, Precision, Recall, F1 Score 3. Store the values for all combinations in a CSV. """ from itertools import product from gcloud import storage from pyspark.sql.functions import concat_ws, split, explode, sum, avg, udf from pyspark.sql.types import StructType, StringType, DoubleType, BooleanType def score_to_sentiment(score): return score >= 0 scoreUDF = udf(score_to_sentiment, BooleanType()) def generate_predictions(review_parquet, lexicon_csv): # TODO: Check if columns exist amazon = spark.read.parquet(review_parquet) # TODO: Change lexicon structure so that 'review_word' is used instead of 'word' struct = StructType().add(field="word", data_type=StringType()).add( field="score", data_type=DoubleType()) words = spark.read.csv(lexicon_csv, schema=struct) # TODO: Should we create reviewID while processing it (instead of here) to be on the safer side? amazon = amazon.withColumn('reviewID', concat_ws('-', amazon.asin, amazon.reviewerID)) exploded_words = amazon.withColumn(
def test_fillna(self): schema = StructType( [ StructField("name", StringType(), True), StructField("age", IntegerType(), True), StructField("height", DoubleType(), True), StructField("spy", BooleanType(), True), ] ) # fillna shouldn't change non-null values row = self.spark.createDataFrame([("Alice", 10, 80.1, True)], schema).fillna(50).first() self.assertEqual(row.age, 10) # fillna with int row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(50).first() self.assertEqual(row.age, 50) self.assertEqual(row.height, 50.0) # fillna with double row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(50.1).first() self.assertEqual(row.age, 50) self.assertEqual(row.height, 50.1) # fillna with bool row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(True).first() self.assertEqual(row.age, None) self.assertEqual(row.spy, True) # fillna with string row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna("hello").first() self.assertEqual(row.name, "hello") self.assertEqual(row.age, None) # fillna with subset specified for numeric cols row = ( self.spark.createDataFrame([(None, None, None, None)], schema) .fillna(50, subset=["name", "age"]) .first() ) self.assertEqual(row.name, None) self.assertEqual(row.age, 50) self.assertEqual(row.height, None) self.assertEqual(row.spy, None) # fillna with subset specified for string cols row = ( self.spark.createDataFrame([(None, None, None, None)], schema) .fillna("haha", subset=["name", "age"]) .first() ) self.assertEqual(row.name, "haha") self.assertEqual(row.age, None) self.assertEqual(row.height, None) self.assertEqual(row.spy, None) # fillna with subset specified for bool cols row = ( self.spark.createDataFrame([(None, None, None, None)], schema) .fillna(True, subset=["name", "spy"]) .first() ) self.assertEqual(row.name, None) self.assertEqual(row.age, None) self.assertEqual(row.height, None) self.assertEqual(row.spy, True) # fillna with dictionary for boolean types row = self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna({"a": True}).first() self.assertEqual(row.a, True)
def read_dataset(filename): spark = init_spark() schema = StructType([ StructField('ID', StringType(), True), StructField('Source', StringType(), True), StructField('TMC', StringType(), True), StructField('Severity', IntegerType(), True), StructField('Start_Time', TimestampType(), True), StructField('End_Time', TimestampType(), True), StructField('Start_Lat', FloatType(), True), StructField('Start_Lng', FloatType(), True), StructField('End_Lat', FloatType(), True), StructField('End_Lng', FloatType(), True), StructField('Distance(mi)', FloatType(), True), StructField('Description', StringType(), True), StructField('Number', StringType(), True), StructField('Street', StringType(), True), StructField('Side', StringType(), True), StructField('City', StringType(), True), StructField('County', StringType(), True), StructField('State', StringType(), True), StructField('Zipcode', StringType(), True), StructField('Country', StringType(), True), StructField('Timezone', StringType(), True), StructField('Airport_Code', StringType(), True), StructField('Weather_Timestamp', TimestampType(), True), StructField('Temperature(F)', FloatType(), True), StructField('Wind_Chill(F)', FloatType(), True), StructField('Humidity(%)', FloatType(), True), StructField('Pressure(in)', FloatType(), True), StructField('Visibility(mi)', FloatType(), True), StructField('Wind_Direction', StringType(), True), StructField('Wind_Speed(mph)', FloatType(), True), StructField('Precipitation(in)', FloatType(), True), StructField('Weather_Condition', StringType(), True), StructField('Amenity', BooleanType(), True), StructField('Bump', BooleanType(), True), StructField('Crossing', BooleanType(), True), StructField('Give_Way', BooleanType(), True), StructField('Junction', BooleanType(), True), StructField('No_Exit', BooleanType(), True), StructField('Railway', BooleanType(), True), StructField('Roundabout', BooleanType(), True), StructField('Station', BooleanType(), True), StructField('Stop', BooleanType(), True), StructField('Traffic_Calming', BooleanType(), True), StructField('Traffic_Signal', BooleanType(), True), StructField('Turning_Loop', BooleanType(), True), StructField('Sunrise_Sunset', StringType(), True), StructField('Civil_Twilight', StringType(), True), StructField('Nautical_Twilight', StringType(), True), StructField('Astronomical_Twilight', StringType(), True) ]) total_accidents_data = spark.read.schema(schema).csv(filename, header=True, mode="DROPMALFORMED", encoding="ISO-8859-1", inferSchema=True) # dropped meaningless and 1 class only columns final_result = total_accidents_data.drop('Country').drop('Turning_Loop').drop('id', 'Source', 'Description')\ .withColumn('start_year', sql_func.date_format(total_accidents_data.Start_Time, 'y')) final_result = final_result.filter(((col('start_year') == 2019))) final_result.show() print(final_result.count()) return final_result
simpleData = [("James", 34, "2006-01-01", "true", "M", 3000.60), ("Michael", 33, "1980-01-10", "true", "F", 3300.80), ("Robert", 37, "06-01-1992", "false", "M", 5000.50)] columns = [ "firstname", "age", "jobStartDate", "isGraduated", "gender", "salary" ] df = spark.createDataFrame(data=simpleData, schema=columns) df.printSchema() df.show(truncate=False) from pyspark.sql.functions import col from pyspark.sql.types import StringType, BooleanType, DateType df2 = df.withColumn("age",col("age").cast(StringType())) \ .withColumn("isGraduated",col("isGraduated").cast(BooleanType())) \ .withColumn("jobStartDate",col("jobStartDate").cast(DateType())) df2.printSchema() df3 = df2.selectExpr("cast(age as int) age", "cast(isGraduated as string) isGraduated", "cast(jobStartDate as string) jobStartDate") df3.printSchema() df3.show(truncate=False) df3.createOrReplaceTempView("CastExample") df4 = spark.sql( "SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample" ) df4.printSchema() df4.show(truncate=False)
def make_not_terminal_udf(actions: List[str]): """ Return true iff next_action is terminal (i.e. idx = len(actions)). """ def get_not_terminal(next_action): return next_action < len(actions) return udf(get_not_terminal, BooleanType())
"timestamp": (int( (timestamp - epoch).total_seconds() * nanoseconds_per_second)) } return date_snippet def search_row(engine='hooli', count=1, source='searchbar'): return Row(engine=text_type(engine), source=text_type(source), count=count) schema = StructType([ StructField("document_id", StringType(), True), StructField("client_id", StringType(), True), StructField("timestamp", StringType(), True), StructField("is_default_browser", BooleanType(), True), StructField( "search_counts", ArrayType( StructType([ StructField("engine", StringType(), True), StructField("source", StringType(), True), StructField("count", LongType(), True) ]), True), True), StructField("country", StringType(), True), StructField("profile_creation_date", LongType(), True), StructField("normalized_channel", StringType(), True), StructField("os", StringType(), True), StructField("subsession_length", LongType(), True), StructField("submission_date_s3", StringType(), True), ])
def select_relevant_columns(df, discrete_action: bool = True, include_possible_actions: bool = True): """ Select all the relevant columns and perform type conversions. """ if not discrete_action and include_possible_actions: raise NotImplementedError( "currently we don't support include_possible_actions") select_col_list = [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("reward").cast(FloatType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("state_features").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("state_features_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_state_features").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_state_features_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("not_terminal").cast(BooleanType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action_probability").cast(FloatType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("mdp_id").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("sequence_number").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("step").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("time_diff").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("metrics").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("metrics_presence").cast(ArrayType(BooleanType())), ] if discrete_action: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action").cast(LongType()), ] else: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action_presence").cast(ArrayType(BooleanType())), ] if include_possible_actions: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("possible_actions_mask").cast(ArrayType(LongType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("possible_next_actions_mask").cast(ArrayType(LongType())), ] return df.select(*select_col_list)
"isinf": lambda c: c == float("inf"), "isnan": F.isnan, "isnat": lambda c: NotImplemented, # Koalas and PySpark does not have Nat concept. "log": F.log, "log10": F.log10, "log1p": F.log1p, "log2": F.pandas_udf(lambda s: np.log2(s), DoubleType()), "logical_not": lambda c: ~(c.cast(BooleanType())), "matmul": lambda _: NotImplemented, # Can return a NumPy array in pandas. "negative": lambda c: c * -1, "positive": lambda c: c, "rad2deg": F.pandas_udf(lambda s: np.rad2deg(s), DoubleType()), "radians": F.radians, "reciprocal": F.pandas_udf(lambda s: np.reciprocal(s), DoubleType()), "rint": F.pandas_udf(lambda s: np.rint(s), DoubleType()), "sign":
StructField("geoNetwork_subContinent", StringType(), True), StructField("geoNetwork_country", StringType(), True), StructField("geoNetwork_region", StringType(), True), StructField("geoNetwork_metro", StringType(), True), StructField("geoNetwork_city", StringType(), True), StructField("geoNetwork_cityId", IntegerType(), True), StructField("geoNetwork_networkDomain", StringType(), True), StructField("geoNetwork_latitude", DoubleType(), True), StructField("geoNetwork_longitude", DoubleType(), True), StructField("geoNetwork_networkLocation", StringType(), True), StructField("device_browser", StringType(), True), StructField("device_browserVersion", DoubleType(), True), StructField("device_browserSize", StringType(), True), StructField("device_operatingSystem", StringType(), True), StructField("device_operatingSystemVersion", StringType(), True), StructField("device_isMobile", BooleanType(), True), StructField("device_mobileDeviceBranding", StringType(), True), StructField("device_mobileDeviceModel", StringType(), True), StructField("device_mobileInputSelector", StringType(), True), StructField("device_mobileDeviceInfo", StringType(), True), StructField("device_mobileDeviceMarketingName", StringType(), True), StructField("device_flashVersion", IntegerType(), True), StructField("device_javaEnabled", StringType(), True), StructField("device_language", StringType(), True), StructField("device_screenColors", StringType(), True), StructField("device_screenResolution", StringType(), True), StructField("device_deviceCategory", StringType(), True), StructField("totals_transactionRevenue", StringType(), True), StructField("landingPage", StringType(), True), StructField("hits_type", StringType(), True), StructField("touchpoints", ArrayType(StringType()), True),
StructField( "build", ArrayType( StructType([StructField("application_name", StringType(), True)]), True), True), StructField( "settings", ArrayType(StructType([StructField("locale", StringType(), True)]), True), True), StructField( "active_addons", ArrayType( MapType( StringType(), StructType([ StructField("blocklisted", BooleanType(), True), StructField("type", StringType(), True), StructField("signed_state", LongType(), True), StructField("user_disabled", BooleanType(), True), StructField("app_disabled", BooleanType(), True), StructField("is_system", BooleanType(), True) ]), True), True)) ]) default_sample = { "client_id": "client-id", "normalized_channel": "release", "build": [{ "application_name": "Firefox"