def import_twitter_data(spark_session, tweets_file_path): """Imports the twitter data and returns resulting DataFrame. Args: spark_session -- An active SparkSession. tweets_file_path -- A file path. """ tweets_schema = types.StructType([ types.StructField('id', types.LongType()), types.StructField('timestamp', types.LongType(), nullable=False), types.StructField('postalCode', types.StringType()), types.StructField('lon', types.DoubleType(), nullable=False), types.StructField('lat', types.DoubleType(), nullable=False), types.StructField('tweet', types.StringType(), nullable=False), types.StructField('user_id', types.LongType()), types.StructField('application', types.StringType()), types.StructField('source', types.StringType()) ]) tweets_df = spark_session.read.csv(tweets_file_path, escape='"', header='true', schema=tweets_schema, mode='DROPMALFORMED') tweets_df = tweets_df.select(['timestamp', 'lon', 'lat', 'tweet']) return tweets_df
def test_df() -> DataFrame: """Return some data.""" spark = package_spark schema = st.StructType([ st.StructField('id', st.LongType()), st.StructField('money', st.StringType()), st.StructField('timestamp', st.LongType()), st.StructField('structtype', st.StructType([ st.StructField('number1', st.StringType()), st.StructField('number2', st.StringType()), st.StructField('number3', st.StringType()), ])), st.StructField('rootstructtype', st.StructType([ st.StructField('nestedstructtype', st.StructType([ st.StructField('fieldtype', st.StringType()), ])), ])), st.StructField('arraytype', st.ArrayType(st.StringType())), ]) test_df = spark.createDataFrame([ [1, '$100.000', 17, (1, 2, 3), ((2,),), ['meta', 'data']], [1, '$200.000', 17, (3, 2, 1), ((2,),), ['meta', 'data']], [1, '$10.000', 16, (1, 3, 2), ((2,),), ['meta', 'data']], [2, '-$100', 17, (3, 1, 2), ((2,),), ['meta', 'data']], [2, '$100', 14, (2, 1, 3), ((2,),), ['meta', 'data']], ], [ 'id', 'money', 'timestamp', 'structtype', 'rootstructtype', 'arraytype' ]) return spark.createDataFrame(test_df.rdd, schema)
def benchmarkCalculatePiUsingDF(spark, samples, parallelism, jobLogger): def inside(p): x, y = random.random(), random.random() return x * x + y * y < 1 jobLogger.info( '****************************************************************') jobLogger.info( 'Starting benchmark test calculatng Pi via dataframe manipulations ' 'with {0:,} samples'.format(samples)) start_time = timer() # Note that the random seed for each of the columns must be different otherwise # each column will have identical values on each row pi_df = (spark.range(0, samples, numPartitions=parallelism).withColumn( 'x', F.rand(seed=8675309) ).withColumn('y', F.rand(seed=17760704)).withColumn( 'within_circle', F.when( (F.pow(F.col('x'), F.lit(2)) + F.pow(F.col('y'), F.lit(2)) <= 1.0), F.lit(1).cast(T.LongType())).otherwise( F.lit(0).cast(T.LongType()))).agg( F.sum('within_circle').alias('count_within_circle'), F.count('*').alias('count_samples'))) res = pi_df.collect() pi_val = 4.0 * (res[0].count_within_circle) / (res[0].count_samples) end_time = timer() return (end_time - start_time), pi_val
def test_datatype(self): first = T.StructType([ T.StructField('f1', T.BooleanType()), T.StructField('f2', T.ByteType()), T.StructField('f3', T.IntegerType()), T.StructField('f4', T.LongType()), ]) second = T.StructType([ T.StructField('f3', T.IntegerType()), T.StructField('f2', T.ByteType()), T.StructField('f4', T.LongType()), T.StructField('f1', T.BooleanType()), ]) SparklyTest().assertRowsEqual(first, second, ignore_order=True) with self.assertRaises(AssertionError): self.assertEqual(first, second) # change entry (f4, T.LongType) second = T.StructType([ T.StructField('f3', T.IntegerType()), T.StructField('f2', T.ByteType()), T.StructField('f4', T.StringType()), T.StructField('f1', T.BooleanType()), ]) with self.assertRaises(AssertionError): SparklyTest().assertRowsEqual(first, second, ignore_order=True)
class Ensembler: _PRIMITIVE_TYPE_MAP = { pb2.Ensembler.ResultType.DOUBLE: types.DoubleType(), pb2.Ensembler.ResultType.FLOAT: types.FloatType(), pb2.Ensembler.ResultType.INTEGER: types.LongType(), pb2.Ensembler.ResultType.LONG: types.LongType(), pb2.Ensembler.ResultType.STRING: types.StringType(), } _CAST_TYPE_MAP = { pb2.Ensembler.ResultType.INTEGER: types.IntegerType(), } def __init__(self, ensembler_uri: str, result_column_name: str, result_type: types.DataType, cast_type: types.DataType): self._ensembler_uri = ensembler_uri self._result_column_name = result_column_name self._result_type = result_type self._cast_type = cast_type def ensemble(self, combined_df: DataFrame, spark: SparkSession) -> DataFrame: udf = mlflow.pyfunc.spark_udf( spark, self._ensembler_uri, self._result_type ) return combined_df.withColumn( self._result_column_name, udf(struct(combined_df.columns)) if self._cast_type is None else udf(struct(combined_df.columns)).cast(self._cast_type) ) @classmethod def from_config(cls, config: pb2.Ensembler) -> 'Ensembler': result_type = None cast_type = None if config.result.type == pb2.Ensembler.ResultType.ARRAY: if config.result.item_type in cls._PRIMITIVE_TYPE_MAP: result_type = types.ArrayType( cls._PRIMITIVE_TYPE_MAP.get(config.result.item_type) ) if config.result.item_type in cls._CAST_TYPE_MAP: cast_type = types.ArrayType( cls._CAST_TYPE_MAP.get(config.result.item_type) ) else: raise ValueError(f'unknown item type for array: {config.result.item_type}') else: result_type = cls._PRIMITIVE_TYPE_MAP.get(config.result.type) cast_type = cls._CAST_TYPE_MAP.get(config.result.type) if result_type is None: raise ValueError(f'unknown result type: {config.result.type}') return Ensembler(config.uri, config.result.column_name, result_type, cast_type)
def main(): # do things... schema = types.StructType([ types.StructField('ID', types.StringType(), False), types.StructField('DATE', types.LongType(), False), types.StructField('TYPE', types.StringType(), False), types.StructField('VALUE1', types.LongType(), False), types.StructField('MFlag', types.StringType(), True), types.StructField('QFlag', types.StringType(), True) ]) t = spark.read.csv(inputs, schema=schema) t = t.where(col("QFlag").isNull()) # t.show() p = t.filter((col("TYPE") == "TMAX") | (col("TYPE") == "TMIN")) \ .groupby('DATE', 'ID').agg( (2 * max("VALUE1") - sum("VALUE1")).alias("Range")) max_table = p.groupby('DATE').agg(max("Range").alias('MaxRange')) cond = [ p['DATE'] == max_table['DATE'], p['Range'] == max_table['MaxRange'] ] df_result = p.join(max_table, cond, 'inner').select(p['DATE'], p['ID'], p['Range']).sort(col("DATE")) df_result.show()
def test_futureLeftJoin(self): import pyspark.sql.types as pyspark_types price = self.price() vol = self.vol() expected_pdf = test_utils.make_pdf([ (1000, 7, 0.5, 400, 1050), (1000, 3, 1.0, 300, 1050), (1050, 3, 1.5, 500, 1100), (1050, 7, 2.0, 600, 1100), (1100, 3, 2.5, 700, 1150), (1100, 7, 3.0, 800, 1150), (1150, 3, 3.5, 900, 1200), (1150, 7, 4.0, 1000, 1200), (1200, 3, 4.5, 1100, 1250), (1200, 7, 5.0, 1200, 1250), (1250, 3, 5.5, None, None), (1250, 7, 6.0, None, None), ], ["time", "id", "price", "volume", "time2"]) new_pdf = price.futureLeftJoin(vol.withColumn( "time2", vol.time.cast(pyspark_types.LongType())), tolerance=pd.Timedelta("100ns"), key=["id"], strict_lookahead=True).toPandas() new_pdf1 = price.futureLeftJoin(vol.withColumn( "time2", vol.time.cast(pyspark_types.LongType())), tolerance=pd.Timedelta("100ns"), key="id", strict_lookahead=True).toPandas() test_utils.assert_same(new_pdf, new_pdf1) test_utils.assert_same(new_pdf, expected_pdf)
def main(inputs, output): pathfunction = functions.udf(path_to_hour, returnType=types.StringType()) comments_schema = types.StructType([ types.StructField('language', types.StringType()), types.StructField('title', types.StringType()), types.StructField('views', types.LongType()), types.StructField('bytes', types.LongType()), ]) wikipage = spark.read.csv(inputs, schema=comments_schema, sep=' ').withColumn( 'hour', pathfunction(functions.input_file_name())) filtered_page = wikipage.filter( (wikipage.language == 'en') & (wikipage.title != 'Main Page') & (~wikipage.title.startswith('Special:'))).cache() max_view = filtered_page.groupBy(wikipage.hour).agg( functions.max(wikipage.views).alias('total_views')) conditions = [ filtered_page.views == max_view.total_views, filtered_page.hour == max_view.hour ] # regular join: join_page = filtered_page.join(functions.broadcast(max_view), conditions).select(filtered_page.hour, 'title', 'views') # broadcast join as following: join_page = filtered_page.join(functions.broadcast(max_view), conditions).select(filtered_page.hour, 'title', 'views') join_page.sort('hour', 'title').write.json(output, mode='overwrite') join_page.explain()
def main(input_stream, sentiment_model_file): # main logic starts here headline_schema = types.StructType([ types.StructField('title', types.StringType()), types.StructField('score', types.LongType()), types.StructField('num_comments', types.LongType()), ]) #load the sentiment model sentiment_model = PipelineModel.load(sentiment_model_file) #Load the headline stream headlines_stream = spark.readStream.format('json').schema( headline_schema).load(input_stream) #match the schema our sentiment model needs headlines_stream = prepare_for_process(headlines_stream) #make the prediction 0 = lowest,1 = neutral, 2 = sentiment predictions_df = sentiment_model.transform(headlines_stream) predictions_df = predictions_df.select(predictions_df['title'], predictions_df['prediction']) predictions_df.writeStream.format('console').outputMode('append').option( 'truncate', False).start().awaitTermination(600)
def my_compute_function(ctx, site_counts, **domains): data = [] for domain_name, domain_df in domains.items(): row_count = domain_df.count() data.append((domain_name.lower(), row_count)) # Create dataframe with row counts for each domain df = ctx.spark_session.createDataFrame(data, ['domain', 'parsed_row_count']) try: # Join in row counts from DATA_COUNT csv for col_name in site_counts.columns: site_counts = site_counts.withColumnRenamed( col_name, col_name.upper()) df = df.join(site_counts, df.domain == F.lower(site_counts.TABLE_NAME), 'left') df = df.withColumn("delta_row_count", df.ROW_COUNT - df.parsed_row_count) df = df.selectExpr("domain", "cast(ROW_COUNT as long) as loaded_row_count", "parsed_row_count", "delta_row_count") except: schema = T.StructType([ T.StructField("domain", T.StringType(), True), T.StructField("loaded_row_count", T.LongType(), True), T.StructField("parsed_row_count", T.LongType(), True), T.StructField("delta_row_count", T.DoubleType(), True), ]) df = ctx.spark_session.createDataFrame([], schema) return df
def createDeltaBackedState(tableName, overwrite=False): from delta.tables import DeltaTable import pyspark.sql.types as T db_location = "dbfs:/home/[email protected]/streamingWorkshop/db" db_table_name = "sw_db." + tableName checkpoint_location = db_location + "/checkpointTables/" + db_table_name delta_schema = (T.StructType([ T.StructField("item_id", T.LongType()), T.StructField("timestamp", T.TimestampType()), T.StructField("sales", T.LongType()) ])) # Create an empty Delta table if it does not exist. This is required for the MERGE to work in the first mini batch. if overwrite or not DeltaTable.isDeltaTable( spark, db_location + "/" + db_table_name): (spark.createDataFrame( [], delta_schema).write.mode("overwrite").option( "overwriteSchema", "true").format("delta").saveAsTable(db_table_name)) spark.sql( f"ALTER TABLE {db_table_name} SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = false)" )
def test_undefined_field(self): with six.assertRaisesRegex(self, KeyError, 'f2'): schema_has( T.StructType([T.StructField('f1', T.IntegerType())]), T.StructType([T.StructField('f2', T.LongType())]), ) with six.assertRaisesRegex(self, KeyError, 'f1\.element\.s2'): schema_has( T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])), ), ]), T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s2', T.LongType())])), ), ]), ) with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'): schema_has( T.ArrayType(T.IntegerType()), T.ArrayType(T.LongType()), )
def df_regex_make(wikiqtsv): # make wikiq tsv into a dataframe tsv2df = reader.csv(wikiqtsv, sep="\t", inferSchema=False, header=True, mode="PERMISSIVE", quote="") #tsv2df = tsv2df.repartition(args.num_partitions) # basic structure struct = types.StructType().add("anon",types.StringType(),True) struct = struct.add("articleid",types.LongType(),True) struct = struct.add("date_time",types.TimestampType(), True) struct = struct.add("deleted",types.BooleanType(), True) struct = struct.add("editor",types.StringType(),True) struct = struct.add("editor_id",types.LongType(), True) struct = struct.add("minor", types.BooleanType(), True) struct = struct.add("namespace", types.LongType(), True) struct = struct.add("revert", types.BooleanType(), True) struct = struct.add("reverteds", types.StringType(), True) struct = struct.add("revid", types.LongType(), True) struct = struct.add("sha1", types.StringType(), True) struct = struct.add("text_chars", types.LongType(), True) struct = struct.add("title",types.StringType(), True) # structure the df to get the def with columns of metadata and regexes regex_one_df = df_structurize(tsv2df,struct) return regex_one_df
def specifySchema(): wiki_schema = types.StructType([ # commented-out fields won't be read types.StructField('lang', types.StringType(), True), types.StructField('page_name', types.StringType(), True), types.StructField('viewcount', types.LongType(), True), types.StructField('bytes', types.LongType(), True), ]) return wiki_schema
def main(inputs, keyspace, table): if table == "yelp_business": business_schema = StructType([ types.StructField('business_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('neighborhood', types.StringType(), True), types.StructField('address', types.StringType(), True), types.StructField('city', types.StringType(), True), types.StructField('state', types.StringType(), True), types.StructField('postal_code', types.StringType(), True), types.StructField('latitude', types.FloatType(), True), types.StructField('longitude', types.FloatType(), True), types.StructField('stars', types.FloatType(), True), types.StructField('review_count', types.LongType(), True), types.StructField('is_open', types.IntegerType(), True) ]) business = spark.read.json(inputs, schema=business_schema) df = business.drop('neighborhood').filter(business.is_open == 1) df.cache() business_data = sc.textFile(inputs).map(json_key_value_1).map( lambda x: Row(x[0], x[1], x[2], x[3])) df_1 = business_data.toDF() df_2 = df_1.withColumnRenamed("_1", "bus_id").withColumnRenamed( "_2", "attributes").withColumnRenamed( "_3", "categories").withColumnRenamed("_4", "hours") df_2.cache() result = df.join(df_2, df.business_id == df_2.bus_id, how='inner').drop(df_2.bus_id) elif table == "yelp_checkin": checkin_data = sc.textFile(inputs).map(json_key_value_2).map( lambda x: Row(str(uuid.uuid1()), x[0], x[1])) df = checkin_data.toDF().cache() df_1 = df.withColumnRenamed("_1", "id").withColumnRenamed( "_2", "time").withColumnRenamed("_3", "business_id") result = df_1 if table == "yelp_review": reviews_schema = types.StructType([ types.StructField('business_id', types.StringType(), True), types.StructField('cool', types.LongType(), True), types.StructField('date', types.DateType(), True), types.StructField('funny', types.LongType(), True), types.StructField('review_id', types.StringType(), True), types.StructField('stars', types.LongType(), True), types.StructField('text', types.StringType(), True), types.StructField('useful', types.LongType(), True), types.StructField('user_id', types.StringType(), True) ]) reviews = spark.read.json(inputs, schema=reviews_schema) uuidUdf = udf(lambda: str(uuid.uuid1()), types.StringType()) result = reviews.withColumn("id", uuidUdf()) result.repartition(300).write.format( "org.apache.spark.sql.cassandra").options(table=table, keyspace=keyspace).save()
def test_type_mismatch(self): with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'): schema_has( T.StructType([T.StructField('f1', T.IntegerType())]), T.ArrayType(T.IntegerType()), ) with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'): schema_has( T.ArrayType(T.IntegerType()), {'f1': T.IntegerType()}, ) with six.assertRaisesRegex(self, TypeError, 'f1 is IntegerType, expected LongType'): schema_has( T.StructType([T.StructField('f1', T.IntegerType())]), T.StructType([T.StructField('f1', T.LongType())]), ) with six.assertRaisesRegex( self, TypeError, 'f1\.element\.s1 is IntegerType, expected LongType', ): schema_has( T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])), ), ]), T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s1', T.LongType())])), ), ]), ) with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'): schema_has( T.ArrayType(T.IntegerType()), T.ArrayType(T.LongType()), ) with six.assertRaisesRegex(self, TypeError, 'key is StringType, expected LongType'): schema_has( T.MapType(T.StringType(), T.IntegerType()), T.MapType(T.LongType(), T.IntegerType()), ) with six.assertRaisesRegex(self, TypeError, 'value is IntegerType, expected LongType'): schema_has( T.MapType(T.StringType(), T.IntegerType()), T.MapType(T.StringType(), T.LongType()), )
def test_arrays_nested_subset(self): schema_has( T.ArrayType(T.ArrayType(T.StructType([ T.StructField('f1', T.ArrayType(T.LongType())), T.StructField('f2', T.ArrayType(T.StringType())), ]))), T.ArrayType(T.ArrayType(T.StructType([ T.StructField('f1', T.ArrayType(T.LongType())) ]))), )
def test_validate_work_success(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) fields = validator.validate( ["src_ip", "dst_ip", "packet_size", "sampling_rate"]) self.assertEqual( fields, types.StructType([ types.StructField('src_ip', types.StringType()), types.StructField('dst_ip', types.StringType()), types.StructField('packet_size', types.LongType()), types.StructField('sampling_rate', types.LongType()) ]), 'StructType should be equal')
def load_prices(spark): data = [ (10, 1546300799000, 37.50, 37.51), (10, 1546300802000, 37.51, 37.52), (10, 1546300806000, 37.50, 37.51), ] schema = T.StructType([ T.StructField("id", T.LongType()), T.StructField("timestamp", T.LongType()), T.StructField("bid", T.DoubleType()), T.StructField("ask", T.DoubleType()), ]) return spark.createDataFrame(data, schema)
def test_data(spark_session): sa = ( T.StructType() .add('id', T.IntegerType(), False, None) .add('tag', T.StringType(), False, None) .add('a1', T.IntegerType(), False, None) .add('a2', T.IntegerType(), False, None) ) sb = ( T.StructType() .add('id', T.IntegerType(), False, None) .add('tag', T.StringType(), False, None) .add('b1', T.LongType(), False, None) .add('b2', T.IntegerType(), False, None) ) sc = ( T.StructType() .add('id', T.IntegerType(), False, None) .add('tag', T.StringType(), False, None) .add('c1', T.LongType(), False, None) .add('c2', T.FloatType(), False, None) .add('c3', T.IntegerType(), False, None) ) da = [ (1,'a',1,1), (2,'a',1,1), (3,'a',1,1)] db = [ (1,'b',2,2), (2,'b',2,2), (3,'b',2,2)] dc = [ (1,'c',3,0.010,1), (2,'c',3,3.0,0), (3,'c',3,3.0,1)] return { "dfa" : spark_session.createDataFrame(da, sa), "dfb" : spark_session.createDataFrame(db, sb), "dfc" : spark_session.createDataFrame(dc, sc) }
def load_trades(spark): data = [ (10, 1546300800000, 37.50, 100.000), (10, 1546300801000, 37.51, 100.000), (20, 1546300804000, 12.67, 300.000), (10, 1546300807000, 37.50, 200.000), ] schema = T.StructType([ T.StructField("id", T.LongType()), T.StructField("timestamp", T.LongType()), T.StructField("price", T.DoubleType()), T.StructField("quantity", T.DoubleType()), ]) return spark.createDataFrame(data, schema)
def add_colocated_variants(self): empty_field = t.StructField('colocated_variants', t.ArrayType(t.StructType())) f = self.get_output_struct_field( "colocated_variants", create_if_nonexistent=empty_field).elementType f.add(t.StructField("seq_region_name", t.StringType())) f.add(t.StructField("strand", t.IntegerType())) f.add(t.StructField("start", t.LongType())) f.add(t.StructField("end", t.LongType())) f.add(t.StructField("id", t.StringType())) f.add(t.StructField("allele_string", t.StringType())) return self
def test_validate_with_correct_two_level_subtree(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "sum" syntaxtree.children = ["sampling_rate", "packet_size"] main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "mult" main_syntax_tree.children = [syntaxtree, "sampling_rate"] fields = validator.validate( [FieldTransformation("result", main_syntax_tree), "dst_ip"]) self.assertEqual( fields, types.StructType([ types.StructField('result', types.LongType()), types.StructField('dst_ip', types.StringType()) ]))
def run(rucio_path, dbs_path, output, verbose): start = time.time() spark = SparkSession.builder.appName("rucio_dumps_test").getOrCreate() csvreader = spark.read.format("csv") \ .option("nullValue", "null") \ .option("mode", "FAILFAST") avroreader = spark.read.format("avro") rucio_info = avroreader.load(rucio_path) \ .withColumn("filename", fn.input_file_name()) logger.debug("Rucio data types") logger.debug(rucio_info.dtypes) # rucio_info.show(5, False) dbs_files = csvreader.schema(schemas.schema_files()) \ .load(dbs_path) \ .select("f_logical_file_name", "f_dataset_id") # dbs_files.show(5, False) rucio_df = (rucio_info.withColumn("tmp1", fn.substring_index("filename", "/rucio/", -1)) .withColumn("tally_date", fn.substring_index("tmp1", "/", 1)) .withColumn('create_day', fn.date_format(fn.to_date((rucio_info.CREATED_AT / fn.lit(1000)) .cast(types.LongType()) .cast(types.TimestampType())), 'yyyyMMdd') ) .withColumn('tally_day', fn.date_format(fn.to_date("tally_date", "yyyy-MM-dd"), 'yyyyMMdd')) .select("RSE_ID", "BYTES", "NAME", "SCOPE", "tally_day", "create_day") ) # rucio_df.show(5, False) rucio_df = rucio_df \ .join(dbs_files, dbs_files.f_logical_file_name == rucio_df.NAME) \ .groupBy("RSE_ID", "f_dataset_id", "SCOPE", "tally_day", "create_day") \ .agg(fn.sum("BYTES").alias("rep_size")) # rucio_df.show(5, False) rucio_df.write.option("compression", "snappy").parquet(output, mode="overwrite") end = time.time() logger.info("Elapsed Time: {min} min, {sec} sec.".format(min=(end - start) // 60, sec=(end - start) % 60))
def recommend(num, user_id, spark, ratings_model): user_df = spark.createDataFrame([user_id], types.LongType()) user_df = user_df.select(user_df['value'].alias('user_id')) rec_df_raw = ratings_model.recommendForUserSubset( user_df, num).select('recommendations') rec_rdd = rec_df_raw.rdd\ .flatMap(lambda x: x['recommendations'])\ .map(lambda x: (x['business_id'], x['rating']))\ .map(lambda x: Row(business_id=x[0], rating=x[1])) if rec_rdd.isEmpty(): return [] rec_df = spark.createDataFrame(rec_rdd)\ .withColumn('user_id', functions.lit(user_id))\ .withColumn('timestamp', functions.current_timestamp()) try: rec_df.write.format('jdbc').options( url='jdbc:mysql://localhost/YelpRecommender', driver='com.mysql.jdbc.Driver', dbtable='Recommend', user='******', password='******').mode('append').save() except Exception as e: print('recommend() function in use_model.py\n', str(e)) # rec_df.show() l = list( rec_df.select('business_id').rdd.map(lambda x: (x['business_id'])).collect()) return l
def main(inputs, output): # main logic starts here wiki_schema = types.StructType([ types.StructField('language', types.StringType()), types.StructField('title', types.StringType()), types.StructField('views', types.IntegerType()), types.StructField('size', types.LongType()), ]) #reading data wikiData = spark.read.csv(inputs, schema=wiki_schema, sep=" ").withColumn( 'hour', path_to_hour(functions.input_file_name())) #filtering data filteredWikiData = wikiData[(wikiData['language'] == 'en') & (wikiData['title'] != 'Main_Page') & (wikiData['title'] != 'Special:Page')].cache() #finding max views per hour. maxCount = filteredWikiData.groupBy('hour').agg( functions.max(filteredWikiData['views']).alias('max')) #joining data to obtain hour and title. joinData = filteredWikiData.join( maxCount, filteredWikiData.views == maxCount.max).select( filteredWikiData["hour"], filteredWikiData["title"], filteredWikiData["views"]) #sorting data based on hour and storing it in json file. joinData.sort(functions.asc('hour')).write.json(output, mode='overwrite')
def _generate_select_expression_for_extended_string_to_long( source_column, name): """ More robust conversion from StringType to LongType. Is able to additionally handle (compared to implicit Spark conversion): * Preceding whitespace * Trailing whitespace * Preceeding and trailing whitespace * underscores as thousand separators Hint ---- Please have a look at the tests to get a better feeling how it behaves under tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and tests/data/test_fixtures/mapper_custom_data_types_fixtures.py Example ------- >>> from spooq.transformer import Mapper >>> >>> input_df.head(3) [Row(input_string=" 21474836470 "), Row(input_string="Hello"), Row(input_string="21_474_836_470")] >>> mapping = [("output_value", "input_string", "extended_string_to_long")] >>> output_df = Mapper(mapping).transform(input_df) >>> output_df.head(3) [Row(input_string=21474836470), Row(input_string=None), Row(input_string=21474836470)] """ return F.regexp_replace(F.trim(source_column), "_", "").cast(T.LongType()).alias(name)
def _generate_select_expression_for_timestamp_s_to_s(source_column, name): """ This Constructor is used for unix timestamps. The values are cleaned next to casting and renaming. If the values are not between `01.01.1970` and `31.12.2099`, NULL will be returned. Cast to :any:`pyspark.sql.types.LongType` Example ------- >>> from pyspark.sql import Row >>> from spooq.transformer import Mapper >>> >>> input_df = spark.createDataFrame([ >>> Row(time_sec=1581540839), # 02/12/2020 @ 8:53pm (UTC) >>> Row(time_sec=-4887839), # Invalid! >>> Row(time_sec=4737139200) # 02/12/2120 @ 12:00am (UTC) >>> ]) >>> >>> mapping = [("unix_ts", "time_sec", "timestamp_s_to_ms")] >>> output_df = Mapper(mapping).transform(input_df) >>> output_df.head(3) [Row(unix_ts=1581540839), Row(unix_ts=None), Row(unix_ts=None)] Note ---- *input* in **seconds** *output* in **seconds** """ return (F.when(source_column.between(MIN_TIMESTAMP_SEC, MAX_TIMESTAMP_SEC), source_column).otherwise(F.lit(None)).cast( T.LongType()).alias(name))
def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name)
def main(): args = parseArguments() spark = SparkSession.builder.getOrCreate() Logger = spark._jvm.org.apache.log4j.Logger joblogger = Logger.getLogger(__name__) joblogger.info( '****************************************************************') joblogger.info('') joblogger.info('Starting creation of test data file with {0} rows and {1} ' 'partitions at {2}'.format(args.rows, args.partitions, args.outfile)) joblogger.info('') joblogger.info( '****************************************************************') udfGetUUID = F.udf(getUUID, T.StringType()) df = (spark.range(0, args.rows, numPartitions=args.partitions).withColumn( 'value', udfGetUUID()).withColumn( 'prefix2', F.substring(F.col('value'), 1, 2)).withColumn( 'prefix4', F.substring(F.col('value'), 1, 4)).withColumn( 'prefix8', F.substring(F.col('value'), 1, 8)).withColumn( 'float_val', F.rand(seed=8675309) * 1000000).withColumn( 'integer_val', F.col('float_val').cast(T.LongType())).drop('id')) df.write.csv(args.outfile, mode='overwrite', header=True) joblogger.info('Done writing to {0}'.format(args.outfile))