def __init__(self, input_file): super(CSVExtractor, self).__init__() self.input_file = input_file self.spark = SparkSession.Builder()\ .enableHiveSupport()\ .appName('spooq.extractor: {nm}'.format(nm=self.name))\ .getOrCreate()
def __init__( self, db_name, table_name, partition_definitions=[{ "column_name": "dt", "column_type": "IntegerType", "default_value": None }], clear_partition=True, repartition_size=40, auto_create_table=True, overwrite_partition_value=True, ): super(HiveLoader, self).__init__() self._assert_partition_definitions_is_valid(partition_definitions) self.partition_definitions = partition_definitions self.db_name = db_name self.table_name = table_name self.full_table_name = db_name + "." + table_name self.repartition_size = repartition_size if clear_partition and not overwrite_partition_value: raise ValueError( "clear_partition is only supported if overwrite_partition_value is also enabled. ", "This would otherwise result in clearing partitions on basis of dynamically values", "(from dataframe) instead of explicitly defining the partition(s) to clear", ) self.clear_partition = clear_partition self.overwrite_partition_value = overwrite_partition_value self.auto_create_table = auto_create_table self.spark = (SparkSession.Builder().enableHiveSupport().appName( "spooq.extractor: {nm}".format(nm=self.name)).getOrCreate())
def getSparkSession(): #conf = SparkConf().setAppName("sanity_check_framework").setMaster("yarn-client") #sc = SparkContext(conf=conf) #spark = SQLContext(sc) spark = SparkSession.Builder().master("yarn-client").appName("Data Validation").enableHiveSupport().getOrCreate() return spark
def main(): spark = SparkSession.Builder().getOrCreate() # load dataset # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) # dataset = spark.read.format('libsvm').json(datapath+'/data/business.json') filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/business_MTL_ONLY.json' # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json' dataset = spark.read.format('libsvm').json(filename) print(dataset) # get longitude and latitude ll = dataset.select(dataset.categories[0], dataset.longitude, dataset.latitude) ll = ll.withColumnRenamed('categories[0]', 'categories') ll.show() print(ll.schema.names) # for item in ll.schema.names: # print(item) # for item2 in item: # print(item2) sys.exit() # convert ll to dense vectors # data =ll.rdd.map(lambda x:(Vectors.dense(float(x[0]), float(x[1])),)).collect() assembler = VectorAssembler(inputCols=['longitude', 'latitude'], outputCol='features') df = assembler.transform(ll) # set KMeans k and seed kmeans = KMeans(k=4, seed=1) # generate model model = kmeans.fit(df) # Make predictions predictions = model.transform(df) predictions.show(20) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # number of location in each cluster print('Number of business in each cluster: ') predictions.groupBy('prediction').count().sort(desc('count')).show() # show in which cluster do we have more restaurants print('Number of restaurant per clusters') predictions.where(predictions.categories == 'Restaurants').groupBy( 'prediction').count().sort(desc('count')).show() # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def __init__(self, jdbc_options, cache=True): super(JDBCExtractor, self).__init__() self._assert_jdbc_options(jdbc_options) self.jdbc_options = jdbc_options self.cache = cache self.spark = (SparkSession.Builder().enableHiveSupport().appName( "spooq.extractor: {nm}".format(nm=self.name)).getOrCreate())
def getSparkSession(): spark = SparkSession.Builder().master("yarn-client").appName("Data Validation") \ .config("hive.exec.dynamic.partition", "true") \ .config("hive.exec.dynamic.partition.mode", "nonstrict") \ .config("hive.warehouse.data.skipTrash", "true") \ .enableHiveSupport() \ .getOrCreate() return spark
def spark_session_builder(self) -> SparkSession.Builder: builder = SparkSession.Builder() joined_config: Dict[str, str] = self.spark_options().copy() joined_config.update(self.default_configuration()) for key, value in joined_config.items(): builder.config(key, value) return builder
def __init__(self, appName): self.__spark = SparkSession.Builder().appName(appName).getOrCreate() self.__database = None self.__user = None self.__password = None self.__host = None self.__port = None self.__connect = None self.__cursor = None
def __init__(self, input_path=None, base_path=None, partition=None): super(JSONExtractor, self).__init__() self.input_path = self._get_path(input_path=input_path, base_path=base_path, partition=partition) self.base_path = base_path self.partition = partition self.spark = (SparkSession.Builder().enableHiveSupport().appName( "spooq.extractor: {nm}".format(nm=self.name)).getOrCreate())
def _create_session(self): spark_conf = SparkConf().setAppName(self._app_name) spark_conf.set('spark.executor.memory', f'{self._memory}g') spark_conf.set('spark.executor.cores', f'{self._cores}') spark_conf.set('spark.driver.memory', f'{self._driver_memory}g') spark_conf.set('spark.driver.extraClassPath', '/home/ripper/postgresql-42.2.19.jar') spark_conf.set('spark.jars.packages', 'org.postgresql:postgresql:42.2.19') return SparkSession.Builder().config(conf=spark_conf).getOrCreate()
def main(): spark = SparkSession.Builder() \ .appName("APP_NAME") \ .config("hive.support.concurrency", "false") \ .config("spark.sql.crossJoin.enabled", "true") \ .enableHiveSupport() \ .master("yarn") \ .getOrCreate() sc = spark.sparkContext spark.udf.register("func", func) spark.sql("select cast(func('EUR', 'USD', '2004-07-01') as double) as rate,src_sys_nm from radar.src_sys").show()
def alsrecommend(n): # spark 初始化 spark = SparkSession.Builder().appName('sql').master('local').getOrCreate() # mysql 配置(需要修改) prop = {'user': '******', 'password': '******', 'driver': 'com.mysql.jdbc.Driver'} # database 地址(需要修改) url = 'jdbc:mysql://192.168.222.3:3306/laoba' # 读取表 data = spark.read.jdbc(url=url, table='score', properties=prop) scoreData = data.createOrReplaceTempView("scoredata") result = spark.sql("select user_id,film_id,score from scoredata") users = spark.sql("select user_id from scoredata") users = [int(row.user_id)for row in users.distinct().collect()] #print(users) score = result.rdd.map(lambda x :(x[0],x[1],x[2])) #score.take(10) model = ALS.train(score,20,25,0.01) recresult = [] for i in users: recresult.extend(model.recommendProducts(i,10)) #print(recresult) a =1 resultdata =[] for row in recresult: resultdata.append((a,row.user,row.product,round(row.rating,2))) a+=1 #resultdata = [(row.index,row.user,row.product,round(row.rating, 2)) for row in recresult] #print(resultdata) #print(len(resultdata)) result_df = spark.createDataFrame(resultdata,schema=["id","user_id","movie_id","rating"]) #result_df.show() result_df.write.jdbc(url=url,table="recommend",mode="overwrite",properties=prop) spark.stop() print("Already running...........%d"%n) time.sleep(10)
def data_anlysis(): inputFile = r"data/vehicles.csv" # inputFile = 'tesvehicles.json' spark = SparkSession.Builder().appName('VH').getOrCreate() df = spark.read.csv(inputFile) print("Loading vehicles from " + inputFile) # prev_count = count_files_in_folder(inputPath) # input = hiveCtx.read.json(inputFile) # input.registerTempTable("vehicles") topvehicless = df.show() print(topvehicless) print( colored("2. filter out now span data:", "blue", attrs=["reverse", "blink"])) print('schema:') print(df.printSchema())
# read and write json files #pip3 install langdetect from pyspark.sql import SparkSession from pyspark.sql.functions import * from langdetect import detect from pyspark.sql.types import * import json spark = SparkSession.Builder().appName("json").master("local[2]").getOrCreate() sc = spark.sparkContext #data = json.load(open("/home/user/workarea/projects/learn-pyspark/data/colors.json","r")) #rdd = sc.parallelize(data) #register a udf for language detection def detect_tweet_lang(s): return detect(s) spark.udf.register("detect_tweet_lang", detect_tweet_lang) """ data = spark.read.format("json").\ option("multiline","true").\ option("mode","FAILFAST").\ load("/home/user/workarea/projects/learn-pyspark/data/source/tweets.json")
raise if __name__ == "__main__": """Submit Spark application in Step: --jars s3_bucket/program/postgresql-42.1.4.jar \ --py-files s3_bucket/program/para2dis-0.1.egg \ py/est_distance_spark.py s3_bucket:program/local.conf """ if len(sys.argv) != 2: sys.exit(1) else: # We use a property file to configure the environment conf_on_s3 = sys.argv[1] spark = SparkSession.Builder().appName('distance').getOrCreate() get_file_on_s3(conf_on_s3, local_file) config = configparser.ConfigParser() config.read(local_file) DRIVER = config.get('database', 'driver') URL = config.get('database', 'url') USER = config.get('database', 'user') PASSWORD = config.get('database', 'password') TODB = config.get('database', 'isUsed') prop = {'driver': DRIVER, 'user': USER, 'password': PASSWORD} df = spark.read.jdbc(URL, main_table, properties=prop)
# @author: Yang, zhen-peng (Arvin) ## Spark Application - execute with spark-submit:spark-submit app.py # Imports from pyspark.sql import SparkSession from pyspark.sql import Row from pyspark.sql import types from pyspark.sql import functions as func # Module Constants # note: Builder() and getOrCreate(), don't miss () APP_NAME = "Spark SQL Demo" spark = SparkSession.Builder() \ .appName(APP_NAME) \ .master("local") \ .getOrCreate() # Closure Functions # Main functionality def main(spark: SparkSession): sc = spark.sparkContext # load data from textFile, return text file RDD t005t = sc.textFile("C:\\200836_az_fi1_105104.T005T_p1.TXT") \ .map(lambda line: line.split("|")) \ .map(lambda t: Row(mandt=t[0], spras=t[1], land1=t[2], landx=t[3], natio=t[4])) # create DataFrame by RDD
#spark certification practice - Spark Definitive Guide chapter-6 from pyspark.sql import SparkSession from pyspark.sql.functions import desc, col, window, column, date_format, pow, round, bround, corr, coalesce import time spark = SparkSession.Builder().appName("test13").master( "local[3]").getOrCreate() df=spark.read.format("csv")\ .option("header","true")\ .option("inferSchema","true")\ .load("/home/user/workarea/projects/Spark-The-Definitive-Guide/data/retail-data/by-day/2010-12-01.csv") df.printSchema() df.createOrReplaceTempView("dfTable") #converting to spark datatypes from pyspark.sql.functions import lit df.select(lit(5), lit("five"), lit(5.0)) #use of boolean expressions df.where(col("InvoiceNo") == "536365")\ .select("InvoiceNo","Description")\ .show(5,False) #another way
from pyspark.sql import SparkSession spark = SparkSession.Builder().appName('Exercise').getOrCreate() df = spark.read.csv('sanket.csv', inferSchema=True, header=True) df.show(3) from pyspark.sql.functions import col df.groupBy("category") \ .count() \ .orderBy(col("count").desc()) \ .show() from pyspark.ml.feature import RegexTokenizer,StopWordsRemover,CountVectorizer from pyspark.ml.feature import HashingTF, IDF, StringIndexer from pyspark.ml import Pipeline #for tokenization using regular expression regexTokenizer = RegexTokenizer(inputCol="article", outputCol="words", pattern="\\W") # Stop words remover add_stopwords = ["http","https","amp","rt"] stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filterwords").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filterwords", outputCol="features", vocabSize=10000, minDF=5) # TF-IDF hashingTF = HashingTF(inputCol="filterwords", outputCol="rawFeatures", numFeatures=10000)
def get_user_business(rating, user_mean, item_mean, rating_global_mean): return rating - (user_mean + item_mean - rating_global_mean) def get_final_ratings(i, user_mean, item_mean, global_average_rating): final_ratings = i + user_mean + item_mean - global_average_rating return final_ratings # sparkConf.set("spark.sql.crossJoin.enabled", "true") #Then get or create SparkSession by passing this SparkConf # val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate() spark = SparkSession.Builder().getOrCreate() spark.conf.set("spark.sql.crossJoin.enabled", "true") seed = 1 # int(sys.argv[SEED]) # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) # rdd = spark.read.json(datapath+'/data/review_truncated_RAW.json').rdd # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review.json' # filename = '../data/review_50K_0.json' filename = 'review_50K_0.json' # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json' # filename = '/Users/nicolasg-chausseau/big_data_project_yelp/data/review_truncated_RAW.json' rdd = spark.read.json(filename).limit( 100).rdd # datapath+'/data/review_trunca®ted_RAW.json' # TODO: put the limit above back to 100,000
from boto3 import s3 from pyspark.sql import SparkSession if __name__ == "__main__": sc = SparkSession.Builder().appName("Teste").getOrCreate() parquetFile = sc.read.parquet("/home/diego/Documentos/parquet.parquet") parquetFile.printSchema() view = parquetFile.createOrReplaceTempView("teste") sc.sql("select * from teste").show() sc.stop()
#spark certification practice - chapter 08: Joins from pyspark.sql import SparkSession spark = SparkSession.Builder().appName("chapter7").master( "local[3]").getOrCreate() #create datasets person = spark.createDataFrame([ (0, "Bill Chambers", 0, [100]), (1, "Matei Zaharia", 1, [500, 250, 100]), (2, "Michael Armbrust", 1, [250, 100])])\ .toDF("id", "name", "graduate_program", "spark_status") graduateProgram = spark.createDataFrame([ (0, "Masters", "School of Information", "UC Berkeley"), (2, "Masters", "EECS", "UC Berkeley"), (1, "Ph.D.", "EECS", "UC Berkeley")])\ .toDF("id", "degree", "department", "school") sparkStatus = spark.createDataFrame([ (500, "Vice President"), (250, "PMC Member"), (100, "Contributor")])\ .toDF("id", "status") person.show() graduateProgram.show() sparkStatus.show()
from pyspark.sql import SparkSession from pyspark_extensions.helpers import test, init_logger spark = SparkSession.Builder().appName('etl').master("local[4]").getOrCreate() spark.sql('select 1').transform(test()).show() logger = init_logger(spark, __name__)() logger.info("info") logger.warn("warn") logger.error("error")
def __init__(self, appName): self.__df = None self.__spark = SparkSession.Builder().appName(appName).getOrCreate() self.__jdbcUrl = None self.__properties = None
# -*- coding: UTF-8 -*- # author: [email protected] # date: 2016-12-27 # imports from pyspark.sql import SparkSession from pyspark.sql import functions as f from pyspark import StorageLevel from pyspark.sql import types # Module Constants APP_NAME = __file__ spark = SparkSession.Builder() \ .appName(APP_NAME) \ .config("hive.support.concurrency", "false") \ .config("spark.sql.crossJoin.enabled", "true") \ .enableHiveSupport() \ .master("yarn") \ .getOrCreate() # Closure Functions # Main functionality def main(spark): # sql declaration bsad = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \ "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsad" bsid = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \ "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsid"
def main(): spark = SparkSession.Builder().getOrCreate() seed = int(sys.argv[SEED]) datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) rdd = spark.read.json(datapath + '/data/review.json').limit(100000).rdd df = spark.createDataFrame(rdd) (training, test) = df.randomSplit([0.8, 0.2], seed) userIdRdd1 = test.select('user_id').rdd.distinct().zipWithIndex().map( lambda x: (x[0][0], x[1])) businessIdRdd1 = test.select('business_id').rdd.distinct().zipWithIndex( ).map(lambda x: (x[0][0], x[1])) # convert to dataframe userIdDf2 = spark.createDataFrame(userIdRdd1)\ .withColumnRenamed('_1', 'user_id') \ .withColumnRenamed('_2', 'user_id_indexed') businessIdDf2 = spark.createDataFrame(businessIdRdd1) \ .withColumnRenamed('_1', 'business_id') \ .withColumnRenamed('_2', 'business_id_indexed') # join user id zipped with index and business id with index test = test.join(userIdDf2, ['user_id'], 'left').join(businessIdDf2, ['business_id'], 'left') # get user mean user_mean = training.groupBy('user_id').mean('stars').withColumnRenamed( 'avg(stars)', 'user-mean') # get item mean business_mean = training.groupBy('business_id').mean( 'stars').withColumnRenamed('avg(stars)', 'business-mean') # join user mean df and training df training = training.join(user_mean, ['user_id']) \ .select(training['user_id'], training['business_id'], training['stars'], user_mean['user-mean']) # join item mean df and traning df training = training.join(business_mean, ['business_id']) \ .select(training['user_id'], training['business_id'], training['stars'], user_mean['user-mean'], business_mean['business-mean']) # get global average rating_global_average = training.groupBy().avg('stars').head()[0] # add user item interaction to training column training = training.withColumn( 'user-business-interaction', get_user_business(training['stars'], user_mean['user-mean'], business_mean['business-mean'], rating_global_average)) # convert distinct user ids and business ids to integer userIdRdd = training.select('user_id').rdd.distinct().zipWithIndex().map( lambda x: (x[0][0], x[1])) businessIdRdd = training.select('business_id').rdd.distinct().zipWithIndex( ).map(lambda x: (x[0][0], x[1])) # convert to dataframe userIdDf = spark.createDataFrame(userIdRdd)\ .withColumnRenamed('_1', 'user_id') \ .withColumnRenamed('_2', 'user_id_indexed') businessIdDf = spark.createDataFrame(businessIdRdd) \ .withColumnRenamed('_1', 'business_id') \ .withColumnRenamed('_2', 'business_id_indexed') # join user id zipped with index and business id with index training = training.join(userIdDf, ['user_id'], 'left').join(businessIdDf, ['business_id'], 'left') als = ALS(maxIter=5, rank=70, regParam=0.01, userCol='user_id_indexed', itemCol='business_id_indexed', ratingCol='user-business-interaction', coldStartStrategy='drop') als.setSeed(seed) model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) predictions = predictions.join(user_mean, ['user_id'], 'left') predictions = predictions.join(business_mean, ['business_id'], 'left') rating_global_mean = training.groupBy().mean('stars').head()[0] predictions = predictions.na.fill(rating_global_mean) final_stars = predictions.withColumn( 'final-stars', get_final_ratings(predictions['prediction'], predictions['user-mean'], predictions['business-mean'], rating_global_mean)) high_stars = final_stars.where(final_stars['final-stars'] >= 3) low_stars = final_stars.where(final_stars['final-stars'] < 3) evaluator = RegressionEvaluator(metricName='rmse', labelCol='stars', predictionCol='final-stars') final_stars_rmse = evaluator.evaluate(final_stars) print('final stars rmse', float(final_stars_rmse)) high_stars_rmse = evaluator.evaluate(high_stars) print('number of high stars', high_stars.count()) print('high stars rmse', float(high_stars_rmse)) print('number of low stars', low_stars.count()) low_stars_rmse = evaluator.evaluate(low_stars) print('low stars rmse', float(low_stars_rmse))
from pyspark.sql import SparkSession from pyspark.sql.functions import * spark = SparkSession.Builder().appName('Example').getOrCreate() sales_df = spark.read \ .option("inferSchema", "true") \ .option("header", "true") \ .csv("sales.csv") result = sales_df.groupBy("COUNTRY_CODE")\ .sum("AMOUNT")\ .orderBy(desc("sum(AMOUNT)")) result.show()
from pyspark.sql import SparkSession spark = SparkSession.Builder().appName("rdd-transform").master( "local[2]").getOrCreate() spark.conf.set("logLineage", 'true') sc = spark.sparkContext #sc.setLogLevel("INFO") #set log lineage to true #--conf spark.logLineage=true #narrow transformations - doesnt require a shuffle #wide transformations - require a shuffle list1 = [1, 2, 3, 3, 6, 7, 8, 12, 6, 23, 45, 76, 9, 10] list2 = [1, 2, 3] list3 = [3, 4, 5] r1 = sc.parallelize(list1, 20) print(type(r1)) #r1.persist() #print(r1.collect()) r3 = r1.map(lambda x: x**2).filter(lambda x: x > 5)
from para2dis.Prior import Prior from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.Builder().appName('test').getOrCreate() pri = Prior() spark.stop()
def main(): spark = SparkSession.Builder().getOrCreate() seed = 1 # int(sys.argv[SEED]) # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) # rdd = spark.read.json(datapath+'/data/review_truncated_RAW.json').rdd filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review.json' # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json' # filename = '/Users/nicolasg-chausseau/big_data_project_yelp/data/review_truncated_RAW.json' rdd = spark.read.json(filename).limit(150000).rdd # datapath+'/data/review_trunca®ted_RAW.json' df = spark.createDataFrame(rdd) (training, test) = df.randomSplit([0.8, 0.2], seed) userIdRdd1 = test.select('user_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1])) businessIdRdd1 = test.select('business_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1])) # convert to dataframe userIdDf2 = spark.createDataFrame(userIdRdd1)\ .withColumnRenamed('_1', 'user_id') \ .withColumnRenamed('_2', 'user_id_indexed') businessIdDf2 = spark.createDataFrame(businessIdRdd1) \ .withColumnRenamed('_1', 'business_id') \ .withColumnRenamed('_2', 'business_id_indexed') # join user id zipped with index and business id with index test = test.join(userIdDf2, ['user_id'], 'left').join(businessIdDf2, ['business_id'], 'left') # get user mean user_mean = training.groupBy('user_id').mean('stars').withColumnRenamed('avg(stars)', 'user-mean') # get item mean business_mean = training.groupBy('business_id').mean('stars').withColumnRenamed('avg(stars)', 'business-mean') # ------------------- NIC stats: ---------------------------- # num reviews per user: usercount = userIdRdd1.count() print(usercount) numReviews = rdd.count() print(numReviews) numReviewsPerUser = numReviews / usercount # numReviewsPerUser = training.rdd.map(lambda x: (x['user_id'], 1)).reduceByKey(lambda a, b: a+b).map(lambda x: x[1]).reduce(lambda a,b: a+b) # numReviewsPerUser = training.groupBy("user_id") # .agg( # func.mean("DOWNSTREAM_SIZE").alias("Mean"), # func.stddev("DOWNSTREAM_SIZE").alias("Stddev"), # func.count(func.lit(1)).alias("Num Of Records") # ) # .show(20, False) print(numReviewsPerUser) # NIC: stats: num reviews per business: businesscount = businessIdRdd1.count() print(businesscount) numReviews = rdd.count() print(numReviews) numReviewsPerBusiness = numReviews / businesscount # numReviewsPerBusiness = training.rdd.map(lambda x: (x['business_id'], 1)).reduce(lambda a, b: a+b).avg() print("numReviewsPerUser ==> ", numReviewsPerUser) print("numReviewsPerBusiness ==> ", numReviewsPerBusiness) # ------------------- /NIC stats: ---------------------------- # join user mean df and training df training = training.join(user_mean, ['user_id']) \ .select(training['user_id'], training['business_id'], training['stars'], user_mean['user-mean']) # join item mean df and traning df training = training.join(business_mean, ['business_id']) \ .select(training['user_id'], training['business_id'], training['stars'], user_mean['user-mean'], business_mean['business-mean']) # get global average rating_global_average = training.groupBy().avg('stars').head()[0] # add user item interaction to training column training = training.withColumn('user-business-interaction', get_user_business(training['stars'], user_mean['user-mean'], business_mean['business-mean'], rating_global_average)) # convert distinct user ids and business ids to integer userIdRdd = training.select('user_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1])) businessIdRdd = training.select('business_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1])) # convert to dataframe userIdDf = spark.createDataFrame(userIdRdd)\ .withColumnRenamed('_1', 'user_id') \ .withColumnRenamed('_2', 'user_id_indexed') businessIdDf = spark.createDataFrame(businessIdRdd) \ .withColumnRenamed('_1', 'business_id') \ .withColumnRenamed('_2', 'business_id_indexed') # join user id zipped with index and business id with index training = training.join(userIdDf, ['user_id'], 'left').join(businessIdDf, ['business_id'], 'left') als = ALS(maxIter=6, rank=10, # ORIGINAL # rank=3, regParam=0.01, # regParam=0.1, userCol='user_id_indexed', itemCol='business_id_indexed', ratingCol='user-business-interaction', coldStartStrategy='drop') als.setSeed(seed) model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) predictions = predictions.join(user_mean, ['user_id'],'left') predictions = predictions.join(business_mean, ['business_id'], 'left') rating_global_mean = training.groupBy().mean('stars').head()[0] predictions = predictions.na.fill(rating_global_mean) final_stars = predictions.withColumn('final-stars', get_final_ratings(predictions['prediction'], predictions['user-mean'], predictions['business-mean'], rating_global_mean)) evaluator = RegressionEvaluator(metricName='rmse', labelCol='stars', predictionCol='final-stars') rmse = evaluator.evaluate(final_stars) print(float(rmse)) print("numReviewsPerUser ==> ", numReviewsPerUser) print("numReviewsPerBusiness ==> ", numReviewsPerBusiness)
from pyspark.sql import SparkSession from pyspark.sql.functions import concat, col, udf from pyspark.sql.types import IntegerType from pyspark.ml.linalg import Vectors, VectorUDT #in dataframe ml.linalg is used from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans, BisectingKMeans, GaussianMixture import numpy as np spark = SparkSession.Builder().master('local').appName('twitter').getOrCreate() sc = spark.sparkContext train_df = spark.read.csv('/home/luminar/Downloads/twitter datas/train.csv', header=True, inferSchema=True) train_df.show() test_df = spark.read.csv('/home/luminar/Downloads/twitter datas/test.csv', header=True, inferSchema=True) test_df.show() train_df.filter(col('_c11').isNotNull()).show(truncate=False) adc = [c for c in train_df.columns if c.startswith("_") or c.endswith("Text")] print(adc) print(*adc) train_df = train_df.fillna('').withColumn("ST", concat(*adc)) train_df.show() train_df.filter(col('ItemID') == 9481).show(truncate=False) train_df.select('ST').show(truncate=False)