Ejemplo n.º 1
0
 def __init__(self, input_file):
     super(CSVExtractor, self).__init__()
     self.input_file = input_file
     self.spark = SparkSession.Builder()\
         .enableHiveSupport()\
         .appName('spooq.extractor: {nm}'.format(nm=self.name))\
         .getOrCreate()
Ejemplo n.º 2
0
 def __init__(
     self,
     db_name,
     table_name,
     partition_definitions=[{
         "column_name": "dt",
         "column_type": "IntegerType",
         "default_value": None
     }],
     clear_partition=True,
     repartition_size=40,
     auto_create_table=True,
     overwrite_partition_value=True,
 ):
     super(HiveLoader, self).__init__()
     self._assert_partition_definitions_is_valid(partition_definitions)
     self.partition_definitions = partition_definitions
     self.db_name = db_name
     self.table_name = table_name
     self.full_table_name = db_name + "." + table_name
     self.repartition_size = repartition_size
     if clear_partition and not overwrite_partition_value:
         raise ValueError(
             "clear_partition is only supported if overwrite_partition_value is also enabled. ",
             "This would otherwise result in clearing partitions on basis of dynamically values",
             "(from dataframe) instead of explicitly defining the partition(s) to clear",
         )
     self.clear_partition = clear_partition
     self.overwrite_partition_value = overwrite_partition_value
     self.auto_create_table = auto_create_table
     self.spark = (SparkSession.Builder().enableHiveSupport().appName(
         "spooq.extractor: {nm}".format(nm=self.name)).getOrCreate())
Ejemplo n.º 3
0
def getSparkSession():

    #conf = SparkConf().setAppName("sanity_check_framework").setMaster("yarn-client")
	#sc = SparkContext(conf=conf)
	#spark = SQLContext(sc)
    spark = SparkSession.Builder().master("yarn-client").appName("Data Validation").enableHiveSupport().getOrCreate()
    return spark
Ejemplo n.º 4
0
def main():
    spark = SparkSession.Builder().getOrCreate()
    # load dataset
    # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
    # dataset = spark.read.format('libsvm').json(datapath+'/data/business.json')

    filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/business_MTL_ONLY.json'
    # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json'
    dataset = spark.read.format('libsvm').json(filename)
    print(dataset)

    # get longitude and latitude
    ll = dataset.select(dataset.categories[0], dataset.longitude,
                        dataset.latitude)
    ll = ll.withColumnRenamed('categories[0]', 'categories')

    ll.show()

    print(ll.schema.names)
    # for item in ll.schema.names:
    #   print(item)
    #   for item2 in item:
    #     print(item2)
    sys.exit()
    # convert ll to dense vectors
    # data =ll.rdd.map(lambda x:(Vectors.dense(float(x[0]), float(x[1])),)).collect()
    assembler = VectorAssembler(inputCols=['longitude', 'latitude'],
                                outputCol='features')

    df = assembler.transform(ll)

    # set KMeans k and seed
    kmeans = KMeans(k=4, seed=1)

    # generate model
    model = kmeans.fit(df)

    # Make predictions
    predictions = model.transform(df)
    predictions.show(20)
    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # number of location in each cluster
    print('Number of business in each cluster: ')
    predictions.groupBy('prediction').count().sort(desc('count')).show()

    # show in which cluster do we have more restaurants
    print('Number of restaurant per clusters')
    predictions.where(predictions.categories == 'Restaurants').groupBy(
        'prediction').count().sort(desc('count')).show()

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
Ejemplo n.º 5
0
 def __init__(self, jdbc_options, cache=True):
     super(JDBCExtractor, self).__init__()
     self._assert_jdbc_options(jdbc_options)
     self.jdbc_options = jdbc_options
     self.cache = cache
     self.spark = (SparkSession.Builder().enableHiveSupport().appName(
         "spooq.extractor: {nm}".format(nm=self.name)).getOrCreate())
Ejemplo n.º 6
0
def getSparkSession():
    spark = SparkSession.Builder().master("yarn-client").appName("Data Validation") \
        .config("hive.exec.dynamic.partition", "true") \
        .config("hive.exec.dynamic.partition.mode", "nonstrict") \
        .config("hive.warehouse.data.skipTrash", "true") \
        .enableHiveSupport() \
        .getOrCreate()

    return spark
Ejemplo n.º 7
0
    def spark_session_builder(self) -> SparkSession.Builder:
        builder = SparkSession.Builder()

        joined_config: Dict[str, str] = self.spark_options().copy()
        joined_config.update(self.default_configuration())
        for key, value in joined_config.items():
            builder.config(key, value)

        return builder
Ejemplo n.º 8
0
 def __init__(self, appName):
     self.__spark = SparkSession.Builder().appName(appName).getOrCreate()
     self.__database = None
     self.__user = None
     self.__password = None
     self.__host = None
     self.__port = None
     self.__connect = None
     self.__cursor = None
Ejemplo n.º 9
0
 def __init__(self, input_path=None, base_path=None, partition=None):
     super(JSONExtractor, self).__init__()
     self.input_path = self._get_path(input_path=input_path,
                                      base_path=base_path,
                                      partition=partition)
     self.base_path = base_path
     self.partition = partition
     self.spark = (SparkSession.Builder().enableHiveSupport().appName(
         "spooq.extractor: {nm}".format(nm=self.name)).getOrCreate())
Ejemplo n.º 10
0
 def _create_session(self):
     spark_conf = SparkConf().setAppName(self._app_name)
     spark_conf.set('spark.executor.memory', f'{self._memory}g')
     spark_conf.set('spark.executor.cores', f'{self._cores}')
     spark_conf.set('spark.driver.memory', f'{self._driver_memory}g')
     spark_conf.set('spark.driver.extraClassPath',
                    '/home/ripper/postgresql-42.2.19.jar')
     spark_conf.set('spark.jars.packages',
                    'org.postgresql:postgresql:42.2.19')
     return SparkSession.Builder().config(conf=spark_conf).getOrCreate()
Ejemplo n.º 11
0
def main():
    spark = SparkSession.Builder() \
    .appName("APP_NAME") \
    .config("hive.support.concurrency", "false") \
    .config("spark.sql.crossJoin.enabled", "true") \
    .enableHiveSupport() \
    .master("yarn") \
    .getOrCreate()
    sc = spark.sparkContext


    spark.udf.register("func", func)
    spark.sql("select cast(func('EUR', 'USD', '2004-07-01') as double) as rate,src_sys_nm from radar.src_sys").show()
Ejemplo n.º 12
0
def alsrecommend(n):
    # spark 初始化
    spark = SparkSession.Builder().appName('sql').master('local').getOrCreate()
    # mysql 配置(需要修改)
    prop = {'user': '******',
            'password': '******',
            'driver': 'com.mysql.jdbc.Driver'}
    # database 地址(需要修改)
    url = 'jdbc:mysql://192.168.222.3:3306/laoba'
    # 读取表
    data = spark.read.jdbc(url=url, table='score', properties=prop)
    scoreData = data.createOrReplaceTempView("scoredata")
    result = spark.sql("select user_id,film_id,score from scoredata")
    users = spark.sql("select user_id from scoredata")
    users = [int(row.user_id)for row in users.distinct().collect()]
    #print(users)
    score = result.rdd.map(lambda x :(x[0],x[1],x[2]))
    #score.take(10)
    model = ALS.train(score,20,25,0.01)

    recresult = []
    for i in users:
        recresult.extend(model.recommendProducts(i,10))
    #print(recresult)
    a =1
    resultdata =[]
    for row in recresult:
        resultdata.append((a,row.user,row.product,round(row.rating,2)))
        a+=1

    #resultdata = [(row.index,row.user,row.product,round(row.rating, 2)) for row in recresult]
    #print(resultdata)
    #print(len(resultdata))
    result_df = spark.createDataFrame(resultdata,schema=["id","user_id","movie_id","rating"])
    #result_df.show()
    result_df.write.jdbc(url=url,table="recommend",mode="overwrite",properties=prop)
    spark.stop()
    print("Already running...........%d"%n)
    time.sleep(10)
Ejemplo n.º 13
0
def data_anlysis():

    inputFile = r"data/vehicles.csv"

    # inputFile = 'tesvehicles.json'
    spark = SparkSession.Builder().appName('VH').getOrCreate()
    df = spark.read.csv(inputFile)

    print("Loading vehicles from " + inputFile)

    # prev_count = count_files_in_folder(inputPath)
    # input = hiveCtx.read.json(inputFile)
    # input.registerTempTable("vehicles")
    topvehicless = df.show()
    print(topvehicless)

    print(
        colored("2. filter out now span data:",
                "blue",
                attrs=["reverse", "blink"]))
    print('schema:')
    print(df.printSchema())
Ejemplo n.º 14
0
# read and write json files
#pip3 install langdetect

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from langdetect import detect
from pyspark.sql.types import *

import json

spark = SparkSession.Builder().appName("json").master("local[2]").getOrCreate()

sc = spark.sparkContext

#data = json.load(open("/home/user/workarea/projects/learn-pyspark/data/colors.json","r"))

#rdd = sc.parallelize(data)

#register a udf for language detection


def detect_tweet_lang(s):
    return detect(s)


spark.udf.register("detect_tweet_lang", detect_tweet_lang)
"""
data = spark.read.format("json").\
        option("multiline","true").\
        option("mode","FAILFAST").\
        load("/home/user/workarea/projects/learn-pyspark/data/source/tweets.json")
Ejemplo n.º 15
0
            raise


if __name__ == "__main__":
    """Submit Spark application in Step:
       --jars s3_bucket/program/postgresql-42.1.4.jar \
       --py-files s3_bucket/program/para2dis-0.1.egg \
       py/est_distance_spark.py s3_bucket:program/local.conf
   """
    if len(sys.argv) != 2:
        sys.exit(1)
    else:
        # We use a property file to configure the environment
        conf_on_s3 = sys.argv[1]

    spark = SparkSession.Builder().appName('distance').getOrCreate()

    get_file_on_s3(conf_on_s3, local_file)

    config = configparser.ConfigParser()
    config.read(local_file)

    DRIVER = config.get('database', 'driver')
    URL = config.get('database', 'url')
    USER = config.get('database', 'user')
    PASSWORD = config.get('database', 'password')
    TODB = config.get('database', 'isUsed')

    prop = {'driver': DRIVER, 'user': USER, 'password': PASSWORD}

    df = spark.read.jdbc(URL, main_table, properties=prop)
Ejemplo n.º 16
0
# @author: Yang, zhen-peng (Arvin)

## Spark Application - execute with spark-submit:spark-submit app.py

# Imports
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import types
from pyspark.sql import functions as func


# Module Constants
# note: Builder() and getOrCreate(), don't miss ()
APP_NAME = "Spark SQL Demo"
spark = SparkSession.Builder() \
    .appName(APP_NAME) \
    .master("local") \
    .getOrCreate()


# Closure Functions


# Main functionality
def main(spark: SparkSession):

    sc = spark.sparkContext
    # load data from textFile, return text file RDD
    t005t = sc.textFile("C:\\200836_az_fi1_105104.T005T_p1.TXT") \
        .map(lambda line: line.split("|")) \
        .map(lambda t: Row(mandt=t[0], spras=t[1], land1=t[2], landx=t[3], natio=t[4]))
    # create DataFrame by RDD
Ejemplo n.º 17
0
#spark certification practice - Spark Definitive Guide chapter-6

from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, col, window, column, date_format, pow, round, bround, corr, coalesce
import time

spark = SparkSession.Builder().appName("test13").master(
    "local[3]").getOrCreate()

df=spark.read.format("csv")\
        .option("header","true")\
        .option("inferSchema","true")\
        .load("/home/user/workarea/projects/Spark-The-Definitive-Guide/data/retail-data/by-day/2010-12-01.csv")

df.printSchema()
df.createOrReplaceTempView("dfTable")

#converting to spark datatypes

from pyspark.sql.functions import lit

df.select(lit(5), lit("five"), lit(5.0))

#use of boolean expressions

df.where(col("InvoiceNo") == "536365")\
        .select("InvoiceNo","Description")\
        .show(5,False)

#another way
Ejemplo n.º 18
0
from pyspark.sql import SparkSession

spark = SparkSession.Builder().appName('Exercise').getOrCreate()

df = spark.read.csv('sanket.csv', inferSchema=True, header=True)
df.show(3)


from pyspark.sql.functions import col

df.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()
    
from pyspark.ml.feature import RegexTokenizer,StopWordsRemover,CountVectorizer  
from pyspark.ml.feature import HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

#for tokenization using regular expression
regexTokenizer = RegexTokenizer(inputCol="article", outputCol="words", pattern="\\W")

# Stop words remover
add_stopwords = ["http","https","amp","rt"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filterwords").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filterwords", outputCol="features", vocabSize=10000, minDF=5)

# TF-IDF 
hashingTF = HashingTF(inputCol="filterwords", outputCol="rawFeatures", numFeatures=10000)
Ejemplo n.º 19
0

def get_user_business(rating, user_mean, item_mean, rating_global_mean):
    return rating - (user_mean + item_mean - rating_global_mean)


def get_final_ratings(i, user_mean, item_mean, global_average_rating):
    final_ratings = i + user_mean + item_mean - global_average_rating
    return final_ratings


# sparkConf.set("spark.sql.crossJoin.enabled", "true")
#Then get or create SparkSession by passing this SparkConf
# val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()

spark = SparkSession.Builder().getOrCreate()
spark.conf.set("spark.sql.crossJoin.enabled", "true")

seed = 1  # int(sys.argv[SEED])
# datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
# rdd = spark.read.json(datapath+'/data/review_truncated_RAW.json').rdd

# filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review.json'
# filename = '../data/review_50K_0.json'
filename = 'review_50K_0.json'
# filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json'
# filename = '/Users/nicolasg-chausseau/big_data_project_yelp/data/review_truncated_RAW.json'
rdd = spark.read.json(filename).limit(
    100).rdd  # datapath+'/data/review_trunca®ted_RAW.json'
# TODO: put the limit above back to 100,000
Ejemplo n.º 20
0
from boto3 import s3
from pyspark.sql import SparkSession

if __name__ == "__main__":
    sc = SparkSession.Builder().appName("Teste").getOrCreate()
    parquetFile = sc.read.parquet("/home/diego/Documentos/parquet.parquet")
    parquetFile.printSchema()

    view = parquetFile.createOrReplaceTempView("teste")

    sc.sql("select * from teste").show()

    sc.stop()
Ejemplo n.º 21
0
#spark certification practice - chapter 08: Joins

from pyspark.sql import SparkSession
spark = SparkSession.Builder().appName("chapter7").master(
    "local[3]").getOrCreate()

#create datasets

person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])])\
        .toDF("id", "name", "graduate_program", "spark_status")


graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")])\
        .toDF("id", "degree", "department", "school")

sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")])\
        .toDF("id", "status")

person.show()
graduateProgram.show()
sparkStatus.show()
Ejemplo n.º 22
0
from pyspark.sql import SparkSession
from pyspark_extensions.helpers import test, init_logger

spark = SparkSession.Builder().appName('etl').master("local[4]").getOrCreate()

spark.sql('select 1').transform(test()).show()

logger = init_logger(spark, __name__)()
logger.info("info")
logger.warn("warn")
logger.error("error")
Ejemplo n.º 23
0
 def __init__(self, appName):
     self.__df = None
     self.__spark = SparkSession.Builder().appName(appName).getOrCreate()
     self.__jdbcUrl = None
     self.__properties = None
# -*- coding: UTF-8 -*-
# author: [email protected]
# date: 2016-12-27

# imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark import StorageLevel
from pyspark.sql import types

# Module Constants
APP_NAME = __file__
spark = SparkSession.Builder() \
    .appName(APP_NAME) \
    .config("hive.support.concurrency", "false") \
    .config("spark.sql.crossJoin.enabled", "true") \
    .enableHiveSupport() \
    .master("yarn") \
    .getOrCreate()

# Closure Functions


# Main functionality
def main(spark):

    # sql declaration
    bsad = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \
           "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsad"
    bsid = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \
           "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsid"
Ejemplo n.º 25
0
def main():
    spark = SparkSession.Builder().getOrCreate()
    seed = int(sys.argv[SEED])
    datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
    rdd = spark.read.json(datapath + '/data/review.json').limit(100000).rdd
    df = spark.createDataFrame(rdd)
    (training, test) = df.randomSplit([0.8, 0.2], seed)
    userIdRdd1 = test.select('user_id').rdd.distinct().zipWithIndex().map(
        lambda x: (x[0][0], x[1]))
    businessIdRdd1 = test.select('business_id').rdd.distinct().zipWithIndex(
    ).map(lambda x: (x[0][0], x[1]))

    # convert to dataframe
    userIdDf2 = spark.createDataFrame(userIdRdd1)\
                    .withColumnRenamed('_1', 'user_id') \
                    .withColumnRenamed('_2', 'user_id_indexed')
    businessIdDf2 = spark.createDataFrame(businessIdRdd1) \
                        .withColumnRenamed('_1', 'business_id') \
                        .withColumnRenamed('_2', 'business_id_indexed')

    # join user id zipped with index and business id with index
    test = test.join(userIdDf2, ['user_id'],
                     'left').join(businessIdDf2, ['business_id'], 'left')

    # get user mean
    user_mean = training.groupBy('user_id').mean('stars').withColumnRenamed(
        'avg(stars)', 'user-mean')

    # get item mean
    business_mean = training.groupBy('business_id').mean(
        'stars').withColumnRenamed('avg(stars)', 'business-mean')

    # join user mean df and training df
    training = training.join(user_mean, ['user_id']) \
            .select(training['user_id'], training['business_id'], training['stars'], user_mean['user-mean'])

    # join item mean df and traning df
    training = training.join(business_mean, ['business_id']) \
            .select(training['user_id'], training['business_id'], training['stars'],
                    user_mean['user-mean'], business_mean['business-mean'])

    # get global average
    rating_global_average = training.groupBy().avg('stars').head()[0]

    # add user item interaction to training column
    training = training.withColumn(
        'user-business-interaction',
        get_user_business(training['stars'], user_mean['user-mean'],
                          business_mean['business-mean'],
                          rating_global_average))

    # convert distinct user ids and business ids to integer
    userIdRdd = training.select('user_id').rdd.distinct().zipWithIndex().map(
        lambda x: (x[0][0], x[1]))
    businessIdRdd = training.select('business_id').rdd.distinct().zipWithIndex(
    ).map(lambda x: (x[0][0], x[1]))

    # convert to dataframe
    userIdDf = spark.createDataFrame(userIdRdd)\
                    .withColumnRenamed('_1', 'user_id') \
                    .withColumnRenamed('_2', 'user_id_indexed')
    businessIdDf = spark.createDataFrame(businessIdRdd) \
                        .withColumnRenamed('_1', 'business_id') \
                        .withColumnRenamed('_2', 'business_id_indexed')
    # join user id zipped with index and business id with index
    training = training.join(userIdDf, ['user_id'],
                             'left').join(businessIdDf, ['business_id'],
                                          'left')
    als = ALS(maxIter=5,
              rank=70,
              regParam=0.01,
              userCol='user_id_indexed',
              itemCol='business_id_indexed',
              ratingCol='user-business-interaction',
              coldStartStrategy='drop')
    als.setSeed(seed)
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)

    predictions = predictions.join(user_mean, ['user_id'], 'left')
    predictions = predictions.join(business_mean, ['business_id'], 'left')
    rating_global_mean = training.groupBy().mean('stars').head()[0]
    predictions = predictions.na.fill(rating_global_mean)
    final_stars = predictions.withColumn(
        'final-stars',
        get_final_ratings(predictions['prediction'], predictions['user-mean'],
                          predictions['business-mean'], rating_global_mean))
    high_stars = final_stars.where(final_stars['final-stars'] >= 3)
    low_stars = final_stars.where(final_stars['final-stars'] < 3)

    evaluator = RegressionEvaluator(metricName='rmse',
                                    labelCol='stars',
                                    predictionCol='final-stars')

    final_stars_rmse = evaluator.evaluate(final_stars)
    print('final stars rmse', float(final_stars_rmse))

    high_stars_rmse = evaluator.evaluate(high_stars)
    print('number of high stars', high_stars.count())
    print('high stars rmse', float(high_stars_rmse))

    print('number of low stars', low_stars.count())
    low_stars_rmse = evaluator.evaluate(low_stars)
    print('low stars rmse', float(low_stars_rmse))
Ejemplo n.º 26
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.Builder().appName('Example').getOrCreate()

sales_df = spark.read \
     .option("inferSchema", "true") \
     .option("header", "true") \
     .csv("sales.csv")

result = sales_df.groupBy("COUNTRY_CODE")\
                 .sum("AMOUNT")\
                 .orderBy(desc("sum(AMOUNT)"))

result.show()
Ejemplo n.º 27
0
from pyspark.sql import SparkSession

spark = SparkSession.Builder().appName("rdd-transform").master(
    "local[2]").getOrCreate()

spark.conf.set("logLineage", 'true')

sc = spark.sparkContext

#sc.setLogLevel("INFO")

#set log lineage to true
#--conf spark.logLineage=true

#narrow transformations - doesnt require a shuffle

#wide transformations  - require a shuffle

list1 = [1, 2, 3, 3, 6, 7, 8, 12, 6, 23, 45, 76, 9, 10]
list2 = [1, 2, 3]
list3 = [3, 4, 5]

r1 = sc.parallelize(list1, 20)

print(type(r1))

#r1.persist()

#print(r1.collect())

r3 = r1.map(lambda x: x**2).filter(lambda x: x > 5)
Ejemplo n.º 28
0
from para2dis.Prior import Prior
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.Builder().appName('test').getOrCreate()

    pri = Prior()

    spark.stop()
def main():
  spark = SparkSession.Builder().getOrCreate()
  seed = 1  # int(sys.argv[SEED])
  # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
  # rdd = spark.read.json(datapath+'/data/review_truncated_RAW.json').rdd

  filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review.json'
  # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json'
  # filename = '/Users/nicolasg-chausseau/big_data_project_yelp/data/review_truncated_RAW.json'
  rdd = spark.read.json(filename).limit(150000).rdd # datapath+'/data/review_trunca®ted_RAW.json'

  df = spark.createDataFrame(rdd)
  (training, test) = df.randomSplit([0.8, 0.2], seed)
  userIdRdd1 = test.select('user_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1]))
  businessIdRdd1 = test.select('business_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1]))

  # convert to dataframe
  userIdDf2 = spark.createDataFrame(userIdRdd1)\
                  .withColumnRenamed('_1', 'user_id') \
                  .withColumnRenamed('_2', 'user_id_indexed')
  businessIdDf2 = spark.createDataFrame(businessIdRdd1) \
                      .withColumnRenamed('_1', 'business_id') \
                      .withColumnRenamed('_2', 'business_id_indexed')

  # join user id zipped with index and business id with index
  test = test.join(userIdDf2, ['user_id'], 'left').join(businessIdDf2, ['business_id'], 'left')

  # get user mean
  user_mean = training.groupBy('user_id').mean('stars').withColumnRenamed('avg(stars)', 'user-mean')

  # get item mean
  business_mean = training.groupBy('business_id').mean('stars').withColumnRenamed('avg(stars)', 'business-mean')

  # ------------------- NIC stats: ----------------------------
  # num reviews per user:
  usercount = userIdRdd1.count()
  print(usercount)
  numReviews = rdd.count()
  print(numReviews)
  numReviewsPerUser = numReviews / usercount

  # numReviewsPerUser = training.rdd.map(lambda x: (x['user_id'], 1)).reduceByKey(lambda a, b: a+b).map(lambda x: x[1]).reduce(lambda a,b: a+b)
  # numReviewsPerUser = training.groupBy("user_id")
  # .agg(
  #     func.mean("DOWNSTREAM_SIZE").alias("Mean"),
  #     func.stddev("DOWNSTREAM_SIZE").alias("Stddev"),
  #     func.count(func.lit(1)).alias("Num Of Records")
  # )
  # .show(20, False)
  print(numReviewsPerUser)

  # NIC: stats: num reviews per business:
  businesscount = businessIdRdd1.count()
  print(businesscount)
  numReviews = rdd.count()
  print(numReviews)
  numReviewsPerBusiness = numReviews / businesscount

  # numReviewsPerBusiness = training.rdd.map(lambda x: (x['business_id'], 1)).reduce(lambda a, b: a+b).avg()
  print("numReviewsPerUser ==> ", numReviewsPerUser)
  print("numReviewsPerBusiness ==> ", numReviewsPerBusiness)
  # ------------------- /NIC stats: ----------------------------

  # join user mean df and training df
  training = training.join(user_mean, ['user_id']) \
          .select(training['user_id'], training['business_id'], training['stars'], user_mean['user-mean'])

  # join item mean df and traning df
  training = training.join(business_mean, ['business_id']) \
          .select(training['user_id'], training['business_id'], training['stars'],
                  user_mean['user-mean'], business_mean['business-mean'])

  # get global average
  rating_global_average = training.groupBy().avg('stars').head()[0]

  # add user item interaction to training column
  training = training.withColumn('user-business-interaction',
                                  get_user_business(training['stars'],
                                                user_mean['user-mean'],
                                                business_mean['business-mean'],
                                                rating_global_average))

  # convert distinct user ids and business ids to integer
  userIdRdd = training.select('user_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1]))
  businessIdRdd = training.select('business_id').rdd.distinct().zipWithIndex().map(lambda x: (x[0][0], x[1]))

  # convert to dataframe
  userIdDf = spark.createDataFrame(userIdRdd)\
                  .withColumnRenamed('_1', 'user_id') \
                  .withColumnRenamed('_2', 'user_id_indexed')
  businessIdDf = spark.createDataFrame(businessIdRdd) \
                      .withColumnRenamed('_1', 'business_id') \
                      .withColumnRenamed('_2', 'business_id_indexed')
  # join user id zipped with index and business id with index
  training = training.join(userIdDf, ['user_id'], 'left').join(businessIdDf, ['business_id'], 'left')
  als = ALS(maxIter=6,
            rank=10,  # ORIGINAL
            # rank=3,
            regParam=0.01,
            # regParam=0.1,
            userCol='user_id_indexed',
            itemCol='business_id_indexed',
            ratingCol='user-business-interaction',
            coldStartStrategy='drop')
  als.setSeed(seed)
  model = als.fit(training)

  # Evaluate the model by computing the RMSE on the test data
  predictions = model.transform(test)

  predictions = predictions.join(user_mean, ['user_id'],'left')
  predictions = predictions.join(business_mean, ['business_id'], 'left')
  rating_global_mean = training.groupBy().mean('stars').head()[0]
  predictions = predictions.na.fill(rating_global_mean)
  final_stars = predictions.withColumn('final-stars', get_final_ratings(predictions['prediction'],
                                          predictions['user-mean'],
                                          predictions['business-mean'],
                                          rating_global_mean))
  evaluator = RegressionEvaluator(metricName='rmse',
                                  labelCol='stars',
                                  predictionCol='final-stars')
  rmse = evaluator.evaluate(final_stars)
  print(float(rmse))
  print("numReviewsPerUser ==> ", numReviewsPerUser)
  print("numReviewsPerBusiness ==> ", numReviewsPerBusiness)
Ejemplo n.º 30
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.linalg import Vectors, VectorUDT  #in dataframe ml.linalg is used
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, BisectingKMeans, GaussianMixture
import numpy as np

spark = SparkSession.Builder().master('local').appName('twitter').getOrCreate()
sc = spark.sparkContext

train_df = spark.read.csv('/home/luminar/Downloads/twitter datas/train.csv',
                          header=True,
                          inferSchema=True)
train_df.show()
test_df = spark.read.csv('/home/luminar/Downloads/twitter datas/test.csv',
                         header=True,
                         inferSchema=True)
test_df.show()

train_df.filter(col('_c11').isNotNull()).show(truncate=False)

adc = [c for c in train_df.columns if c.startswith("_") or c.endswith("Text")]
print(adc)
print(*adc)

train_df = train_df.fillna('').withColumn("ST", concat(*adc))
train_df.show()

train_df.filter(col('ItemID') == 9481).show(truncate=False)
train_df.select('ST').show(truncate=False)