def item_based(): input_file = sc.textFile(train_file) train_data = input_file.map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id").persist( StorageLevel(True, True, False, False)) input_file2 = sc.textFile(val_file) val_data = input_file2.map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id").persist( StorageLevel(True, True, False, False)) t_users = train_data.map(lambda a: a[0]).distinct().collect() t_businesses = train_data.map(lambda a: a[1]).distinct().collect() R = len(t_users) C = len(t_businesses) users_dict = {} for u in range(0, R): users_dict[t_users[u]] = u businesses_dict = {} for u in range(0, C): businesses_dict[t_businesses[u]] = u t_characteristic_matrix = train_data.map( lambda x: (x[0], ([businesses_dict[x[1]]], [x[2]]))).reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])).persist( StorageLevel(True, True, False, False)) t2 = t_characteristic_matrix.map(lambda x: trans(x)) dum = {} for u in t2.collect(): dum[u[0]] = u[1] ti_characteristic_matrix = train_data.map(lambda x: (businesses_dict[x[ 1]], ([users_dict[x[0]]], [x[2]]))).reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])).persist( StorageLevel(True, True, False, False)) ti2 = ti_characteristic_matrix.map(lambda x: trans(x)) dim = {} for u in ti2.collect(): dim[u[0]] = u[1] pres = val_data.map(lambda x: (x[0], x[ 1], i_predict((x[0], x[1], x[2]), dum, dim, businesses_dict))).persist( StorageLevel(True, True, False, False)) ans_file = open(output_file, 'w') ans_file.write("user_id, business_id, prediction\n") for c in pres.collect(): ans_file.write(c[0] + "," + c[1] + "," + str(c[2]) + "\n") ans_file.close() return
def createCheckInDataPerUser(self): review_user = self.sqlContext.sql( "SELECT business_id, user_id FROM reviews") business_loc = self.sqlContext.sql( "SELECT business_id, latitude, longitude FROM business") review_user.registerTempTable("reviews_user") business_loc.registerTempTable("business_loc") self.df_join_reviewAndBusiness = self.sqlContext.sql( "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id" ).rdd.groupBy(lambda x: x.user_id).persist( StorageLevel(True, True, False, True, 1)) # self.df_join_reviewAndBusiness.repartition(1).saveAsTextFile("user.json") self.user_centers = self.df_join_reviewAndBusiness.map( getCentersOfUser, preservesPartitioning=True) schema_2 = StructType([ StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True) ]) schema = StructType([ StructField("cluster_centers", ArrayType(schema_2), True), StructField("user_id", StringType(), True) ]) df = self.sqlContext.createDataFrame(self.user_centers.repartition(1), schema) df.save("center.json", "json")
def special_show(self, n=2000, truncate=False, vertical=False, auto_sample=True, seed=None): """Special version of show, this changes the default to number of rows to 2000 and samples the result. Caches the input if not already cached. """ if vertical: raise Exception("this doesn't work in fancy notebook mode") do_cache = auto_sample and self.storageLevel == StorageLevel( False, False, False, False, 1) try: if do_cache: df.cache() sampled_df = self if auto_sample: total_count = self.count() do_sample = (n < total_count) and auto_sample if do_sample: fraction = (n * 1.1) / total_count sampled_df = self.sample(withReplacement=False, fraction=fraction).limit(n) pandas_df = sampled_df.toPandas() return DataFrameResult(pandas_df, self, do_sample) finally: if do_cache: df.unpersist()
def get_sensor_table(sparkContext, sqlContext): start = CONFIG['stat_from'].strftime('%Y%m%d') end = CONFIG['stat_to'].strftime('%Y%m%d') table = sqlContext.sql(''' SELECT search_word, product_id, sum(VIEW) AS exposure_count, sum(click) AS click_count FROM (SELECT if(a.doc_type = 'global_mall' or a.doc_type = 'global_pop_mall', b.product_id, a.p_material_id) AS product_id, CASE WHEN (a.event_id = 4) THEN 1 ELSE 0 END AS VIEW, CASE WHEN (a.event_id = 3) THEN 1 ELSE 0 END AS click, a.search_word FROM (SELECT event_id, search_word, p_material_id, doc_type FROM (SELECT event_id, regexp_extract(p_params, '^(.*?)&(.*?)$', 1) AS search_word, regexp_extract(p_material_id, '(.*p)?(\\\d+).*',2) AS p_material_id, regexp_extract(p_material_link, '^.*&type=(.*?)&.*', 1) AS doc_type FROM rawdata.event_ros_p1 WHERE DAY >= '%s' AND DAY <= '%s' AND p_material_page='product_search_list' AND p_params IS NOT NULL AND (event_id = 4 OR event_id = 3)) st WHERE search_word IS NOT NULL AND search_word != '' AND p_material_id IS NOT NULL AND p_material_id rlike '^\\\d+$' ) a LEFT JOIN mysql.jumei_mall b ON a.p_material_id = b.mall_id AND (a.doc_type = 'global_mall' OR a.doc_type = 'global_pop_mall') WHERE if(a.doc_type = 'global_mall' or a.doc_type = 'global_pop_mall', b.product_id, a.p_material_id) is not null AND a.search_word IS NOT NULL AND a.search_word != '' ) t WHERE product_id IS NOT NULL GROUP BY search_word, product_id ''' % (start, end)) table.persist(StorageLevel(True, True, False, False, 1)) if CONFIG['do_save_table']: table.write.saveAsTable('recommend.ecpm_sensor' + CONFIG['table_suffix'], mode='overwrite') return table
def user_based(): input_file = sc.textFile(train_file) train_data = input_file.map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id").persist( StorageLevel(True, True, False, False)) input_file2 = sc.textFile(val_file) val_data = input_file2.map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id").persist( StorageLevel(True, True, False, False)) t_businesses = train_data.map(lambda a: a[1]).distinct().collect() ncolumns = len(t_businesses) businesses_dict = {} for u in range(0, ncolumns): businesses_dict[t_businesses[u]] = u t_characteristic_matrix = train_data.map( lambda x: (x[0], ([businesses_dict[x[1]]], [x[2]]))).reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda x: trans(x)) dum = {} for u in t_characteristic_matrix.collect(): dum[u[0]] = u[1] businesses_users = train_data.map(lambda x: (x[1], [x[0]])).reduceByKey( lambda x, y: x + y) dbu = {} for bu in businesses_users.collect(): dbu[bu[0]] = bu[1] pres = val_data.map(lambda x: (x[0], x[ 1], predict((x[0], x[1]), dum, dbu, businesses_dict))) ans_file = open(output_file, 'w') ans_file.write("user_id, business_id, prediction\n") for c in pres.collect(): ans_file.write(c[0] + "," + c[1] + "," + str(c[2]) + "\n") ans_file.close() return
def getStorageLevel(self): """ Get the RDD's current storage level. >>> rdd1 = sc.parallelize([1,2]) >>> rdd1.getStorageLevel() StorageLevel(False, False, False, 1) """ java_storage_level = self._jrdd.getStorageLevel() storage_level = StorageLevel(java_storage_level.useDisk(), java_storage_level.useMemory(), java_storage_level.deserialized(), java_storage_level.replication()) return storage_level
def special_show(self, n=2000, truncate=False, vertical=False, auto_sample=True, seed=None): """Special version for a Spark dataframe's `show` This changes the defaults to number of rows to show to 2000 and samples the result. Caches the input if not already cached. Parameters ---------- - self: a dataframe - n (int): number of rows from dataframe to show (default 2000) - truncate (bool): whether to truncate rows (default False) - vertical (bool): support fancy notebook mode (default False) - auto_sample (bool): whether to sample the dataframe (default True) - seed (int): seed for sampling (default None) """ if vertical: raise Exception("this doesn't work in fancy notebook mode") do_cache = auto_sample and self.storageLevel == StorageLevel( False, False, False, False, 1) try: if do_cache: df.cache() sampled_df = self if auto_sample: total_count = self.count() do_sample = (n < total_count) and auto_sample if do_sample: fraction = (n * 1.1) / total_count sampled_df = self.sample(withReplacement=False, fraction=fraction).limit(n) pandas_df = sampled_df.toPandas() return DataFrameResult(pandas_df, self, do_sample) finally: if do_cache: df.unpersist()
def get_order_table(sparkContext, sqlContext): start = CONFIG['stat_from'].strftime('%Y-%m-%d') end = CONFIG['stat_to'].strftime('%Y-%m-%d') table = sqlContext.sql(''' select sell_label,product_id as productId,sum(quantity*deal_price) as sales_amount from bi_datawarehouse.int_paid_orders where data_date >= '%s' AND data_date <= '%s' and sell_label is not null and sell_label != "" and sell_type = 'mSearch' group by sell_label,product_id order by sales_amount desc ''' % (start, end)) table.persist(StorageLevel(True, True, False, False, 1)) if CONFIG['do_save_table']: table.write.saveAsTable('recommend.ecpm_order' + CONFIG['table_suffix'], mode='overwrite') return table
宽窄依赖: "https://github.com/rohgar/scala-spark-4/wiki/Wide-vs-Narrow-Dependencies" # pandas DataFrame to spark DataFrame from pyspark.sql import SparkSession sqlContext = SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() spark_df = sqlContext.createDataFrame(df) # pandas DataFrame to spark rdd spark.createDataFrame(df).rdd # 设置缓存级别 from pyspark.storagelevel import StorageLevel StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False) StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2) StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, False) StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2) StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, False) StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2) StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1) # spark使用随机森林 from pyspark.mllib.tree import RandomForest, RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # spark rdd存为本地文本文件 rdd.saveAsTextFile(file_name) # rdd获取指定数量元素转化为列表
from pyspark import SparkContext from pyspark import SparkConf from pyspark.storagelevel import StorageLevel conf = SparkConf().setAppName("wordcount").setMaster("local") sc = SparkContext(conf=conf) sc.setCheckpointDir("./chk") lines = sc.textFile("./text") words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) pairs.persist(storageLevel=StorageLevel(True, True, False, False, 3)) result = pairs.reduceByKey(lambda a, b: a + b) result.checkpoint() sorted_result = result.sortBy(lambda kv: kv[1], False) # sorted_result = result.map(lambda kv: (kv[1], kv[0])).sortByKey(False) \ # .map(lambda kv: (kv[1], kv[0])) temp = sorted_result.take(3) # num = result.count() # temp = result.collect() print(temp) # result.saveAsTextFile("./my_result")
# initiate sc = SparkContext('local[*]', 'inf553_hw2_1') sc.setLogLevel("OFF") case_mark = int(sys.argv[1]) S = int(sys.argv[2]) input_file = sc.textFile(sys.argv[3]) # readfile output_file = sys.argv[4] data = input_file.distinct().map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id") # create basket if case_mark == 1: baskets = data.groupByKey().map(lambda x: (x[0], list(x[1]))).persist( StorageLevel(True, True, False, False)) elif case_mark == 2: baskets = data.map(lambda x: (x[1], x[0])).groupByKey().map( lambda x: (x[0], list(x[1]))).persist( StorageLevel(True, True, False, False)) else: exit(-1) # baskets = baskets.coalesce(1) # data.unpersist() N = baskets.count() # SON algorithm # Pass 1 # Pass 1 Map
data_review = input_file_review.map(lambda a: json.loads(a)).map( lambda a: (a['business_id'], a['stars'])) input_file_business = sc.textFile(sys.argv[2]) data_business = input_file_business.map(lambda a: json.loads(a)).map( lambda a: (a['business_id'], a['state'])) data = data_review.join(data_business) mstatestar = data.map(lambda x: (x[1][1], (x[1][0], 1))).reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])).sortByKey() mstateavgstar = mstatestar.map( lambda x: (x[0], format(float(x[1][0]) / x[1][1]))).sortBy( lambda x: x[1], ascending=False).persist(StorageLevel(True, True, False, False)) # 1-collect begin_time_1 = time.time() m1 = mstateavgstar.collect() for i in range(5): print(m1[i]) end_time_1 = time.time() # 2-take begin_time_2 = time.time() m2 = mstateavgstar.take(5) print(m2)
sim = float(len(inter)/len(un)) return (pairs[0], pairs[1], sim) if __name__ == "__main__": time1 = time.time() conf = SparkConf().setAppName('inf553_hw3_1').setMaster('local[*]') sc = SparkContext(conf=conf) # initiate sc.setLogLevel("OFF") input_file = sc.textFile(sys.argv[1]) # readfile output_file = sys.argv[2] data = input_file.map(lambda x: x.split(',')).filter(lambda x: x[0] != "user_id").persist(StorageLevel(True, True, False, False)) users = data.map(lambda a: a[0]).distinct().collect() nrows = len(users) users_dict = {} for u in range(0, nrows): users_dict[users[u]] = u characteristic_matrix = data.map(lambda x: (x[1], [users_dict[x[0]]])).reduceByKey(lambda x, y: x + y).persist(StorageLevel(True, True, False, False)) d_characteristic_matrix = {} cm = characteristic_matrix.map(lambda x: (x[0], set(x[1]))).collect() for i in cm: d_characteristic_matrix[i[0]] = i[1]
import os import sys import re import boto3 from datetime import datetime from pyspark.sql.types import * import pyspark.sql.functions as F from pyspark.storagelevel import StorageLevel MEMORY_AND_DISK = StorageLevel(True, True, False, False) def splitter(array, n=50): assert n > 0 i = 0 result = [] for e in array: if i < n: result.append(e) i += 1 if i >= n: yield result i = 0 del result[:] if len(result): yield result def gen_partition_statement(partition_tuples, target_root, run_id=None):
from pyspark.context import SparkContext from pyspark.storagelevel import StorageLevel import json import sys sc = SparkContext('local[*]', 'inf553_hw1_1') # initiate input_file = sc.textFile(sys.argv[1]) # readfile data = input_file.map(lambda x: json.loads(x)).map( lambda x: (x['review_id'], (x['user_id'], x['business_id'], x['useful'], x[ 'stars'], len(x['text'])))).persist( StorageLevel(True, True, False, False)) # deal with json museful = data.filter(lambda x: x[1][2] > 0).count() mfivestar = data.filter(lambda x: x[1][3] == 5.0).count() mlongestreview = data.map(lambda x: (x[1][4], 1)).top(1) muser = data.map(lambda x: (x[1][0], 1)).reduceByKey( lambda x, y: x + y).sortByKey().persist( StorageLevel(True, True, False, False)) musernum = muser.count() muserreview = muser.takeOrdered(20, lambda x: -x[1]) mbusiness = data.map(lambda x: ((x[1][1]), 1)).reduceByKey( lambda x, y: x + y).sortByKey().persist( StorageLevel(True, True, False, False))
def join_dict_of_rdd(rdd_dict: Dict[str, RDD]) -> RDD: """Join dictionary of RDD, but not a traditional join 1. Stack all RDD together and make as paired RDD 2. reducebyKey, value is list of dict 3. fill all field accorddingly, and return a RDD of dict Args: rdd_dict (Dict[str, RDD]): RDD dict Returns: RDD: Joined RDD """ # create loss multiplier for inputs rdd_list = [] loss_multiplier_list = [] def _add_loss_multiplier(inp: dict, problem: str) -> dict: lm_name = '{}_loss_multiplier'.format(problem) inp[lm_name] = 1 return inp for p, rdd in rdd_dict.items(): loss_multiplier_list.append('{}_loss_multiplier'.format(p)) rdd_list.append( rdd.map(lambda x, p=p: _add_loss_multiplier(x, problem=p))) # union rdds sc: SparkContext = SparkContext.getOrCreate() all_problem_rdd = sc.union(rdd_list) # make pair rdd def _make_pair_rdd(inp_dict: dict) -> Tuple[str, dict]: if 'record_id' not in inp_dict: raise KeyError( "Chaining problems with & without " "providing 'record_id' in inputs. Received keys: {}".format( inp_dict.keys())) return (inp_dict['record_id'], inp_dict) all_problem_rdd = all_problem_rdd.map(_make_pair_rdd) # reduce by key, fill out dict correspondingly loss_multiplier_list_b = sc.broadcast(loss_multiplier_list) def _merge_dicts(left_dict: dict, right_dict: dict): left_dict.update(right_dict) return left_dict def _add_dummpy_loss_multiplier(inp: dict) -> dict: # set loss multiplier to inform which problem # is available in this record lml = loss_multiplier_list_b.value for lm in lml: if lm not in inp: inp[lm] = 0 return inp # MEMORY_AND_DISK all_problem_rdd = all_problem_rdd.persist( storageLevel=StorageLevel(True, True, False, False)) all_problem_rdd = all_problem_rdd.reduceByKey(_merge_dicts).map( lambda x: x[1]).map(_add_dummpy_loss_multiplier) return all_problem_rdd
if __name__ == '__main__': global table_create table_create = False # create local StreamingContext with * working thread and batch interval of 20 second sc = SparkContext('local[*]', 'TwitterStream') ssc = StreamingContext(sc, 20) # read data from port with open('config.yaml', 'r') as stream: details = yaml.safe_load(stream) lines = ssc.socketTextStream(details['host'], details['port'], storageLevel=StorageLevel( False, True, False, False, 1)) # split each tweet into words words = lines.flatMap(lambda line: line.split(' ')) # do processing for each RDD generated in each interval words.foreachRDD(process_rdd) # start the streaming computation ssc.start() # wait for the streaming to finish ssc.awaitTermination()
from pyspark.sql.functions import _to_seq, _to_java_column from pyspark.sql import SparkSession, Column from pyspark.sql.functions import broadcast #from marketing_mart.CRM.delivery_diner.ddls import * #from marketing_mart.helpers import write_and_partition #from marketing_mart.CRM.diner_last_address.ddls import * logging.basicConfig(level=logging.INFO) formatter = logging.Formatter( "%(asctime)s %(levelname)s:%(name)s: %(message)s") root_logger = logging.getLogger("[CRM ETL Delivery Diner]") ACTIVE_LOOK_BACK = 380 # MEMORY_AND_DISK = StorageLevel(True, True, False, False) MEMORY_ONLY = StorageLevel(False, True, False, False) geom_table = 'source_mysql_core.geom' customer_table = 'source_mysql_core.customer' postal_code_dim_table = 'integrated_core.postal_code_dim' diner_order_agg_table = 'integrated_diner.diner_order_agg' diner_last_address_table = 'migrated_marketing_reporting.diner_last_address' login_user_table = 'source_mysql_core.login_user' #self.sc.conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2") #spark.conf.set("spark.kryoserializer.buffer.mb", "300") #spark.conf.set("spark.kryoserializer.buffer.max", "300297910") cbsa_rest_query = """ SELECT DISTINCT c.cust_id AS restaurant_id , g.g AS wkt
time1 = time.time() sc = SparkContext('local[*]', 'inf553_hw2_2') # initiate sc.setLogLevel("OFF") T = int(sys.argv[1]) S = int(sys.argv[2]) input_file = sc.textFile(sys.argv[3]) # readfile output_file = sys.argv[4] data = input_file.map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id") # create basket baskets = data.groupByKey().map(lambda x: (x[0], list(x[1]))).filter( lambda x: len(x[1]) > T).persist(StorageLevel(True, True, False, False)) # baskets = baskets.coalesce(4, True).persist(StorageLevel(True, True, False, False)) N = baskets.count() # Pass 1 # Pass 1 Map can_freq_is = baskets.mapPartitions(apriori) # Pass 1 Reduce all_can_freq_is = can_freq_is.distinct().map(lambda x: alterStr(x)).sortBy( lambda x: x).sortBy(lambda x: len(x)) if baskets.getNumPartitions() == 1: results = all_can_freq_is
def persist(self, storageLevel=StorageLevel(True, True, False, False, 1)): raise NotImplementedError()