def main(): pic_plot() return sc = SparkContext() data = sc.textFile('/home/z/Documents/python/EE627_HW8/re_u.data') pdata = sc.parallelize(data.take(100000)) ratings = pdata.map(lambda l: l.split(','))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))\ # pdb.set_trace() sc.setCheckpointDir('target') # need to add this!!! rank = 20 numIter = 30 model = ALS.train(ratings, rank, numIter) testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE))
def main(): args = parseArgs() sc = SparkContext(args.master, appName='Alternating least squares') if not args.verbose: sc.setLogLevel("ERROR") sc.setCheckpointDir('checkpoint/') folds = readFolds(args.data, args.folds, sc) cross_val_rmses = [] for k in range(len(folds)): train, test = createTrainTestData(folds, k, args.N) print "Initiating fold %d with %d train samples and %d test samples" % ( k, train.count(), train.count()) start = time() model = ALS.train(train, args.d, iterations=args.iter, lambda_=args.reg) testRMSE = testModel(model, test) now = time() - start print "Fold: %d\tTime: %f\tTestRMSE: %f" % (k, now, testRMSE) cross_val_rmses.append(testRMSE) train.unpersist() test.unpersist() print "%d-fold cross validation error is: %f " % (args.folds, np.mean(cross_val_rmses))
def spark_context(request): """ fixture for creating a spark context Args: request: pytest.FixtureRequest object """ conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest")) sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint') # Stackoverflow error request.addfinalizer(lambda: sc.stop()) quiet_py4j() return sc
def start(): sc=SparkContext(appName='NetworkWordCount') #一个时间单位是1 sc.setCheckpointDir('/tmp/spark') ssc=StreamingContext(sc,TIME_UNIT) lines = ssc.socketTextStream("10.5.24.137", 9999) words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) # 窗口长度:3*TIME_UNIT 每次移动长度:2*TIME_UNIT wordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y,3*TIME_UNIT,2*TIME_UNIT) print wordCounts wordCounts.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def setup_context(): global SPARK_CONTEXT global SQL_CONTEXT config = config_pyspark_submit_args() SPARK_CONTEXT = SparkContext(conf=config) SQL_CONTEXT = SQLContext(SPARK_CONTEXT) logging.getLogger('py4j').setLevel(logging.ERROR) SPARK_CONTEXT.setLogLevel("ERROR") SPARK_CONTEXT.setCheckpointDir(f"{DATA_PATH}/checkpoint/")
def start(): sc = SparkContext(appName='NetworkWordCount') ssc = StreamingContext(sc, 1) #必须设置checkpoint地址 sc.setCheckpointDir('/tmp') lines = ssc.socketTextStream("10.5.24.137", 9999) words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) def update_count(new_value, total_value): return sum(new_value, total_value or 0) #使用updateStateBykey针对输入的每个key进行有状态的统计(会一直累加每个key的count) total_count = pairs.updateStateByKey(updateFunc=update_count) total_count.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(): # Configure Spark if not os.path.isdir("checkpoints"): os.mkdir("checkpoints") conf = SparkConf().setMaster('local').setAppName('connected components') sc = SparkContext(conf=conf) sqlcontext = SQLContext(sc) SparkContext.setCheckpointDir(sc, "checkpoints") # The directory for the file filename = "q1.txt" # Get data in proper format data = getData(sc, filename) edges = get_edges(data, sqlcontext) vertices = get_vertices(data, sqlcontext) graph = GraphFrame(vertices, edges) connected_components(graph=graph)
def run_tree_join(ACCESS_KEY, SECRET_KEY, REDIS_SERVER, REDIS_PORT, CHECKPOINT_REMOTE_DIR): sc = SparkContext(appName='TreeJoin') sc.setCheckpointDir(CHECKPOINT_REMOTE_DIR) ss = SparkSession(sc).builder.getOrCreate() # ss.conf.set("spark.sql.shuffle.partitions", 4) # file_download_path = 's3a://heyyall/test_folder' # file_download_path = 's3a://heyyall/big_test' # file_download_path = 'RC_2011-01_my_slice_2' file_download_path = 's3a://heyyall/reddit_data/RC_2011-01' reddit_schema = StructType([ StructField('archived', BooleanType()), StructField('author', StringType()), StructField('author_flair_css_class', StringType()), StructField('author_flair_text', StringType()), StructField('body', StringType()), StructField('controversiality', IntegerType()), StructField('created_utc', StringType()), StructField('distinguished', StringType()), StructField('downs', IntegerType()), StructField('edited', StringType()), StructField('gilded', IntegerType()), StructField('id', StringType()), StructField('link_id', StringType()), StructField('name', StringType()), StructField('parent_id', StringType()), StructField('retrieved_on', LongType()), StructField('score', IntegerType()), StructField('score_hidden', BooleanType()), StructField('subreddit', StringType()), StructField('subreddit_id', StringType()), StructField('ups', IntegerType()) ]) clean_data = get_clean_data(ss, file_download_path, reddit_schema) joined_links = link_join(clean_data) joined_links = joined_links.repartition('match_group') write_to_redis(joined_links, REDIS_SERVER, REDIS_PORT)
def process(name): CLOUDSQL_INSTANCE_IP = '' #(database server IP) CLOUDSQL_DB_NAME = 'recommendation_spark' CLOUDSQL_USER = '******' CLOUDSQL_PWD = 'tiger' # CE conf = SparkConf().setAppName("train_model") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) USER_ID=name jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD) # checkpointing helps prevent stack overflow errors sc.setCheckpointDir('checkpoint/') # Read the ratings and accommodations data from Cloud SQL dfRates = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Rating', useSSL='false').load() dfAccos = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Accommodation', useSSL='false').load() print("read ...") # train the model model = ALS.train(dfRates.rdd, 20, 20) # tuning number print("trained ...") # use this model to predict what the user would rate accommodations that she has not rated allPredictions = None dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect() rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2]))) predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5 print("predicted for user={0}".format(USER_ID)) if (allPredictions == None): allPredictions = predictions else: allPredictions.extend(predictions) # write them schema = StructType([StructField("userId", StringType(), True), StructField("accoId", StringType(), True), StructField("prediction", FloatType(), True)]) dfToSave = sqlContext.createDataFrame(allPredictions, schema) dfToSave.write.jdbc(url=jdbcUrl, table='Recommendation', mode='overwrite')
def main(): args = parseArgs() sc = SparkContext(args.master, appName='Alternating least squares') sess = SparkSession(sc) if not args.verbose: sc.setLogLevel("ERROR") sc.setCheckpointDir('checkpoint/') folds = readFolds(args.data, args.folds, sc, sess) cross_val_rmses = [] for k in range(len(folds)): train, test = createTrainTestData(folds, k, args.N) print "Initiating fold %d with %d train samples and %d test samples" % ( k, train.count(), train.count()) start = time() als = ALS(maxIter=args.iter, regParam=args.reg, userCol="userId", itemCol="itemId", ratingCol="rating", coldStartStrategy='drop') model = als.fit(train) predictions = model.transform(test) predictions.show() evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") testRMSE = evaluator.evaluate(predictions) now = time() - start print "Fold: %d\tTime: %f\tTestRMSE: %f" % (k, now, testRMSE) cross_val_rmses.append(testRMSE) train.unpersist() test.unpersist() print "%d-fold cross validation error is: %f " % (args.folds, np.mean(cross_val_rmses))
def main(): sc = SparkContext() dataDir = '/home/z/Documents/python/EE627_project/data/data_in_matrixForm/' matrix_in_name = dataDir + 'user_track.txt' test_name = dataDir + 'testTrack_hierarchy.txt' t = strftime('%Y%m%d%H%M', gmtime()) title = 'mf_track_estimated' + t + '.txt' output_file = dataDir + title f_out = open(output_file, 'w') data = sc.textFile(matrix_in_name) ratings = data.map(lambda l: l.split('|'))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) sc.setCheckpointDir('target') # need to add this!!! rank = 150 numIter = 30 model = ALS.train(ratings, rank, numIter) # Save and load model model.save(sc, "target/tmp/album_rank150num30")
def get_most_liked_courses(n): """Get the top n courses with the most likes and ratings""" input = { 'sort_mode': 'interesting', 'count': n } return m.Course.search(params=input)[0] def save_recommendations_to_mongo(): log.info('Saving recommendations to database...') for user in m.User.objects: try: user.recommended_courses = engine.recommend_user( str(user.id), _PARAMS['num_courses']) user.save() except Exception as e: log.error(e) if __name__ == '__main__': mongoengine.connect(c.MONGO_DB_RMC) sc = SparkContext() sc.setCheckpointDir('data/recommendation/checkpoint/') engine = RecommendationEngine(sc) engine.train() engine.load_data() save_recommendations_to_mongo()
import sys from pyspark import SparkConf, SparkContext from pyspark.mllib.recommendation import ALS, Rating def loadMovieNames(): movieNames = {} with open("ml-100k/u.ITEM") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore') return movieNames conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS") sc = SparkContext(conf = conf) sc.setCheckpointDir('checkpoint') print "\nLoading movie names..." nameDict = loadMovieNames() #umadeup: data create on top of u.data (3 rows) data = sc.textFile("C:/Users/seeth_000/UdemySpark/ml-100k/umadeup.data") ratings = data.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() # Build the recommendation model using Alternating Least Squares print "\nTraining recommendation model..." rank = 10 # Lowered numIterations to ensure it works on lower-end systems numIterations = 6 model = ALS.train(ratings, rank, numIterations) userID = int(sys.argv[1])
from pyspark import SQLContext, SparkContext from graphframes.examples import Graphs from graphframes import * import random import sys sc = SparkContext() sqlContext = SQLContext(sc) sc.setCheckpointDir("/home/shaanzie/sparkchecks/") v = sqlContext.createDataFrame([("a", "Alice", 34), ("b", "Bob", 36), ("c", "Charlie", 30), ("d", "David", 29), ("e", "Esther", 32), ("f", "Fanny", 36), ("g", "Gabby", 60)], ["id", "name", "age"]) # Edge DataFrame e = sqlContext.createDataFrame([("a", "b", "friend"), ("b", "c", "follow"), ("c", "b", "follow"), ("f", "c", "follow"), ("e", "f", "follow"), ("e", "d", "friend"), ("d", "a", "friend"), ("a", "e", "friend")], ["src", "dst", "relationship"]) # Create a GraphFrame g = GraphFrame(v, e) result = g.connectedComponents() result.select("id", "component").orderBy("component").show()
SparkContext.setSystemProperty('spark.executor.memory', '2560m') #2560m SparkContext.setSystemProperty('spark.executor.cores', '8') #SparkContext.setSystemProperty('spark.executor.memoryOverhead', '1536m') SparkContext.setSystemProperty("spark.scheduler.mode", "FAIR") SparkContext.setSystemProperty('spark.memory.fraction', '0.8') SparkContext.setSystemProperty('spark.memory.storageFraction', '0.1') SparkContext.setSystemProperty("spark.default.parallelism", "256") SparkContext.setSystemProperty("spark.num.executors", "1") SparkContext.setSystemProperty("spark.local.dir", "/tmp") conf = SparkConf().setAppName('MoviesRec: Preditcions') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sc.setCheckpointDir('/ML/movies/checkpoint/') df = sqlContext.read.load(path='/ML/movies/data/*', format='com.databricks.spark.csv', delimiter=',', inferSchema='true', header="true").cache() df = df.drop('timestamp') oldColumns = df.schema.names newColumns = ["userId", "itemId", "rating"] df = reduce( lambda df, idx: df.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df) df = df.withColumn("userId", df["userId"].cast("string"))
import time import os from pyspark import SparkContext from pyspark.streaming import StreamingContext from utils import * sc = SparkContext('local', 'test') sc.setLogLevel("ERROR") sc.setCheckpointDir("/tmp") # for stable state ssc = StreamingContext(sc, 0.01) rddQ = [] for filename in os.listdir("data/split"): rddQ.append(sc.textFile("data/split/" + filename)) # rddQ.append(sc.textFile("data/split/aa")) result = [] def update_result(rdd): global result result = rdd.top(10) # processing dstream = ssc.queueStream(rddQ) dstream = sclean(dstream) dstream = scount(dstream) dstream\ .map(lambda x: (x[1],x[0]))\ .foreachRDD(lambda rdd: update_result(rdd))
def load_spark_context(): conf = SparkConf().setMaster("local[*]").setAppName( "MovieRecommendationsALS") sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint') return sc
''' 이 연습에서는 체크포인트가 반복 루틴에 미칠 수 있는 영향을 보여준다. ''' import sys from pyspark import SparkConf, SparkContext sc = SparkContext() sc.setCheckpointDir("file:///Users/isang-geon/tmp/checkpointdir") rddofints = sc.parallelize([1,2,3,4,5,6,7,8,9,10]) try: # 이것은 rddofints에 대한 매우 긴 리니지를 만들 것이다. for i in range(1000): rddofints = rddofints.map(lambda x: x+1) if i % 10 == 0: print("Looped " + str(i) + " times") rddofints.checkpoint() rddofints.count() except Exception as e: print("Exception: " + str(e)) print("RDD Debug String: ") print(rddofints.toDebugString()) sys.exit() print("RDD Debug String: ") print(rddofints.toDebugString())
inters_dst_dst = DataFrame.intersect(df1.select("dst"), df2.select("dst")) inters_dst_dst_list = inters_dst_dst.select("dst").rdd.map(lambda row : row.dst).collect() query_list = [] query_list += split_query(inters_dst_dst_list, 'src', 'or', 'dst') query_list += split_query(inters_dst_dst_list, 'dst', 'or', 'dst') inters_e = reduce(DataFrame.unionAll, [e.filter(query) for query in query_list]) inters_v = inters_dst_dst.select(col("dst").alias("id")) return inters_v, inters_e sc = SparkContext() sc.setCheckpointDir("/user/pnda/checkpoint") sqlContext = sql.SQLContext(sc) #load vertices v = sqlContext.read.parquet("/user/pnda/result/vertex") #load edges e = sqlContext.read.parquet("/user/pnda/result/edge") eg = e.groupBy("src").count().sort("count", ascending=[0,1]).head(2) user_1 = eg[0].src user_2 = eg[1].src df1 = get_n_level_connection(user_1, 3, True) df2 = get_n_level_connection(user_2, 3, True)
# In[15]: import math import pyspark.sql.functions as psf from pyspark.sql.types import DoubleType dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) s=data.alias("i").join(data.alias("j"), psf.col("i.id") < psf.col("j.id")) .select( psf.col("i.id").alias("src"), psf.col("j.id").alias("dst"), dot_udf("i.norm", "j.norm").alias("relationship"))\ .sort("src", "dst")\ # In[ ]: #run in the spark shell v = featurized_data.select("id", "features") e = s.filter("relationship > 0.8") from graphframes import * g = GraphFrame(v, e) g.vertices.show() g.edges.show() results = g.pageRank(resetProbability=0.15, maxIter=10) #pagerank results.vertices.select("id", "pagerank").show() results.edges.select("src", "dst", "weight").show() results = g.triangleCount() #triangelCount results.select("id", "count") sc.setCheckpointDir("_checkpoint") results = g.connectedComponents() #connectedComponents results.show()
location = "hdfs" try: if "avatar" in platform.node(): location = "local" except: pass try: if "avatar" in socket.gethostname(): location = "local" except: pass print "### location %s" % location sc = SparkContext(appName="CRF") sc.setCheckpointDir("/tmp") year = 2015 mode = sys.argv[1] tag = sys.argv[2] month = int(sys.argv[3]) day = int(sys.argv[4]) hour = int(sys.argv[5]) partNum = None try: partNum = int(sys.argv[6]) except: pass limit = None try: limit = int(sys.argv[7]) except:
from pyspark.sql.types import * import pendulum import pandas import requests conf = SparkConf().setAppName("PySparkStreaming") \ .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\ .set("spark.default.parallelism", 6) \ .set("spark.speculation", "true") \ .set("spark.speculation.interval", "1s") \ .set("spark.streaming.kafka.maxRatePerPartition", 300) \ .set("spark.sql.autoBroadcastJoinThreshold", -1) sc = SparkContext(conf=conf) sc.setLogLevel("WARN") sc.setCheckpointDir("/tmp/spark-streaming") spark = SparkSession \ .builder \ .appName("sparkSQL_car_gps") \ .getOrCreate() spark.sql("SET spark.sql.shuffle.partitions=6") grid_schema = StructType([ StructField("grid", IntegerType(), False), StructField("block", IntegerType(), False) ]) def grid_block(lng_x, lat_y): if not (121.34 <= lng_x < 121.68 and 24.92 <= lat_y < 25.2):
sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from ml import diff_feature_cls, diff_train_cls_pos def run(sc, sql_context, is_hive): diff_feature_cls.main(sc, sql_context, is_hive=True) diff_train_cls_pos.main(sc, sql_context, is_hive=True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf) sc.setCheckpointDir("checkpoint/") sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
testRDD = sc.textFile(testFile) header = inputRDD.first() #extract header inputRDD = inputRDD.filter(lambda row: row != header) header2 = testRDD.first() #extract header testRDD = testRDD.filter(lambda row: row != header2) inputRDD = inputRDD.map(lambda line: line.split(',')).map( lambda x: ((int(x[0]), int(x[1])), float(x[2]))) testRDD = testRDD.map(lambda line: line.split(',')).map( lambda x: ((int(x[0]), int(x[1])), 1)) input1 = inputRDD.subtractByKey(testRDD) input = input1.map(lambda x: Rating(x[0][0], x[0][1], x[1])) sc.setCheckpointDir('/tmp') rank = 8 numIterations = 10 lmbda = 0.1 numBlocks = 16 nonnegative = True model = ALS.train(input, rank, numIterations, lmbda, nonnegative=True, seed=42) testRDD = testRDD.map(lambda x: (x[0][0], x[0][1])).distinct() predictions = model.predictAll(testRDD).map(
def computeEntityTSData(): ##### Spark Functions # input: json_line # [(entity: {date, count})] def deriveEntityToDate(json_line): entity_dates = [] json_obj = json.loads(json_line) created_string = float(json_obj['created_utc']) created_date = datetime.fromtimestamp(created_string).date() annotations = json_obj['entity_texts'] for annotation in annotations: date_count = {} date_count[created_date] = 1 entity_dates.append((annotation, date_count)) return entity_dates def combineDateCounts(date_counts1, date_counts2): date_counts = date_counts1 for date in date_counts2: if date in date_counts: date_counts[date] += date_counts2[date] else: date_counts[date] = date_counts2[date] return date_counts ###### Execution code conf = SparkConf().setAppName("NER Diffusion - Exploratory Plots") conf.set("spark.python.worker.memory","10g") conf.set("spark.driver.memory","15g") conf.set("spark.executor.memory","10g") conf.set("spark.default.parallelism", "12") conf.set("spark.mesos.coarse", "true") conf.set("spark.driver.maxResultSize", "10g") # Added the core limit to avoid resource allocation overruns conf.set("spark.cores.max", "5") conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos") conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz") conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory") sc = SparkContext(conf=conf) sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing") # use sample directory for testing # distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/user/derczynskil/RC_2015-01") distFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated") # Point to local file until data has finished uploading to HDFS # distFile = sc.textFile("/home/derczynskil/annotated/") distFile.cache() # Step 1: Derive the time-sensitive map of when entities appeared print("----Loading entity time-series") entity_citation_dates = distFile\ .flatMap(deriveEntityToDate)\ .reduceByKey(combineDateCounts) entity_citation_dates.cache() # print(entity_citation_dates.collect()) print("----Deriving the count of entity citations") entity_citation_counts = entity_citation_dates\ .map(lambda x: (x[0], len(x[1])))\ .map(lambda x: (x[1], x[0]))\ .sortByKey(False)\ .map(lambda x: (x[1], x[0]))\ .collect() # Write to local disk print("------Writing the output to a file") outputString = "" for (entity, count) in entity_citation_counts: outputString += str(entity.encode('utf-8')).replace("'", "") + "\t" + str(count) + "\n" # print(outputString) outputFile = open("data/entity_mention_frequencies.csv", "w") outputFile.write(outputString) outputFile.close() # Write the time-series output to local disk print("------Writing the ts output to a file") outputString = "" for (entity, date_to_count) in entity_citation_dates.collect(): outputString += str(entity.encode('utf-8')).replace("'", "") for date in date_to_count: outputString += "\t" + str(date) + "|" + str(date_to_count[date]) outputString += "\n" # print(outputString) outputFile = open("data/entity_mention_ts.csv", "w") outputFile.write(outputString) outputFile.close() # stop the Spark context from running sc.stop()
from pyspark.sql import SQLContext from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from utils import * from pyspark import SparkContext, SparkConf file_path = "./data.csv" checkpoint_dir = "./CheckpointDir/" conf = SparkConf().setAppName("Car Price Prediction").setMaster("local[*]") sc = SparkContext(conf=conf) print(sc.getConf().getAll()) sc.setCheckpointDir(checkpoint_dir) spark = SQLContext(sc) data = spark.read.csv(path=file_path, header=True, quote='"', sep=",", inferSchema=True) data_test, data_train = data.randomSplit(weights=[0.3, 0.7], seed=10) get_indexer_input = get_indexer_input(data) def model_training(data_train, indexer_input): x_cols = list( set(data_train.columns) - set(indexer_input.keys() + ["Price"])) str_ind_cols = ['indexed_' + column for column in indexer_input.keys()] indexers = indexer_input.values() pipeline_tr = Pipeline(stages=indexers) data_tr = pipeline_tr.fit(data_train).transform(data_train)
def computeGlobalCascadeIsomorphicDistribution(): ###### Spark Tranformation Functions # Returns: [(reply_id, orig_post_id)] - for flatMap - to avoid issue of 0 cardinality of the list def deriveReplyMap(json_line): reply_tuples = [] json_obj = json.loads(json_line) if 'parent_id' in json_obj: orig_post_id = json_obj['parent_id'] reply_post_id = json_obj['name'] reply_tuples.append((reply_post_id, orig_post_id)) return reply_tuples def combineReplies(replies1, replies2): replies = replies1 + replies2 return replies def deriveEntityToPosts(json_line): # Get the broadcast maps that need to be used # orig_replies_map = orig_replies_map_broadcast.value reply_orig_map = reply_orig_map_broadcast.value orig_replies_map = orig_replies_map_broadcast.value entity_posts = [] json_obj = json.loads(json_line) post_id = json_obj['name'] entities = json_obj['entity_texts'] # get the replies to this post for entity in entities: try: # ensure that the post appears in a chain - in order to filter out singleton citations if post_id in orig_replies_map or post_id in reply_orig_map: # if len(orig_replies_map.lookup(post_id)) > 0 \ # or len(reply_orig_map.lookup(post_id)) > 0: # New code to use rdd lookup to save RDD partitioning entity_posts.append((str(entity), [str(post_id)])) except: pass return entity_posts ###### Graph Isomorphism Functions # Input: (entity,[chains]) - tuple # output: (entity, [connected_chain]) - tuple def induce_type1_cascades(tuple): entity = tuple[0] chains = tuple[1] # entity_posts = tuple[1]['posts'] # Get the entity posts entity_citation_posts = entity_posts_map_broadcast.value entity_posts = entity_citation_posts[entity] # log the connected chains of the entity connected_chains = [] for chain in chains: # new chain formed by filtering the chain with only entity citing posts # Filter the chain down to include only posts citing the entity new_chain = set() for chain_edge in chain: source = chain_edge.split("->")[0] target = chain_edge.split("->")[1] if source in entity_posts and target in entity_posts: new_chain.add(chain_edge) # too inefficient, better to convert to matrix form and then run this the graph # 1. Induce maps between node label and ids node_to_index = {} index_to_node = {} index = -1 for chain_edge in new_chain: source = chain_edge.split("->")[0] target = chain_edge.split("->")[1] if source not in node_to_index: index += 1 node_to_index[source] = index index_to_node[index] = source if target not in node_to_index: index += 1 node_to_index[target] = index index_to_node[index] = target # 2. Populate the nd matrix dim = len(node_to_index) if dim > 1: M = np.zeros(shape=(dim, dim)) for chain_edge in new_chain: source = chain_edge.split("->")[0] source_index = node_to_index[source] target = chain_edge.split("->")[1] target_index = node_to_index[target] M[source_index, target_index] = 1 # 3. Induce the connected components from the matrix # print(str(dim)) # print(str(M)) Msp = csgraph.csgraph_from_dense(M, null_value=0) n_components, labels = csgraph.connected_components(Msp, directed=True) # print("Number of connected components = " + str(n_components)) # print("Components labels = " + str(labels)) # get the components and their chains for i in range(0, n_components): # print("Component: " + str(i)) component_chain = [] # get the nodes in that component # print(labels) c_nodes = [j for j in range(len(labels)) if labels.item(j) is i] # print(c_nodes) # Only log the component if more than two nodes are in in if len(c_nodes) > 1: # build the canonical edges for source_id in c_nodes: for target_id in c_nodes: if int(M[(source_id, target_id)]) is 1: component_chain.append(str(source_id) + "->" + str(target_id)) if len(component_chain) > 0: connected_chains.append(component_chain) canonical_chains = connected_chains print("Canonical Chains:") print(canonical_chains) # return back to the function the mapping between the entity and the connected chains return (entity, canonical_chains) # Input: (entity, [post]) # Output: [(entity, {cascades: [cascade]})] def computePerEntityCascadesAndPosts(tuple): entity = tuple[0] posts = tuple[1] # Get the broadcast maps that need to be used orig_replies_map = orig_replies_map_broadcast.value reply_orig_map = reply_orig_map_broadcast.value # Get the cascade graphs for each entity entity_chains = [] chain_posts = set() # get the chain that each post was in for post in posts: # Do this to speed up computation by ensuring that we haven't already recorded the post in another chain if post not in chain_posts: chain_posts.add(post) chain = [] to_process = [] # Starting from the seed post in a possible chain # get the replies to the post - down the chain if post in orig_replies_map: replies = orig_replies_map[post] to_process += replies for reply in replies: chain.append(reply + "->" + post) # Get the post that the post replied to if post in reply_orig_map: orig_post = reply_orig_map[post] to_process += orig_post chain.append(post + "->" + orig_post) # Go through each post that is to be processed while len(to_process) > 0: to_process_post = to_process.pop() # log that the post has been processed in an already found chain chain_posts.add(to_process_post) # get the replies to this post if to_process_post in orig_replies_map: replies = orig_replies_map[to_process_post] to_process += replies for reply in replies: chain.append(reply + "->" + to_process_post) # get the post that this post relied to if to_process_post in reply_orig_map: orig_post = reply_orig_map[to_process_post] to_process += orig_post chain.append(to_process_post + "->" + orig_post) # log the chain for the entity entity_chains.append(chain) # Return the entity chains return (entity, entity_chains) ###### Execution code conf = SparkConf().setAppName("NER Diffusion - Cascade Pattern Mining") conf.set("spark.python.worker.memory","10g") conf.set("spark.driver.memory","15g") conf.set("spark.executor.memory","10g") conf.set("spark.default.parallelism", "12") conf.set("spark.mesos.coarse", "true") conf.set("spark.driver.maxResultSize", "10g") conf.set("spark.cores.max", "15") conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos") conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz") conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory") sc = SparkContext(conf=conf) sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing") # use sample directory for testing annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample") # annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated") annotationFile.cache() # thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json") thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample") thinnedFile.cache() # Load the reply graphs from the thinnedFile print("Loading replies map") reply_map_rdd = thinnedFile\ .flatMap(deriveReplyMap) # Collect as a map and broadcast this to the cluster reply_orig_map = reply_map_rdd\ .collectAsMap() print("Reply Orig Map Size = " + str(len(reply_orig_map))) # print(reply_orig_map) reply_orig_map_broadcast = sc.broadcast(reply_orig_map) # # get the: {orig, [reply]} dictionary orig_replies_map = reply_map_rdd\ .map(lambda x: (x[1], [x[0]]))\ .reduceByKey(combineReplies)\ .collectAsMap() print("Orig Replies Map Size = " + str(len(orig_replies_map))) orig_replies_map_broadcast = sc.broadcast(orig_replies_map) # Load the entity to post map # input: json_line of annotations of each post # output: [(entity, [post])] of posts where entities appeared print("Loading entity to posts rdd") entity_posts_rdd = annotationFile\ .flatMap(deriveEntityToPosts)\ .reduceByKey(lambda p1, p2: p1 + p2) entity_posts_rdd_map = entity_posts_rdd.collectAsMap() print("Entity to posts map size = " + str(len(entity_posts_rdd_map))) entity_posts_map_broadcast = sc.broadcast(entity_posts_rdd_map) # print("Entity to posts Map Size = " + str(entity_posts_rdd.count())) # entity_posts_rdd_sample = sc.parallelize(entity_posts_rdd.take(100)) # # print(entity_posts_rdd_map) # print("Computing entity cascades") # Output: (entity, {"cascades": entity_chains, "posts": posts}) entity_cascades_rdd = entity_posts_rdd\ .map(computePerEntityCascadesAndPosts) # # # .reduceByKey(lambda chain1, chain2: chain1 + chain2) entity_cascades_rdd_map = entity_cascades_rdd.collect() print("Entity cascades Map Size = " + str(len(entity_cascades_rdd_map))) # print("Inducing distribution of cascade shapes - Global") # Induce global distributiion # Returns: [(entity, [chain])] canonical_cascade_patterns = entity_cascades_rdd\ .map(induce_type1_cascades) # # canonical_cascade_patterns_distribution_map = canonical_cascade_patterns\ # # .take(10) # # print("Entity connected cacades Map Size = " + str(len(canonical_cascade_patterns_distribution_map))) # # print(canonical_cascade_patterns_distribution_map) print("Top Patterns:") canonical_cascade_patterns_distribution = canonical_cascade_patterns\ .flatMap(lambda x: x[1] if len(x) is 2 else ["null"])\ .map(lambda x: (str(x), 1))\ .filter(lambda x: "null" not in x[0])\ .reduceByKey(lambda count1, count2: count1 + count2)\ .map(lambda x: (x[1], x[0]))\ .sortByKey(False)\ .map(lambda x: (x[1], x[0])) top_patterns = canonical_cascade_patterns_distribution.collect() # print(top_patterns) # # Write the patterns to local disk for local isomorphism computation outputString = "" for (pattern, freq) in top_patterns: outputString += str(pattern) + "\t" + str(freq) + "\n" outputFile = open("data/cascade_shapes.tsv", "w") outputFile.write(outputString) outputFile.close() # stop the Spark context from running sc.stop()
import sys import json from operator import add from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark import SQLContext, SparkContext, SparkConf reload(sys) sys.setdefaultencoding('utf-8') conf = SparkConf().setAppName("chencheng's task").setMaster("spark://anti-spam-spark-001.yz.momo.com:8081,anti-spam-spark-002.yz.momo.com:8081") sc = SparkContext(conf=conf) sc.setCheckpointDir("hdfs://antispam/user/hadoop/output/chencheng/checkpoint") user_artist_data1 = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/female/2016031[0-9]18/") user_artist_data2 = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/female/2016032[0-3]18/") #user_artist_data2 = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/male/2016032[0-1]18/") user_artist_data= user_artist_data1.union(user_artist_data2) ratings = user_artist_data.map(lambda x: json.loads(x))\ .filter(lambda x: x[0][0] and x[0][1])\ .map(lambda x: Rating(int(x[0][0]), int(x[0][1]), float(x[1]))) ratings.checkpoint() ratings.cache() rank = 30 numIterations = 25 ALS.checkpointInterval = 2 model = ALS.train(ratings, rank, numIterations,lambda_=0.03,nonnegative=True) model.save(sc,"hdfs://antispam/user/hadoop/output/chencheng/model/als_female_parameters/30/als_female_0310-23_003_")
const=True, default=False, help= 'Include extra (non-restaurant businesses) rows for training the model.') parser.add_argument('--withTrainTestSplit', required=False, type=str2bool, nargs='?', const=True, default=False) args = vars(parser.parse_args()) conf = SparkConf().setAppName("eda").setMaster("local[*]") sc = SparkContext(conf=conf) sc.setCheckpointDir("checkpoint_dir/") outFile_name = 'businesses' start = timer() business_json_df = ( pd.read_json('yelp_dataset/yelp_academic_dataset_business.json', lines=True) # Set the index to business_id; this will be used for joining later. .set_index('business_id').dropna(subset=['attributes', 'categories'])) # Filter out non-restaurant businesses. mask = (business_json_df['categories'].str.contains(r"Food|Restaurant|Bar", case=False) | business_json_df['name'].str.contains("Restaurant|Cuisine", case=False))
if error < min_error: min_error = error best_rank = rank best_lambda = lambda_i print 'The best model was trained with rank %s, lambda %f' % (best_rank, best_lambda) # Test model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations, lambda_=best_lambda) predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print 'For testing data the RMSE is %s' % (error) ''' # Using the complete dataset to build the final model; re-do the above # Load the complete dataset file complete_ratings_file = os.path.join('./datasets', 'ml-latest', 'ratings.csv') complete_ratings_raw_data = sc.textFile(complete_ratings_file) complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0] # Parse complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\ .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache() print "There are %s recommendations in the complete dataset" % (complete_ratings_data.count()) # to avoid stackover flow sc.setCheckpointDir('checkpoint/') training_RDD, test_RDD = complete_ratings_data.randomSplit([7, 3], seed=0L) complete_model = ALS.train(training_RDD, best_rank, seed=seed,
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.storagelevel import StorageLevel import sys reload(sys) sys.setdefaultencoding('utf8') print("------ CONFIG -------") # CONF CLUSTER conf=SparkConf()\ .set('spark.network.timeout','5000000s')\ .set('spark.executor.heartbeatInterval','4500000s') sc = SparkContext(conf=conf) sc.setCheckpointDir("checkpointdir2") sqlContext = SQLContext(sc) print("------ reading data -------") data = sqlContext.read.parquet("data.parquet") print(sc.uiWebUrl) print("------ train test split --------") (trainingData, testData) = data.randomSplit([0.7, 0.3]) print("--------- Indexing categorical values for OHE------------") codeIndexer = StringIndexer(inputCol='code', outputCol='code_index').setHandleInvalid("keep") lieuIndexer = StringIndexer(inputCol='lieu', outputCol='lieu_index').setHandleInvalid("keep")
import findspark # this needs to be the first import findspark.init() import networkx as nx from pyspark import SparkConf from pyspark import SparkContext from snpp.cores.lowrank import partition_graph conf = (SparkConf().setMaster("local[2]").setAppName("SparkTest")) sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint') # Stackoverflow error train_graph_path = 'data/{}/train_graph.pkl'.format('slashdot') g = nx.read_gpickle(train_graph_path) partition_graph(g, k=40, sc=sc, lambda_=0.1, iterations=20, seed=123456) sc.stop()
##### Main Execution Code conf = SparkConf().setAppName("Subreddit extraction") conf.set("spark.python.worker.memory","10g") conf.set("spark.driver.memory","15g") conf.set("spark.executor.memory","10g") conf.set("spark.default.parallelism", "12") conf.set("spark.mesos.coarse", "true") conf.set("spark.driver.maxResultSize", "10g") # Added the core limit to avoid resource allocation overruns conf.set("spark.cores.max", "10") conf.setMaster("mesos://zk://scc-culture-slave4.lancs.ac.uk:2181/mesos") conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz") conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory") sc = SparkContext(conf=conf) sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing") # get the HDFS url of the dataset dataset = "reddit" hdfsUrl = inlocation # broadcast the name of the dataset to the cluster print("----Broadcasting the name of the dataset being processed") datasetName = sc.broadcast(dataset) # run a map-reduce job to first compile the RDD for the dataset loaded from the file print("-----Dataset file: " + hdfsUrl) rawPostsFile = sc.textFile(hdfsUrl, minPartitions=12) # clean the posts and write them into HDFS from their respective paritions print("Writing to HDFS")
structfields.append( StructField(column_names[i], StringType(), True)) elif column_types[i] == "date": structfields.append(StructField(column_names[i], DateType(), True)) else: raise ValueError("column type undefined: " + column_types[i]) schema = StructType(structfields) print("...SCHEMA OF " + name + " IS " + str(schema)) # Read in files location_file = open(location_file_name, "r") locations = location_file.read().splitlines() sc.setCheckpointDir(locations[2] + "/" + name) rdd = sc.textFile(csv_file, use_unicode=False) rdd.checkpoint() rdd2 = rdd.map(lambda line: parse(column_types, line)) df = sqlContext.createDataFrame(rdd2, schema) # Convert to Parquet start = time.time() df.write.parquet(locations[3] + name) end = time.time() # Record time and before/after file sizes output_file_name = './output_' + compression_scheme + '_' + dict_encoding + '/' + name + '_' + compression_scheme + '_' + dict_encoding + '.txt' if os.path.exists(output_file_name): append_write = 'a' # append if already exists else:
def computeRmse(model, evalSet): evalSetUserProduct = evalSet.map(lambda x: (x[0], x[1])) predictions = model.predictAll(evalSetUserProduct).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = evalSet.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) validationRmse = math.sqrt(ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) return validationRmse if __name__ == "__main__": conf = SparkConf() \ .setAppName("YelpReviewALS") \ .set("spark.executor.memory", "1g") sc = SparkContext('local', conf=conf) reviewRDD = sc.textFile("../../../data/review_large.txt") sc.setCheckpointDir("checkpoints/") if len(sys.argv) < 2: printUsage() exit(1) if sys.argv[1] == '-e': evalModel = True else: evalModel = False if sys.argv[1] == '-c': runValidation = True else: runValidation = False
# -*- coding: utf-8 -*- """ Created on Wed Apr 12 14:12:44 2017 @author: Administrator """ from pyspark import SparkContext, SparkConf from pyspark.mllib.recommendation import ALS from math import sqrt from operator import add conf = SparkConf().setAppName("MovieRecommendation").set( "spark.executor.memory", "4g") sc = SparkContext(conf=conf) #为了保证迭代次数过多时不报错,具体原理不知道,搜了好久才看到这个解决方案 sc.setCheckpointDir("D:/WorkSpace/Spyder/checkpoint") #处理用户评分数据 def HandleRating(line): user = line.strip().split(",") return int(user[3]) % 10, (int(user[0]), int(user[1]), float(user[2])) #处理电影数据,返回[(电影id,电影名称)] def HandleMovie(line): movie = line.strip().split(",") return int(movie[0]), movie[1] #处理电影数据,返回[(电影id,(电影类型))],主要是为了解决根据电影类型推荐电影的冷启动问题
def train(): conf = SparkConf() \ .setAppName("project") \ .setMaster("local[*]") \ .set("spark.driver.memory","4g") sc = SparkContext(conf=conf) # check model dir if not os.path.exists(model_dir): os.mkdir(model_dir) # rename raw_data = sc.textFile(train_file).map(json.loads).persist( StorageLevel.MEMORY_AND_DISK) u_table1 = raw_data.map(lambda x: x['user_id']).distinct().collect() u_set1 = set(u_table1) b_table1 = raw_data.map(lambda x: x['business_id']).distinct().collect() b_set1 = set(b_table1) user_avg = support.getAvg(user_avg_file) business_avg = support.getAvg(business_avg_file) u_set2 = set(user_avg.keys()) b_set2 = set(business_avg.keys()) b_table3 = sc.textFile(business_json).map( json.loads).map(lambda x: x['business_id']).collect() b_set3 = set(b_table3) u_table = list(u_set1.union(u_set2)) b_table = list(b_set1.union(b_set2).union(b_set3)) u_d = {u_table[i]: i for i in range(len(u_table))} b_d = {b_table[i]: i for i in range(len(b_table))} # agmentation business_avg = support.getAvg(business_avg_file) n_b_avg = {b_d[k]: business_avg[k] for k in business_avg} # get stopwords stopwords = sc.textFile(stopwords_file).collect() b_profile = sc.textFile(business_json) \ .map(json.loads) \ .map(lambda x: (x['business_id'], x['categories'])) \ .map(lambda x: (b_d[x[0]], x[1])) \ .mapValues(lambda v: processCategories(v, stopwords)) \ .collectAsMap() b_list = list(sorted(b_profile.keys())) b_length = len(b_profile) jaccard_sim = sc.parallelize(b_list) \ .flatMap(lambda x: getJS(x, b_profile, b_list)) \ .reduceByKey(lambda x, y: x + y) \ .mapValues(lambda vs: {k: v for k, v in vs}) \ .collect() agm_data = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \ .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \ .map(lambda x: (x[0], [(x[1], x[2])])) \ .reduceByKey(lambda x, y: x + y) \ .mapValues(lambda vs: processValues(vs, jaccard_sim, n_b_avg)) \ .flatMap(lambda x: [(x[0], b, star) for b, star in x[1]]) \ .persist(StorageLevel.MEMORY_AND_DISK) # asl agm_train = agm_data.map(lambda x: ((u_table[x[0]], b_table[x[1]]), x[2])).collect() support.writeDownRenameTable(agm_train, agm_train_file) lonely_user = agm_data.map(lambda x: (x[0], 1)) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] < LONELY_USER_THRESHOLD) \ .map(lambda x: x[0]) \ .collect() lonely_business = agm_data.map(lambda x: (x[1], 1)) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] < LONELY_BUSINESS_THRESHOLD) \ .map(lambda x: x[0]) \ .collect() stars_data = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \ .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK) sc.setCheckpointDir(checkpoint_file) ALS.checkpointInterval = 2 modelRDD = ALS.train(ratings=stars_data, rank=1, iterations=70, lambda_=0.01, nonnegative=True) saveAlsModel(modelRDD, u_table, b_table, als_model_file)
def computeExposureGraphForTop500Entities(): ###### Spark Tranformation Functions # Returns: [(reply_id, orig_post_id)] - for flatMap - to avoid issue of 0 cardinality of the list def deriveReplyMap(json_line): reply_tuples = [] json_obj = json.loads(json_line) if 'parent_id' in json_obj: orig_post_id = str(json_obj['parent_id'].encode("utf-8")) reply_post_id = str(json_obj['name'].encode("utf-8")) reply_tuples.append((reply_post_id, orig_post_id)) return reply_tuples def combineReplies(replies1, replies2): replies = replies1 + replies2 return replies def derivePostDetails(json_line): # get the entity posts # entity_posts_map = entity_posts_map_broadcast.value # entity_posts = [] # for entity in entity_posts_map: # entity_posts += entity_posts_map[entity] # entity_posts_set = set(entity_posts) # for entity in entity_posts_map: # entity_posts_map[entity] # get the set of posts that cite the top-500 entities # entity_posts_set = [entity_posts_map[entity] for entity in entity_posts_map] # need to log: postid, userid, time of post json_obj = json.loads(json_line) post_id = str(json_obj['name'].encode("utf-8")) # if post_id in entity_posts_set: created_string = float(json_obj['created_utc']) created_date = datetime.fromtimestamp(created_string).date() user_id = str(json_obj['author'].encode("utf-8")) post_dict = {'user_id': user_id, 'post_id': post_id, 'created_date': created_date} return (post_id, post_dict) # else: # return ("null", {}) def deriveEntityToPosts(json_line): # Get the broadcast maps that need to be used orig_replies_map = orig_replies_map_broadcast.value reply_orig_map = reply_orig_map_broadcast.value top_500_entities = set(top_500_entities_broadcast.value) entity_posts = [] json_obj = json.loads(json_line) post_id = json_obj['name'] entities = json_obj['entity_texts'] for entity in entities: # ensure that the entities is one that we want to process if entity in top_500_entities: try: # ensure that the post appears in a chain - in order to filter out singleton citations if post_id in orig_replies_map or post_id in reply_orig_map: entity_posts.append((str(entity), [str(post_id)])) except: pass return entity_posts # input: tuple x where x[0] = entity name, x[1] = entity posts def derive_per_entity_exposure_distribution(x): entity = x[0] posts = x[1] # get the posts for this entity from the broadcast variable # entity_posts_list = entity_posts_map_broadcast.value[entity] entity_posts_list = posts post_details_map = post_details_broadcast.value # time order the posts of the entity print("Generating time-ordered posts and users citing the entity") date_to_posts = {} post_users = [] for post in entity_posts_list: # check that the post has been by an existing user post_user = post_details_map[post]['user_id'] if '[deleted]' not in post_user and post in post_details_map: post_users.append(post_user) print(post_user) post_date = post_details_map[post]['created_date'] # get the timestamp if post_date in date_to_posts: date_to_posts[post_date] += [post] else: date_to_posts[post_date] = [post] # get the user specific replies entity_users = set(post_users) print("Collecting the user specific time series interaction graph") reply_orig_map = reply_orig_map_broadcast.value # Get the: {user, {date, interacted_user}} map user_to_interaction_dates = {} for reply_post_id in reply_orig_map: if post_details_map[reply_post_id]['user_id'] in entity_users: # Check that we have the post details from the reply map if reply_orig_map[reply_post_id] in post_details_map and reply_post_id in post_details_map: user_id = post_details_map[reply_post_id]['user_id'] interaction_date = post_details_map[reply_post_id]['created_date'] interacted_user = str(post_details_map[reply_orig_map[reply_post_id]]['user_id']) if '[deleted]' not in interacted_user: if user_id in user_to_interaction_dates: interaction_dates = user_to_interaction_dates[user_id] if interaction_date in interaction_dates: interaction_dates[interaction_date] += [interacted_user] else: interaction_dates[interaction_date] = [interacted_user] user_to_interaction_dates[user_id] = interaction_dates else: user_to_interaction_dates[user_id] = {interaction_date: [interacted_user]} # Go through and derive the point of activation of each user activation_points = {} activated_users = set() for user in entity_users: # get times when they interacted with people # dict: {date, [user_id]} if user in user_to_interaction_dates: ts_interactions = user_to_interaction_dates[user] # go through the posts of the entity each day for date in sorted(date_to_posts): # only do the computation if the user is not activated already if user not in activated_users: # Go through each post on the sorted dates date_posts = date_to_posts[date] for post in date_posts: # is the post by the user - if so, this will be the user's first citation of the entity: activated if post_details_map[post]['user_id'] is user: # get how many times the user was exposed to the entity before adopting it prior_post_authors = [] for prior_date in sorted(date_to_posts): if prior_date < date: # get all the users who authored posts before this one for prior_post in date_to_posts[prior_date]: prior_post_authors.append(post_details_map[prior_post]['user_id']) # get all of the posts that were authored by people that the person had replied to in the past prior_users = set() for ts_date in sorted(ts_interactions): if ts_date < date: print("TS Date = " + str(ts_date)) print(str(ts_interactions)) for prior_user in ts_interactions[ts_date]: prior_users.add(prior_user) exposure_count = 0 # count how many times the user was exposed to the entity (given that they had interacted with the users beforehand for prior_author in prior_post_authors: if prior_author in prior_users: exposure_count += 1 # log the exposure count if exposure_count in activation_points: activation_points[exposure_count] += 1 else: activation_points[exposure_count] = 1 # Exit the posts dates loop activated_users.add(user) break # Return the mapping between the entity and the activation point distribution return (entity, activation_points) ###### Execution code conf = SparkConf().setAppName("NER Diffusion - Exposure Dynamics") conf.set("spark.python.worker.memory","10g") conf.set("spark.driver.memory","15g") conf.set("spark.executor.memory","10g") conf.set("spark.default.parallelism", "12") conf.set("spark.mesos.coarse", "true") conf.set("spark.driver.maxResultSize", "10g") conf.set("spark.cores.max", "15") conf.setMaster("mesos://zk://scc-culture-slave9.lancs.ac.uk:2181/mesos") conf.set("spark.executor.uri", "hdfs://scc-culture-mind.lancs.ac.uk/lib/spark-1.3.0-bin-hadoop2.4.tgz") conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory") sc = SparkContext(conf=conf) sc.setCheckpointDir("hdfs://scc-culture-mind.lancs.ac.uk/data/checkpointing") # use sample directory for testing # annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated-sample") annotationFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/annotated") annotationFile.cache() thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json") # thinnedFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/thinned-json-sample") thinnedFile.cache() # Top 500 entities file top500EntitiesFile = sc.textFile("hdfs://scc-culture-mind.lancs.ac.uk/reddit/entities/top500_entities.csv") top500Entities = top500EntitiesFile.map(lambda x: str(x.encode("utf-8"))).collect() print("Top Entities Loaded. Total = " + str(len(top500Entities))) top_500_entities_broadcast = sc.broadcast(top500Entities) # print(str(len(top500Entities))) # print(top500Entities) # Load the reply graphs from the thinnedFile reply_map_rdd = thinnedFile\ .flatMap(deriveReplyMap) reply_map_rdd.cache() reply_orig_map = reply_map_rdd\ .collectAsMap() print("Reply Orig Map Size = " + str(len(reply_orig_map))) # print(reply_orig_map) # TODO: Convert code to HBase format as this fails to broadcast with Java heap space memory error reply_orig_map_broadcast = sc.broadcast(reply_orig_map) # # get the: {orig, [reply]} dictionary orig_replies_map = reply_map_rdd\ .map(lambda x: (x[1], [x[0]]))\ .reduceByKey(combineReplies)\ .collectAsMap() print("Orig Replies Map Size = " + str(len(orig_replies_map))) orig_replies_map_broadcast = sc.broadcast(orig_replies_map) # Load the entity to post map - restricted to the top-500 entities # input: json_line of annotations of each post # output: [(entity, [post])] of posts where entities appeared entity_posts_rdd = annotationFile\ .flatMap(deriveEntityToPosts)\ .reduceByKey(lambda p1, p2: p1 + p2) entity_posts_rdd_map = entity_posts_rdd.collectAsMap() # entity_posts_map_broadcast = sc.broadcast(entity_posts_rdd_map) print("Entity to posts Map Size = " + str(len(entity_posts_rdd_map))) # print(entity_posts_rdd_map) # Get the post details - restricted to the top-500 entities for now post_details = thinnedFile\ .map(derivePostDetails)\ .filter(lambda x: x[0] is not "null")\ .collectAsMap() post_details_broadcast = sc.broadcast(post_details) print("Post Details Size = " + str(len(post_details))) # print(post_details) # Compute the exposure curves for each entity in the dataset entity_exposure_curves_rdd = entity_posts_rdd\ .map(derive_per_entity_exposure_distribution) entity_exposure_curves_rdd_distribution = entity_exposure_curves_rdd\ .collect() # print(entity_exposure_curves_rdd_distribution) print("Entity Exposure Curves Distribution Size = " + str(len(entity_exposure_curves_rdd_distribution))) output_string = "" for (entity, dist_dictionary) in entity_exposure_curves_rdd_distribution: output_string += str(entity) for exposure_count in dist_dictionary: output_string += "\t" + str(exposure_count) + ", " + str(dist_dictionary[exposure_count]) output_string += "\n" print(output_string) outputFile = open("data/exposure_curves.tsv", "w") outputFile.write(output_string) outputFile.close() # stop the Spark context from running sc.stop()
#ss = SparkSession.builder \ # .master("local") \ # .config("spark.some.config.option", "some-value") \ # .getOrCreate() ss = SQLContext(sc) thres = int(sys.argv[1]) data = sc.textFile( sys.argv[2]).mapPartitionsWithIndex(lambda idx, row: islice( row, 1, None) if idx == 0 else row).map(lambda x: x.split(',')) b = data.map(lambda x: (x[1], x[0])).repartition(15).cache() edges = b.join(b).filter(lambda x: x[1][0] != x[1][1]).map( lambda x: (x[1], x[0])).aggregateByKey( [], lambda x, y: x + [y], lambda x, y: x + y).filter( lambda x: len(set(x[1])) >= thres).map(lambda x: x[0]) sc.setCheckpointDir("s/") vertices = ss.createDataFrame( edges.flatMap(lambda x: list(x)).distinct().map(lambda x: [x]), ["id"]) g_edge = ss.createDataFrame(edges, ["src", "dst"]) print(vertices.count()) print(g_edge.count()) g = GraphFrame(vertices, g_edge) ans = g.labelPropagation( maxIter=5).rdd.map(lambda x: (x[1], x[0])).aggregateByKey( [], lambda x, y: x + [y], lambda x, y: x + y).map(lambda x: sorted(x[1])).sortBy( lambda x: (len(x), x[0])).collect() with open(sys.argv[3], "w") as file: for i in ans:
#createDirectStream() to fetch data from topic print('starting streaming:' + str(datetime.now())) stockvol = filestream.map(lambda x: x[1]).flatMap( lambda x: [line for line in x.splitlines()]).flatMap(parseOrder).map( lambda o: ((o['symbol'], o['type']), o['amount'])) #process dstream , pick value as kafka sends (key,value); #split data into lines/rows; format row , select columns #required & create tuple ((Symbol , type),amount) #to further reduce aggregating by key noofrecords = filestream.map(lambda x: x[1]).flatMap( lambda x: [line for line in x.splitlines()]).flatMap(parseOrder).map( lambda o: ((o['symbol'], o['type']), o['amount'])) noofrecords.foreachRDD(saveOfunc) #to keep track of records' num received from Kakfa, for logging purpose stockvol_window = stockvol.window(10, 10) stockvol_aggr = stockvol_window.reduceByKey(add) #Create dtream window duration/sliding to collects RDDs in it stockvol_highest = stockvol_aggr.window.transform(volhigh) stockvol_highest.foreachRDD(savefunc) #Transform window dstream to find highest trade volume stock #in each rdd of window batch and save to hdfs sc.setCheckpointDir("hdfs:///user/cloudera/checkpoint/") ssc.start() ssc.awaitTermination()
from pyspark import SparkConf, SparkContext from pyspark.mllib.recommendation import ALS, Rating def loadMovieNames(): movieNames = {} with open("ml-100k/u.ITEM") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore') return movieNames conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS") sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint') print("\nLoading movie names...") nameDict = loadMovieNames() #umadeup: data create on top of u.data (3 rows) data = sc.textFile("C:/SparkCourse/ml-100k/umadeup.data") ratings = data.map(lambda l: l.split()).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() # Build the recommendation model using Alternating Least Squares print("\nTraining recommendation model...") rank = 10 # Lowered numIterations to ensure it works on lower-end systems numIterations = 6 model = ALS.train(ratings, rank, numIterations)
def init_spark_context(): conf = SparkConf().setAppName("MovieRatings").set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) sc.setCheckpointDir('/tmp/checkpoint/') return sc