def initialize(): global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) buckets_user = items.groupByKey().mapValues(list).filter( lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex( removeDuplicateEntriesAfter) print("Without Duplicates DOne..") # withoutDuplicates = checkM.mapPartitionsWithIndex( # removeDuplicateEntries).groupByKey().mapValues(list) if (case == 1): # buckets_user = withoutDuplicates.mapPartitionsWithIndex( # createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold) callSonPhase1(buckets_user) print("Initializing Phase 2.....") finalFreq = buckets_user.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass if (case == 2): buckets_business = withoutDuplicates.mapPartitionsWithIndex( createBuckets_case2).groupByKey().mapValues(list) callSonPhase1(buckets_business) print("Initializing Phase 2.....") finalFreq = buckets_business.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass
def initialize(): global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') # print(columnName) items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # Getting user and their business count user_business = items.groupByKey().mapValues(set).collect() tuple_edge_list = [] for i in range(0, len(user_business) - 1): for j in range(i + 1, len(user_business)): inter = user_business[i][1] & user_business[j][1] if len(inter) >= filterThreshold: tuple_edge_list.append( (str(user_business[i][0]), str(user_business[j][0]))) tuple_edge_list.append( (str(user_business[j][0]), str(user_business[i][0]))) totalEdges = float(len(tuple_edge_list) / 2) adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues( list).collectAsMap() adjacency_listMain = copy.deepcopy(adjacency_list) totalNodes = list(adjacency_list.keys()) # ------------------------Newly added line------------------------ strict_totalNodes = copy.deepcopy(totalNodes) # print(len(totalNodes)) # ----------------------Part 1--------------------- bfs(totalNodes, adjacency_list) print("Writing Betweenness to File....") # Converting into sorted List Initial Betweenness list_val = list(cost_dict.items()) list_val.sort(key=lambda x: (-x[1], x[0])) writeToFile(list_val) totalNodes = copy.deepcopy(strict_totalNodes) # print(len(totalNodes)) # ----------------------Part 2---------------------- print("Creating Partitions....") create_components(list_val, adjacency_listMain, totalNodes, totalEdges) # ---------------------EoC--------------------------- print("Duration: " + str(time.time() - t))
def initialize(): global sc, spark, items, inputfile print("Initializing...") sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") jsonread = sc.textFile(inputfile) items = jsonread.map(json.loads)
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
# /opt/spark/bin/pyspark --master local[1] --jars /opt/symetry/lib/sym-spark-assembly.jar --driver-java-options -Dsym.lic.loc=/opt/symetry/sym.lic # execfile('/Users/mike/rtds/master/RTLM/ScalaProjects/sym-shell/src/com/sml/examples/python/amazonExample.py') import os import sys import pyspark from pyspark.context import SparkContext from pyspark.context import SparkConf from pyspark.sql import SQLContext, HiveContext from pyspark.storagelevel import StorageLevel print("amazonExample.py start") conf = SparkConf() conf.setAppName('amazonExample') sc = SparkContext(conf=conf) gateway = sc._gateway sym = gateway.jvm.com.sml.shell # Find the access keys for EC2. awsAccessKeyId = os.environ['AWS_ACCESS_KEY'] awsSecretAccessKey = os.environ['AWS_SECRET_KEY'] # print("awsAccessKeyId=" + awsAccessKeyId) # print("awsSecretAccessKey=" + awsSecretAccessKey) sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", awsAccessKeyId) sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", awsSecretAccessKey) myrdd = sc.textFile('s3a://sml-oregon/datasets/susy/SUSYmini.csv')
# import os, sys # set OS environment variable os.environ["SPARK_HOME"] = '/usr/local/Cellar/apache-spark/2.2.0/libexec' # add Spark library to Python sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python')) # import package import pyspark from pyspark.context import SparkContext, SparkConf import atexit def stop_my_spark(): sc.stop() del (sc) # Register exit atexit.register(stop_my_spark) # Configure and start Spark ... but only once. if not 'sc' in globals(): conf = SparkConf() conf.setAppName('MyFirstSpark') ## you may want to change this conf.setMaster('local[2]') sc = SparkContext(conf=conf) print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
def initialize(): global sc, spark, items, inputfile, t, m, gidDict, bids, hashedList, n, b, r, candidateTuple, listvala, listvalb print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # column name is userid, businessid, starts userids = list(set(sorted(items.keys().collect()))) k = 0 for user in userids: if (user not in gidDict): gidDict[user] = k k = k + 1 # print(k) bids = list(set(sorted(items.values().collect()))) # bids = copy.copy(sorted(bids)) # print(len(bids)) m = len(userids) listvala = random.sample(range(1, m), n) listvalb = random.sample(range(1, m), n) bid_uid = items.map(lambda x: ((x[1], x[0]), 1)).reduceByKey( lambda x, y: x + y).map(lambda x: (x[0])).groupByKey().mapValues(list) bid_uid_hashed = bid_uid.map(lambda x: initialHash(x)) dict_uniques = {} for each in bid_uid.collect(): dict_uniques[each[0]] = set(each[1]) bid_uid_hashed2 = bid_uid_hashed.map(lambda x: hashing(x)) # print(bid_uid_hashed2.first()) # creating signature matrix column per business IDs start = 0 end = r tempSim = [] finalList = [] hashedListSet = bid_uid_hashed2.collect() length = len(hashedListSet) c = 1 print("Finding similar pairs...") dictionEvery = {} while (end <= n): tempDict = [] for each in hashedListSet: templist = sorted(each[1][start:end]) tempDict.append((tuple(templist), each[0])) # tempDict.append((tuple(each[1][start:end]), each[0])) dictionEvery[c] = tempDict c = c + 1 start = end end = end + r dictionaryCheck = {} # for i in range(1, b+1): # dictionaryCheck = {} # for i in range(0, ) length = len(dictionEvery[1]) candidateset = [] candidateTuple = [] print("Working on Bands 1 to 40 ") for i in range(1, b + 1): justBid = [] dictionBand = dictionEvery[i] # print("Working on Band: "+str(i)) mapper = sc.parallelize(dictionBand).groupByKey().mapValues( list).filter(lambda x: (len(x[1]) > 1)) justBid = mapper.map(lambda c: c[1]).collect() candidateTuple.append(justBid) # print(justBid) # print(len(candidateTuple[0])) # print(len(candidateTuple[1])) # print((candidateTuple[1])) candidateset = (candidateTuple) # it was list(set(candidateTuple)) candidatepairs = [] count = 0 for each in candidateset: for e in each: l1 = list(combinations(sorted(e), 2)) candidatepairs.extend(l1) candPairSet = [] # set() candPairSet = (candidatepairs) # it was list(set(candidatepairs)) lines = [] print("Finding final Jaccard Simmilarity") finalPairs = [] for each in candPairSet: set1 = dict_uniques[each[0]] set2 = dict_uniques[each[1]] inter = set1 & set2 # print(len(inter), len(set1), len(set2)) jaccard = (float(len(inter))) / (float(len(set1.union(set2)))) # print(jaccard) if (jaccard >= 0.5): # print(jaccard) lines.append([each[0], each[1], jaccard]) finalPairs.append(each) # print(len(list(set(finalPairs)))) # print(len((finalPairs))) answer = writeToFile(lines) # calculatingPreRec(lines) print("Total Items Printed: " + str(answer)) print("Duration: " + str(time.time() - t))
def initialize(): global sc, spark, inputfile, t, items, validationfile, dictUid, dictBid, list_unaccounted, dict_code_uid, dict_code_bid, t, case t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task2") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # ------------Reading evaluation data----------- csvread2 = sc.textFile(validationfile) columnName2 = csvread2.first().split(',') validationData = csvread2.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName2) # calling case 3: if case == 3: implement_case3(items, validationData) print("Duration: " + str(time.time() - t)) return # calling case 2: if case == 2: implement_case2(items, validationData) print("Duration: " + str(time.time() - t)) return # Ending case 2 # ------------PreProcessing data for training the mode----------- if case == 1: bid_uid = items.map(lambda u: (u[0], u[1])) keys = list(set(bid_uid.keys().collect())) values = list(set(bid_uid.values().collect())) dictUid = dict(zip(keys, range(0, len(keys)))) dictBid = dict(zip(values, range(0, len(values)))) for k, v in dictUid.items(): dict_code_uid[v] = k for k, v in dictBid.items(): dict_code_bid[v] = k ratings = items.map(lambda l: Rating(int(dictUid[l[0]]), int(dictBid[l[1]]), float(l[2]))) # Training the model on train data rank = 2 lambd = 0.5 numIterations = 10 model = ALS.train(ratings, rank, numIterations, lambd) print("Total entries in validation data: " + str(len(validationData.collect()))) # ----------------------Creating a map with integer values for users and business on validation test set----------------- test_on_validation = validationData.map(lambda p: mapData(p)) # validationRating = test_on_validation.filter( lambda p: (p[0] == 1)).map(lambda r: (r[1][0], r[1][1], r[1][2])) accountedPairs = test_on_validation.filter(lambda p: (p[0] == 1)).map( lambda r: (r[1][0], r[1][1])) UnaccountedPairs = test_on_validation.filter(lambda p: p[0] == 0).map( lambda r: ((r[1][0], r[1][1]), 2.75)) # print("Accounted Pairs: "+str(len(accountedPairs.collect()))) # print("Unaccounted Pairs: "+str(len(UnaccountedPairs.collect()))) # print(test_on_validation.count()) # print("Unaccounted Pairs: "+str(len(list_unaccounted))) # ----------------------Evaluate the model on training data---------------------- # testdata = ratings.map(lambda p: (p[0], p[1])) # predictions = model.predictAll(testdata).map(8 # lambda r: ((r[0], r[1]), r[2])) # ratesAndPreds = ratings.map(lambda r: ( # (r[0], r[1]), r[2])).join(predictions) # MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() # # import validation data # print("Mean Squared Error = " + str(MSE)) # ----------------------Evaluate the model on testing data---------------------- predictions = model.predictAll(accountedPairs).map( lambda r: ((r[0], r[1]), r[2])) # print(len(predictions.collect())) finalpred = predictions.union(UnaccountedPairs) # print(len(finalpred.collect())) # return # ratesAndPreds = validationRating.map(lambda r: ( # (r[0], r[1]), r[2])).join(predictions) ratesAndPreds = validationRating.map( lambda r: ((r[0], r[1]), r[2])).join(finalpred) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() writeToFile(finalpred) rmse = math.sqrt(MSE) print("Root Mean Squared Error = " + str(rmse)) print("Duration: " + str(time.time() - t))
# # This configuration works for Spark on macOS using homebrew # import os, sys # set OS environment variable os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark' # add Spark library to Python sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python')) # import package import pyspark from pyspark.context import SparkContext, SparkConf import atexit def stop_my_spark(): sc.stop() del(sc) # Register exit atexit.register(stop_my_spark) # Configure and start Spark ... but only once. if not 'sc' in globals(): conf = SparkConf() conf.setAppName('MyFirstSpark') ## you may want to change this conf.setMaster('yarn-client') sc = SparkContext(conf=conf) print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId) print "http://arc.insight.gsu.edu:8088/cluster/app/%s"% (sc.applicationId)