def initialize(): global sc, spark, items1, items2, inputfile1, inputfile2 print("Initializing...") sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") jsonread1 = sc.textFile(inputfile1) items1 = jsonread1.map(json.loads) jsonread2 = sc.textFile(inputfile2) items2 = jsonread2.map(json.loads)
def main(args): sc = SparkContext(appName="PGM") graph1 = sc.textFile(args.IN[0]).map(line_to_edge) graph2 = sc.textFile(args.IN[1]).map(line_to_edge) graph_name = args.IN[0] seed_num = args.sn PARTS = args.PARTS G1 = deep_copy(graph1, PARTS) G2 = deep_copy(graph2, PARTS) IsBucket = "" matchtype = "" if args.inseeds: seeds = sc.textFile(args.inseeds).map(line_to_edge) matchtype = "_seeded_" ETA = 0 else: matchtype = "_seedless_" start = time() seeds = dinoise.seed_generator(sc, G1, G2, seed_num, PARTS) stop = time() ETA = round(float(stop - start) / 60, 4) stats = evaluate_output(graph_name + matchtype + IsBucket, G1, G2, seeds, "seeds_log.csv", ETA, PARTS) if not args.bucketing: start = time() res = dinoise.distributed_noisy_seeds(sc, G1, G2, seeds, PARTS) stop = time() else: start = time() res = dinoise_w_bucketing.distributed_noisy_seeds( sc, G1, G2, seeds, PARTS) IsBucket = "_bucket_" stop = time() ETB = round(float(stop - start) / 60, 4) stats = evaluate_output(graph_name + matchtype + IsBucket, G1, G2, res, "results_log.csv", ETB, PARTS) sc.stop()
def set_tweets_data(self, path): sc = SparkContext().getOrCreate() # Чтение данных и разбиение на колонки. self.tweets = sc.textFile(path).map( lambda line: line[1:-1].split('","')) # Запись выборочных колонок. self.tweets = self.tweets.map(lambda row: (row[1], row[10], row[24]))
class InvertedDictionary: def __init__(self, fileConfig): self.sc = SparkContext() self.sqlContext = HiveContext(self.sc) self.invertedDict = {} self.conn = boto.connect_s3(host='s3.amazonaws.com') self.bucket = fileConfig['s3_bucket'] self.folder = fileConfig['s3_folder'] def buildInvertedDict(self): bucket = self.conn.get_bucket(self.bucket) for idx, file in enumerate(bucket.list(self.folder, '/')): if idx == 0: continue fileId = file.key.split('/')[1] rdd = self.sc.textFile('s3a://{}/{}/{}'.format( self.bucket, self.folder, fileId)) output = rdd.flatMap(lambda (words): [(re.sub(r'[^a-z\']', '', word)) for word in words.lower().split()])\ .map(lambda (word): (word, fileId)) for row in output.collect(): word, docId = str(row[0]), int(row[1]) if word not in self.invertedDict: self.invertedDict[word] = [docId] else: if docId in self.invertedDict[word]: pass else: self.invertedDict[word].append(docId)
def initialize(): global sc, spark, items, inputfile, buckets_user, buckets_business, partition, totalSize, t, mainThreshold print("Initializing...") t = time.time() candidateList = [] frequentList = [] sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) buckets_user = items.groupByKey().mapValues(list).filter( lambda x: len(x[1]) > mainThreshold).mapPartitionsWithIndex( removeDuplicateEntriesAfter) print("Without Duplicates DOne..") # withoutDuplicates = checkM.mapPartitionsWithIndex( # removeDuplicateEntries).groupByKey().mapValues(list) if (case == 1): # buckets_user = withoutDuplicates.mapPartitionsWithIndex( # createBuckets).groupByKey().mapValues(list).filter(lambda x: len(x[1]) > mainThreshold) callSonPhase1(buckets_user) print("Initializing Phase 2.....") finalFreq = buckets_user.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass if (case == 2): buckets_business = withoutDuplicates.mapPartitionsWithIndex( createBuckets_case2).groupByKey().mapValues(list) callSonPhase1(buckets_business) print("Initializing Phase 2.....") finalFreq = buckets_business.mapPartitionsWithIndex( lambda partition_index, iter_row: phase2(partition_index, iter_row) ).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= threshold).map(lambda x: makeList(x[0])) # print((finalFreq.collect())) finalOutput = (finalFreq.collect()) x = sorted(finalOutput, key=lambda item: (len(list(item)), list(item))) # print(x) printingFreq(x) pass
def main(): parser = argparse.ArgumentParser(description="Find Dependency inclusions") parser.add_argument('--path', type=str) parser.add_argument('--cores', type=str) args = parser.parse_args() sc = SparkContext(appName="DDM") sc.getConf().set("spark.executor.cores", args.cores) sc.getConf().set("spark.driver.cores", args.cores) sc.getConf().set("spark.worker.cores", args.cores) sc.getConf().set("spark.deploy.defaultCores", args.cores) sc.getConf().set("spark.driver.memory", "15g") global number_of_columns data = [] file_headers = [] for file in os.listdir(args.path): if file.endswith(".csv"): rdd = sc.textFile(os.path.join(args.path, file)).map(lambda line: line[1:-1].split("\";\"")) file_data = rdd.collect() file_header = file_data[0] del file_data[0] file_data = [(number_of_columns, x) for x in file_data] data += file_data file_headers += file_header number_of_columns = number_of_columns + len(file_header) header_dummies = list(range(0, number_of_columns)) rdd = sc.parallelize(data) values_as_key = rdd.flatMap(lambda el: list(zip(el[1], range(el[0], el[0] + len(el[1]))))) unique_values = values_as_key.map(lambda x: (x[0], x[1])).groupByKey().mapValues(set) unique_values = unique_values.map(lambda x: (tuple(x[1]), 0)).reduceByKey(sum_func) matrix_per_key = unique_values.map(lambda x: make_candidate_matrix(x[0])) result_matrix = matrix_per_key.reduce(lambda x, y: matrix_and(x, y)) assert len(result_matrix) == number_of_columns output = [] for i in range(0, number_of_columns): assert len(result_matrix[i]) == number_of_columns output.append([]) for i in range(0, len(result_matrix)): for j in range(0, len(result_matrix[i])): if i != j and result_matrix[i][j]: output[j].append(file_headers[i]) for i in range(0, len(output)): row = output[i] if len(row) != 0: output_string = str(row[0]) for j in range(1, len(row)): output_string += (", " + str(row[j])) print(str(file_headers[i]) + " < " + output_string) sc.stop()
def main(): sc = SparkContext(appName='TextSimillarity') sqlcont = SQLContext(sc) rdd = sc.textFile("test.csv") header = rdd.first() newrdd = rdd.filter(lambda x: x!= header)\ .map(lambda x: x.split(','))\ .map(lambda x: Row(description_x = x[1], description_y = x[2])) new_df = sqlcont.createDataFrame(newrdd) calculate_simillarity(new_df)
def initialize(): global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') # print(columnName) items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # Getting user and their business count user_business = items.groupByKey().mapValues(set).collect() tuple_edge_list = [] for i in range(0, len(user_business) - 1): for j in range(i + 1, len(user_business)): inter = user_business[i][1] & user_business[j][1] if len(inter) >= filterThreshold: tuple_edge_list.append( (str(user_business[i][0]), str(user_business[j][0]))) tuple_edge_list.append( (str(user_business[j][0]), str(user_business[i][0]))) totalEdges = float(len(tuple_edge_list) / 2) adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues( list).collectAsMap() adjacency_listMain = copy.deepcopy(adjacency_list) totalNodes = list(adjacency_list.keys()) # ------------------------Newly added line------------------------ strict_totalNodes = copy.deepcopy(totalNodes) # print(len(totalNodes)) # ----------------------Part 1--------------------- bfs(totalNodes, adjacency_list) print("Writing Betweenness to File....") # Converting into sorted List Initial Betweenness list_val = list(cost_dict.items()) list_val.sort(key=lambda x: (-x[1], x[0])) writeToFile(list_val) totalNodes = copy.deepcopy(strict_totalNodes) # print(len(totalNodes)) # ----------------------Part 2---------------------- print("Creating Partitions....") create_components(list_val, adjacency_listMain, totalNodes, totalEdges) # ---------------------EoC--------------------------- print("Duration: " + str(time.time() - t))
def main(args): sc = SparkContext(appName="PGM") graph1 = sc.textFile(args.IN[0]).map(line_to_edge) graph2 = sc.textFile(args.IN[1]).map(line_to_edge) graph_name = args.IN[0] seed_num = args.sn PARTS = args.PARTS G1 = deep_copy(graph1, PARTS) G2 = deep_copy(graph2, PARTS) if args.inseeds: seeds = sc.textFile(args.inseeds).map(line_to_edge) matchtype = "seeded" ETA = 0 else: matchtype = "seedless" start = time() seeds = seed_generator(sc, G1, G2, seed_num, PARTS) stop = time() ETA = round(float(stop - start) / 60, 4) seeds2 = seeds.map(lambda pair: str(pair[0]) + " " + str(pair[1])) seeds2.coalesce(1).saveAsTextFile(args.OUT + "bucketing_segen_seeds") start = time() res = distributed_noisy_seeds(sc, G1, G2, seeds, PARTS) stop = time() ETB = round(float(stop - start) / 60, 4) res2 = res.map(lambda pair: str(pair[0]) + " " + str(pair[1])) res2.coalesce(1).saveAsTextFile(args.OUT + matchtype + "_bucketing_matching") print("\nSeGen time :" + str(ETA) + " min ") print("DiNoiSe time :" + str(ETB) + " min\n") return [ETA, ETB] sc.stop()
def load(): """ This function should prepare local configuration and initiate a SparkContext. Then a file must read and the String RDD must be transformed to a Article RDD :return: an RDD of Articles :see: Hint: use article.parseLine for the transformation """ logger.info("Prepare spark contest and load data") conf = SparkConf().setMaster('local').setAppName("Ranking App")\ .set("spark.executor.memory", "3g").set("spark.driver.memory", "3g").set("spark.python.worker.memory", "3g") \ .set("spark.driver.maxResultsSize", 0) sc = SparkContext(conf=conf) return sc.textFile("data/wikipedia.dat").map(lambda x: parse_line(x))
def select(self, col_names=None, filepath=None, sampler=None, sampling_rate=1, task="classfication"): sc = SparkContext(conf=SparkConf().setAppName("select")) # Columns Selection if col_names == None: print("============No Column Names Specified============") data_rdd = sc.textFile(filepath).map(self.parse_xy) else: header_rdd = sc.textFile(filepath + "/header").map( self.parse_header) header_list = sum(header_rdd.collect(), []) #print("header list: ", header_list) columns_ind = [header_list.index(col) for col in col_names] # [Y, X1, X2, ...] X_col_ind = columns_ind[1:] Y_col_ind = columns_ind[0] #print("columns_index: ", columns_ind) # map function에 인자 전달??? 따로 빼고싶다,, data_rdd = sc.textFile(filepath + "/data").map(lambda vec: ([ [float(x) for x in vec.split(',')][i] for i in X_col_ind ], [float(x) for x in vec.split(',')][Y_col_ind])) if sampler == None: print("============No Sampling Method Specified============") elif sampler == "random": data_rdd = data_rdd.sample(False, sampling_rate) # 정확하지가 않네..? print("data: ", data_rdd.collect()[0]) print("len(data): ", len(data_rdd.collect()))
def get_top_3(): """ 统计每个网址访问的uid的前三个人及他访问的次数 :return: """ spark = SparkContext("local", "get_top3") data = spark.textFile("data.txt") data.map(lambda line:(line.split(" ")[2],line.split(" ")[3])).\ groupByKey()\ .flatMap(lambda info:get_site_uid_cnt(info))\ .groupByKey()\ .map(lambda info:get_top3_uid_and_cnt(info))\ .foreach(print)
def run(): sc = SparkContext("local", "WordCount") #初始化配置 filename_hdfs = "/sparklearn/data/小王子.txt" filename_linux = "file:///media/sl/D/java/idea/a2019/sparklearn/data/小王子.txt" filename_win = "D:\\java\\idea\\a2019\\sparklearn\\data\\小王子.txt" data = sc.textFile(filename_linux) #读取是utf-8编码的文件 stopwords_linux = "/media/sl/D/java/idea/a2019/sparklearn/data/stopwords-master/百度停用词表.txt" stopwords_win = r'D:\java\idea\a2019\sparklearn\data\stopwords-master\百度停用词表.txt' with open(stopwords_linux, 'r', encoding='utf-8') as f: x = f.readlines() stop = [i.replace('\n', '') for i in x] stop.extend([ ',', '的', '我', '他', '', '。', ' ', '\n', '?', ';', ':', '-', '(', ')', '!', '1909', '1920', '325', 'B612', 'II', 'III', 'IV', 'V', 'VI', '—', '‘', '’', '“', '”', '…', '、' ]) #停用标点之类 data=data.flatMap(lambda line: jieba.cut(line,cut_all=False)).filter(lambda w: w not in stop).\ map(lambda w:(w,1)).reduceByKey(lambda w0,w1:w0+w1).sortBy(lambda x:x[1],ascending=False) print(data.take(100))
class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3]))) category_schema = StructType([ StructField("userid", IntegerType(), True), StructField("movieid", IntegerType(), True), StructField("rating", FloatType(), True), StructField("time", LongType(), True) ]) category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list.registerTempTable("data") movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid") movie_list.registerTempTable("movie") movieid = movie_list.sort(movie_list.ct.desc()).first().movieid # movieid = category_list.first().movieid category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid)) category_list.registerTempTable("data") user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50") print(user_list.count()) user_list.show() user_list.registerTempTable("users") category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1) #category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time)) category_list = category_list.repartition(1) category_list.saveAsTextFile("data.txt")
def main(): from pyspark.context import SparkContext from operator import add # Clean up previous results that can cause failure if os.path.isdir(os.path.join(os.getcwd(), "wc_out")): shutil.rmtree(os.path.join(os.getcwd(), "wc_out")) #print(os.environ.get("SPARK_HOME")) sc = SparkContext(appName="HelloWorld") f = sc.textFile("test_word_count.txt") wc = f.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey( add) wc.saveAsTextFile("wc_out") # Check if results files exist and raise error if they do not. if os.path.isdir(os.path.join(os.getcwd(), "wc_out")): print("Test succeeded, results files are present for word count.") else: # print("Test failed, no results files were found for word count.") raise ValueError("test_word_count.py has Failed.")
import sys import pyspark from pyspark.context import SparkContext from pyspark import SparkConf # Parse the input parameters input_file_name = sys.argv[1] number_k_for_topK = int(sys.argv[2]) print(input_file_name) print(str(number_k_for_topK)) # Prepare the Spark context conf = SparkConf().setMaster("local") \ .setAppName("Word Count Spark") \ .set("spark.executor.memory", "4g") \ .set("spark.executor.instances", 1) sc = SparkContext(conf=conf) book = sc.textFile(input_file_name) # WordCount words_counted = book.flatMap(lambda line: line.split(" ")) \ .map(lambda word:(word,1)) \ .reduceByKey(lambda x,y:x+y ) print(words_counted) # Output the top-K most frequent words topk = words_counted.sortBy(lambda keyvalue: -keyvalue[1]).take( number_k_for_topK) print(topk)
out_s += "\n" output_mark += 1 out_s = out_s.strip("\n") return out_s if __name__ == "__main__": time1 = time.time() sc = SparkContext('local[*]', 'inf553_hw2_2') # initiate sc.setLogLevel("OFF") T = int(sys.argv[1]) S = int(sys.argv[2]) input_file = sc.textFile(sys.argv[3]) # readfile output_file = sys.argv[4] data = input_file.map(lambda x: x.split(',')).filter( lambda x: x[0] != "user_id") # create basket baskets = data.groupByKey().map(lambda x: (x[0], list(x[1]))).filter( lambda x: len(x[1]) > T).persist(StorageLevel(True, True, False, False)) # baskets = baskets.coalesce(4, True).persist(StorageLevel(True, True, False, False)) N = baskets.count() # Pass 1 # Pass 1 Map
from __future__ import print_function import sys from operator import add from pyspark.sql import SparkSession from pyspark.context import SparkContext, SparkConf if __name__ == "__main__": config = SparkConf().setAppName("wordCount").setMaster("local") sc = SparkContext() lines = sc.textFile("./src/main/python/wordCount/hello.txt") words = lines.flatMap(lambda line: line.split(" ")) wordCountMap = words.map(lambda word: (word, 1)) # count = wordCountMap.reduceByKey(lambda preCount, count: preCount + count) # output = count.collect() # print(output) # use countByKey instead count = wordCountMap.countByKey() print(count)
default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) # create RDD of input data def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) images_labels = sc.textFile(args.images_labels).map(parse) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, master_node='chief') # Note: need to feed extra data to ensure that each worker receives sufficient data to complete epochs # to compensate for variability in partition sizes and spark scheduling cluster.train(images_labels, args.epochs) cluster.shutdown()
from pyspark.context import SparkContext from pyspark.sql import SQLContext, SparkSession from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.sql.types import DoubleType from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder #Setup #spark = SparkSession.builder.master("spark://master:7077").appName("adult").config("spark.some.config.option", "akki").getOrCreate().enableHiveSupport() sc = SparkContext() sqlContext = SQLContext(sc) # Load and parse the data file into an RDD of LabeledPoint. data = sc.textFile('C:/Users/akshaykumar.kore/Downloads/data/adults.csv').map( lambda line: line.split(",")) #data=sc.read.csv("C:/Users/akshaykumar.kore/Downloads/data/adult1.csv", header=True, mode="DROPMALFORMED") data = data.toDF() data = data.na.fill(0) categoricalColumns = ["_2", "_4", "_6", "_7", "_8", "_9", "_10", "_14"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoder(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
with zipfile.ZipFile(small_dataset_path, "r") as z: z.extractall(datasets_path) if not os.path.exists(complete_dataset_path): with zipfile.ZipFile(complete_dataset_path, "r") as z: z.extractall(datasets_path) small_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv') # initiate a SparkConext from pyspark.context import SparkContext sc = SparkContext('local', 'movie-recommender-engine') # give small_ratings_file in input to the sc small_ratings_raw_data = sc.textFile(small_ratings_file) small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0] # Parse the Raw data into a new RDD - Ratings. small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\ .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache() small_movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv') small_movies_raw_data = sc.textFile(small_movies_file) small_movies_raw_data_header = small_movies_raw_data.take(1)[0] # Parse the Raw data into a new RDD - Movies. small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\ .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()
import sys import time import math import utils from pyspark.context import SparkContext if (len(sys.argv) > 1): hdfs_file_path = "/user/lsde02/data/%s/*.gz" % sys.argv[1] else: hdfs_file_path = "/user/lsde02/data/1901/*.gz" hdfs_results_path = "/user/lsde02/results/" start_time = time.strftime("%Y-%m-%d-%H-%M-%S") sc = SparkContext() context = sc.textFile(hdfs_file_path) stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()]) stations = stations.filter(lambda x: 'longitude' in x[1] and 'latitude' in x[1]) stations.persist() # Do computations on month level month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (x[1]['temp'], x[1]['wind-speed'], x[1]['sky-condition'], x[1]['visibility'], \ x[1]['wind-direction']))) month_data = month_data.combineByKey(lambda value: (x['temp'], 1, x['wind-speed'], 1, x['sky-condition'], 1, x['visibility'], 1, \ math.sin(x['wind-direction'])*math.pi/180., math.cos(x['wind-direction']*math.pi/180.)),\ lambda x, value: (x[0] + value[0], value[1] + 1, x[2]+value[2], 1 + value[3], x[4] + value[4], 1 + value[5],\ x[6]+value[6], 1 + value[7], x[8] + value[8], x[9] + value[9]),\ lambda x, y: (x[0]+y[0], x[1]+y[1], x[2]+y[2], x[3]+y[3], x[4]+y[4], x[5]+y[5], x[6]+y[6], x[7]+y[7], x[8]+y[8]\ x[9]+y[9])) month_data = month_data.map(lambda (label, (x1, c1, x2, c2, x3, c3, x4, c4, x5a, x5b)): (label, (x1/c1, x2/c2, x3/c3, x4/c4, math.atan2(x5a, x5b)))) month_data = month_data.coalesce(1, True) month_avg.saveAsTextFile("%s%s-%s" % (hdfs_results_path, start_time, 'all'))
def run(host, database, collection, start_time=None, end_time=None, center=None, degree=None): response = tangelo.empty_response() # Bail with error if any of the required arguments is missing. missing = map(lambda x: x[0], filter(lambda x: x[1] is None, zip(["start_time", "end_time", "center", "degree"], [start_time, end_time, center, degree]))) if len(missing) > 0: response["error"] = "missing required arguments: %s" % (", ".join(missing)) return response # Cast the arguments to the right types. # # The degree is the degree of separation between the center element and the # retrieved nodes - an integer. try: degree = int(degree) except ValueError: response["error"] = "argument 'degree' must be an integer" return response # The start time is the number of milliseconds since the epoch (which is how # JavaScript dates are constructed, and therefore how dates are stored in # MongoDB) - an integer. try: start_time = datetime.datetime.strptime(start_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'start_time' must be in YYYY-MM-DD format" return response # The end time is another date - an integer. try: end_time = datetime.datetime.strptime(end_time, "%Y-%m-%d") except ValueError: response["error"] = "argument 'end_time' must be in YYYY-MM-DD format" return response # Get a handle to the database collection. if SparkContext._active_spark_context == None: sc = SparkContext('spark://impaladev.darpa.mil:7077', 'Enron Emailers') else: sc = SparkContext._active_spark_context enronData = sc.textFile('hdfs://localhost:8020/user/bigdata/pgill/enron/email_graph_fixed.txt').map(lambda line: line.split('\t')).cache() def withinTimespan(record): recordDate = datetime.datetime.strptime(record[2], "%Y-%m-%d") return recordDate >= start_time and recordDate < end_time def emptyRecords(record): return record[0] != "" and record[1] != "" def orderRecord(record): if record[1] < record[0]: record[0], record[1] = record[1], record[0] return record enronSpan = enronData.filter(withinTimespan).filter(emptyRecords).map(orderRecord).map(lambda rec: (rec[0], rec[1])).distinct().cache() # Start a set of all interlocutors we're interested in - that includes the # center emailer. talkers = set([center]) # Also start a table of distances from the center. distance = {center: 0} current_talkers = list(talkers) all_results = [] for i in range(degree): def emailsInvolved(record): return any(keyword in record for keyword in current_talkers) results = enronSpan.filter(emailsInvolved).collect() # Collect the names. current_talkers = list(itertools.chain(*map(lambda x: [x[1], x[0]], results))) current_talkers = list(set(current_talkers)) talkers = talkers.union(current_talkers) # Compute updates to everyone's distance from center. for t in current_talkers: if t not in distance: distance[t] = i+1 # save the cursor. all_results.append(results) # Construct a canonical graph structure from the set of talkers and the list # of emails. # # Start with an index map of the talkers. talkers = list(talkers) talker_index = {name: index for (index, name) in enumerate(talkers)} # Create a chained iterable from all the rewound partial results. all_results = itertools.chain(*all_results) # Create a list of graph edges suitable for use by D3 - replace each record # in the data with one that carries an index into the emailers list. edges = [] ident = 0 for result in all_results: source = result[0] target = result[1] ident += 1 rec = { "source": talker_index[source], "target": talker_index[target], "id": str(ident) } edges.append(rec) talkers = [{"email": n, "distance": distance[n]} for n in talkers] # Stuff the graph data into the response object, and return it. response["result"] = { "nodes": talkers, "edges": edges } return response
if args.format == "tfr": images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) print("zipping images and labels") dataRDD = images.zip(labels) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown()
import os import sys from pprint import pprint from operator import add import pyspark from pyspark.context import SparkContext sc = SparkContext() file = "SampleData3.txt" wordcounts = sc.textFile(file) \ .map(lambda l: ((l.split(" ")[0], len([x for x in l.split(" ")[1:] if ("gene_" in x or "disease_" in x)])), [x for x in l.split(" ")[1:] if ("gene_" in x or "disease_" in x)]))\ \ .flatMap(lambda x: x.split()) \ .map(lambda x: (x, 1)) \ .reduceByKey(lambda x,y:x+y) \ .map(lambda x:(x[1],x[0])) \ .sortByKey(False)
else: try: id1 = int(string.strip(data[0])) id2 = int(string.strip(data[1])) return (id1, id2) except: return (-1, "error") filePath = '/home/piyush/datasets/audioscrobbler/' conf = SparkConf().setAppName("audio_scrobbler") sc = SparkContext(conf=conf) # parse raw user artist data userArtistDataFile = filePath + 'user_artist_data.txt' rawUserArtistData = sc.textFile(userArtistDataFile) # parse Artist data file artistDataFile = filePath + 'artist_data.txt' rawArtistData = sc.textFile(artistDataFile) artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1) # parse artist alias file artistAliasDataFile = filePath + 'artist_alias.txt' rawArtistAliasData = sc.textFile(artistAliasDataFile) artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap() # broadcast variable bArtistAlias = sc.broadcast(artistAlias)
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m)
# read data as CSV for Dataframe analysis # /Volumes/work/data/kaggle/ssi.csv # read data n0rmally """ sqlContext = SQLContext(sc) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv') # summarize(df) print df.show() #points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)])) values using Dataframe Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441] Final intercept: 0.0 """ points = sc.textFile(BASE_DATA_PATH + "/ssi.csv").map(parsePoint) model = LogisticRegressionWithSGD.train(points, 10) print("Final weights: " + str(model.weights)) print("Final intercept: " + str(model.intercept)) """ Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441] Final intercept: 0.0 """ sc.stop()
import os import sys import platform import py4j import pyspark from pyspark.context import SparkContext from pyspark.sql import SQLContext, HiveContext from pyspark.storagelevel import StorageLevel # init - create SparkContext on Azure HdInsight # on Azure HdInsight master was deafult to yarn sc = SparkContext(appName="wc") print "sys.argv[1]: ", sys.argv[1] wc = sc.textFile(sys.argv[1]) \ .map( lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower()) \ .flatMap(lambda x: x.split(" ")) \ .map(lambda x: (x, 1)) \ .reduceByKey(lambda x, y: x + y) print wc.collect()
import py4j import pyspark from pyspark.context import SparkContext sc = SparkContext() # Control our logLevel. This overrides any user-defined log settings. # Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN sc.setLogLevel("FATAL") text_file = sc.textFile(spark_home + "/README.md") word_counts = text_file \ .flatMap(lambda line: line.split()) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) print word_counts.collect()
from pyspark.context import SparkContext from pyspark import SparkConf from collections import OrderedDict import sys import json conf = SparkConf() conf.setMaster("local[*]") sc = SparkContext(conf = conf) path = sys.argv[1] output = [] top_20_cust = [] top_20_business = [] review_file = sc.textFile(path) rdd_json = review_file.map(json.loads).map(lambda x : (x['user_id'],x['business_id'],x['useful'],x['text'],x['stars'])) useful = rdd_json.filter(lambda x: x[2] > 0).count() stars = rdd_json.filter(lambda x: x[4] == 5.0).count() longest_review = rdd_json.map(lambda x: len(x[3])).max() rdd_cust = rdd_json.map(lambda x: (x[0],1))\ .reduceByKey(lambda x,y: x + y)\ .sortBy(lambda x: (-x[1],x[0])) rdd_business = rdd_json.map(lambda x: (x[1],1))\ .reduceByKey(lambda x,y : x + y)\ .sortBy(lambda x: (-x[1],x[0]))
directories += str(i) if i < int(sys.argv[2]): directories += "," directories += "}" hdfs_file_path = "/user/lsde02/data/%s/*.gz" % directories forced_partitions = (int(sys.argv[2])+1-int(sys.argv[1]))*12 else: hdfs_file_path = "/user/lsde02/data/*/*.gz" forced_partitions = 1500 hdfs_results_path = "/user/lsde02/results/" start_time = time.strftime("%Y-%m-%d-%H-%M-%S") print "Started processing: %s" % hdfs_file_path sc = SparkContext() context = sc.textFile(hdfs_file_path, forced_partitions) stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()]) #stations = stations.filter(lambda x: 'fixed-weather-station' in x[1] or ) # Do computations on month level month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (utils.get_attribute(x[1], 'temp'), utils.get_attribute(x[1], 'windspeed'), \ utils.get_attribute(x[1], 'sky-condition'), utils.get_attribute(x[1], 'visibility'), utils.get_attribute(x[1], 'wind-direction'), \ utils.get_attribute(x[1], 'latitude'), utils.get_attribute(x[1], 'longitude')))) month_data = month_data.combineByKey(lambda value: (value[0] if value[0] != None else 0, 1 if value[0] != None else 0,\ value[1] if value[1] != None else 0, 1 if value[1] != None else 0, \ value[2] if value[2] != None else 0, 1 if value[2] != None else 0, \ value[3] if value[3] != None else 0, 1 if value[3] != None else 0, \ math.sin(value[4]*math.pi/180.0) if value[4] != None else 0, \ math.cos(value[4]*math.pi/180.0) if value[4] != None else 0, \ value[0]*value[0] if value[0] != None else 0, \ value[1]*value[1] if value[1] != None else 0, \
from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.rdd import RDD from pyspark.files import SparkFiles from pyspark.storagelevel import StorageLevel from pyspark.accumulators import Accumulator, AccumulatorParam from pyspark.broadcast import Broadcast from pyspark.serializers import MarshalSerializer, PickleSerializer from pyspark.status import * from pyspark.profiler import Profiler, BasicProfiler conf = SparkConf().setAppName("hotelhodsrrequest_parsing") sc = SparkContext(conf = conf) textFile = sc.textFile("/user/hive/warehouse/ehotel.db/hotelhodsrrequest/year=2016/month=01/day=*/*.gz") #filter by PCC PCCArray = textFile.map(lambda line: line.split("|")).filter(lambda line: line[4] == 'B7ZB') #GroupBy Date shopperdaycount = PCCArray.map(lambda line: (line[0].split(" ")[0], 1)).reduceByKey(lambda a, b: a+b) shopperdaycount.saveAsTextFile('/user/sg952655/Totalshopperdaycount/') shopperdaycounttxt = sc.textFile("/user/sg952655/Totalshopperdaycount/*") preview0 = shopperdaycounttxt.collect() #Duplicate record count with sessionid, transactionid, propertycode as unique key filteredduplicaterecordcount = PCCArray.map(lambda line: (line[1]+" "+line[2]+" "+line[6], 1)).reduceByKey(lambda a, b: a+b).filter(lambda line: line[1] > 1) Orderedfilteredduplicaterecordcount = filteredduplicaterecordcount.map(lambda line: (line[1], line[0])) Orderedfilteredduplicaterecordcount.saveAsTextFile('/user/sg952655/TotalOrderedfilteredduplicaterecordcount/') Orderedfilteredduplicaterecordcounttxt = sc.textFile("/user/sg952655/TotalOrderedfilteredduplicaterecordcount/*")
print('\nRunning example of classification using GradientBoostedTrees\n') testClassification(trainingData, testData) print('\nRunning example of regression using GradientBoostedTrees\n') testRegression(trainingData, testData) sc.stop() from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint train = sc.textFile("train.csv") # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(values[-1], values[:-1]) #data = sc.textFile("data/mllib/sample_svm_data.txt") parsedData = train.map(parsePoint) # Build the model model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr))
# calculate predictions on train data & calculate Mean Squared Error & R2 score y_pred = lm.predict(features) print("Mean Squared Error = ", sklearn.metrics.mean_squared_error(labels, y_pred)) print("R2 score = ", sklearn.metrics.r2_score(labels, y_pred)) # calculate predictions on test data & calculate Mean Squared Error & R2 score y_pred_test = lm.predict(testfeatures) print("Mean Squared Error = ", sklearn.metrics.mean_squared_error(testlabels, y_pred_test)) print("R2 score = ", sklearn.metrics.r2_score(testlabels, y_pred_test)) # plot graph of prediction vs ground truth plt.scatter(labels, y_pred, color='black') plt.xlabel('Ground Truth') plt.ylabel('Prediction') plt.show() plt.savefig('result.png') # load the prepared dataset data = sc.textFile("data/testFeatLabs.csv") printStat = [ 'Below High School Education level', 'High School Education level', 'Some College Education level', 'Bachelors Degree and above Education Level' ] # run Lasso regression to predict the value for each class for ind in range(4): print('\nTraining for', printStat[ind], ':') parsedData = data.filter(filterRows).map(lambda x: parsePoint(x, ind)) trainAndTest(parsedData)
''' Created on Oct 30, 2015 @author: dyerke ''' from pyspark.context import SparkContext from pyspark.conf import SparkConf if __name__ == '__main__': m_hostname= "dyerke-Inspiron-7537" # conf= SparkConf() conf.setAppName("MyTestApp") conf.setMaster("spark://" + m_hostname + ":7077") conf.setSparkHome("/usr/local/spark") conf.set("spark.driver.host", m_hostname) logFile = "/usr/local/spark/README.md" # Should be some file on your system # sc= SparkContext(conf=conf) logData= sc.textFile(logFile).cache() # countAs= logData.filter(lambda x: 'a' in x).count() countBs= logData.filter(lambda x: 'b' in x).count() # print("Lines with a: %i, lines with b: %i" % (countAs, countBs)) sc.stop()
from pyspark.context import SparkContext from collections import OrderedDict #preserve the order of json file output import sys import time import json import pyspark file_path1 = sys.argv[1] file_path2 = sys.argv[2] output_path1 = sys.argv[3] output_path2 = sys.argv[4] sc = SparkContext("local[*]") sc.setLogLevel("ERROR") tf1 = sc.textFile(file_path1) tf2 = sc.textFile(file_path2) data1 = tf1.map(lambda x: (json.loads(x)["business_id"], json.loads(x)["stars"])) data2 = tf2.map( lambda x: (json.loads(x)["business_id"], json.loads(x)["state"])).persist( pyspark.StorageLevel.MEMORY_AND_DISK_2) statecount = data2.map(lambda x: x[1]).distinct().count() RDD = data1.join(data2).map(lambda x: (x[1][1], x[1][0])).persist( pyspark.StorageLevel.MEMORY_AND_DISK_2) st = RDD.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) task1 = st.mapValues(lambda x: x[0] / x[1]).sortByKey(False) task1f = task1.top(statecount, key=lambda x: x[1])
from pyspark.conf import SparkConf from pyspark.context import SparkContext from sparkpackage.sales_dto import SalesDTO def print_lines(line): print line.product_name config = SparkConf() config.setAppName("CSVReaderJOB") config.setMaster("local[*]") context = SparkContext(conf=config) textFileRDD = context.textFile( '/home/dharshekthvel/ac/code/scalatrainingintellij/data/sales.csv') # Broadcast # amazon_product = context.broadcast(SalesDTO("AMAZON_PRODUCT")) # mappedRDD = textFileRDD.map(lambda x : amazon_product.value) # mappedRDD.foreach(lambda x : print_lines(x)) # Accumulator context.accumulator()
import pyspark from pyspark.context import SparkContext from pyspark import SparkConf conf = SparkConf() sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # Load the adjacency list file AdjList1 = sc.textFile("/home/rob/Assignment4/02AdjacencyList.txt") print AdjList1.collect() AdjList2 = AdjList1.map( lambda line: line) # 1. Replace the lambda function with yours AdjList3 = AdjList2.map( lambda x: x) # 2. Replace the lambda function with yours AdjList3.persist() print AdjList3.collect() nNumOfNodes = AdjList3.count() print "Total Number of nodes" print nNumOfNodes # Initialize each page's rank; since we use mapValues, the resulting RDD will have the same partitioner as links print "Initialization" PageRankValues = AdjList3.mapValues( lambda v: v) # 3. Replace the lambda function with yours print PageRankValues.collect() # Run 30 iterations print "Run 30 Iterations"
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_save_as_textfile_with_utf8(self): x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x.encode("utf-8")]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m) def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) def test_zip_with_different_number_of_items(self): a = self.sc.parallelize(range(5), 2) # different number of partitions b = self.sc.parallelize(range(100, 106), 3) self.assertRaises(ValueError, lambda: a.zip(b)) # different number of batched items in JVM b = self.sc.parallelize(range(100, 104), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # different number of items in one pair b = self.sc.parallelize(range(100, 106), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # same total number of items, but different distributions a = self.sc.parallelize([2, 3], 2).flatMap(range) b = self.sc.parallelize([3, 2], 2).flatMap(range) self.assertEquals(a.count(), b.count()) self.assertRaises(Exception, lambda: a.zip(b).count()) def test_histogram(self): # empty rdd = self.sc.parallelize([]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) self.assertRaises(ValueError, lambda: rdd.histogram(1)) # out of range rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1]) # in range with one bucket rdd = self.sc.parallelize(range(1, 5)) self.assertEquals([4], rdd.histogram([0, 10])[1]) self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1]) # in range with one bucket exact match self.assertEquals([4], rdd.histogram([1, 4])[1]) # out of range with two buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1]) # out of range with two uneven buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) # in range with two buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two bucket and None rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two uneven buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1]) # mixed range with two uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01]) self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1]) # mixed range with four uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # mixed range with uneven buckets and NaN rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, None, float('nan')]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # out of range with infinite buckets rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")]) self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1]) # invalid buckets self.assertRaises(ValueError, lambda: rdd.histogram([])) self.assertRaises(ValueError, lambda: rdd.histogram([1])) self.assertRaises(ValueError, lambda: rdd.histogram(0)) self.assertRaises(TypeError, lambda: rdd.histogram({})) # without buckets rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 4], [4]), rdd.histogram(1)) # without buckets single element rdd = self.sc.parallelize([1]) self.assertEquals(([1, 1], [1]), rdd.histogram(1)) # without bucket no range rdd = self.sc.parallelize([1] * 4) self.assertEquals(([1, 1], [4]), rdd.histogram(1)) # without buckets basic two rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2)) # without buckets with more requested than elements rdd = self.sc.parallelize([1, 2]) buckets = [1 + 0.2 * i for i in range(6)] hist = [1, 0, 0, 0, 1] self.assertEquals((buckets, hist), rdd.histogram(5)) # invalid RDDs rdd = self.sc.parallelize([1, float('inf')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) rdd = self.sc.parallelize([float('nan')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) # string rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2) self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2)) # mixed RDD rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2) self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1]) self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals(([1, "b"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2))
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5])
trg_node_fields = ["trg_" + field for field in node_info_fields] # Init Spark Context as running in local mode sc = SparkContext("local") # Create a basic Spark Session spark = SparkSession \ .builder \ .appName(app_name) \ .getOrCreate() # Specify properties of fields, # including field name and related data type log_fields = src_node_fields + transc_info_fields + trg_node_fields + item_info_fields # ------------------------------------------ # Pipeline of the Workflow # Load rawdata from local file system # And split each row by specific delimiter source = sc.textFile(input_file_name) \ .map(lambda x: x.split(delimiter)) # DataFrame for logistics data log_df = spark.createDataFrame(source, log_fields) log_df.groupBy("src_area_city").count().write.csv("src_area_city") log_df.groupBy("src_industry_lv1").count().write.csv("src_industry_lv1") log_df.groupBy("src_industry_lv3").count().write.csv("src_industry_lv3") log_df.groupBy("trg_area_city").count().write.csv("trg_area_city") log_df.groupBy("trg_industry_lv1").count().write.csv("trg_industry_lv1") log_df.groupBy("trg_industry_lv3").count().write.csv("trg_industry_lv3")
sc = SparkContext(conf=conf) # read data as CSV for Dataframe analysis # /Volumes/work/data/kaggle/ssi.csv # read data n0rmally ''' sqlContext = SQLContext(sc) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(BASE_DATA_PATH + '/ssi.csv') # summarize(df) print df.show() #points = df.map(lambda row: LabeledPoint(input[row.C4],[float(row.C0),float(row.C1),float(row.C2),float(row.C3)])) values using Dataframe Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441] Final intercept: 0.0 ''' points = sc.textFile(BASE_DATA_PATH + '/ssi.csv').map(parsePoint) model = LogisticRegressionWithSGD.train(points, 10) print("Final weights: " + str(model.weights)) print("Final intercept: " + str(model.intercept)) ''' Final weights: [-137.221167143,12.555647803,53.629362055,109.314252441] Final intercept: 0.0 ''' sc.stop()
sc = SparkContext(conf=conf) gateway = sc._gateway sym = gateway.jvm.com.sml.shell # Find the access keys for EC2. awsAccessKeyId = os.environ['AWS_ACCESS_KEY'] awsSecretAccessKey = os.environ['AWS_SECRET_KEY'] # print("awsAccessKeyId=" + awsAccessKeyId) # print("awsSecretAccessKey=" + awsSecretAccessKey) sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", awsAccessKeyId) sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", awsSecretAccessKey) myrdd = sc.textFile('s3a://sml-oregon/datasets/susy/SUSYmini.csv') # Convert pyspark RDD to JavaRDD # _to_java_object_rdd myJavaRdd = myrdd._jrdd # The first line of CSV file are the name of the attributes attributeNames = myrdd.first().split(",") # The attributeTypes has to be given attributeTypes = ["B"]+["C"]*(len(attributeNames)-1) # The IP address of the host if empty, project is not persisted. (Not Persisted Here) # sym.SymShellConfig.set("RedisHost","charm") # sym.SymShellConfig.set("RedisPort",6379) # 1) Create the Project here projectName = "susyExampleInPython"
if args.format == "tfr": # HDFS==>numpy array images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": # HDFS==>numpy array images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": # HDFS==>numpy array images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) print("zipping images and labels") # print(type(labels)) # print(labels.count()) dataRDD = images.zip(labels) # image+label #cluster = TFCluster.reserve(sc, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) #cluster.start(mnist_dist.map_fun, args) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train" or args.mode == "retrain": cluster.train(dataRDD, args.epochs)
from pyspark.context import SparkContext sc = SparkContext(...) lines = sc.textFile(sys.argv[2],1) counts = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (x,1)) \ .reduceByKey(lambda x, y: x+y) for (word, count) in counts.collect(): print "%s:%i" %(word, count)