import findspark findspark.init() import pyspark from pyspark import SQLContext from pyspark import SparkContext SparkContext.setSystemProperty('spark.cleaner.periodicGC.interval', '2') SparkContext.setSystemProperty('spark.executor.memory', '2400m') SparkContext.setSystemProperty('spark.driver.cores', '2') SparkContext.setSystemProperty('spark.driver.memory', '2g') SparkContext.setSystemProperty("spark.driver.maxResultSize", "2g") sc = pyspark.SparkContext(master='spark://192.168.11.239:7077', appName='pipeline_tests') sqlContext = SQLContext(sc) from pyspark.sql.types import StringType from datetime import datetime import pyspark.sql.functions as F #avoid conflicts with regular python functions from pyspark.sql.functions import udf from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml.feature import PCA, StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer import numpy as np import time from pipeline_tester import PipelineTester ######CLEAN df = sqlContext.read.csv("/datasets/crimes.csv", header='true') #Define date derivatives df = (df.withColumn(
#! /usr/bin/python3 import pyspark sc = pyspark.SparkContext("local", "storagelevel app") rdd1 = sc.parallelize([1, 2]) rdd1.persist(pyspark.StorageLevel.MEMORY_AND_DISK) rdd1.getStorageLevel() print(rdd1.getStorageLevel())
import pyspark sc = pyspark.SparkConf()\ .setMaster("local[*]")\ .set("spark.driver.memory","8g")\ .set("spark.executor.memory","8g")\ .set("spark.debug.maxToStringFields", 10000)\ .set('spark.sql.debug.maxToStringFields', 2000)\ .set("spark.jars","/Users/yeeun/Apache/spark-2.4.4-bin-hadoop2.7/jars/spark-redis-2.4.0-jar-with-dependencies.jar") sparkContext = pyspark.SparkContext(conf=sc) spark = pyspark.sql.SparkSession(sparkContext).builder\ .appName("pysaprk_python")\ .config("spark.redis.host", "localhost")\ .config("spark.redis.port", "6379")\ .getOrCreate() from pyspark.streaming import StreamingContext from pyspark import StorageLevel from itertools import chain import redis, json, time myRedis = redis.Redis(host='127.0.0.1', port=6379, db=0) IP = "127.0.0.1" Port = 5559 # tweet hashtag Trend analysis schema = ['text', 'is_quote_status', 'entities.hashtags.text as hashtag']
import pyspark sc = pyspark.SparkContext(appName="test_pyspark") import random NUM_SAMPLES = 1000 def inside(p): x, y = random.random(), random.random() return x*x + y*y < 1 count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count() pi = 4 * count / NUM_SAMPLES print() print(pi) print() print("Pyspark is working") print()
from pyspark.sql import SparkSession, SQLContext import pyspark.sql.functions as fn from pyspark.sql.functions import col import pyspark import os import subprocess import math from pyspark.sql.types import StructType, StringType, IntegerType private_ip = 'namenode_ip' private_ip = private_ip.replace('.', '-') context = pyspark.SparkContext('local[*]') sesh = SparkSession(context) df_mongo = sesh.read.json( 'hdfs://ip-{}.ec2.internal:9000/user/ubuntu/metadata/metadata.json'.format( private_ip)) # drop these columns from metadata df_mongo = df_mongo.drop('id')\ .drop('_id')\ .drop('brand')\ .drop('categories')\ .drop('description')\ .drop('related')\ .drop('salesRank')\ .drop('title')\ .drop('imUrl')\ .dropna()\
## Assignment 1 - Word Count Using PySpark ## ## Author - Akshay Patel ## ## Usage - Copy paste all code in pyspark ## ## until graph is rendered in window ## ###################################################### import matplotlib.pyplot as plt from string import punctuation from operator import add import numpy as np import re import time import pyspark sc = pyspark.SparkContext(appName="App") ################ Word Count Application On Shakespeare's File ################# def wordCount(wordListRDD): """ Creates a pair RDD with word counts from an RDD of words. Args: wordListRDD (RDD of str): An RDD consisting of words. Returns: RDD of (str, int): An RDD consisting of (word, count) tuples. """ wordPairs = wordListRDD.map(lambda w: (w, 1)) wordCounts = wordPairs.reduceByKey(add) return wordCounts
import findspark findspark.init() import pyspark import random from pyspark.sql import SparkSession from pyspark.conf import SparkConf from pyspark.sql import SQLContext from hdfs import InsecureClient from datetime import date, timedelta yesterday = date.today() - timedelta(1) date = yesterday.strftime('%Y%m%d') sc = pyspark.SparkContext(appName="Citibike_aggregation_" + date) csvDf = sqlContext.read.format("csv").option("header", "true").option( "inferschema", "true").option("mode", "DROPMALFORMED").load( 'hdfs://ubuntuclient2.psudata.dev:8020/user/hdfs/citibike/raw_data/' + date + "*.csv") pd_df = csvDf.toPandas() with client_hdfs.write( '/user/hdfs/citibike/daily_aggregated_data/aggregated_' + date + '.csv', encoding='utf-8') as writer: pd_df.to_csv(writer, index=False) mkdir_command = 'hdfs dfs -mkdir /user/hdfs/citibike/daily_backup_data/' + date cmd_mkdir = mkdir_command.split() subprocess.check_output(cmd_mkdir)
import sys import csv import math import pyspark sc = pyspark.SparkContext("local") rd = sys.argv[1] td = sys.argv[2] ratings = sc.textFile(rd) ratings = ratings.map(lambda x: x.split(",")).map(lambda x: (x[0], (x[1], x[2]))) test = sc.textFile(td) test=test.map(lambda x: x.split(",")).filter(lambda x: x[0]!='userId')\ .map(lambda x: (x[0],x[1])) raw=ratings.filter(lambda x: x[0]!='userId')\ .map(lambda x: ((x[0],x[1][0]),x[1][1])) test_label = test.map(lambda x: ((x[0], x[1]), 'test')) train_test = raw.leftOuterJoin(test_label) # train data # (userid,(movieid,rating)) rating=train_test.filter(lambda x: x[1][1]!='test')\ .map(lambda x: (x[0][0],(x[0][1],x[1][0]))) # test data # (userid,(movieid,rating)) test_rating=train_test.filter(lambda x: x[1][1]=='test')\
''' initSession.py establishes a unified spark context and session to be used in subsequent scripts. ''' import pyspark from pyspark.sql import SparkSession sc = pyspark.SparkContext('local[*]', "temp") spark = SparkSession.builder.master("local[*]").appName("temp").getOrCreate()
def main(argv): sc = pyspark.SparkContext() # Two SDG Goals goal = [ "Proportion of students at the end of lower secondary education achieving at least a minimum proficiency level in reading, both sexes (%)", "Proportion of students at the end of lower secondary education achieving at least a minimum proficiency level in mathematics, both sexes (%)" ] # Two different Y label files for two SDG goals files = ['hdfs:/data/SDG4_data.csv', 'hdfs:/data/SDG4_data_Math.csv'] for i in range(2): rdd_SDG_data = sc.textFile(files[i]) # Here filtered out data for a particular Goal def filter_goal(x): split_x = x.split(',') gol = goal[i] if (((split_x[1]+",").replace('"','')+split_x[2].replace('"',''))== gol)\ and split_x[-4] != "" and split_x[-3] !="" : return True else: return False # Here Mapping data for a (country, (year, value)) def map_cntry_year(x): split_x = x.split(',') return (split_x[-7].replace('"', ''), (split_x[-4].replace('"', ''), split_x[-3])) # Filtered out country having data of both 2015 and 2018 year def map_year_2015_2018(x): yr_2015, yr_2018 = 0.0, 0.0 if x[1][0][0] == "2015": yr_2015 = float(x[1][0][1]) else: yr_2018 = float(x[1][0][1]) if x[1][1][0] == "2015": yr_2015 = float(x[1][1][1]) else: yr_2018 = float(x[1][1][1]) return (x[0], (yr_2015, yr_2018)) mapped_sdg_data = rdd_SDG_data.filter(lambda x:filter_goal(x)).map(lambda x:map_cntry_year(x))\ .groupByKey().map(lambda x : (x[0], list(x[1]))).filter(lambda x:len(x[1])==2)\ .map(lambda x: map_year_2015_2018(x)) #list down all countries having data points, total 57 country are there list_cntry_code = mapped_sdg_data.map(lambda x: x[0]).collect() brdcast_cntry = sc.broadcast(list_cntry_code) # Feedback variable files for year 2018 of students rdd_fb_data_2018 = sc.textFile('hdfs:/data/cy07_msu_stu_qqq.csv') # Feedback variable files for year 2015 of students rdd_fb_data_2015 = sc.textFile('hdfs:/data/cy6_ms_cmb_stu_qqq.csv') # map (k,v) as (country,[list of Feedback variables]) def map_cnt_STFB(x): l = x.split(',') # 298 are total features return (l[1], (l[14:18] + l[20:41] + l[44:52] + l[55:320])) # List down all the Feedback variable list, there are 298 features in total to be used for hypothesis testing brdcast_FB_var = sc.broadcast( rdd_fb_data_2018.map(lambda x: map_cnt_STFB(x)).map( lambda x: x[1]).take(1)) # print(len((brdcast_FB_var.value)[0])) # There were multiple entries for each country for each Feedback variable # Filled the missing value with the mean and as output, (country,(Feedback_var name,value)) def take_avg_per_cntry_FB_var(x): N = len(x[1]) dict_count_empty = {} dict_count_val = {} for j in range(N): l = x[1][j] for ele in range(len(l)): if not l[ele]: if ele in dict_count_empty.keys(): dict_count_empty[ele] += 1.0 else: dict_count_empty[ele] = 1.0 else: if ele in dict_count_val.keys(): dict_count_val[ele] += float(l[ele]) else: dict_count_val[ele] = float(l[ele]) FB_var_list = (brdcast_FB_var.value)[0] ans = [] for i in dict_count_val.keys(): if i not in dict_count_empty.keys(): ans.append((x[0], (FB_var_list[i], dict_count_val[i] / N))) else: ans.append((x[0], (FB_var_list[i], (dict_count_val[i] + ( (dict_count_val[i] / (N - dict_count_empty[i])) * dict_count_empty[i])) / N))) return ans # Filter Feedback variable data for countries for which we have SDG goal value def filter_country(x): cntry_list = brdcast_cntry.value l = x.split(',') if l[1] in cntry_list: return True else: return False filter_FB_2018 = rdd_fb_data_2018.filter(lambda x: filter_country(x)).map(lambda x:map_cnt_STFB(x)).groupByKey()\ .map(lambda x : (x[0], list(x[1])))\ .flatMap(lambda x:take_avg_per_cntry_FB_var(x)) filter_FB_2015 = rdd_fb_data_2015.filter(lambda x: filter_country(x)).map(lambda x:map_cnt_STFB(x)).groupByKey()\ .map(lambda x : (x[0], list(x[1])))\ .flatMap(lambda x:take_avg_per_cntry_FB_var(x)) # Map (k,v) pair as (Feedback_varname, (yr, its value)) def merge_2018(k): yr_2018 = k[1][0] FB_var = k[1][1] return (FB_var[0], (yr_2018[1], FB_var[1])) # Map (k,v) pair as (Feedback_varname, (yr, its value)) def merge_2015(k): yr_2015 = k[1][0] FB_var = k[1][1] return (FB_var[0], (yr_2015[0], FB_var[1])) # Merge all SDG goal output with Feedback variable data merge_year_2018_data = mapped_sdg_data.join(filter_FB_2018).map( lambda k: merge_2018(k)) merge_year_2015_data = mapped_sdg_data.join(filter_FB_2015).map( lambda k: merge_2015(k)) # Merged all years data with key value as Feedback variable name and in output, (Feedback var, [(goalval, Feedback_var value) for each country]) total_merge_data = merge_year_2015_data.join(merge_year_2018_data).groupByKey()\ .map(lambda x : (x[0], list(x[1]))) # Standandardize data to calculate Beta value for each Feedback variable def standard_data_fun(x): key = x[0] val = x[1] ans = [] N = len(val) arrx = [] arry = [] for i in range(N): arrx.append(val[i][0][1]) arrx.append(val[i][1][1]) arry.append(val[i][0][0]) arry.append(val[i][1][0]) x_mean = np.mean(arrx) y_mean = np.mean(arry) x_std = np.std(arrx) y_std = np.std(arry) ans = [] tot = len(arrx) beta_val = 0.0 for i in range(tot): beta_val += ((arry[i] - y_mean) / y_std) * ( (arrx[i] - x_mean) / x_std) ans.append( ((arry[i] - y_mean) / y_std, (arrx[i] - x_mean) / x_std)) return ((key, beta_val / (tot - 1)), ans) def calc_beta_val(x): sum = 0.0 N = len(x[1]) for i in range(len(x[1])): sum += x[1][i][0] * x[1][i][1] return (x[0], sum / N - 1) standardize_data = total_merge_data.map(lambda x: standard_data_fun(x)) # Seperated all Feedback variables having 20 most and least co-related variable brdcast_top_20_feature = sc.broadcast( standardize_data.sortBy(lambda x: -x[0][1]).map( lambda x: x[0][0]).take(20)) brdcast_bottom_20_feature = sc.broadcast( standardize_data.sortBy(lambda x: x[0][1]).map( lambda x: x[0][0]).take(20)) # print(top_20_feature) # print("#########################################") # print(bottom_20_feature) def calc_p_value(x): dof = len(x[1]) - 2 rss = 0.0 for i in range(len(x[1])): val = x[1][i][1] - (x[0][1] * x[1][i][0]) rss += val * val s_square = rss / dof denominator = dof + 2 plt_beta = stats.t.cdf( x[0][1] / math.sqrt((s_square / denominator)), dof) if plt_beta < 0.5: return ((x[0][0]), (x[0][1], 2 * plt_beta * 1000)) else: return ((x[0][0]), (x[0][1], (1 - plt_beta) * 2 * 1000)) pvalue_20_pos_cor = standardize_data.filter(lambda x: x[0][ 0] in brdcast_top_20_feature.value).map(lambda x: calc_p_value(x)) pvalue_20_neg_cor = standardize_data.filter( lambda x: x[0][0] in brdcast_bottom_20_feature.value).map( lambda x: calc_p_value(x)) print(" For goal ##", goal[i]) print("#########################################") print() print("Top 20 most important feature ") print() print(pvalue_20_pos_cor.collect()) print() print("#########################################") print() print("Bottom 20 most important feature ") print() print(pvalue_20_neg_cor.collect()) print()
for i in range(number_of_batches): for j in range((i+1), number_of_batches): combinations.append((i,j)) else: combinations = generate_combinations(number_of_batches, int(args.combinations)) print(combinations) combGraph = nx.Graph() for e in combinations: combGraph.add_edge(e[0],e[1]) print("len(combinations)",len(combinations)) details["combinations"] = combinations details["number_of_combinations"] = len(combinations) partition_size = len(combinations) # In[13]: app_name = dataset_name + "_" + str(number_of_batches) + "_" + str(len(combinations)) sc = pyspark.SparkContext(appName=app_name) sc.addPyFile("node2vec.py") # In[83]: train_edge_true = sc.textFile(input_path + "train_edges_true.txt") .map(lambda x: (int(x.split(' ')[0]), int(x.split(' ')[1]))) .map(lambda x: (min(x[0],x[1]), max(x[0],x[1]), 1)) train_edge_false= sc.textFile(input_path + "train_edges_false.txt") .map(lambda x: (int(x.split(' ')[0]), int(x.split(' ')[1]))) .map(lambda x: (min(x[0],x[1]), max(x[0],x[1]), 0)) train_edges = sc.union([train_edge_true, train_edge_false]) del train_edge_true del train_edge_false # In[84]: nodes = train_edges.flatMap(lambda x: [x[0],x[1]]).distinct().persist() nodes_count = len(nodes.collect()) #todo optimize
import pyspark as ps sc = ps.SparkContext("local") import sys case_num = int(sys.argv[1]) ratings_path = sys.argv[2] users_path = sys.argv[3] support = int(sys.argv[4]) # A-Priori Algorithm def Get_Candidates(data, threshold): local_candidate = dict() candidate_set = set() data_sets = list() items_set = dict() for i in data: data_sets.append(set(i)) for i in data_sets: for j in i: if items_set.has_key(j): items_set[j] += 1 else: items_set[j] = 1 for i in items_set.keys(): if items_set[i] >= threshold: candidate_set.add(frozenset([i])) k = 2 local_candidate[k - 1] = candidate_set
import pyspark import pyspark.streaming as pyspark_streaming import pyspark.streaming.kafka as pyspark_kafka import scapy.all as scapy # ----------------------------------------------------------------------------- # Main program # ----------------------------------------------------------------------------- if __name__ == "__main__": # # Setup # #-- define spark usual and streaming contexts cont_0 = pyspark.SparkContext(appName="pkt_dissector") cont_0.setLogLevel("ERROR") s_cont_0 = pyspark_streaming.StreamingContext(cont_0, 5) #-- kafka integration (notice, that we receive packets as a bytes struct) brokers = "192.168.122.71:9092,192.168.122.72:9092,192.168.122.73:9092" kafka_dstream = pyspark_kafka.KafkaUtils.createDirectStream( s_cont_0, ["test1"], {"metadata.broker.list": brokers}, valueDecoder=lambda x: bytes(x)) # # Lazy evaluation rules # #-- Kafka message comes as a 2-tuple: (key, value). The code below will #-- select the actual message (i.e. packet) and dissects it. pkts = kafka_dstream.map(lambda x: scapy.Ether(x[1]))
# Installing some required python packages and models print( "\n**INFO** :Теперь установка некоторых необходимых пакетов и моделей Python\n" ) from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params from pyspark.sql import SQLContext print("Done!") import pyspark sqlContext = pyspark.SQLContext(pyspark.SparkContext()) print( "\n**INFO** :Подготовьте данные обучения из списка (метка, характеристики) кортежей.\n" ) # print("Done!") training = sqlContext.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) print("Done!") print( "\n**INFO** :# Создать экземпляр LogisticRegression. Этот экземпляр является оценщиком.\n" ) # Распечатайте параметры, документацию и любое значение по умолчанию lr = LogisticRegression(maxIter=10, regParam=0.01) # распечатайте параметры, документацию и любые значения по умолчанию. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
#import findspark #findspark.init() import pyspark sc = pyspark.SparkContext("local", "MyApp") rdd = sc.parallelize(range(5)) letters = rdd.map(lambda x: [abet for abet in 'alpha']) scrambl = letters.reduce(lambda x, y: x + y) answer = scrambl.collect() print(answer)
import operator import pyspark # def main(): # '''Program entry point''' #Intialize a spark context with pyspark.SparkContext("local", "PySparkWordCounts") as sc: #Get a RDD containing lines from this script file lines = sc.textFile("C:\Govi\Python\jaffa.txt") #Split each line into words and assign a frequency of 1 to each word words = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)) #count the frequency for words counts = words.reduceByKey(operator.add) #Sort the counts in descending order based on the word frequency sorted_counts = counts.sortBy(lambda x: x[1], False) #Get an iterator over the counts to print a word and its frequency for word, count in sorted_counts.toLocalIterator(): print(u"{} --> {}".format(word, count)) # if __name__ == "__main__": # main()
####################################################################### # Word count example in pyspark ####################################################################### import pyspark sc = pyspark.SparkContext(appName = 'wordcount') ################################################################ # Read in the data ################################################################ lines = ["Dorothy lived in the midst of the great Kansas prairies,", "with Uncle Henry, who was a farmer, and Aunt Em, who was", "the farmer's wife. Their house was small, for the lumber", "to build it had to be carried by wagon many miles. There", "were four walls, a floor and a roof, which made one room;" "and this room contained a rusty looking cookstove, a", "cupboard for the dishes, a table, three or four chairs," "and the beds."] rdd = sc.parallelize(lines) # TO DO: look at the first 3 lines ################################################################ # Pre-processing the data ################################################################ # TO DO: transform the RDD into a new one (also called 'rdd')
all_args['features'] = arguments.features all_args['id'] = arguments.id all_args['labels'] = arguments.labels # dtu_cluster_path = 'file:///home/micsas/workspace/distributions/dist_workflow' # local_path = "file:/home/svanhmic/workspace/DABAI/Workflows/dist_workflow" # visma_cluster_path = 'file:/home/ml/deployments/workflows' py_files = [ '/shared.zip', '/examples.zip', '/cleaning.zip', '/classification.zip', '/semisupervised.zip' ] spark_conf = pyspark.SparkConf(loadDefaults=False) (spark_conf.set('spark.executor.cores', 4).set('spark.executor.memory', '1G').set('spark.executors', 2)) sc = pyspark.SparkContext(appName=arguments.job_name) job_module = importlib.import_module('{:s}'.format(arguments.job_name)) # sc = pyspark.SparkContext( # appName=arguments.job_name, pyFiles=[arguments.cluster_path+py_file for py_file in py_files], conf=spark_conf) # job_module = importlib.import_module('{:s}'.format(arguments.job_name)) try: data_frame = job_module.run(sc, **all_args) # data_frame.printSchema() # data_frame.show() rdd = data_frame.toJSON( ) # .saveAsTextFile('hdfs:///tmp/cleaning.txt') js = rdd.collect() # print(js) if arguments.job_name == 'cleaning': print("""{"cluster":[""" + ','.join(js) + """]}""") elif arguments.job_name == 'classification':
from pyspark.sql.types import * import os import numpy as np import pyspark as ps dlds = '/Users/travis.howe/Downloads/' # to run this in the command line, simply python galvanize_individual.py # in the spark shell, i don't need to initialize SparkContext (done automatically) conf = ps.SparkConf().setMaster('local[4]').setAppName('My App') sc = ps.SparkContext(conf=conf) # file_rdd = sc.textFile('../sample_data_files/cookie_data.txt') rdd1 = sc.parallelize(range(5)) rdd1.saveAsTextFile(dlds + 'thing.txt') sys.exit() print rdd1.reduce(lambda x, y: x + y) # start with page 67, aggregate
"""Data pipeline for extracting basic metrics""" import sys import random import ast import re # this code allows you to run the following code locally import findspark findspark.init() import pyspark # noqa: E402 from pyspark import SparkContext # noqa: E402 sc = pyspark.SparkContext(appName="hw") # the code is run on the Cavium cluster of UMICH ARC-TS # Some libraries are pre-loaded as part of the cluster configuration # e.g. SparkContext # If you are to run this code on a local machine, # pay attention to those libraries: https://arc-ts.umich.edu/cavium/user-guide/ # or contact the authors random.seed(42) sys.setrecursionlimit(999999) def inject_one_depth(node): res = 1 for subnode in node['children']: inject_one_depth(subnode) # inplace res = max(res, subnode['replies_depth']) node['replies_depth'] = res
data.replace("<br />", "") data = re.sub("[^\w]", " ", data) return data.split() def getLabelPoint(inputList): return LabeledPoint(inputList[1],inputList[0]) if __name__=="__main__": #model = py4j.java_gateway.launch_gateway finalDict={} details = {} if len(gb.glob("./diction*.pkl")) == 0: configuartion=py.SparkConf() # setting the Spark Configuration sContext=py.SparkContext(conf=configuartion) # setting the Spark context sContext.defaultParallelism print ("Data preprocessing start time:", datetime.datetime.now().time()) traindataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/pos/*.txt")) posData = traindataPos.flatMap(getdata) testdataPos = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/pos/*.txt")) postestData = testdataPos.flatMap(getdata) newposData = traindataPos + testdataPos traindataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/train/neg/*.txt")) negData = traindataNeg.flatMap(getdata) testdataNeg = sContext.parallelize(gb.glob("/home/vyassu/Downloads/Telegram Desktop/aclImdb/test/neg/*.txt")) negtestData = testdataNeg.flatMap(getdata)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # 예정된 도착/출발 시간 추가 # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature에서 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # 파이프라인 모델 저장 string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속 수치형 필드를 명목형 필드의 인덱스와 결합해서 하나의 특징 벡터로 만듦 numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 수치 벡터 어셈블러를 저장 vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델 교차 검증, 훈련, 평가: 4 개의 지표에 대해 5회 반복 # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시키기 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # 오래된 모델 대신 새 모델을 저장 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터로 모델 평가 predictions = model.transform(test_data) # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 특징 중요도 수집 # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # 지표별 평균과 표준편차 평가 및 표로 출력 # import numpy as np score_averages = defaultdict(float) # 표 데이터 계산 average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # 표 출력 print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb")) # # 특징 중요도 변화를 분석하고 보고 # # 각 특징에 대한 평균 계산 feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # 특징 중요도를 내림차순으로 정렬하고 출력 import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # 이번 실행 결과인 특징 중요도를 이전 실행 결과와 비교 # # 특징 중요도 로그를 적재하거나 빈 로그를 초기화 try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # 각 특징에 대한 점수 변화를 계산하고 디스플레이 try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # 변동 값 계산 feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다 import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # 정렬된 특징 변동 값 디스플레이 print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # 로그에 기존 평균 변동 값을 추가 feature_log.append(feature_importance_entry) # 다음 실행을 위해 로그 유지 pickle.dump(feature_log, open(feature_log_filename, "wb"))
import sys, json, time, collections, operator, math, pickle # hyper-parameter rareword_threshold = 0.000001 if __name__ == "__main__": start_time = time.time() # parse commandline argument train_file_path = sys.argv[1] # input model_file_path = sys.argv[2] # output stopwords_path = sys.argv[3] conf = pyspark.SparkConf().setAppName("Task2Train").setMaster("local[*]") sc = pyspark.SparkContext(conf=conf) sc.setLogLevel("ERROR") stopwords = set(sc.textFile(stopwords_path).collect()) stopwords_broadcast = sc.broadcast(stopwords) def only_keep_letter_space(str): whitelist = set('abcdefghijklmnopqrstuvwxyz ') return "".join(filter(whitelist.__contains__, str.lower())) rawRDD = sc.textFile(train_file_path) \ .map(lambda x: json.loads(x)) \ .map(lambda x: ((x["user_id"], x["business_id"]), x["text"])) \ .mapValues(lambda x: only_keep_letter_space(x)) \ .mapValues(lambda x: list(filter(lambda x: x not in stopwords_broadcast.value, x.split())))
# 1. an empty line at the bottom. # 2. the labels are in the form of strings. We want an integer. # Download dataset (if not already downloaded) filename = "iris.data" temp_filename = filename + '_temp' url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" if not os.path.isfile(filename): if os.path.isfile(temp_filename): os.remove(temp_filename) download_file(url, temp_filename) os.rename(temp_filename, filename) # We use pyspark to filter empty lines sc = pyspark.SparkContext(master='local[*]', appName='iris') data = sc.textFile('iris.data') filtered_data = data.filter(lambda x: len(x) > 0) # Define Input Schema input_schema = Schema() input_schema.add_double_column('Sepal length') input_schema.add_double_column('Sepal width') input_schema.add_double_column('Petal length') input_schema.add_double_column('Petal width') input_schema.add_categorical_column( "Species", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]) # Define Transform Process tp = TransformProcess(input_schema) tp.categorical_to_integer("Species")
#!/usr/bin/env python import pyspark import tarfile from io import BytesIO import sys def extractFiles(incoming_bytes): tar = tarfile.open(fileobj=BytesIO(incoming_bytes), mode="r:gz") return [tar.extractfile(x).read() for x in tar if x.isfile()] if len(sys.argv) != 3: raise Exception("Exactly 2 arguments are required: <outputUri>") sc = pyspark.SparkContext(appName="ana") gzfiles = sc.binaryFiles(sys.argv[1]) values = gzfiles.flatMap(lambda x : extractFiles(x[1])).filter(lambda x: len(x)>1).map(lambda x: x.decode("latin-1")) words = values.flatMap(lambda line: line.split(" ")).filter(lambda x: (len(x) > 1) & x.isalpha()) anagrams = words.map(lambda word: (''.join(sorted(word.lower())), set([word.lower()]))).reduceByKey(lambda a,b: a.union(b) ).filter(lambda x: len(x[1]) > 1).map(lambda x: x[1]) anagrams.saveAsTextFile(sys.argv[2]) sc.stop()
def main(): # Note: It is important to import the libraries needed within the function # so Spark does not attempt serializing the libraries to all the workers, # otherwise it could fail during Serialization/Deserialization # using the pickle methods. from mxinfer import load_images from mxinfer import predict from utils import get_args from utils import get_s3client from utils import fetch_s3_keys from utils import download_objects from utils import upload_file args = get_args() logger.info('received arguments:{}'.format(args)) conf = SparkConf().setAppName( "Distributed Inference using MXNet and Spark") # we will set the number of cores per executor to 1 to force Spark to create # only one task per executor since MXNet efficiently uses all the cpus on the # system for inference conf.set('spark.executor.cores', '1') sc = pyspark.SparkContext(conf=conf) logger.info("Spark Context created") s3_client = get_s3client(args['access_key'], args['secret_key']) keys = fetch_s3_keys(args['bucket'], args['prefix'], s3_client) # filter out only png images. # you can also choose to check the content-type headers by doing # a head call against each S3-Key keys = filter(lambda x: x.endswith('.png'), keys) # number of keys n_keys = len(keys) if n_keys < args['batch']: args['batch'] = n_keys n_partitions = n_keys // args['batch'] logger.info('number of keys from s3: {}'.format(n_keys)) # if keys cannot be divided by args['batch'] . if (n_partitions * args['batch'] != n_keys): keys.extend(keys[:args['batch'] - (n_keys - n_partitions * args['batch'])]) logger.debug('Keys:{}'.format(keys)) n_partitions = len(keys) // args['batch'] logger.info("number of keys:{}, n_partitions:{}".format( len(keys), n_partitions)) # we will create partitions of args['batch'] rdd = sc.parallelize(keys, numSlices=n_partitions) logger.info('created rdd with {} partitions'.format( rdd.getNumPartitions())) sc.broadcast(args['bucket']) rdd = rdd.mapPartitions(lambda k: download_objects(args['bucket'], k)) rdd = rdd.mapPartitions(load_images) sc.broadcast(args) rdd = rdd.mapPartitions(lambda imgs: predict(imgs, args)) output = rdd.collect() # drop the extra keys that we added to fill the last batch keys = keys[:n_keys] output = output[:n_keys] logger.info("predictions: {}".format(output)) if args['output_s3_key'] and args['output_s3_bucket']: with open('/tmp/' + args['output_s3_key'], 'w+') as f: for k, o in zip(keys, output): f.write("Key %s: Prediction: %s\n" % (k, o)) upload_file(args['output_s3_bucket'], args['output_s3_key'], '/tmp/' + args['output_s3_key'], s3_client)
#!/usr/bin/env python import pyspark import sys inputUri = 'gs://dataproc-ed3c3d29-fb10-47bb-aca7-dcc358c68973-us-central1/input/map.txt' outputUri = 'gs://dataproc-ed3c3d29-fb10-47bb-aca7-dcc358c68973-us-central1/output' grid = [] def parseRDD(line): parsedLine = list(line) row = [] for i in range(len(parsedLine)): row.append(0) grid.append(row) return parsedLine sc = pyspark.SparkContext() lines = sc.textFile(inputUri) words = lines.flatMap(parseRDD) print(grid) wordCounts = words.map(lambda word: (word, 1)).reduceByKey( lambda count1, count2: count1 + count2) wordCounts.saveAsTextFile(outputUri)
import pyspark import sys print("Hello, world!") master_url = sys.argv[1] if len(sys.argv) >= 2 else "local" sc = pyspark.SparkContext(master_url) msg = "Hello, pyspark!" rdd = sc.parallelize(list(msg)) print(''.join(rdd.collect()))
import pyspark sc = pyspark.SparkContext(appName="myAppName") quijote = sc.textFile("quijote.txt") palabrasrdd = quijote.flatMap(lambda x: x.split(" ")) # Se cuenta las lineas que llevan la cadena Quijote filtro1 = palabrasrdd.filter(lambda l: "Quijote" in l).count() # Se cuenta las lineas que llevan la cadena Sancho filtro2 = palabrasrdd.filter(lambda l: "Sancho" in l).count() # Se cuenta las lineas que llevan la cadena Rocinante filtro3 = palabrasrdd.filter(lambda l: "Rocinante" in l).count() print("Quijote: " + str(filtro1)) print("Sancho: " + str(filtro2)) print("Rocinante: " + str(filtro3))
def py_sc(self): return pyspark.SparkContext(master='local[*]', appName='test')