from pyspark import SparkContext, SparkConf from pyspark.sql import DataFrame, HiveContext #SQLContext from pyspark.sql.functions import * from pyspark.sql.types import * pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) sconf = SparkConf().setMaster("local[32]").setAppName("TradeDataCount").set( "spark.driver.maxResultSize", "32g").set( "spark.shuffle.consolidateFiles", "true" ) #spark://10.160.5.48:7077 or local[*] to use as many threads as cores sc = SparkContext(conf=sconf) hc = HiveContext(sc) dataset = sc.pickleFile('raw_dataset_rdd.pickle') dataset = dataset.toDF() holidays = ["2016-05-30", "2016-07-04", "2016-09-05"] #better to exclude files when doing intial parsing? data = (dataset.select([ 'ProductName', 'Maturity', 'Date', 'TimeStamp', hour("TimeStamp").alias("Hour"), 'Price', 'Quantity' ]).where((dataset.Date.isin(holidays) == False)).cache()) print "Raw data is %d rows." % data.count() max_date = data.select(max('Date')).first() print "Max date is %s." % max_date data.registerTempTable("RawData") #check holidays and weather index gone
from pyspark import SparkConf, SparkContext SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\") import sys, pickle,math from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('random-forest') sc = SparkContext(conf=conf) input = sys.argv[1] # Load and parse the data def parsePoint(line): return LabeledPoint(float(line[1]), line[0]) train = sc.pickleFile(input+'/bow_train/part-00000') test = sc.pickleFile(input+'/bow_test/part-00000') parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0) parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache() model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1) predictions = model.predict(parsedtest.map(lambda x: x.features)) labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions) val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count()) parsedtest.unpersist() RMSE=math.sqrt(val_err) print("Root Mean Squared Error Test= " + str(RMSE))
class SparkFEProcess: def __init__(self): self.parser = self.init_config() sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_cross") \ .set("spark.ui.showConsoleProgress", "false") self.sc = SparkContext(conf=sparkConf) self.sc.broadcast(self.parser) self.init_logger() # #初始化相关参数 # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用 # self.bins_dict={} def init_config(self): current_path = os.path.dirname(os.path.realpath(__file__)) workspace_path = current_path.split('featureEngineering')[0] config_file = workspace_path + 'resource/config.ini' parser = configparser.ConfigParser() parser.read(config_file) return parser def init_logger(self): ''' 设置日志级别 :param sc: :return: ''' logger = self.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def read_rdd(self, fileName): try: file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName data_rdd = self.sc.textFile(file_path) return data_rdd except Exception as e: print(e) def data_describe(self): sqlContext = SQLContext(self.sc) rootPath = self.parser.get("hdfs_path", "hdfs_data_path") print('start to read actLog_single ,and to deal with cross_feature') train_file_path = rootPath + 'actLog_train_single' test_file_path = rootPath + 'actLog_test_single' actLog_train_rdd = self.sc.pickleFile(train_file_path) actLog_test_rdd = self.sc.pickleFile(test_file_path) #修改label labels = [ ('duration_time', typ.IntegerType()), ('device', typ.IntegerType()), ('music_id', typ.IntegerType()), ('item_city', typ.IntegerType()), ('author_id', typ.IntegerType()), ('item_id', typ.IntegerType()), ('user_city', typ.IntegerType()), ('uid', typ.IntegerType()), ('channel', typ.IntegerType()), ('finish', typ.IntegerType()), ('like', typ.IntegerType()), ('time_day', typ.IntegerType()), ('item_pub_month', typ.IntegerType()), ('item_pub_day', typ.LongType()), ('item_pub_hour', typ.IntegerType()), ('item_pub_minute', typ.IntegerType()), ('uid_count_bin', typ.IntegerType()), ('user_city_count_bin', typ.IntegerType()), ('user_city_count_ratio', typ.DoubleType()), ('item_id_count_bin', typ.IntegerType()), ('item_id_count_ratio', typ.DoubleType()), ('author_id_count_bin', typ.IntegerType()), ('author_id_count_ratio', typ.DoubleType()), ('item_city_count_bin', typ.IntegerType()), ('item_city_count_ratio', typ.DoubleType()), ('music_id_count_bin:', typ.IntegerType()), ('music_id_count_ratio', typ.DoubleType()), ('device_count_bin', typ.IntegerType()), ('device_count_ratio', typ.DoubleType()), ('duration_time_count_bin', typ.IntegerType()), #由于step1_single中多处理这个字段,这个字段其实用不上,读进来后删掉 ('duration_time_count_ratio', typ.DoubleType()) ] actionLogSchema = typ.StructType( [typ.StructField(e[0], e[1], True) for e in labels]) df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd, actionLogSchema) df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd, actionLogSchema) df_actLog_train = df_actLog_train.drop('duration_time_count_bin').drop( 'duration_time_count_ratio') df_actLog_test = df_actLog_test.drop('duration_time_count_bin').drop( 'duration_time_count_ratio') # df_actLog_train.show(5,truncate=False) df_actLog_train.printSchema() # df_actLog_test.show(5,truncate=False) df_actLog_test.printSchema() return df_actLog_train, df_actLog_test def bining(self, sqlContext, df, col, percent_list): ''' :param sqlContext: :param df: :param col: 需要分箱的列 :return: ''' pandas_df = df.toPandas() bins = [] for percent in percent_list: bins.append(np.percentile(pandas_df.loc[:, col], percent)) #至少有20%的数据项小于或等于这个值 print(col + '查看分箱') print(bins) pandas_df.loc[:, col] = np.digitize(pandas_df.loc[:, col], bins, right=True) #修改pandas中的列名 pandas_df.rename(columns={col: col + '_bin'}, inplace=True) df_spark = sqlContext.createDataFrame(pandas_df) # df_spark.show() return df_spark # def city_col_deal(self,df,col): # df_city_score=df.groupBy(col).avg('finish', 'like') \ # .withColumnRenamed("avg(finish)","avg_finish").withColumnRenamed("avg(like)","avg_like") # df_city_score=df_city_score.withColumn(col+'_score', df_city_score.avg_finish*0.7+df_city_score.avg_like*0.3)\ # .select(col,fn.bround(col+'_score', scale=4).alias(col+'_score')) # return df_city_score def dropUnuseCols(self, df, unuse_col): for col in unuse_col: df = df.drop(col) return df def data_explore(self, df_train, df_test): sqlContext = SQLContext(self.sc) print('--------2、统计特征:count、ratio、nunique、ctr相关特征') print("计算交叉特征的count、类别偏好的ratio") count_feats_list = [] print('cross count') users = ['uid'] authors = ['author_id', 'item_city', 'channel', 'music_id', 'device'] count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors]) users = ['author_id'] authors = ['channel', 'user_city', 'item_city', 'music_id'] count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors]) count_feats_list.append(['uid', 'channel', 'device']) count_feats_list.append(['author_id', 'item_city', 'music_id']) print("计算count的字段有以下这些") print(count_feats_list) for i in range(len(count_feats_list)): group_cols = count_feats_list[i] new_feature = '_'.join(group_cols) print("根据上述保存的df_train 和df_test 再处理2维交叉变量") if len(group_cols) == 2: print("开始处理2维交叉变量") df_train = df_train.withColumn( new_feature, fn.concat_ws( '_', df_train[group_cols[0]].cast(typ.StringType()), df_train[group_cols[1]].cast(typ.StringType()))) df_test = df_test.withColumn( new_feature, fn.concat_ws( '_', df_test[group_cols[0]].cast(typ.StringType()), df_test[group_cols[1]].cast(typ.StringType()))) df2 = df_train.groupby(new_feature).count()\ .withColumnRenamed('count',new_feature+'_count') #类别偏好的ratio比例 count_min = df2.select(fn.min(df2[new_feature + '_count'])).collect()[0][0] count_max = df2.select(fn.max(df2[new_feature + '_count'])).collect()[0][0] # F.bround("Rank", scale=4) df2 = df2.withColumn( new_feature + '_count_ratio', fn.bround( ((df2[new_feature + '_count'] - fn.lit(count_min)) / ((fn.lit(count_max) - fn.lit(count_min)).cast( typ.IntegerType()))), scale=3)) if new_feature == "uid_author_id": #用户看了这个用户发布的视频 超过2个 percent_list = [0, 90, 95, 98, 100] if new_feature == "uid_music_id": percent_list = [0, 75, 90, 95, 98, 100] if new_feature == "uid_device": percent_list = [0, 25, 50, 75, 90, 100] if new_feature == "author_id_user_city": percent_list = [0, 75, 90, 95, 98, 100] if new_feature == "author_id_music_id": percent_list = [0, 75, 90, 95, 98, 100] else: percent_list = [0, 50, 75, 90, 95, 100] df2 = self.bining(sqlContext, df2, new_feature + '_count', percent_list) print("查看df2_2") df2.show(1, truncate=False) df_train = df_train.join(df2, new_feature, 'left').drop(new_feature) print("train") df_train.show(1, truncate=False) #ratio是一个连续变量,范围0-1 df_train.printSchema() df_test = df_test.join(df2, new_feature, 'left').drop(new_feature) #先关联后删除 print("test") df_test.show(1, truncate=False) if len(group_cols) == 3: print("开始处理3维交叉变量") df_train = df_train.withColumn( new_feature, fn.concat_ws( '_', df_train[group_cols[0]].cast(typ.StringType()), df_train[group_cols[1]].cast(typ.StringType()), df_train[group_cols[2]].cast(typ.StringType()))) df_test = df_test.withColumn( new_feature, fn.concat_ws( '_', df_test[group_cols[0]].cast(typ.StringType()), df_test[group_cols[1]].cast(typ.StringType()), df_test[group_cols[2]].cast(typ.StringType()))) df3 = df_train.groupby(new_feature).count()\ .withColumnRenamed('count',new_feature+'_count') #类别偏好的ratio比例 count_min = df3.select(fn.min(df3[new_feature + '_count'])).collect()[0][0] count_max = df3.select(fn.max(df3[new_feature + '_count'])).collect()[0][0] # F.bround("Rank", scale=4) df3 = df3.withColumn( new_feature + '_count_ratio', fn.bround( ((df3[new_feature + '_count'] - fn.lit(count_min)) / ((fn.lit(count_max) - fn.lit(count_min)).cast( typ.IntegerType()))), scale=3)) # print("查看df3_1") # df3.show(5,truncate=False) percent_list = [0, 50, 75, 90, 95, 100] df3 = self.bining(sqlContext, df3, new_feature + '_count', percent_list) print("查看df3_2") df3.show(1, truncate=False) df_train = df_train.join(df3, new_feature, 'left').drop(new_feature) print("train") df_train.show(1, truncate=False) df_train.printSchema() df_test = df_test.join(df3, new_feature, 'left').drop(new_feature) print("test") df_test.show(1, truncate=False) print("交叉特征处理结束") print("查看train的表结构") df_train.printSchema() # print("删除没有必要的列") # unuse_col=['item_city','user_city','device','author_id','music_id',] #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到 # df_train=self.dropUnuseCols(df_train,unuse_col) # df_test=self.dropUnuseCols(df_test,unuse_col) print("表中含有为null的字段,主要产生在leftjoin的时候") # df_train=df_train.na.fill({'uid_author_id_count_bin':1,'uid_author_id_count_ratio':0,\ # 'uid_item_city_count_bin':1,'uid_item_city_count_ratio':0,\ # 'uid_channel_count_bin':1,'uid_channel_count_ratio':0,\ # 'uid_music_id_count_bin':1,'uid_music_id_count_ratio':0,\ # 'uid_device_count_bin':1,'uid_device_count_ratio':0,\ # 'author_id_channel_count_bin':1,'author_id_channel_count_ratio':0,\ # 'author_id_user_city_count_bin':1,'author_id_user_city_count_ratio':0,\ # 'author_id_item_city_count_bin':1,'author_id_item_city_count_ratio':0,\ # 'author_id_music_id_count_bin':1,'author_id_music_id_count_ratio':0,\ # 'uid_channel_device_count_bin':1,'uid_channel_device_count_ratio':0,\ # 'author_id_item_city_music_id_bin':1,'author_id_item_city_music_id_ratio':0 # }) df_train = df_train.na.fill({ 'user_city_count_bin': 1, 'user_city_count_ratio': 0 }) #user_city_count_bin,device_count_bin 这两个是step1_single中漏掉的两个字段 df_test=df_test.na.fill({'user_city_count_bin':1,'user_city_count_ratio':0,\ 'device_count_bin':-1,'device_count_ratio':0,\ 'uid_author_id_count_bin':1,'uid_author_id_count_ratio':0,\ 'uid_item_city_count_bin':1,'uid_item_city_count_ratio':0,\ 'uid_channel_count_bin':1,'uid_channel_count_ratio':0,\ 'uid_music_id_count_bin':1,'uid_music_id_count_ratio':0,\ 'uid_device_count_bin':1,'uid_device_count_ratio':0,\ 'author_id_channel_count_bin':1,'author_id_channel_count_ratio':0,\ 'author_id_user_city_count_bin':1,'author_id_user_city_count_ratio':0,\ 'author_id_item_city_count_bin':1,'author_id_item_city_count_ratio':0,\ 'author_id_music_id_count_bin':1,'author_id_music_id_count_ratio':0,\ 'uid_channel_device_count_bin':1,'uid_channel_device_count_ratio':0,\ 'author_id_item_city_music_id_count_bin':1,'author_id_item_city_music_id_count_ratio':0 }) print("查看test缺失值") df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df_test.columns]).show() print("查看train缺失值") #以防万一,可能会漏掉哪个字段 df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df_train.columns]).show() print('-------5.保存数据预处理结果-------') test_file_path = self.parser.get( "hdfs_path", "hdfs_data_path") + 'actLog_test_single_cross' os.system("hadoop fs -rm -r {}".format(test_file_path)) df_test.rdd.map(tuple).saveAsPickleFile(test_file_path) del df_test gc.collect() train_file_path = self.parser.get( "hdfs_path", "hdfs_data_path") + 'actLog_train_single_cross' os.system("hadoop fs -rm -r {}".format( train_file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
from PredictionsHandlerFlask import NewsPrediction import json if __name__ == "__main__": conf = SparkConf() #conf.set('spark.shuffle.blockTransferService', 'nio') conf.set('spark.files.fetchTimeout', '180') conf.set('spark.files.overwrite', 'yes') conf.set('spark.akka.timeout', '180') conf.set('spark.task.maxFailures', '30000') conf.set('spark.akka.frameSize', '500') conf.set('spark.network.timeout', '180') dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2' myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb')) dataSetMaker = DataSetMakerV2(n=200000) sc = SparkContext(conf=conf) newsRDD = sc.pickleFile(dataDirectory + '/2015-05-040') print('%d news' % newsRDD.count()) for news in newsRDD.collect(): try: print(str(news)) except: pass
# The code from here on is wrapped in a try-finally block to ensure sc.stop() cleans up in # the event of an exception or a ctrl-C termination try: sc = SparkContext(conf=config, appName="acka630") vecdir = sys.argv[2] if sys.argv[2][-1] == "/" else sys.argv[2] + "/" metadir = sys.argv[3] # Load TFIDF vectors from the specified directory, and cache as these will be used # each time a new subject/fold is run tfidfVectorsAll = sc.parallelize([], 16) for tfidf in os.listdir(vecdir): if tfidf[:5] != 'TFIDF': continue vectors = sc.pickleFile(vecdir + tfidf) tfidfVectorsAll = tfidfVectorsAll.union(vectors) numVectors = tfidfVectorsAll.cache().count() print "numVectors:", numVectors # Generate a list of all file IDs and collect to Python fileIdList = tfidfVectorsAll.keys().collect() # Filter the metadata by file ID to just keep relevant file-subject pairs metaData = sc.pickleFile(metadir) \ .filter(lambda x: int(x[0]) in fileIdList)
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
def create_wordbag(x): wordbag = [] if(x['eval_content']) is None: return wordbag twitter = Twitter() for text in twitter.pos(x['eval_content'], stem = True): tag = text[1] if tag in unneeded: continue word = text[0] wordbag.append(word) return wordbag documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id']) #users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department') #for u in users.select('department','user').take(10000): # print u ''' professors = documents.select('prof_name').distinct() department = documents.select('department').distinct() #grade 1/2/3/4 eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5 for e in eval_total.collect(): print e '''
## **** you can jump this part. directly load data in the next part ****** ## print "calculating tfidf ..." tfidf, tags = create_tfidf(sc) dimention = 1000 print "reducing tfidf to " + str(dimention) + "..." save_file = './data/10k_reducedRDD' # just this if you want to load reduced directly in the following section. # reduced = reduce_tfidf(tfidf, dimention) # # use below to save the reduce tfidf RDD # if os.path.exists(save_file): # shutil.rmtree(save_file, ignore_errors=True) # reduced.saveAsPickleFile(save_file) # use below to load reduced tfidf RDD directly. reduced = sc.pickleFile(save_file) # processed = sc.pickleFile('./data/10k_processedRDD') #tune parameters below K = 50 KeepAllTag = False # you can run KNN to get the results by yourself, 1k posts in 60s, 10k posts in 20 mins. KNN(reduced,tags, K, KeepAllTag) # or... you can load the results for KNN directly, there is 1k and 10k version in ./data confution_matrix = load_obj('10k_confution_matrix') perfomanceMatrix = load_obj('10k_perfomanceMatrix') # need some one use this data to make some figures for showing our results. # comment next line out, if you want to save. (note: change the path accordingly) # reduced.saveAsPickleFile('./data/1k_reducedRDD')
from pyspark import SparkContext sc = SparkContext() tmpFile = "sparkdata/srm.data.samples" r = sc.pickleFile(tmpFile, 5).collect()
from gensim.models.doc2vec import Doc2Vec from math import exp from threading import Thread, Event sc = SparkContext() sqlContext = SQLContext(sc) # this is a large object we cache it on each worker node gmod_broadcast = sc.broadcast( Doc2Vec.load("/data/_hndata/doc2vec_model/hn") ) tfidf_model = RandomForestModel.load(sc, "hdfs:///hndata/hnrrmodel_tfidf") doc2vec_model = RandomForestModel.load(sc, "hdfs:///hndata/rrscoremodel") doc2vec_model2 = RandomForestModel.load(sc, "hdfs:///hndata/rrscoremodel2") tf = sc.pickleFile("hdfs:///hndata/tf_pickle") idf = IDF().fit(tf) hashingTF = HashingTF(1000) def pred_tfidf(docs): sents = sc.parallelize(docs).map(lambda d: d.strip().split()) new_tf = hashingTF.transform(sents) tfidf = idf.transform(new_tf) return tfidf_model.predict(tfidf) def pred_doc2vec(docs, takelog=True, cased=False): sents = sc.parallelize(docs) \ .map(lambda d: (d.lower() if not cased else d).strip().split()) def loadDoc2vec(sents):
if __name__ == "__main__": #print(ageToGroup) print(dayInterval) aviser=['an','ba','dt','fb','havis','oa','rb',\ 'ta','tb','nordlys','firda','glomdalen',\ 'mossavis','ringblad','sb','sa',\ 'tk','op','ostlendingen'] conf=SparkConf().setAppName('konsumprofiler').setMaster("local[8]").set('spark.app.id','200') sc=SparkContext(conf=conf) sqlContext = SQLContext(sc) user_map=sc.pickleFile('/home/erlenda/data/konsum/konsumprofil-rdd') print(user_map.first()) total_pvs=user_map.map(lambda x:x.pvs).collect() #total_visits=user_map.map(lambda x:(x.a_user_key,1.)).collect() #print(total_pvs[:1000]) percs_pageviews=np.percentile(total_pvs,[20.,40.,60.,80.]) #percs_visits=np.percentile(total_visits,[20.,40.,60.,80.]) print('Quantiles pageviews:',percs_pageviews) percs_pageviews_top=np.percentile(total_pvs,[95.,98.,99.]) #print('Quantiles visits:',percs_visits)
from pyspark.mllib.feature import IDF import datetime from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel from pyspark.mllib.feature import Normalizer conf = SparkConf() conf.setMaster('yarn-client') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) path = "/Users/sradhakr/Desktop/Assignment3/Assignment3" train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10) val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10) norm = Normalizer(2) train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10) val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10) train_featuresRDD=train_featureScoreTimeRDD.map(lambda (feature, score): feature) trainfeatureScoreNormRDD=norm.transform(train_featuresRDD).zip(train_featuresRDD.map(lambda (feature, score): score))
unneeded = [u'Unknown', u'KoreanParticle',u'Hashtag', u'ScreenName' ,u'Number', u'Alpha', u'Foreign',u'Punctuation', u'Suffix', u'Eomi', u'PreEomi' ,u'Josa', u'Exclamation'] def create_wordbag(x): wordbag = [] if(x['eval_content']) is None: return wordbag twitter = Twitter() for text in twitter.pos(x['eval_content'], stem = True): tag = text[1] if tag in unneeded: continue word = text[0] wordbag.append(word) return wordbag documents = sc.pickleFile('merged_file').map(lambda x : (x['no'],create_wordbag(x))) htf = HashingTF() tf_id = documents.mapValues(htf.transform) #tf_id.cache() #for a in tf_id.take(100): # print a #tf = htf.transform(documents.values()) #tf.cache() idf = IDF().fit(tf_id.values()) #idf.cache() tfidf_id = idf.transform(tf_id.values()) #tf_id.mapValues(idf.transform) print type(tfidf_id)
(cv_data_rdd, out_cv_data)]: url = sparkutil.util.s3n_url(S3_BUCKET, S3_PATH, name) sparkutil.util.s3n_delete(url) rdd.saveAsPickleFile(url) pickle.dump({'url': url}, open(name, 'w')) sc.stop() @jobs_limit(1) @transform(spark_run_experiments, suffix('.samples'), '.samples.pickle') def get_samples((exp_samples, exp_cvdata, exp_inits), out_filename): sample_metadata = pickle.load(open(exp_samples, 'r')) sc = SparkContext() results_rdd = sc.pickleFile(sample_metadata['url']) sparkutil.util.save_rdd_elements(results_rdd, out_filename, S3_BUCKET, S3_PATH) sc.stop() @jobs_limit(1) @transform(spark_run_experiments, suffix('.samples'), '.cvdata.pickle') def get_cvdata((exp_samples, exp_cvdata, exp_inits), out_filename): cvdata_metadata = pickle.load(open(exp_cvdata, 'r')) sc = SparkContext() results_rdd = sc.pickleFile(cvdata_metadata['url']) pickle.dump(results_rdd.collect(), open(out_filename, 'w'))
import json if __name__ == "__main__": conf = SparkConf() #conf.set('spark.shuffle.blockTransferService', 'nio') conf.set('spark.files.fetchTimeout', '180') conf.set('spark.files.overwrite', 'yes') conf.set('spark.akka.timeout', '180') conf.set('spark.task.maxFailures', '30000') conf.set('spark.akka.frameSize', '500') conf.set('spark.network.timeout', '180') dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2' myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p','rb')) dataSetMaker = DataSetMakerV2(n=200000) sc = SparkContext(conf=conf) newsRDD = sc.pickleFile(dataDirectory + '/2015-05-040') print('%d news' % newsRDD.count()) for news in newsRDD.collect(): try: print(str(news)) except: pass
sqlContext = SQLContext(sc) path = "/Users/sradhakr/Desktop/Assignment3/Assignment3" df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p1.json') reviewDF = df.select("overall", "reviewText", "reviewTime") def removePunctuation(text): return re.sub("[^a-zA-Z]", " ", text) cleanedReviewRDD = reviewDF.map(lambda row: (row.overall, removePunctuation(row.reviewText).lower().split(), row.reviewTime )) reviewRDD = sc.pickleFile(path+'/P2CleanedRDD', 10) uniqueWordsRDD = reviewRDD.flatMap(lambda words: words).distinct().map(lambda word: (word, 1)) word2VecRDD = sqlContext.read.parquet(path+"/word2vec/data") wordsFeaturesDict = sc.broadcast(uniqueWordsRDD.join(word2VecRDD.rdd).map(lambda (key, (dummy,features)):(key, features)).collectAsMap()) def getFeature(word): if wordsFeaturesDict.value.has_key(word): return np.array(wordsFeaturesDict.value[word]) else: return []
year_end = gui.endYear print year_end num_of_segments = 5 predict_end = gui.predYear # predict till year conf = SparkConf() conf.setMaster("local[4]") conf.setAppName("damu1000") conf.set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) # read data lines = sc.pickleFile(".//result") # filter by country, indicator and period. sort by period lines = lines.filter(lambda x: x[0] == country and x[2] == indicator and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a, b, c, d, e): d, True).cache() lines.take(1) if not lines.take(1): print "Index not present for this country. Stopping" sys.exit() print lines.collect() x = lines.map(lambda (a, b, c, d, e): (d)) # getting x values in 2D RDD y = lines.map(lambda (a, b, c, d, e): float(e)) # getting x values in 2D RDD # num_of_segments = x.count() / 5 #averaging at around 5 points per segment #--------------------------------------- Find out "break" points in pattern-------------------------------------------------------------------- # assign indexes to y values, increment by 1 and 2 so that elements can be joined to find out diff later.
from pyspark import SparkContext,SparkConf sc=SparkContext(conf=SparkConf().setAppName("Batsman Runs")) batsmanData=sc.pickleFile("deliveries.pickle") batsmanData=batsmanData.map(lambda s:(s['batsman'],int(s['batsman_runs']))).reduceByKey(lambda a,b:a+b).sortByKey() for item in batsmanData.collect(): print(item)
def main(comment_dir, submission_dir, output_dir, author, n): # spark specific setup conf = SparkConf().setAppName('Subreddit Recommender') sc = SparkContext(conf=conf) model = None author_id_rdd = None subreddit_id_rdd = None MODEL_PFP = output_dir + '/pickles/model' AUTHOR_ID_PFP = output_dir + '/pickles/author_id_rdd' SUBREDDIT_ID_PFP = output_dir + '/pickles/subreddit_id_rdd' if os.path.isdir(MODEL_PFP) and os.path.isdir(SUBREDDIT_ID_PFP) and os.path.isdir(AUTHOR_ID_PFP): print 'Loading model...', model = MatrixFactorizationModel.load(sc, MODEL_PFP) author_id_rdd = sc.pickleFile(AUTHOR_ID_PFP) subreddit_id_rdd = sc.pickleFile(SUBREDDIT_ID_PFP) print 'Done!' else: print 'Model not found :(' print 'This will take a while...' # ((author, subreddit), comment_rank) comment_rdd = do_it(sc, comment_dir) # ((author, subreddit), submission_rank) submission_rdd = do_it(sc, submission_dir) # ((author, subreddit),(comment_rank, submission_rank)) total_rdd = submission_rdd.fullOuterJoin(comment_rdd) # (author, subreddit, comment_rank + submission_rank) sum_rdd = total_rdd.map(combine_join_results).cache() author_id_rdd, subreddit_id_rdd, translated_rdd = hash_rating(sum_rdd, sc) print 'Training...', model = ALS.train(translated_rdd, 1) print 'Saving...', model.save(sc, MODEL_PFP) author_id_rdd.saveAsPickleFile(AUTHOR_ID_PFP) subreddit_id_rdd.saveAsPickleFile(SUBREDDIT_ID_PFP) print 'Done!' print 'Getting recommendations...' wanted_author_id = author_id_rdd.filter(lambda (a, a_id): str(a) == str(author)).collect() wanted_author_id = int(wanted_author_id[0][1]) products_ratings = model.recommendProducts(wanted_author_id, int(n)) wanted_subreddit_ids = map(lambda x: x.product, products_ratings) wanted_subredits = subreddit_id_rdd.filter(lambda (sub, s_id): s_id in wanted_subreddit_ids).collect() wanted_subredits = map(lambda (sub, s_id): sub, wanted_subredits) print 'author:', author print 'Recommended subreddits:' print wanted_subredits fp_out = open(output_dir + '/recommendation.txt', 'w') fp_out.write('Recommendations for /u/' + author + ':\n') for i, s in enumerate(wanted_subredits): fp_out.write(str(i) + ': /r/' + str(s) + '\n') fp_out.close()
'linear': LinearClassifier(3, 32, 32, 10, 20), 'nn' : NNClassifier(3, 32, 32, 10, 5), 'cnn' : CNNClassifier(3, 32, 32, 10, 3), } classifier = classifiers[name] """ set spark context and RDDs """ master = open("/root/spark-ec2/cluster-url").read().strip() slaves = sum(1 for line in open("/root/spark-ec2/slaves")) conf = SparkConf() conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.eventLog.enabled", "TRUE") conf.set("spark.default.parallelism", str(slaves * 2)) conf.set("spark.akka.frameSize", "50") sc = SparkContext(master=master, environment={'PYTHONPATH':os.getcwd()}, conf=conf) trainData = sc.pickleFile("s3n://61c-cnn/" + data, slaves * 4)\ .persist(StorageLevel.MEMORY_AND_DISK_SER) """ run clssifier """ log = open('ec2-' + name + data.strip('train') + '.log', 'w') sys.stdout = Log(sys.stdout, log) if name == 'cnn': classifier.load('snapshot/' + name + '/') s = time() classifier.train(trainData, [], datanum, is_ec2=True) e = time() """ skip validation """ print '[CS61C Project 4] training performane: %.2f imgs / sec' % \ ((datanum * classifier.iternum) / (e - s)) print '[CS61C Project 4] time elapsed: %.2f min' % ((e - s) / 60.0) trainData.unpersist()
dist = dist + p if len(v1) == 0: dist = sum(v[1] ** 2 for v in v2) return dist # ---------------------------------------------------------------- if __name__ == "__main__": conf = SparkConf() conf.setMaster("local[2]") conf.setAppName("ItemBased") conf.set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) sourceFile = sys.argv[1] if len(sys.argv) > 1 else "data/sample1k.txt" similarity = sys.argv[2] if len(sys.argv) > 1 else "cos_sim" rawdata = sc.textFile(sourceFile) users = ( rawdata.map(toRowKey) .aggregateByKey([], fill_row, assemble_row) .map(sort_row) .sortBy(lambda x: len(x[1]), ascending=False) ) testdata = users.take(10) sim = sc.pickleFile(similarity) error = evaluate(testdata, sim) print(error)
conf = SparkConf() conf.setMaster('yarn-client') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p1.json') reviewDF = df.select("overall", "reviewText", "reviewTime") def removePunctuation(text): return re.sub("[^a-zA-Z]", " ", text) cleanedReviewRDD = reviewDF.map(lambda row: (row.overall, removePunctuation(row.reviewText).lower().split(), row.reviewTime )) wordsClustersRDD= sc.pickleFile(path+'/WordClustersRDD',10) reviewRDDWithIndex = cleanedReviewRDD.zipWithIndex().map(lambda (row, index):(index, row)).cache() reviewTextWithIndex = reviewRDDWithIndex.map(lambda (index,(score, words, time)): (index, words)) reviewScoreTimeWithIndex = reviewRDDWithIndex.map(lambda (index,(score, words, time)): (index, (score, time))) def getKey(item): return item[0] def createSparseVector(histogram): indexList = [] countList = [] for histogramIndex, count in sorted(histogram, key=getKey): indexList.append(histogramIndex) countList.append(count)
from pyspark import SparkConf,SparkContext sc=SparkContext(conf=SparkConf().setAppName("Bowler Extras")) bowlerData=sc.pickleFile('deliveries.pickle') bowlerData=bowlerData.map(lambda s:(s['bowler'],int(s['wide_runs'])+int(s['bye_runs'])+int(s['legbye_runs'])+int(s['noball_runs']))).reduceByKey(lambda a,b:a+b).sortByKey() for item in bowlerData.collect(): print(item)
def main(): # Parse arguments parser = argparse.ArgumentParser( description='Distributed RNN-LSTM built on Spark and Tensorflow') parser.add_argument('-i', '--input', type=str, required=True, help='Path to dataset') parser.add_argument('-t', '--target', type=str, required=True, help='Path to target classes') parser.add_argument('-m', '--master', type=str, help='host of master node', default='local') parser.add_argument('-sem', '--sparkexecmemory', type=str, help='Spark executor memory', default='4g') parser.add_argument('-p', '--partitions', type=int, help='Number of minibatch for dataset', default=4) parser.add_argument('-hl', '--numHidden', type=int, help='Number of hidden layers', default=1) parser.add_argument('-e', '--epoch', type=int, help='Number of training epoch', default=1) parser.add_argument('-o', '--output', type=str, help='Output path', default='temp') parser.add_argument('-lp', '--loadPickle', type=bool, help='Load weights from a pickle file', default=False) parser.add_argument('-lo', '--loadOp', type=str, help='Operation to execute after load', default='reduce') parser.add_argument('-gp', '--graphPath', type=str, help='Graph path', default='tmp/graph_default') args = vars(parser.parse_args()) input_path = args['input'] target_path = args['target'] master_host = args['master'] sem = args['sparkexecmemory'] partitions = args['partitions'] hidden = args['numHidden'] epoch = args['epoch'] output = args['output'] load = args['loadPickle'] load_op = args['loadOp'].split('|') graphPath = args['graphPath'] global COUNT_RUN COUNT_RUN = 1 # Initialize spark # Substitute 4 with max supported workers = partitions if partitions == multiprocessing.cpu_count( ) else partitions % multiprocessing.cpu_count() workers_master = '[%d]' % workers conf = SparkConf().setMaster(master_host + workers_master).setAppName("RNN-LSTM").set( "spark.executor.memory", sem) print 'Total workers: ', workers_master print 'Spark executor memory: ', sem sc = SparkContext(conf=conf) quiet_logs(sc) with open(target_path, 'rb') as t_f: target = json.load(t_f) target = map_target(target) if not load: # Read dataset into RDD as csv training_rdd = textToRDDCsv(sc, input_path, partitions) minibatch_rdd = training_rdd.partitionBy(partitions + 1) # FOR NOW OK # It is simple to extend multilayer lstm to support different settings # on multiple layers multilayer_props = [ dict(layer_name='1', layer_type='lstm', dim_size=-1, num_hidden=hidden, normalize=True) ] start = time.time() weights_rdd = minibatch_rdd.mapPartitions( lambda x: train_rnn( x, multilayer_props, epoch=epoch, target=target), True) # Return weights and average them weights_rdd = weights_rdd.filter(lambda x: len(x) == 2) #weights = weights_rdd.saveAsPickleFile(output + '_raw') out = weights_rdd.filter(lambda x: len(x) == 2) # Mean row by row weights_mean_rdd = out.groupByKey().mapValues( lambda x: sum(x) / float(len(x))) # if (output == 'temp'): print 'No output directory defined using temp' weights_mean_rdd.collect() print 'RNN-LSTM - Total Processing Time (with weight averaging): %f' % ( time.time() - start) print 'RNN-LSTM - Total Processing Time (with repartition) %f' % ( time.time() - start) else: if 'reduce' in load_op: print 'REDUCING' weights_rdd = sc.pickleFile(input_path + '_raw', partitions) print weights_rdd.collect() out = weights_rdd.filter(lambda x: len(x) == 2) # Mean row by row c = out.groupByKey().mapValues( lambda x: sum(x) / float(len(x))).collect() for i, d in enumerate(c): print i, " - ", d print print if 'save' in load_op: if (output == 'temp'): print 'No output directory defined using temp' weights_mean_rdd.saveAsPickleFile(output + '_mean') print weights_mean_rdd.collect() # SHOULD CONTINUE sys.exit(0)
# start_map = time() # RDD_IDF = sc.pickleFile('IDF_RAW.RDD') ## RDD_IDF = RDD_IDF.map(lambda (word, count): (word, ( N / count))) # RDD_IDF = RDD_IDF.map(lambda (word, count): (word, np.log( N / count))) # save_name = 'IDF.RDD' # call(["rm", "-rf", save_name]) # RDD_IDF.saveAsPickleFile(save_name) # finish_map = time() # print "Mapped, took ",finish_map - start_map # Part 1f - create TF.IDF if MODE == 'TF.IDF': print "" print "CREATING TF.IDFs" print >> runtimes_file,"CREATING TF.IDF" RDD_IDF = sc.pickleFile('IDF.RDD') TFIDF.create_tfidf(sc, RDD_IDF, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics) # Part 1d/e/f - Do the whole thing for 'stop_after' files if MODE == 'ALL': print "" print "CREATING TFs" print >> runtimes_file,"CREATING TFs" TF.create_tf(data_path, sc, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics) print "" print "CREATING IDF" print >> runtimes_file,"CREATING IDF" RDD_IDF = sc.parallelize([]) RDD_IDF = IDF.create_idf(sc, RDD_IDF, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics) print "" print "CREATING TF.IDFs"
else: print 'Using Pre-Pickled Files\n' # End Timer for this phase WordFreq_Time = time() - WordFreq_Time print('############ Processing Completed ##############') print('################################################\n') print('################################################') print('############## Word Freq to IDF RDD ############\n') # Start Timer IDF_Time = time() # Ascertain if Section has already been completed if len(getDirectory(directory[3])) < 1: allFolders = getDirectory(directory[2]) # Load in Word Frequency Pickles into one RDD IDF = sc.union([sc.pickleFile(i) for i in allFolders]) # Rearrange RDD into correct the correct format IDF = IDF.flatMap(lambda x_y: [(pair[0], [[x_y[0], str(pair[1])]]) for pair in x_y[1]]) \ .reduceByKey(add) \ .map(lambda x_y1: (x_y1[0], len(x_y1[1]), float(N), x_y1[1])) \ .map(lambda x_y_z_a: (x_y_z_a[0], np.log2(x_y_z_a[2] / x_y_z_a[1]), x_y_z_a[3])) \ .repartition(8) # Save IDF RDD as a Pickle File IDF.saveAsPickleFile(directory[4], 50) else: print 'Using Pre-Pickled Files\n' # End Timer for this phase IDF_Time = time() - IDF_Time print('############ Processing Completed ##############') print('################################################\n')
# cluster(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 """ # print("Test clustering") # group_res = group_rdd.collect() # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 # cluster(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 print("Working on clustering") cluster_rdd = group_rdd.map( lambda x: cluster_two_pass(x[1], x[0], st, global_dict.value)) cluster_rdd.saveAsPickleFile( path_save_res) # save all the cluster to the hard drive cluster_rdd_reload = sc.pickleFile( path_save_res).collect() # here we have all the clusters in memory # first_dict = cluster_rdd_reload[0] print("clustering done") # plot all the clusters # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5) """ ##### query Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range The following line is for testing querying on one cluster # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value) """ # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
from pyspark.mllib.linalg import Vectors import sys, math conf = SparkConf().setAppName('tf-idf') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) train = sys.argv[1] test = sys.argv[2] # Load and parse the data def parsePoint(line): return LabeledPoint(float(line[1]), line[0]) #train data train_data = sc.pickleFile(train) parsedData = train_data.map(parsePoint) #test data test_data = sc.pickleFile(test) parsedtestData = test_data.map(parsePoint) # cross validation num_iterations = 100 step_size=[0.1,10,20,300] best_error=1000000 best_model=[0] best_step=0 best_test_error=0 best_split=[] best_RMSE=0
class SparkFEProcess: def __init__(self): self.parser = self.init_config() sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_2") \ .set("spark.ui.showConsoleProgress", "false") self.sc = SparkContext(conf=sparkConf) self.sc.broadcast(self.parser) self.init_logger() # #初始化相关参数 # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用 # self.bins_dict={} def init_config(self): current_path = os.path.dirname(os.path.realpath(__file__)) workspace_path = current_path.split('featureEngineering')[0] config_file = workspace_path + 'resource/config.ini' parser = configparser.ConfigParser() parser.read(config_file) return parser def init_logger(self): ''' 设置日志级别 :param sc: :return: ''' logger = self.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def read_rdd(self, fileName): try: file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName data_rdd = self.sc.textFile(file_path) return data_rdd except Exception as e: print(e) def data_describe(self): sqlContext = SQLContext(self.sc) print('starto read data after explore_saprk_step1_cross:') rootPath = self.parser.get("hdfs_path", "hdfs_data_path") print('start to read actLog_train_single_cross') test_file_path = rootPath + 'actLog_test_single_cross' actLog_test_rdd = self.sc.pickleFile(test_file_path) #比对label,看labels是否合适 labels = [ ('duration_time', typ.IntegerType()), ('device', typ.IntegerType()), ('music_id', typ.IntegerType()), ('item_city', typ.IntegerType()), ('author_id', typ.IntegerType()), ('item_id', typ.IntegerType()), ('user_city', typ.IntegerType()), ('uid', typ.IntegerType()), ('channel', typ.IntegerType()), ('finish', typ.IntegerType()), ('like', typ.IntegerType()), ('time_day', typ.IntegerType()), ('item_pub_month', typ.IntegerType()), ('item_pub_day', typ.LongType()), ('item_pub_hour', typ.IntegerType()), ('item_pub_minute', typ.IntegerType()), ('uid_count_bin', typ.IntegerType()), ('user_city_count_bin', typ.IntegerType()), ('user_city_count_ratio', typ.DoubleType()), ('item_id_count_bin', typ.IntegerType()), ('item_id_count_ratio', typ.DoubleType()), ('author_id_count_bin', typ.IntegerType()), ('author_id_count_ratio', typ.DoubleType()), ('item_city_count_bin', typ.IntegerType()), ('item_city_count_ratio', typ.DoubleType()), ('music_id_count_bin', typ.IntegerType()), ('music_id_count_ratio', typ.DoubleType()), ('device_count_bin', typ.IntegerType()), ('device_count_ratio', typ.DoubleType()), ('uid_author_id_count_bin', typ.IntegerType()), ('uid_author_id_count_ratio', typ.DoubleType()), ('uid_item_city_count_bin', typ.IntegerType()), ('uid_item_city_count_ratio', typ.DoubleType()), ('uid_channel_count_bin', typ.IntegerType()), ('uid_channel_count_ratio', typ.DoubleType()), ('uid_music_id_count_bin', typ.IntegerType()), ('uid_music_id_count_ratio', typ.DoubleType()), ('uid_device_count_bin', typ.IntegerType()), ('uid_device_count_ratio', typ.DoubleType()), ('author_id_channel_count_bin', typ.IntegerType()), ('author_id_channel_count_ratio', typ.DoubleType()), ('author_id_user_city_count_bin', typ.IntegerType()), ('author_id_user_city_count_ratio', typ.DoubleType()), ('author_id_item_city_count_bin', typ.IntegerType()), ('author_id_item_city_count_ratio', typ.DoubleType()), ('author_id_music_id_count_bin', typ.IntegerType()), ('author_id_music_id_count_ratio', typ.DoubleType()), ('uid_channel_device_count_bin', typ.IntegerType()), #改成uid_channel_device ('uid_channel_device_count_ratio', typ.DoubleType()), #改成uid_channel_device ('author_id_item_city_music_id_count_bin', typ.IntegerType()), ('author_id_item_city_music_id_count_ratio', typ.DoubleType()), ] actionLogSchema = typ.StructType( [typ.StructField(e[0], e[1], True) for e in labels]) df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd, actionLogSchema) df_actLog_test.show(1, truncate=False) print('start to read actLog_train_single_cross') train_file_path = rootPath + 'actLog_train_single_cross' actLog_train_rdd = self.sc.pickleFile(train_file_path) df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd, actionLogSchema) df_actLog_train.show(1, truncate=False) return df_actLog_train, df_actLog_test def data_explore(self, df_train, df_test): sqlContext = SQLContext(self.sc) print("对item_pub_hour进行离散化") def hourBin(x): if x >= 23 or x <= 2: return 1 elif 3 <= x < 8: return 2 elif 8 <= x < 12: return 3 else: return 4 converHourBin = udf(lambda x: hourBin(x), typ.IntegerType()) df_train = df_train.withColumn("item_pub_hour", converHourBin(df_train.item_pub_hour)) df_test = df_test.withColumn("item_pub_hour", converHourBin(df_test.item_pub_hour)) print("--------1、针对uid,authorid,musicid等组合的正负样本数量统计特征--------") print("交叉特征的正负样本数量统计") posneg_feats_list = [] # posneg_feats_list.append(["duration_time"]) # posneg_feats_list.append(["time_day"]) print('cross count') users = ['uid'] authors = ['author_id', 'item_city', 'channel', 'music_id'] #,'item_pub_hour' posneg_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors]) # posneg_feats_list.append(['uid','author_id', 'channel']) # posneg_feats_list.append(['uid', 'author_id', 'music_id']) # posneg_feats_list.append(['uid','author_id', 'channel','time_day']) # posneg_feats_list.append(['uid', 'author_id', 'music_id','time_day']) print("计算以下交叉特征的正负样本比例") #有2、3、4维的交叉特征 print(posneg_feats_list) for i in range(len(posneg_feats_list)): group_cols = posneg_feats_list[i] new_feature = '_'.join(group_cols) #计算df_train数据中正负样本的比例,test中直接拼接,为null则填充为0或者均值 #正负样本判定字段:like finish #d第一步,先拼接 print(new_feature) if len(group_cols) == 2: print("开始处理2维交叉变量") df_train = df_train.withColumn( new_feature, fn.concat_ws( '_', df_train[group_cols[0]].cast(typ.StringType()), df_train[group_cols[1]].cast(typ.StringType()))) df_test = df_test.withColumn( new_feature, fn.concat_ws( '_', df_test[group_cols[0]].cast(typ.StringType()), df_test[group_cols[1]].cast(typ.StringType()))) if len(group_cols) == 3: print("开始处理3维交叉变量") df_train = df_train.withColumn( new_feature, fn.concat_ws( '_', df_train[group_cols[0]].cast(typ.StringType()), df_train[group_cols[1]].cast(typ.StringType())), df_train[group_cols[2]].cast(typ.StringType())) df_test = df_test.withColumn( new_feature, fn.concat_ws( '_', df_test[group_cols[0]].cast(typ.StringType()), df_test[group_cols[1]].cast(typ.StringType())), df_test[group_cols[2]].cast(typ.StringType())) # if len(group_cols)==4: # # print("开始处理4维交叉变量") # df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType())) # ,df_train[group_cols[2]].cast(typ.StringType()) ,df_train[group_cols[3]].cast(typ.StringType())) # df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType())) # ,df_test[group_cols[2]].cast(typ.StringType()) ,df_test[group_cols[3]].cast(typ.StringType())) for target in ["like", "finish"]: df3 = df_train.select( new_feature, target).groupby(new_feature).count().withColumnRenamed( 'count', new_feature + '_count') df4 = df_train.select( new_feature, target).where(df_train[target] == 1).groupby( new_feature).count().withColumnRenamed( 'count', new_feature + "_count_" + target + "_1") df3 = df3.join(df4, new_feature, 'left').na.fill(0) del df4 gc.collect() # print("两列相除:得到正样本的比例",target) df3 = df3.withColumn( new_feature + "_" + target + "_pos_neg", fn.col(new_feature + "_count_" + target + "_1") / fn.col(new_feature + '_count')) df3 = df3.drop(new_feature + "_count_" + target + "_1", new_feature + '_count') print("新的df_train", new_feature, target) df_train = df_train.join(df3, new_feature, "left") df_train.show(1) df_test = df_test.join(df3, new_feature, "left") #会存在null,缺失值设置为0 print("新的df_test", new_feature, target) df_test.show(1) df_test = df_test.na.fill(0) del df3 gc.collect() if new_feature not in ["duration_time", "time_day"]: df_train = df_train.drop(new_feature) df_test = df_test.drop(new_feature) df_train.printSchema() df_test.printSchema() print('最终表结构,该表结构用于concate的输入' ) #是不是应该有build_data_for_like build_data_for_finish df_train.printSchema() df_test.printSchema() print("查看test缺失值") df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in posneg_feats_list]).show() print("查看train缺失值") df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in posneg_feats_list]).show() print('-------5.保存数据预处理结果-------') test_file_path = self.parser.get( "hdfs_path", "hdfs_data_path") + 'actLog_test_step2' os.system("hadoop fs -rm -r {}".format(test_file_path)) df_test.rdd.map(tuple).saveAsPickleFile(test_file_path) del df_test gc.collect() train_file_path = self.parser.get( "hdfs_path", "hdfs_data_path") + 'actLog_train_step2' os.system("hadoop fs -rm -r {}".format( train_file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
import numpy as np sc = SparkContext() sqlContext = SQLContext(sc) # raw data df = sqlContext.read.load("hdfs:///hndata/parquet_typed", format="parquet") scores = df.where("score IS NOT NULL") \ .where("type='story'") \ .where("title IS NOT NULL") \ .map(lambda row: (row.id, row.score)) # this is a RDD of (id, <numpy array>) docvecs = sc.pickleFile("hdfs:///hndata/docvecs_glove_pickle") def loadVecs(score_pairs): ''' Executes on works, gensim doc2vec model has been rsynced to each node on cluster, so each worker can read its own copy If the model/np-array is larger than my driver memory, cannot use sc.broadcast to sync to each worker ''' import numpy as np docvecs = np.load("/data/_hndata/doc2vec_model/hn.docvecs.doctag_syn0.npy", mmap_mode='r') return [(s, np.array(docvecs[i])) for (s,i) in score_pairs] def mergeByKey(a,b): '''
from pyspark import SparkConf, SparkContext sc = SparkContext(conf=SparkConf().setAppName('Airport timezone')) mainRdd = sc.pickleFile('airports_mod.pickle') mainRdd = mainRdd.map(lambda d: (d['Tz'], 1)) mainRdd = mainRdd.reduceByKey(lambda acc, b: acc + b) for item in mainRdd.collect(): print(item)
# creating a hdfs client for writing purposes hdfs_client = InsecureClient(hdfs_address, user=hdfs_user) # opening training and test data files if not cluster_execution: learning_data_filename_training = 'file://' + learning_data_filename_training id_to_dataset_filename_training = 'file://' + id_to_dataset_filename_training if learning_data_filename_test: learning_data_filename_test = 'file://' + learning_data_filename_test id_to_dataset_filename_test = 'file://' + id_to_dataset_filename_test learning_data_training = sc.textFile(learning_data_filename_training + '/*').persist( StorageLevel.MEMORY_AND_DISK) id_to_dataset_training = sc.pickleFile( id_to_dataset_filename_training).persist(StorageLevel.MEMORY_AND_DISK) learning_data_test = sc.emptyRDD() id_to_dataset_test = sc.emptyRDD() if learning_data_filename_test: learning_data_test = sc.textFile(learning_data_filename_test + '/*').persist( StorageLevel.MEMORY_AND_DISK) id_to_dataset_test = sc.pickleFile( id_to_dataset_filename_test).persist(StorageLevel.MEMORY_AND_DISK) # taking first element and checking if information about joined dataset is present has_joined_data = False first = json.loads(learning_data_training.first()) if 'joined_dataset' in first: has_joined_data = True
# pyspark统计某个字段的取值个数 # spark: 2.5min, awk: 5min sc.textFile("file_name").map(lambda x:x.split("\x01")).map(lambda x:(x[0], 1)).reduceByKey(lambda a,b:a+b).collect() # RDD集合操作 a.union(b) # 并集 a.intersection(b) # 交集 a.subtract(b) # 差集 # RDD去重 a.distinct() # rdd保存与加载 rdd.saveAsPickleFile(file_name, partition_num) sc.pickleFile(file_name) # spark关键概念 Application: 用户提交的任务 Driver: 任务调度 Job: 每个action算子是一个job Task: RDD的partitions上的执行单元 Stage: 按照宽窄依赖划分 宽窄依赖: "https://github.com/rohgar/scala-spark-4/wiki/Wide-vs-Narrow-Dependencies" # pandas DataFrame to spark DataFrame from pyspark.sql import SparkSession sqlContext = SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate()
########################functions################################## def quiet_logs(sc): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) region = sys.argv[1] timeframe = sys.argv[2] # spatial division: cell_id->region of interest # data loading # checking file existance ##### sc = SparkContext() chiamate_orarie = sc.pickleFile( '/peaks/hourly_presence-' + "%s-%s" % (region, timeframe)) presenze_medie = chiamate_orarie.map(lambda x: ( (x[0][0], x[0][1], x[0][3]), x[1])).groupByKey() os.system("$HADOOP_HOME/bin/hadoop fs -rm -r /peaks/weekly_presence-%s-%s/" % (region, timeframe)) presenze_medie.saveAsPickleFile( '/peaks/weekly_presence-' + "%s-%s" % (region, timeframe)) ##picchi ##
continue wordbag.append((word)) return wordbag ''' def create_matrix(x, terms, matrix): if (x['eval_content']) is None: return matrix for text in twitter.pos(x['eval_content'], stem = True): matrix.index((text) ''' terms = sc.pickleFile('merged_file').flatMap(lambda x : create_dictionary(x)).distinct() #print terms.count() matrix_key = terms.collect() if len(matrix_key) % 2: matrix_key.append("") matrix = dict((k, []) for k in matrix_key) ''' f = open('dictionary_test.txt', 'w') for m in matrix: f.write(m) f.write(' ')
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ from pyspark import SparkConf, SparkContext sc = SparkContext(conf = SparkConf().setAppName("My App")) mainRdd = sc.pickleFile('matches.pickle') mainRdd.persist() seasonRunRdd = mainRdd.map(lambda d : (d['season'], int(d['win_by_runs']))).reduceByKey(lambda a, b: a if a > b else b).sortByKey() print(seasonRunRdd.collect())
# =========== SPARK CONFIG =========== set_master_val = "local[" + str(num_nodes) + "]" from pyspark import SparkContext, SparkConf #conf = SparkConf().setAppName('TrainWavenet').set("spark.driver.maxResultSize", "2G") conf = SparkConf().setMaster(set_master_val).setAppName('TrainWavenet').set("spark.driver.maxResultSize", "2G") sc = SparkContext(conf=conf) # ============ DATA SETUP =========== # s3_song_directory = "s3://waveform-storage/input_data/song_processed/Pop/part-00000" #, minPartitions=n_data_partitions) train_rdd = sc.pickleFile(song_directory) \ .flatMap(lambda x: split_song_to_train(x, data_size, data_collect_stride)) \ .map(lambda x: (x, one_hot_encode_chunk(x))) \ .map(lambda x: (np.array(x[0]).reshape(data_size,1), np.array(x[1]))) print("Num Partitions: ", train_rdd.getNumPartitions()) # ============ MODEL SETUP =========== from keras.optimizers import SGD wavenet_model = create_wavenet(stack_layers, n_output_channels, n_filter_list, num_stacks, skip=False) adam_opt = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False) wavenet_model.compile(optimizer=SGD(), loss='categorical_crossentropy') print(wavenet_model.summary()) # ============ ELEPHAS TRAIN ===========
df = sqlContext.read.format('jdbc')\ .options(url = MYSQL_CONNECTION_URL.value, dbtable = db+'.'+tableName.value ).load() # CREATE STREAMING CONTEXT ssc = StreamingContext(sc, int(spark_batch_duration)) # setting checkpoint # ssc.checkpoint(".") tf_val = 1048576 # LOADING AND COMPUTING TF's TRAINING MODEL print('Loading TRAINING_TF_MODEL...', end="") tf_training = sc.pickleFile(os.getcwd() + "/Desktop/MODEL/TF/TF_MODEL_" + str(tf_val)) print('done!') print('Computing TF-IDF MODEL...', end="") idf_training = IDF(minDocFreq=5).fit(tf_training) print('done!') print('Loading Naive Bayes Model...', end="") NBM = NaiveBayesModel.load( sc, os.getcwd() + "/Desktop/MODEL/NBM/NaiveBayesModel_" + str(tf_val)) print('done!') print('READY TO PROCESS DATA...') kafkaParams = {'metadata.broker.list"': kafka_brokers}
samples - 1 if samples > 1 else samples) X = vstack(data) pca = TruncatedSVD(n_components) Xtransformed = pca.fit_transform(X) p = pd.DataFrame(Xtransformed, columns=['%i' % i for i in range(n_components)], index=author_index) output_list = [] for k, v in zip(p.index, p.values): output_list.append((k, v)) return (subreddit, output_list) ae_flat_cat = sc.pickleFile( '/user/username/data/output/_jobs/author_entities_cats') # map as ((subcat, author, entity_id), score) and compute median medians = ae_flat_cat.map(lambda x: ((x[5][1], x[0], x[1]), x[3])).groupByKey( ).mapValues(list).map(lambda x: x[0] + (float(np.median(x[1])), )) #group by cat ae_grouped_by_subreddit = medians.groupBy(lambda x: (x[0])).mapValues(list) processed_groups = ae_grouped_by_subreddit.map( lambda x: process_group(x)).filter(lambda x: len(x) > 0) processed_groups.saveAsPickleFile( '/user/username/data/output/_jobs/processed_groups_topcats_svd')
15;visitor;0.0;0.0;0.0; 0.1; 0.1; 0.1;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0""" archetipi = [(y[1], y[2:][:18]) for y in [x.split(';') for x in archetipi.split("\n")[:-1]]] if __name__ == '__main__': region = sys.argv[1] timeframe = sys.argv[2] # import rdd with profiles sc = SparkContext() quiet_logs(sc) sc._conf.set('spark.executor.memory', '32g').set( 'spark.driver.memory', '32g').set('spark.driver.maxResultsSize', '0') r = sc.pickleFile('/profiles-%s-%s' % (region, timeframe)) # clustering! r_carrelli = r.flatMap(lambda x: array_carretto(x[1])) percentage = 0.3 r_carrelli.sample(False, percentage, 0).filter( lambda l: sum(l)) # filtro passing by # sample and filter out passing by data = r_carrelli.sample(False, percentage, 0).filter( lambda l: sum(l)).map(lambda x: np.array(x)) #kmns = KMeans.train(data, 100, initializationMode="random") kmns = KMeans.train(data, 100, initializationMode="k-means||") tipi_centroidi = []
class SparkFEProcess: def __init__(self): self.parser = self.init_config() sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_step3") \ .set("spark.ui.showConsoleProgress", "false") self.sc = SparkContext(conf=sparkConf) self.sc.broadcast(self.parser) self.init_logger() # #初始化相关参数 # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用 # self.bins_dict={} def init_config(self): current_path = os.path.dirname(os.path.realpath(__file__)) workspace_path = current_path.split('featureEngineering')[0] config_file = workspace_path + 'resource/config.ini' parser = configparser.ConfigParser() parser.read(config_file) return parser def init_logger(self): ''' 设置日志级别 :param sc: :return: ''' logger = self.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def read_rdd(self, fileName): try: file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName data_rdd = self.sc.textFile(file_path) return data_rdd except Exception as e: print(e) def data_describe(self): sqlContext = SQLContext(self.sc) print('starto read data after explore_saprk_step1_cross:') rootPath=self.parser.get("hdfs_path", "hdfs_data_path") print('start to read actLog_train_single_cross') test_file_path = rootPath + 'actLog_test_single_cross' actLog_test_rdd = self.sc.pickleFile(test_file_path) #比对label,看labels是否合适 labels=[ ('duration_time',typ.IntegerType()), ('device',typ.IntegerType()), ('music_id',typ.IntegerType()), ('item_city',typ.IntegerType()), ('author_id',typ.IntegerType()), ('item_id',typ.IntegerType()), ('user_city',typ.IntegerType()), ('uid',typ.IntegerType()), ('channel',typ.IntegerType()), ('finish',typ.IntegerType()), ('like',typ.IntegerType()), ('time_day',typ.IntegerType()), ('item_pub_month',typ.IntegerType()), ('item_pub_day',typ.LongType()), ('item_pub_hour',typ.IntegerType()), ('item_pub_minute',typ.IntegerType()), ('uid_count_bin',typ.IntegerType()), ('user_city_count_bin',typ.IntegerType()), ('user_city_count_ratio',typ.DoubleType()), ('item_id_count_bin',typ.IntegerType()), ('item_id_count_ratio',typ.DoubleType()), ('author_id_count_bin',typ.IntegerType()), ('author_id_count_ratio',typ.DoubleType()), ('item_city_count_bin',typ.IntegerType()), ('item_city_count_ratio',typ.DoubleType()), ('music_id_count_bin',typ.IntegerType()), ('music_id_count_ratio',typ.DoubleType()), ('device_count_bin',typ.IntegerType()), ('device_count_ratio',typ.DoubleType()), ('uid_author_id_count_bin',typ.IntegerType()), ('uid_author_id_count_ratio',typ.DoubleType()), ('uid_item_city_count_bin',typ.IntegerType()), ('uid_item_city_count_ratio',typ.DoubleType()), ('uid_channel_count_bin',typ.IntegerType()), ('uid_channel_count_ratio',typ.DoubleType()), ('uid_music_id_count_bin',typ.IntegerType()), ('uid_music_id_count_ratio',typ.DoubleType()), ('uid_device_count_bin',typ.IntegerType()), ('uid_device_count_ratio',typ.DoubleType()), ('author_id_channel_count_bin',typ.IntegerType()), ('author_id_channel_count_ratio',typ.DoubleType()), ('author_id_user_city_count_bin',typ.IntegerType()), ('author_id_user_city_count_ratio',typ.DoubleType()), ('author_id_item_city_count_bin',typ.IntegerType()), ('author_id_item_city_count_ratio',typ.DoubleType()), ('author_id_music_id_count_bin',typ.IntegerType()), ('author_id_music_id_count_ratio',typ.DoubleType()), ('uid_channel_device_count_bin',typ.IntegerType()), #改成uid_channel_device ('uid_channel_device_count_ratio',typ.DoubleType()), #改成uid_channel_device ('author_id_item_city_music_id_count_bin',typ.IntegerType()), ('author_id_item_city_music_id_count_ratio',typ.DoubleType()), ] actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,actionLogSchema) # df_actLog_test.show(1,truncate=False) print('start to read actLog_train_single_cross') train_file_path = rootPath + 'actLog_train_single_cross' actLog_train_rdd = self.sc.pickleFile(train_file_path) df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,actionLogSchema) # df_actLog_train.show(1,truncate=False) return df_actLog_train, df_actLog_test def data_explore(self,df_train,df_test): sqlContext = SQLContext(self.sc) print("对item_pub_hour进行离散化") def hourBin(x): if x>=23 or x <=2: return 1 elif 3<=x<8: return 2 elif 8<=x<12: return 3 else: return 4 converHourBin=udf(lambda x :hourBin(x), typ.IntegerType()) df_train = df_train.withColumn("item_pub_hour", converHourBin(df_train.item_pub_hour)) df_test = df_test.withColumn("item_pub_hour", converHourBin(df_test.item_pub_hour)) print("----1、计算统计特征:用户特征和item特征之间的条件概率---------") feats_list = [] condition = ['uid'] authors = ['music_id','item_pub_hour'] #'author_id', 'item_city', 'channel', feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors]) df_tmp=df_train.select(condition) df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count') # df2.show(1,truncate=False) # ['uid','uid_count'] df2.cache() # df_train=df_train.join(df2,condition,'left') # df_train.show(1,truncate=False) # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id] # del df2 # gc.collect() for feature_group in feats_list: print(feature_group+[feature_group[0]+'_count']) #+[feature_group[0]+'_count'] df1=df_train.select(feature_group).groupby(feature_group).count() # df1.show(1,truncate=False) #理论上还是只有3个字段,不包含uid_count df1=df1.join(df2,condition,'left') df1.show(1,truncate=False) #|uid|item_pub_hour|count|uid_count df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count')) df1=df1.drop('count').drop(feature_group[0]+'_count') df1.show(1,truncate=False) print(df_train.columns) print(df1.columns) df_train=df_train.join(df1,feature_group,"left") #|uid|item_pub_hour|item_pub_hour_uid_condition_ratio df_train.show(1,truncate=False) df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0}) #对某一列填充缺失值 df_test.show(1,truncate=False) feats_list = [] condition = ['item_id'] authors = ['uid_city', 'channel'] feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors]) df_tmp=df_train.select(condition) df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count') # df2.show(1,truncate=False) # ['uid','uid_count'] df2.cache() # df_train=df_train.join(df2,condition,'left') # df_train.show(1,truncate=False) # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id] # del df2 # gc.collect() for feature_group in feats_list: print(feature_group+[feature_group[0]+'_count']) #+[feature_group[0]+'_count'] df1=df_train.select(feature_group).groupby(feature_group).count() # df1.show(1,truncate=False) #理论上还是只有3个字段,不包含uid_count df1=df1.join(df2,condition,'left') df1.show(1,truncate=False) df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count')) df1=df1.drop('count').drop(feature_group[0]+'_count') # df1.show(5) df_train=df_train.join(df1,feature_group,"left") df_train.show(1,truncate=False) df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0}) #对某一列填充缺失值 df_test.show(1,truncate=False) df_train=df_train.drop('uid_count').drop('item_id_count') df_train.printSchema() df_test.printSchema() print('-------5.保存数据预处理结果-------') test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_step3_try' os.system("hadoop fs -rm -r {}".format(test_file_path)) df_test.rdd.map(tuple).saveAsPickleFile(test_file_path) del df_test gc.collect() train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_step3_try' os.system("hadoop fs -rm -r {}".format(train_file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
from pyspark import SparkConf, SparkContext sc = SparkContext(conf=SparkConf().setAppName("Innings Run")) inningData = sc.pickleFile('deliveries.pickle') inningData = inningData.map(lambda s: ( (int(s['match_id']), int(s['inning'])), int(s['total_runs']))).reduceByKey( lambda a, b: a + b).sortByKey() for item in inningData.collect(): print(item)
from pyspark import SparkContext, SparkConf from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint conf = (SparkConf() \ .set("spark.driver.maxResultSize", "2g")) sc = SparkContext(conf=conf) pos = sc.textFile("hdfs:///movie_review/positive").map(lambda s: (True, s.lower().split())) neg = sc.textFile("hdfs:///movie_review/negative").map(lambda s: (False, s.lower().split())) if False: docvecs = sc.pickleFile("hdfs://movie_review/doctags") else: from ddoc2vec import DistDoc2Vec data = (neg + pos).zipWithIndex().map(lambda (v, i): (i, v[0], v[1])) sents = data.map(lambda (a,b,c): c) model = Word2Vec(size=100, hs=0, negative=8) dd2v = DistDoc2Vec(model, learn_hidden=False, num_partitions=5, num_iterations=10) dd2v.build_vocab_from_rdd(sents, reset_hidden=False) # train word2vec in driver model.train(sents.collect()) model.save("/root/doc2vec/word2vec_model/review") print "*** done training words ****" print "*** len(model.vocab): %d ****" % len(model.vocab) dd2v.train_sentences_cbow(data.map(lambda (i, l, v): TaggedDocument(words=v, tags=[i])))
import math from itertools import combinations from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.linalg.distributed import RowMatrix from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.ml import Pipeline from pyspark.ml.regression import * from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import * from pyspark.ml.tuning import CrossValidator, ParamGridBuilder df = sqlContext.createDataFrame(sc.pickleFile("rdd1.p", 30), ["label", "cat_features","cont_features"]).cache() to_delete = [2,3,4,5,6,7,8,96] # cela correspond à cat2, cat3, cat4 ... features_to_keep = list(range(116)) for idx in to_delete: features_to_keep.remove(idx - 1) # Car to_delete commence à 1 class customTransformer: def __init__(self, inputCol, outputCol, *others): self.inputCol = inputCol self.outputCol = outputCol self.args = list(others) self.fitInfo = 0 # Store information taken from the dataframe
#carr=np.zeros(24) carr=[0 for x in range(18)] for o in obs: week_idx=week_ordering.index(o[0]) idx=(week_idx-1)*6+o[1]*3+o[2] carr[idx]=o[3] tipo_utente=sorted([(c[0],euclidean(carr,list(c[1]))) for c in profiles],key=lambda x:x[1])[0][0] yield (munic,tipo_utente,id) sc=SparkContext() ##annotazione utenti ##open r=sc.pickleFile('hdfs://hdp1.itc.unipi.it:9000/profiles/centroids%s-%s'%(region,timeframe)) cntr=r.collect() profiles=[(x[0],x[1]) for x in cntr] r=sc.pickleFile('hdfs://hdp1.itc.unipi.it:9000/profiles/'+"%s-%s"%(region,timeframe)) r_auto= r.flatMap(lambda x: annota_utente(x[1],profiles)) \ .map(lambda x: ((x[0],x[1]),1)) \ .reduceByKey(lambda x,y:x+y) # ##ottengo coppie municipio,id_cluster ### risultato finale # lst=r_auto.collect()
num = int(fields[2]) result = [] for i in range(num - 1): result.append([ bid, fields[3 + 4 * i], fields[3 + 4 * i + 1], fields[3 + 4 * i + 2], fields[3 + 4 * i + 3] ]) return result rdd_flat = rdd.flatMap(lambda line: flat_trend(line)) trend_df = sqlContext.createDataFrame( rdd_flat, ['business_id', 'start', 'end', 'rating', 'trend']) geo_rdd = sc.pickleFile( '/Users/zimoli/Downloads/RBDA-MCINTOSH/Project/RBDAProject/phoenix_cate_ts' ).cache() trend_list = sc.broadcast([[ str(row['business_id']), str(row['start']), str(row['end']), str(row['rating']), str(row['trend']) ] for row in trend_df.collect()]) trend_map = {} def get_trend_map(trend_map): for trend in trend_list.value:
(cv_data_rdd, out_cv_data)]: url = sparkutil.util.s3n_url(S3_BUCKET, S3_PATH, name) sparkutil.util.s3n_delete(url) rdd.saveAsPickleFile(url) pickle.dump({'url' : url}, open(name, 'w')) sc.stop() @jobs_limit(1) @transform(spark_run_experiments, suffix('.samples'), '.samples.pickle') def get_samples((exp_samples, exp_cvdata, exp_inits), out_filename): sample_metadata = pickle.load(open(exp_samples, 'r')) sc = SparkContext() results_rdd = sc.pickleFile(sample_metadata['url']) sparkutil.util.save_rdd_elements(results_rdd, out_filename, S3_BUCKET, S3_PATH) sc.stop() @jobs_limit(1) @transform(spark_run_experiments, suffix('.samples'), '.cvdata.pickle') def get_cvdata((exp_samples, exp_cvdata, exp_inits), out_filename): cvdata_metadata = pickle.load(open(exp_cvdata, 'r')) sc = SparkContext() results_rdd = sc.pickleFile(cvdata_metadata['url']) pickle.dump(results_rdd.collect(), open(out_filename, 'w')) sc.stop()
df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p2.json') reviewDF = df.select("reviewText") sc = SparkContext(appName='Word2Vec') def removePunctuation(text): return re.sub("[^a-zA-Z]", " ", text) cleanedReviewRDD = reviewDF.map(lambda row: removePunctuation(row.reviewText).lower().split()) cleanedReviewRDD.saveAsPickleFile(path+'/P2CleanedRDD',10) reviewRDD = sc.pickleFile('/Users/sradhakr/Desktop/Assignment3/Assignment3/P2CleanedRDD', 10) uniqueWordsRDD = reviewRDD.flatMap(lambda words: words).distinct().map(lambda word: (word, 1)) word2VecRDD = sqlContext.read.parquet(path+"/word2vec/data") wordsFeaturesRDD = uniqueWordsRDD.join(word2VecRDD.rdd).map(lambda (key, (dummy,features)):(key, features)) #not a RDD kMeansclusters = KMeans.train(wordsFeaturesRDD.map(lambda (key, features): features), 2000, maxIterations=50, runs=5, initializationMode="random", seed=50) wordsClustersRDD = wordsFeaturesRDD.map(lambda (key,features): (key,kMeansclusters.predict(features))) wordsClustersRDD.saveAsPickleFile(path+'/WordClustersRDD',10)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jul 7 23:50:23 2017 @author: dray """ from pyspark import SparkConf, SparkContext from pprint import pprint sc = SparkContext(conf=SparkConf().setAppName("Students read")) marksRdd = sc.pickleFile("students.pickle").map(lambda d: (d['sid'], d[ 'marks'])).reduceByKey(lambda a, b: a + b).sortByKey() pprint(marksRdd.collect())
import findspark findspark.init() try: sc.stop() except: pass from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession conf = SparkConf().setAppName("finalproject").setMaster("local[*]") sc = SparkContext(conf=conf) spark = SparkSession(sparkContext=sc) import pickle from operator import add allData = sc.pickleFile("./all-data.pkl") #print allData.first() #([["Company", "Class", "Name", "City", "State", "Country", "Date"]]) #only use if there is a problem with using None def companyOrPersonCount(x): if (x[0] is None): return (x[2], 1) else: return (x[0], 1) def companyCount(x):
from konlpy.tag import Twitter from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import Normalizer from pyspark.mllib.clustering import KMeans, KMeansModel import pickle from numpy import array sc = SparkContext() sqlContext = SQLContext(sc) normData = sc.pickleFile('idf_normalized') from pyspark.mllib.clustering import KMeans, KMeansModel from math import sqrt data = normData.map(lambda x : x.idf_norm) clusters = KMeans.train(data, 10, maxIterations=10,runs=10, initializationMode="random") ''' def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) ''' clusters.save(sc,'KMeansModel') #WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y) #print("Within Set Sum of Squared Error = " + str(WSSSE))
else: print 'Using Pre-Pickled Files\n' # End Timer for this phase WordFreq_Time = time() - WordFreq_Time print('############ Processing Completed ##############') print('################################################\n') print('################################################') print('############## Word Freq to IDF RDD ############\n') # Start Timer IDF_Time = time() # Ascertain if Section has already been completed if len(getDirectory(directory[3])) < 1: allFolders = getDirectory(directory[2]) # Load in Word Frequency Pickles into one RDD IDF = sc.union([sc.pickleFile(i) for i in allFolders]) # Rearrange RDD into correct the correct format IDF = IDF.flatMap(lambda x_y: [(pair[0], [[x_y[0], str(pair[1])]]) for pair in x_y[1]]) \ .reduceByKey(add) \ .map(lambda x_y1: (x_y1[0], len(x_y1[1]), float(N), x_y1[1])) \ .map(lambda x_y_z_a: (x_y_z_a[0], np.log2(x_y_z_a[2] / x_y_z_a[1]), x_y_z_a[3])) \ .repartition(8) # Save IDF RDD as a Pickle File IDF.saveAsPickleFile(directory[4], 50) else: print 'Using Pre-Pickled Files\n' # End Timer for this phase IDF_Time = time() - IDF_Time print('############ Processing Completed ##############') print('################################################\n')
""" Peak detection Module Given a hourly presence dataset (usually regarding a month of activity), and a typical weekly presence dataset, it computes the relative presences for each hour of the month, in order to identify eventual peaks of presences. Usage: peak_detection.py <spatial_division> <region> <timeframe> --region,timeframe: names of the file stored into the hdfs. E.g. Roma 11-2015 example: pyspark peak_detection.py roma 06-215 It loads the hourly presences in /peaks/weekly_presence-<region>-<timeframe> and stores results into standard csv file: rome_peaks<region>-<timeframe>-<spatial_division>.csv """ spatial_division = sys.argv[1] region = sys.argv[2] timeframe = sys.argv[3] sc = SparkContext() presenze_medie = sc.pickleFile( '/peaks/weekly_presence-' + "%s-%s" % (region, timeframe)).collectAsMap() chiamate_orarie = sc.pickleFile( '/peaks/hourly_presence-' + "%s-%s" % (region, timeframe)) peaks = open('rome_peaks%s-%s-%s.csv' % (region, timeframe, spatial_division.replace(".", "").replace("/", "")), 'w') for l in chiamate_orarie.collect(): print >>peaks, "%s,%s,%s,%s" % (l[0][0], l[0][4], l[0][3], l[ 1] / np.mean(list(presenze_medie[(l[0][0], l[0][1], l[0][3])])))
auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) args = sys.argv; api = tweepy.API(auth,timeout=10) filename='tweets'+str(time.time())+'.pickle' folderurl='/user/bijoyan/tweetstore/' list_tweets = [] for status in tweepy.Cursor(api.search,q=args[1:],lang='en',result_type='recent').items(80): list_tweets.append(status.text) mainRdd = sc.parallelize(list_tweets) mainRdd.saveAsPickleFile(folderurl+filename) def mymap(line): for char in string.punctuation: line = line.replace(char,' ') blob = TextBlob(line) sum=0 n=0 for sentence in blob.sentences: sum+=sentence.sentiment.polarity n+=1 return (sum/n)*100 mainRdd=sc.pickleFile(folderurl+filename) mainRdd = mainRdd.map(mymap) for emotion in mainRdd.collect(): print(emotion)
z = 2*365 #z in formula is poverty line. GINI=0.54115 indicator="Adjusted net national income per capita (constant 2005 US$)" #this is yt in formula year_start = 1971 year_end = 2015 num_of_segments = 5 conf = SparkConf() conf.setMaster("local[4]") conf.setAppName("damu1000") conf.set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) #read data lines = sc.pickleFile(".//result").filter(lambda x: x[0]==country).cache() #filter by country, indicator and period. sort by period #-----------------------------------------read 1st to 5th 20% income---------------------------------------------------------- #using inequality data to approximate standard deviation income_20_1 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by lowest 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True) years = income_20_1.map(lambda (a,b,c,d,e): float(d) ) #years for which income share data is not null. use this later to filter average income income_20_1 = income_20_1.map(lambda (a,b,c,d,e): float(e) ) income_20_2 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by second 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e) ) income_20_3 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by third 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e) ) income_20_4 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by fourth 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e) ) income_20_5 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by highest 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e) ) PPP = lines.filter(lambda x: x[0]==country and x[2]=="Poverty headcount ratio at $2 a day (PPP) (% of population)" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True)
def main(args): file_path = args.input # './dataset/001-SART-August2017-MB.csv' Server_path = ['/usr/lib/jvm/java-1.8.0-openjdk-amd64', './res/saved_dataset', file_path ] Yu_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home', './res/saved_dataset', './dataset/001-SART-August2017-MB-50.csv'] Leo_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_151.jdk/Contents/Home', './res/saved_dataset', file_path] Yuncong_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home', './res/saved_dataset', file_path] path = Server_path os.environ['JAVA_HOME'] = path[0] # create a spark job cores = args.cores st = args.st full_length = args.full_length sc = SparkContext('' + 'local' + '[' + str(cores) + ']' + '', "First App") # sc = SparkContext("local[4]", "First App") # st = 0.25 new_path = re.match(r"(.*)\.csv", path[2]).group(1) path_save_res = path[1] + '/' + new_path + '_' + str(st) # if path exist, the job can't be executed if os.path.isdir(path_save_res): group_rdd = sc.pickleFile(path_save_res + '/group/') cluster_rdd = sc.pickleFile(path_save_res + '/cluster/') global_dict_rdd = sc.pickleFile(path_save_res + '/dict/') # shutil.rmtree(path_save_res) else: # TODO file = path[2] # add test for commit features_to_append = [0, 1, 2, 3, 4] # res_list: list of raw time series data to be on distributed # timeSeries: a dictionary version of as res_list, used for sebsequence look up res_list, time_series_dict, global_min, global_max = generate_source(file, features_to_append) print('processing dataset' + path[2]) print("Global Max is " + str(global_max)) print("Global Min is " + str(global_min)) normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min, global_max) # TODO # add clustering method after grouping # this broadcast object can be accessed from all nodes in computer cluster # in order to access the value this, just use val = global_dict.value # for future reading data # NOTE that the data being broadcasted is the minmax-normalized data global_dict = sc.broadcast(normalized_ts_dict) time_series_dict = sc.broadcast(time_series_dict) # max(flows, key=lambda k: len(flows[k])) # find the key of largest length of # max_len_key = max(global_dict.value, key=lambda k: len(global_dict.value[k])) # max_length = len(global_dict.value[max_len_key]) if full_length: grouping_range = (1, max([len(v) for v in global_dict.value.values()])) else: grouping_range = (89, 90) # grouping_range = (1, length) global_dict_rdd = sc.parallelize(res_list[1:], numSlices=16) global_dict_rdd.saveAsPickleFile(path_save_res + '/dict/') # global_dict_res = global_dict_rdd.collect() # finish grouping here, result in a key, value pair where # key is the length of sub-sequence, value is the [id of source time series, start_point, end_point] # res_rdd = global_dict_rdd.flatMap(lambda x: get_all_subsquences(x)).collect() # In get_subsquences(x, 100, 110): we are grouping subsequences that are of length 90 to 110 """ ##### group group_rdd_res: list: items = (length, time series list) -> time series list: items = (id, start, end) """ # add save option or not group_start_time = time.time() group_rdd = global_dict_rdd.flatMap(lambda x: get_subsquences(x, grouping_range[0], grouping_range[1])).map( lambda x: (x[0], [x[1:]])).reduceByKey( lambda a, b: a + b) group_rdd.saveAsPickleFile(path_save_res + '/group/') group_end_time = time.time() print('group of timeseries from ' + str(grouping_range[0]) + ' to ' + str(grouping_range[1]) + ' using ' + str( group_end_time - group_start_time) + ' seconds') # group_rdd_res = group_rdd.collect() print("grouping done, saved to dataset") """ ##### cluster The following code is for testing clustering operation. Cluster one group without using RDD 4/15/19 # print("Test clustering") # group_res = group_rdd.collect() # cluster(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 """ # print("Test clustering") # group_res = group_rdd.collect() # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 # cluster(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 print("Working on clustering") cluster_start_time = time.time() cluster_rdd = group_rdd.map(lambda x: cluster(x[1], x[0], st, global_dict.value)) cluster_rdd.saveAsPickleFile(path_save_res + '/cluster/') # save all the cluster to the hard drive cluster_rdd_reload = sc.pickleFile(path_save_res).collect() # here we have all the clusters in memory # first_dict = cluster_rdd_reload[0] cluster_end_time = time.time() print('clustering of timeseries from ' + str(grouping_range[0]) + ' to ' + str( grouping_range[1]) + ' using ' + str(cluster_end_time - cluster_start_time) + ' seconds') print("clustering done, saved to dataset") # plot all the clusters # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5) """ ##### query Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range The following line is for testing querying on one cluster # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value) """ # print("Using Twopass") # total_cluster_count = 0 # for cluster_dic in cluster_rdd.collect(): # # representative, cluster_subsequences = random.choice(list(cluster_dic.items())) # # cluster_length = representative.get_length() # total_cluster_count = total_cluster_count + len(cluster_dic.keys()) # # print("length " + str(cluster_length) + " has cluster count of " + str(len(cluster_dic.keys()))) # print("Total cluster count is: " + str(total_cluster_count)) # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)' # '(2013e_001)_(100-0-Back)_(B-DC8)_(232665953.1250)' query_id = '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)' query_sequence = get_data(query_id, 24, 117, time_series_dict.value) # get an example query filter_rdd = cluster_rdd.filter(lambda x: exclude_same_id(x, query_id)) # raise exception if the query_range exceeds the grouping range querying_range = (90, 91) k = 5 # looking for k best matches if querying_range[0] < grouping_range[0] or querying_range[1] > grouping_range[1]: raise Exception("query_operations: query: Query range does not match group range") query_result = cluster_rdd.filter(lambda x: x).map( lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value)).collect() exclude_overlapping = True query_result = filter_rdd.map( lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value, exclude_overlapping, 0.5)).collect() plot_query_result(query_sequence, query_result, time_series_dict.value) sc.stop()
def __main__(): # Get program options input_path = "" num_learners = 1 num_parts = 1 output_path = '/filer/tmp1/yw298/spark/output/' fs = 'file:' save_data = 0 # Parameters for base learner max_depth = None max_features = None min_samples_leaf = 1 min_samples_split = 2 # Parameters for coefficient fitting regularizer = None niters = 100 reg_weight = 1.0 step_size = 1.0 batch_frac = 1 for option in sys.argv: opt_val = option.split('=') if opt_val[0] == '--input': input_path = str(opt_val[1]) elif opt_val[0] == '--fs': fs = str(opt_val[1]) elif opt_val[0] == '--num_learners': num_learners = int(opt_val[1]) - 1 elif opt_val[0] == '--num_parts': num_parts = int(opt_val[1]) elif opt_val[0] == '--max_depth': max_depth = int(opt_val[1]) elif opt_val[0] == '--max_features': max_features = int(opt_val[1]) elif opt_val[0] == '--min_samples_leaf': min_samples_leaf = int(opt_val[1]) elif opt_val[0] == '--min_samples_split': min_samples_split = int(opt_val[1]) elif opt_val[0] == '--output': output_path = str(opt_val[1]) elif opt_val[0] == '--regularizer': regularizer = str(opt_val[1]) elif opt_val[0] == '--niters': niters = int(opt_val[1]) elif opt_val[0] == '--reg_weight': reg_weight = float(opt_val[1]) elif opt_val[0] == '--step_size': step_size = float(opt_val[1]) elif opt_val[0] == '--batch_fraction': batch_frac = float(opt_val[1]) elif opt_val[0] == '--save_data': save_data = int(opt_val[1]) print '>>> input_path = %s' % str(input_path) print '>>> num_learners = %s' % str(num_learners) print '>>> num_parts = %s' % str(num_parts) print '>>> output_path = %s' % str(output_path) print '>>> max_depth = %s' % str(max_depth) print '>>> max_features = %s' % str(max_features) print '>>> min_samples_leaf = %s' % str(min_samples_leaf) print '>>> min_samples_split = %s' % str(min_samples_split) print '>>> regularizer = %s' % str(regularizer) print '>>> niters = %s' % str(niters) print '>>> reg_weight = %s' % str(reg_weight) print '>>> step_size = %s' % str(step_size) print '>>> file_system = %s' % str(fs) print '>>> batch_fraction = %s' % str(batch_frac) print '>>> save_data = %s' % str(save_data) if input_path == "": print >> sys.stderr, "Usage: parallel boosting training <file>" exit(-1) # Initialize Spark conf = SparkConf() sc = SparkContext(conf=conf) # Map function of mapping training data to each learner (1): # Randomly partitioning the entire dataset. def func_pmap_rndpartition(p_iter): rnd.seed() for (k, v) in p_iter: yval = v[0] xvec = v[1] kv_pair = (rnd.randint(1, num_learners + 1), (yval, xvec)) yield kv_pair # Map function of mapping training data to each learner (2): # Mapping each example to num_learners copies with their labels corrupted # with standard Gaussian multiplications. def func_pmap_rndlabeling(p_iter): rnd.seed() for v in p_iter: yval = v[0] xvec = v[1] # Emitting the training set with true labels yield (0, (yval, xvec)) # Emitting the training sets with corrupted labels for tid in range(num_learners): coin = rnd.random() if coin < 0.5: yval_rnd = -yval else: yval_rnd = yval kv_pair = (tid + 1, (yval_rnd, xvec)) yield kv_pair # Def of mapping function for training each learner def func_train_learner((key, data)): yvec = [] xmat = [] # Emitting the trained learners for (yval, xvec) in data: # Append label and feature values xmat.append(xvec) yvec.append(yval) # Train learner learner = tree.DecisionTreeRegressor( \ max_depth = max_depth, \ max_features = max_features, \ min_samples_leaf = min_samples_leaf, \ min_samples_split = min_samples_split \ ) learner.fit(xmat, yvec) return (key, learner) # Hypothesis sampling train_data_HS = sc.pickleFile(fs + input_path) \ .repartition(num_parts) \ .persist(StorageLevel.MEMORY_AND_DISK) train_data_map_HS = train_data_HS.mapPartitions(func_pmap_rndlabeling).\ combineByKey(createCombiner = lambda v : [v], \ mergeValue = lambda c, v : c + [v], \ mergeCombiners = lambda c1, c2 : c1 + c2 \ ) learner_class = train_data_map_HS.map(func_train_learner).collect() learner_class_broadcast = sc.broadcast(learner_class) # Map function of generating training data for coefficient fitting def func_map_ypredmat(v): yval = v[0] xvec = v[1] values = np.zeros(num_learners + 1) for l in learner_class_broadcast.value: values[l[0]] = l[1].predict(xvec) return LabeledPoint(yval, values) # Coefficient fitting train_data_CF = train_data_HS \ .map(func_map_ypredmat) \ .persist(StorageLevel.MEMORY_AND_DISK) coeffs = LinearRegressionWithSGD.train(\ data = train_data_CF, \ iterations = niters, \ step = step_size, \ regType = regularizer, \ regParam = reg_weight, \ miniBatchFraction = batch_frac, \ intercept = False ) coeffs_list = sorted(list(enumerate(coeffs.weights, start = 1)), \ key = lambda kv : kv[0]) learner_class_list = sorted(learner_class, \ key = lambda kv : kv[0]) if save_data: # Save the raw training data train_data_HS.saveAsPickleFile(path = fs + output_path + '/train_data', batchSize = 10240) # Save the coeff-fit training data train_data_CF.saveAsPickleFile(path = fs + output_path + '/coeff_data', batchSize = 10240) # Save the learner class and fitted coefficients file = open(output_path + '/learner_class', 'w') pk.dump(learner_class_list, file) file.close() file = open(output_path + '/fitted_coeffs', 'w') pk.dump(coeffs_list, file) file.close() sc.stop()