def main(training_file,n): epochs = int(n); x,y,tags = read_training_data(training_file) v = {} sc = SparkContext(appName="parameterMixing") tags = sc.broadcast(tags) time0 = time.time() training_data = [] for i in range(len(x)): training_data.append((x[i],y[i])) train_data = sc.parallelize(training_data).cache() for round in range(0,epochs): fv = sc.broadcast(v) feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value)) feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)), (lambda x, y: (x[0] + y, x[1] + 1)), (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect() for (feat, (a,b)) in feat_vec_list: v[feat] = float(a)/float(b) sc.stop() # Compute the weight vector using the Perceptron algorithm #trainer.perceptron_algorithm(5) print "iteration %d in %f seconds" %(iterations, time.time()-t0) # Write out the final weight vector write_weight_vector(v)
def createContext(): uBATCH_INTERVAL = 10 sc = SparkContext(SPARK_MASTER, appName="StreamingKafka") sc.broadcast(batchUserPostDict) sc.broadcast(batchPostUserDict) #sc = SparkContext("local[*]", appName="StreamingKafka") # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower ssc = StreamingContext(sc, uBATCH_INTERVAL) ssc.checkpoint(CHECKPOINT_DIR) # set checkpoint directory in HDFS #ssc.checkpoint(10 * uBATCH_INTERVAL) return ssc ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
def geneSpark(input_filename, output_filename, upstream_bp=2000, downstream_bp=500): ''' Performs geneSpark extensions given a `input_filename` and stores the output in `output_filename` Parameters ---------- input_filename : string path to the GTF file output_filename : string path to the output extended GTF file upstream_bp : int (default=2000): Extend upstream of first exon of each gene dowstream_bp : int (default=500): Extend dowstream of last exon of each gene ''' # create spark context sc = SparkContext(appName="geneSpark") # set up broadcasting variables upstream_bp_var = sc.broadcast(upstream_bp) downstream_bp_var = sc.broadcast(downstream_bp) # create temporary folder where to store the output chunks tempFile = NamedTemporaryFile(delete=True) tempFile.close() # define the spark pipeline (sc.textFile(input_filename) .map(lambda x: x.split('\t')) .filter(lambda x: x[2] == 'exon') .map(parse_line) .reduceByKey(min_and_max) .sortByKey() .map(partial(geneSpark, upstream_bp=upstream_bp_var, downstream_bp=downstream_bp_var)) .saveAsTextFile(tempFile.name)) # merge output chunks to single output_filename with open(output_filename, 'w') as fw: for line in input(sorted(glob(tempFile.name + "/part-000*"))): fw.write(line) sc.stop()
def main(): """Process the input file got as a command-line argument.""" global stop_words, punctuations input_file, feature_dimensions, num_clusters, max_iterations, runs = _parse_cmd_line_args() sc = SparkContext(conf=_get_conf("CS-838-Assignment3-PartB")) # for the _tokenize function to remove stopwords and punctuations stop_words = sc.broadcast(set(stopwords.words("english"))) punctuations = sc.broadcast(set(string.punctuation)) input_text_rdd, tfidf_vectors_rdd = get_feature_vectors(sc, input_file, feature_dimensions) model = build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs) top_n_in_each_cluster(sc, input_text_rdd, tfidf_vectors_rdd, model, 5)
def main(): # Insure a search term was supplied at the command line if len(sys.argv) != 2: sys.stderr.write("Usage: {} <search_term>".format(sys.argv[0])) sys.exit() # Create the SparkContext sc = SparkContext(appName="SparkWordCount") # Broadcast the requested term requested_movie = sc.broadcast(sys.argv[1]) # Load the input file source_file = sc.textFile("/user/hduser/input/movies") # Get the movie title from the second fields titles = source_file.map(lambda line: line.split("|")[1]) # Create a map of the normalized title to the raw title normalized_title = titles.map(lambda title: (re.sub(r"\s*\(\d{4}\)", "", title).lower(), title)) # Find all movies matching the requested_movie matches = normalized_title.filter(lambda x: requested_movie.value in x[0]) # Collect all the matching titles matching_titles = matches.map(lambda x: x[1]).distinct().collect() # Display the result print "{} Matching titles found:".format(len(matching_titles)) for title in matching_titles: print title sc.stop()
def main(name, divide): """ old_g = pickle.load(open("/net/data/facebook/facebook-ucsb/Facebook_2008/"+name +"/original_pickles/"+name +".pickle", 'r')) new_g = networkx.Graph() for node, friends in old_g.adj.iteritems(): if node not in new_g.nodes(): new_g.add_node(node) for friend in friends.iterkeys(): new_g.add_node(friend) new_g.add_edge(node, friend) """ # serialize the networkx graph as text files of edgelist # into a text file for workers to read # networkx.write_edgelist(new_g, "edgelist/"+name, data=False) # subprocess.check_call("hdfs dfs -put edgelist/"+name+ " edgelist/", shell=True) new_g = networkx.read_adjlist(name + "_list.txt") # Egypt_list is an edge list sc = SparkContext(appName="Sorted_removal") dataG = json_graph.node_link_data(new_g) stringG = json.dumps(dataG) originalG = sc.broadcast(stringG) edges = sc.textFile("hdfs://scrapper/user/xiaofeng/edgelist/" + name, 192 * 4 * int(divide)) costs = edges.map(lambda line: line.split(" ")).map(lambda edge: edge_to_cost(edge, originalG.value)) costs.saveAsTextFile("hdfs://scrapper/user/xiaofeng/costs_" + name) sc.stop() subprocess.check_call("hdfs dfs -get costs_" + name + " /home/xiaofeng/facebook/FacebookProject/costs/", shell=True) Reformat("/home/xiaofeng/facebook/FacebookProject/costs/costs_" + name + "/", name)
def run(date): """" 加载hdfs上 业务提供的规则 并封装成 FunnelRule对象 例如:[FunnelRule(funnelId=u'1496', ruleId=u'896', level=u'1', requestRule=u'contains')] """"" sc = SparkContext(appName="readHdfsFile",master=conf.sparkURL) rulesList=readFile(sc,conf.dim_model_url_new).flatMap(lambda line:line.split('\r\n')).map(buildBean).collect() #OrderedDict( rules_lookup = sc.broadcast(rulesList) """ setp2:加载点击流日志与规则表比对,剔除无效日志, 生成后期数据分析结构(in 1-----> out N+) set4:产生新的key set5: """ """ >>>rdd2=sc.parallelize([['1\t1',['1','1','2','a']],['1\t1',['1','1','1','b']],['2\t1',['2','1','1','b']]]) >>>rdd2.groupByKey().map(lambda line:list(line[1])).filter(lambda x:x[0][0]=='1').flatMap(lambda x:x).collect() [['1', '1', '2', 'a'], ['1', '1', '1', 'b']] """ #conf.click_jr_log_url_dir+"/dt="+date clickLogRDD=readFile(sc,"/funnelNew/input/click_log/000000_0").map(rowSplit) clickLogRDD1=clickLogRDD.flatMap(lambda line:funnelFilter.getList(line[0],rules_lookup)).groupByKey()\ .map(lambda line:line[1]).filter(reduceFilter).flatMap(lambda x:x).map(countSessionKey).\ partitionBy(1).reduceByKey(add) clickLogRDD1.saveAsTextFile("/funnelNew/output/dt="+date)
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs, n_iter_search): # Spark configuration. conf = (SparkConf() .setMaster("local[" + str(n_jobs) + "]") .setAppName("FDD") .set("spark.executor.memory", "512mb") .set("spark.cores.max", str(n_jobs))) sc = SparkContext(conf=conf) # Build hyperparameter vectors. parameters = cartesian((n_components, n_pc, covar_types)) # Distribute the hyperparameters vector. parameters_rdd = sc.parallelize(parameters, 96) # Broadcast the data to all workers. data_broadcast = sc.broadcast(data) # Train a model for each hyperparameter set. models = parameters_rdd.map(lambda param: train_with_parameters(param, data_broadcast)) # Persist the models the avoid re-computation. models.persist(StorageLevel(True, True, False, True, 1)) # Sort by BIC. sorted_models = models.sortBy(lambda model: model[0]) # The first is the best model. best_model = sorted_models.collect()[0][1] sc.stop() return best_model
def count_triangles(data, master="local[2]"): """ @brief: Count triangles using Spark @param data: The data location for the input files @param master: The master URL as defined at https://spark.apache.org/docs/1.1.0/submitting-applications.html#master-urls """ ################# NO EDITS HERE ################### assert not os.path.exists("triangles.out"), "File: triangles.out \ already exists" sc = SparkContext(master, "Triangle Count") start = time() ############### END NO EDITS HERE ################ # TODO: Your code goes here! people = sc.textFile(data) AdjList = people.map(makepair) DriverAdj = dict(AdjList.collect()) WorkerAdj = sc.broadcast(DriverAdj) Edges = AdjList.flatMapValues(lambda x: x) TriSet = Edges.map(lambda (k,v): ((k,v), AintersectB(k,v,WorkerAdj.value))) Triangle = TriSet.flatMapValues(lambda x: x).map(lambda (k,v): tuple(sorted([int(v),int(k[0]),int(k[1])],reverse=True))) output = set(Triangle.collect()) ################# NO EDITS HERE ################### print "\n\n*****************************************" print "\nTotal algorithm time: %.4f sec \n" % (time()-start) print "*****************************************\n\n""" ############### END NO EDITS HERE ################ with open("triangles.out", "wb") as f: for friends in output: f.write(str(friends[0])+" "+str(friends[1])+" "+str(friends[2])+"\n") # TODO: Loop with f to write your result to file serially pass
def SparkBroadcastAccumulator(n): global broadcast_var global accumulator_var spcon = SparkContext("local[2]","SparkBroadcastAccumulator") broadcast_var=spcon.broadcast("broadcast_message") accumulator_var=spcon.accumulator(0) spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
def SLAPmi_initialize_spark(fullpath): D = io.loadmat(fullpath, struct_as_record=False, squeeze_me=True) obs = D['obs'] opts = D['opts'] Y = obs.data_in P0 = opts.P.T # transpose Sk = D['Sk'] Su = D['Su'] if len(Su.shape)<2: Su = Su[:,None] masks = D['masks'] #S = Sk #S = np.concatenate((Sk,Su), axis=1) def P (frame): return P0 def solveOneFrame(frameDataIn): #framedata has structure [framenumber, y[:,framenumber]] Pt = P(frameDataIn[0]) #PSk = np.zeros((Pt.shape[0], Sk.shape[0])) #for Sk_ix in range(len(Sk)): # PSk[:, Sk_ix] = Pt[:,masks[:,Sk_ix].toarray()[:,0]].dot(Sk[Sk_ix]) #code.interact(local=locals()) PSk = Pt.dot(Sk_bc.value).toarray() PSu = Pt.dot(Su_bc.value) PS = np.concatenate((PSk, PSu), axis=1) F = optimize.nnls(PS,frameDataIn[1]) #code.interact(local=locals()) return F[0] #code.interact(local=locals()) conf = SparkConf().setAppName('SLAPmi_initialize') sc = SparkContext(conf=conf) Sk_bc = sc.broadcast(Sk) Su_bc = sc.broadcast(Su) frameData = [(i, Y[:,i]) for i in range(Y.shape[1])] F_solved = np.array(sc.parallelize(frameData,len(frameData)).map(solveOneFrame).collect()) # print 'F_solved', F_solved.shape print 'Sk', Sk.shape print 'Su', Su.shape Fk = F_solved[:, 0:Sk.shape[1]].T Fu = F_solved[:, Sk.shape[1]:(Sk.shape[1]+Su.shape[1])].T return Sk,Su,Fk,Fu, obs, opts, masks, D['ground_truth']
class BroadcastTest(unittest.TestCase): def tearDown(self): if getattr(self, "sc", None) is not None: self.sc.stop() self.sc = None def _test_encryption_helper(self, vs): """ Creates a broadcast variables for each value in vs, and runs a simple job to make sure the value is the same when it's read in the executors. Also makes sure there are no task failures. """ bs = [self.sc.broadcast(value=v) for v in vs] exec_values = self.sc.parallelize(range(2)).map(lambda x: [b.value for b in bs]).collect() for ev in exec_values: self.assertEqual(ev, vs) # make sure there are no task failures status = self.sc.statusTracker() for jid in status.getJobIdsForGroup(): for sid in status.getJobInfo(jid).stageIds: stage_info = status.getStageInfo(sid) self.assertEqual(0, stage_info.numFailedTasks) def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20]) def test_broadcast_with_encryption(self): self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true")) def test_broadcast_no_encryption(self): self._test_multiple_broadcasts() def _test_broadcast_on_driver(self, *extra_confs): conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) bs = self.sc.broadcast(value=5) self.assertEqual(5, bs.value) def test_broadcast_value_driver_no_encryption(self): self._test_broadcast_on_driver() def test_broadcast_value_driver_encryption(self): self._test_broadcast_on_driver(("spark.io.encryption.enabled", "true"))
def process(master, input_container, output_container): sc = SparkContext(master, "CDNBilling") # load broadcast variables countryMapRDD = sc.textFile(input_container + "/country_map.tsv") countryMapList = countryMapRDD.collect() sc.broadcast(countryMapList) countryMapDict.update(createCountryDict(countryMapList)) # load domainLogs domainsRawRDD = sc.textFile(input_container + "/domains_map.tsv") domainsRDD = domainsRawRDD.map(formatDomainsLine) # load logs logsRDD = sc.textFile(input_container + "/raxcdn_*.gz") # drop the header actual_log_lines = logsRDD.filter(lambda x: x[0] != '#') # filter by date filteredRDD = actual_log_lines.filter(filterByDate) # format the data formattedRDD = filteredRDD.map(formatLogLine, countryMapDict) # Zero event domains domains_unused = domainsRDD.subtractByKey(formattedRDD) domains_unused_formatted = domains_unused.map(formatUnusedDomain) # for each domain, calculate bandwidth and request count aggregatedLogs = formattedRDD.combineByKey(createCombiner, mergeValue, mergeCombiners) # add type of domain, project-ID, service-ID joinedWithDomainDetails = aggregatedLogs.join(domainsRDD) # join the usage logs with domains map including zero events joinedLogs = joinedWithDomainDetails.union(domains_unused_formatted) # save the output joinedLogs.saveAsTextFile(output_container + "/output-files") sc.stop()
def main(): conf = SparkConf().setAppName("Test2") sc = SparkContext(conf=conf) # new_dict函数将<tuple,value>键值对转换成<tuple_1,dict(tuple_2,value)>键值对 def new_dict(line): Dict = dict() Dict[line[0][1]] = line[1] return (line[0][0], Dict) # 读取原始文件,形成<文件,内容>的键值对 data_raw = sc.wholeTextFiles("/home/djt/data/proclassified") # Doc函数将<文件,内容>键值对中内容按行split,每一行即对应一封判决书的内容 def Doc(line): s = line[1].split("\n") return s[0:len(s) - 1] # <文件,内容>的键值对 => <判决书路径,判决书内容>键值对 data = data_raw.flatMap(Doc) # 将判决书路径 => ID def DocID(string): s = filter(lambda x: x.isdigit(), string) return s[1:len(s)] # <判决书路径,判决书内容> => <判决书ID,判决书内容> data_wordsplit = data.map(lambda line: (DocID(line.split(",<")[0]), line.split(",<")[1].split(" "))) # 去除分词后文本之间的空格,便于后续正则表达式匹配 def Doc_Integration(line): doc = "" for k in line[1]: doc += k return (line[0], doc) # <判决书ID,判决书内容(有空格)> => <判决书ID,判决书内容> data_doc = data_wordsplit.map(Doc_Integration) # 从keywords_body.txt中提取出各可能维度,用正则表达式编译 keywords_raw = sc.textFile("/home/djt/data/keywords_crime.txt") keywords = keywords_raw.map( lambda line: re.compile(line)).collect() # 将<维度,set(特征词)>键值对广播 keywords = sc.broadcast(keywords) # 正则表达式匹配各判决书中出现的所有腐败行为方式(即罪名) def keywords_stats(line): doc = line[1] # 匹配 doc是判决书内容 value[0]即正则表达式 temp = keywords.value[0].findall(doc) crime_set = set(temp) crime = "" for k in crime_set: crime+="\t"+k return (line[0],crime) # raw:<判决书ID,所有出现的行为方式(罪名)> raw = data_doc.map(keywords_stats) after = raw.sortByKey() # 输出 res = after.map(lambda (k, v): k + "\t" + v) res.saveAsTextFile("/home/djt/data/out")
def computeMinHashSig(K, N, rdd): """ :param K: number of random hash functions (i.e., the number of rows of the signature matrix) :param N: maximum number of elements in any of the considered sets :param rdd: RDD where each record contains one set represented as a sorted list of 32-bit integers from the range [1 , . . . , N] :return: RDD containing the signature matrix, stored column-wise. That is, one record holds the K entries that correspond to the signature of one set """ sc = SparkContext(appName="PythonMinhash") # first choose a set of K random hash functions h1,..., hK (described in lecture 5 on slide 33) hashParams = sc.broadcast(generateHashParams(K)) data = sc.parallelize(rdd) sig = data.map(lambda x: computeSig(hashParams.value, N, x)) return sig.collect()
def main(): parser = argparse.ArgumentParser( description='process some log messages, storing them and signaling ' 'a rest server') parser.add_argument('--mongo', help='the mongodb url', required=True) parser.add_argument('--rest', help='the rest endpoint to signal', required=True) parser.add_argument('--port', help='the port to receive from ' '(default: 1984)', default=1984, type=int) parser.add_argument('--appname', help='the name of the spark application ' '(default: SparkharaLogCounter)', default='SparkharaLogCounter') parser.add_argument('--master', help='the master url for the spark cluster') parser.add_argument('--socket', help='the socket ip address to attach for streaming ' 'text data (default: caravan-pathfinder)', default='caravan-pathfinder') parser.add_argument('--model', help='the serialized model to use', default='model.json') args = parser.parse_args() mongo_url = args.mongo rest_url = args.rest model = args.model sconf = SparkConf().setAppName(args.appname) if args.master: sconf.setMaster(args.master) sc = SparkContext(conf=sconf) ssc = StreamingContext(sc, 1) somv = fromJSON(model) som = sc.broadcast(somv) log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN) lines = ssc.socketTextStream(args.socket, args.port) lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url, som)) ssc.start() ssc.awaitTermination()
def spark_batch(sc: SparkContext, feature_names, question_db: str, guess_db: str, granularity='sentence'): sql_context = SQLContext(sc) question_db = QuestionDatabase(question_db) log.info("Loading Questions") questions = question_db.guess_questions() log.info("Loading Guesses") guess_list = GuessList(guess_db) guess_lookup = guess_list.all_guesses(allow_train=True) log.info("Loading tasks") tasks = [Task(q, guess_lookup[q.qnum]) for q in questions] shuffle(tasks) log.info("Number of tasks: {0}".format(len(tasks))) features = {name: instantiate_feature(name, question_db) for name in feature_names} b_features = sc.broadcast(features) def f_eval(x): return evaluate_feature_question(x, b_features, granularity) log.info("Beginning feature job") feature_rdd = sc.parallelize(tasks)\ .repartition(150 * len(feature_names))\ .flatMap(f_eval) feature_df = sql_context.createDataFrame(feature_rdd, SCHEMA).cache() feature_df.count() log.info("Beginning write job") for fold in FOLDS: feature_df_with_fold = feature_df.filter('fold = "{0}"'.format(fold)).cache() for name in feature_names: filename = 'output/features/{0}/sentence.{1}.parquet'.format(fold, name) os.makedirs(os.path.dirname(filename), exist_ok=True) feature_df_with_fold.filter('feature_name = "{0}"'.format(name))\ .write.save(filename, mode='overwrite') feature_df_with_fold.unpersist() log.info("Computation Completed, stopping Spark") sc.stop()
def main(): # master = 'local[2]' master = 'spark://192.168.9.164:7077' app_name = 'test-broadcast' # spark_home = '/data01/app/bigdata/spark' # local spark_home = '/home/hadoop/app/spark' # test pyFiles = ['mysql_utils.py'] spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) external_cache = get_api_deviceinfo() deviceinfo_b = sc.broadcast(external_cache) sc.stop()
def feature_to_fdata(file_name): from pyspark import SparkContext def handle(x): line = x.split("\t") return line[0],line[1:] sc = SparkContext(appName="feature_to_fdata") data = sc.textFile(file_name) result = data.map(handle).reduceByKey(lambda x,y:list(x)+list(y)) transform_set = read_transform("/home/wangzhe/ccf/data/feature/transform.txt") transform_broadcast = sc.broadcast(transform_set) def handle2(x): uid,values = x label = '1' if uid in transform_broadcast.value else '0' value_map = {} for item in values: key,value = item.split(":") value_map[key] = float(value) return uid,label,value_map return result.map(handle2)
class TFIDF(): def __init__(self,input_path,output_path): self.input = input_path self.output = output_path self.texts = glob(self.input + '/*.txt') self.conf = SparkConf().setAppName('tfidf')\ .setMaster('local')\ .set('spark.executor.memory','1g') self.sc = SparkContext(conf=self.conf) def writeToCSVFile(self,rdd): with open(self.output + '/tfidf-scores.csv','wb') as csvfile: writer = csv.writer(csvfile) writer.writerow(['docID','word','score']) writer.writerows(rdd) def run(self): # Job 1: Word Frequency in Documents. tfilter = TextFilter().filter wcRDD = self.sc.emptyRDD() for dkey,textfile in enumerate(self.texts): tf = self.sc.textFile(textfile)\ .filter(lambda line: len(line.strip()) > 0)\ .flatMap(lambda line: tfilter(line))\ .map(lambda word: ((word,dkey),1))\ .reduceByKey(operator.add) N = tf.map(lambda ((w,d),y): y).sum() tf = tf.map(lambda ((w,d),y): ((w,d),(y,N))) wcRDD = self.sc.union([wcRDD,tf]) # Job 2: Word Frequency in Corpus & Calculate TF-IDF. D = self.sc.broadcast(len(self.texts)) wcRDD = wcRDD.map(lambda ((w,d),(a,b)): (w,(d,a,b))) wfRDD = wcRDD.map(lambda (w,(d,a,b)): (w,1)).reduceByKey(operator.add) tfidf = wcRDD.join(wfRDD).map(lambda (w,((d,a,b),c)): ((d,-a/b * np.log(D.value/c),w),1))\ .sortByKey(True).map(lambda ((d,z,w),a): (d,w,-z)) self.writeToCSVFile(tfidf.collect())
def train(self, data, iterations, partitions=12): from pyspark import SparkContext sc = SparkContext() dataRDD = sc.parallelize(data).cache() for t in range(iterations): sigma = self._decay_func(self.sigma, t, iterations) lr = self._decay_func(self.lr, t, iterations) codebookBC = sc.broadcast(self.codebook) randomizedRDD = dataRDD.repartition(partitions) print "iter: %d, sigma: %.2f, lr: %.2f, error: %.4f" % (t, sigma, lr, self.quantization_error(randomizedRDD.collect())) def train_partition(partition_data): localCodebook = codebookBC.value for elem in partition_data: (w_h, w_w) = winner(elem, localCodebook, self.w, self.h) g = gaussian(self.w, self.h, (w_h, w_w), sigma) * lr it = np.nditer(g, flags=['multi_index']) while not it.finished: localCodebook[it.multi_index] += g[it.multi_index]*(elem - localCodebook[it.multi_index]) it.iternext() return [localCodebook] resultCodebookRDD = randomizedRDD.mapPartitions(train_partition) sumCodebook = resultCodebookRDD.reduce(lambda a, b: a + b) newCodebook = sumCodebook / float(partitions) self.codebook = newCodebook
nPart = 38 * 14 * 4 * 4 sDir = op.join(sHdfsDir, 'hg38.fa.nb.enc.gzip') sPtter = op.join(sHdfsDir, 'ptter') codec = "org.apache.hadoop.io.compress.GzipCodec" # print default SparkConf sf = SparkConf() print sf.toDebugString() sc = SparkContext(appName=sApp) rdd = sc.textFile(sDir, use_unicode=False) rdd = rdd.map(split2KV) #lPtter = genPtter(rdd, 0.001, nPart) #sc.parallelize(lPtter).saveAsTextFile(sPtter) ptter = sc.broadcast(sc.textFile(sPtter, use_unicode=False).collect()) nTime = 4 nOne = nPart / nTime lIndex = [i * nOne for i in xrange(1, nTime)] s0 = ptter.value[lIndex[0]] s1 = ptter.value[lIndex[1]] s2 = ptter.value[lIndex[2]] #print ptter.value[lIndex[0]], ptter.value[:lIndex[0]] #print ptter.value[lIndex[0]], ptter.value[lIndex[1]], ptter.value[lIndex[0]:lIndex[1]] #print ptter.value[lIndex[1]], ptter.value[lIndex[2]], ptter.value[lIndex[1]:lIndex[2]] #print ptter.value[lIndex[2]], ptter.value[lIndex[2]:] for i in xrange(4): sp.call('hdfs dfs -rm -r ' + op.join(sHdfsDir, 'nb.' + str(i)),
except: return [(0, "x")] def artistToAlias(line): tokens = line.split('\t') try: return [(int(tokens[0]), int(tokens[1]))] except: return [(9999, 0)] def prepareRawUserArtistData(line, bArtistAlias): userID, artistId, count = map(int, line.split(' ')) finalArtistID = bArtistAlias.value.get(artistId, artistId) return mlrecom.Rating(userID, finalArtistID, count) ############################################################################### ########################CODE################################################### artistByID = rawArtistData.flatMap(lambda line: artistToId(line)) artistAlias = rawArtistAlias.flatMap( lambda line: artistToAlias(line)).collectAsMap() bArtistAlias = sc.broadcast(artistAlias) trainData = rawUserArtistData.map( lambda line: prepareRawUserArtistData(line, bArtistAlias)).cache() model = mlrecom.ALS.trainImplicit(trainData, 10, 5, 0.01, 1.0)
#To get 1-item frequent pattern one_item = _trans.flatMap(mineOneItem).reduceByKey(add).filter( lambda x: x[1] > SUPPORT_NUM).cache() result_buffer = one_item.map( lambda x: str(x[0]) + ":" + str(float(x[1]) / TRANS_NUM)) if args.verbose: print "1-item pattern:" print result_buffer.collect() #result_buffer.saveAsTextFile(args.output+"/1_item.out") #To get 2-k item frequent pattern frequent_pattern = one_item for i in range(2, args.k + 1): child_pattern = getChildPattern( frequent_pattern.map(lambda x: x[0]).collect(), i) #print child_pattern if len(child_pattern) == 0: break broadcast_pattern = sc.broadcast(child_pattern) frequent_pattern = _trans.flatMap(mineItem).reduceByKey(add).filter( lambda x: x[1] > SUPPORT_NUM).cache() result_buffer = frequent_pattern.map( lambda x: str(x[0]) + ":" + str(float(x[1]) / TRANS_NUM)) if args.verbose: print str(i) + "-item pattern:" print result_buffer.collect() #result_buffer.saveAsTextFile(args.output+"/"+str(i)+"_item.out") broadcast_pattern.unpersist() stop = time.time() if args.verbose: print "Complete! Time cost: {}".format(stop - start)
question, context_follwers, context_name, topicFollowers, topicNames, question_key, ans, anonymous ] with open("answered_data_10k.in") as f: data = f.readlines() N = int(data[0]) data = data[1:] print N, len(data), data[0] sc = SparkContext() sqlContext = SQLContext(sc) V = sc.broadcast(punctuations) r = sc.parallelize(data) r = r.map(lambda s: s.strip()).map(json.loads).map(getData) r = r.take(10) df = sqlContext.createDataFrame(r, [ "question_text", "context_topic_followers", "context_topic_names", "topics_followers", "topics_name", "question_key", "__ans__", "anonymous" ]) df.show() rdd = df.select("question_text").rdd print rdd.take(2) row = Row("cleaned_text") k = rdd.map(lambda d: d["question_text"].lower()).map(lambda word: " ".join( [str(w) for w in word.split() if not w in stopword])).map( lambda word: ''.join(char for char in word
try: spark=SparkSession.builder.config(conf=conf).getOrCreate() logger.debug("Initialized spark session successfully") except: logger.error("Fail to start spark session") # Input the dataset try: logger.debug("Start to read the input dataset") posts_df=spark.read.json(posts_file) tags_df=spark.read.csv(tags_file, header=True) selected_tags=pd.read_csv(selected_tags_file, header=None) local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) local_catId_to_tags=dict(zip(list(selected_tags.index), selected_tags[0])) tags_to_catId=sc.broadcast(local_tags_to_catId) catId_to_tags=sc.broadcast(local_catId_to_tags) tags_set=sc.broadcast(set(selected_tags[0])) logger.debug("Read in dataset successfully") except: logger.error("Can't input dataset") # Join posts_df and tags_df together and prepare training dataset selected_tags_df=tags_df.filter(tags_df.Tag.isin(tags_set.value)).na.drop(how = 'any') tags_questions_df=selected_tags_df.join(posts_df, "Id") training_df=tags_questions_df.select(['Tag', 'Body','Id']).na.drop(how = 'any') logger.debug("successfully get training_df") # tokenize post texts and get term frequency and inverted document frequency logger.debug("Start to generate TFIDF features")
artistByID = dict(rawArtistData.flatMap(lambda x: pairsplit(x)).collect()) def aliaslookup(alias): splitPair = alias.rsplit('\t') if len(splitPair) != 2: return [] else: try: return [(int(splitPair[0]), int(splitPair[1]))] except: return [] artistAlias = rawArtistAlias.flatMap(lambda x: aliaslookup(x)).collectAsMap() bArtistAlias = sc.broadcast(artistAlias) def ratinglookup(x): userID, artistID, count = map(lambda line: int(line), x.split()) finalArtistID = bArtistAlias.value.get(artistID) if finalArtistID is None: finalArtistID = artistID return Rating(userID, finalArtistID, count) trainData = rawUserArtistData.map(lambda x: ratinglookup(x)) trainData.cache() '''build model''' model = ALS.trainImplicit(trainData, 10, 5) '''test artist'''
workerList = [] scenarioList = [] for i in range(scenarioSize): workerList.append(Worker(i)) scenarioList.append("Scenario " + str(i)) parallelWorkerList = sc.parallelize(zip(workerList, scenarioList)).persist() for x in range(20): var1 = randint(100, 10000) var2 = randint(100, 10000) print("Initializing iteration " + str(x)) print("Updating variables: ") print("\tvar1: " + str(var1)) print("\tvar2: " + str(var2)) broadcast1 = sc.broadcast(var1) broadcast2 = sc.broadcast(var2) solved_values = parallelWorkerList.foreach(lambda item: do_iteration( item[0], item[1], broadcast1, broadcast2)) #solved_values = parallelWorkerList.map(lambda item: do_iteration(item[0], item[1], broadcast1, broadcast2)).collect() assert len(solved_values) == scenarioSize print("Total time: " + str(time.time() - startTime))
import plotly.tools as tls tls.set_credentials_file(username='******', api_key='njpjllrdy0') os.environ['SPARK_HOME'] = "/usr/local/spark" sys.path.append("/usr/local/spark/python") sys.path.append("/usr/local/spark/python/lib") from pyspark import SparkContext from pyspark import SparkConf, SparkContext conf = SparkConf() conf.setMaster("local[4]") conf.setAppName("My app") conf.set("spark.executor.memory", "8g") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf = conf) months = {'Jan': '01', 'Feb': '02', 'Mar':'03', 'Apr':'04', 'May': '05', 'Jun': '06', 'Jul':'07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov':'11', 'Dec': '12'} months = sc.broadcast(months) class OSDataAnalysis(object): @staticmethod def urpd(line): # line = line.replace("\"", "", 10) if "::1" not in line: reqTime = re.search(r"\[([A-Za-z0-9_]+)(.+)\]", line).group()[1:-1] if re.search(r'\"([A-Za-z0-9_]+)(.+)\"', line) != None: request = re.search(r'\"([A-Za-z0-9_]+)(.+)\"', line).group().split('\"')[1] else: return False # request = re.search(r'\"([A-Za-z0-9_]+)(.+)\"', line).group().split('\"')[1] date = reqTime[:11] date = date[7:]+'-'+months.value[date[3:6]]+'-'+date[0:2]
#use pred_source_all_com_small training tree data_pred_source_all = sc.textFile("/data/mllib/pred_source_all_com_small").map(data_p_std) #data_p_std = data.filter(filter_positive_data).sample(True, 50) #resampling positive sample #union data and data_p data_trans_feature = data_ans.union(data_p_std).union(data_ans_0827) #data_union = data_ans.sample(False, 0.5).union(data_p_std) #balance data set by reduce negative data data_union = data_ans.union(data_p_std.sample(True, 1.5)) #balance data set by reduce negative data #get the unique features, broadcast the value #col_na = range(80, 83) + [84] + range(87, 92) + [97] col_na = range(80, 83) + range(87, 92) + [97] fe = trans_fun(data_trans_feature, col_na) class_col = sc.broadcast(col_na) uni_f = sc.broadcast(fe) #broadcast uni_feature list #transform raw data to labeledPoint parsed_data = data_union.map(feature_char_to_num) #parsed_data = data_pred_source_all.map(feature_char_to_num) numFeatures = -1 if numFeatures <= 0: parsed_data.cache() numFeatures = parsed_data.map(lambda x:-1 if x[1].size==0 else x[1][-1]).reduce(max)+1 labeled_data = parsed_data.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1],x[2])))
#longitude=data.map(lambda x:float(x[1])).cache()#extract field2 latitude = data.map(lambda x: np.array(x[0]).astype(float)) #extract field1 longitude = data.map(lambda x: np.array(x[1]).astype(float)) #extract field1 coord1 = latitude.zip( longitude) # Zip latitude with longitude to coord(Spatial Information) #print type(coord1) coord = coord1.zipWithIndex( ) # Index the coordinate data--> 'coord' will be in format(coordinate,index) coordData = coord.map( lambda (k, v): (v, k)) # make the index as key and coordinate data as value count = coordData.count() # Count the number of points: count is global countglobal = sc.broadcast(count) ab = coord1.collect() #print(ab[:5]) #print type(ab[0][1]) #print(ab[0][1]) tre = spatial.cKDTree(ab) b1 = coord1.take(2) aa = sc.broadcast(tre) graph = coordData.map(lambda (k, v): (k, tre.query(v, 29))) gra = graph.map(lambda (k, v): (k, v[1][1:])) cra = gra.cartesian(gra) #gra=graph.map(lambda (k,v): k) def intersectionCount(k1, k2, v1, v2): countNum = 0
from pyspark import SparkConf, SparkContext def loadMovieNames(): movieNames = {} with open("ml-100k/u.item") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames conf = SparkConf().setMaster("local").setAppName("PopularMovies") sc = SparkContext(conf = conf) nameDict = sc.broadcast(loadMovieNames()) lines = sc.textFile("/Users/bjhav1/Documents/SparkCourse/ml-100k/u.data") movies = lines.map(lambda x: (int(x.split()[1]), 1)) movieCounts = movies.reduceByKey(lambda x, y: x + y) flipped = movieCounts.map( lambda (x, y) : (y, x)) sortedMovies = flipped.sortByKey() sortedMoviesWithNames = sortedMovies.map(lambda (count, movie) : (nameDict.value[movie], count)) results = sortedMoviesWithNames.collect() for result in results: print result
def parse_names(line): movie_names = {} fields = line.split("|") return (int(fields[0]), fields[1]) conf = SparkConf()#.setMaster("local").setAppName("PopularMovies") sc = SparkContext(conf = conf) id_lines = sc.textFile("hdfs://...ml100k/u.item") id_lines_rdd = id_lines.map(parse_names) names_dict = id_lines_rdd.collectAsMap() # creates key:value dict (id:movie) # Sends our mapping dictionary we made, one time, to every node in cluster and keeps it there # so it's available when needed and all nodes will know it as the object names_dict nameDict = sc.broadcast(names_dict) lines = sc.textFile("hdfs://.../ml100k/u.data") movies = lines.map(lambda x: (int(x.split()[1]), 1)) movieCounts = movies.reduceByKey(lambda x, y: x + y) flipped = movieCounts.map(lambda x : (x[1], x[0])) sortedMovies = flipped.sortByKey() sortedMoviesWithNames = sortedMovies.map(lambda countMovie : (nameDict.value[countMovie[1]], countMovie[0])) results = sortedMoviesWithNames.collect() for result in results: print (result)
output_file = sys.argv[3] sc = SparkContext(master, job_name) # Create an acucmulator initialized to 0 # This will be used to count the number of empty lines in the file blank_lines = sc.accumulator(0) file = sc.textFile(input_file) call_signs = file.flatMap(extract_call_signs) call_signs.count() # Call an action so blank lines can be displayed print('Blank lines: {0}'.format(blank_lines.value)) # Load call signs lookup table into a broadcast variable sign_prefixes = sc.broadcast(load_call_lookup) print(type(sign_prefixes)) ################################################### #### Numeric Stats. Switching to my NeoWs Data #### ################################################### with open('../data/near_miss_data.csv') as csv_file: near_miss_data = csv.DictReader(csv_file) near_miss_distance = {(float(row['miss_distance_astronomical'])) for row in near_miss_data} nm_data = sc.parallelize(near_miss_distance).persist() stats = nm_data.stats() stdev = stats.stdev() mean = stats.mean() print('Total number of near missses: {0}'.format(nm_data.count()))
# Helper functions for looking up the call signs def lookupCountry(sign, prefixes): pos = bisect.bisect_left(prefixes, sign) return prefixes[pos].split(",")[1] def loadCallSignTable(): f = open("./files/callsign_tbl_sorted", "r") return f.readlines() # Lookup the locations of the call signs on the # RDD contactCounts. We load a list of call sign # prefixes to country code to support this lookup. signPrefixes = sc.broadcast(loadCallSignTable()) def processSignCount(sign_count, signPrefixes): country = lookupCountry(sign_count[0], signPrefixes.value) count = sign_count[1] return (country, count) countryContactCounts = (contactCounts .map(lambda signCount: processSignCount(signCount, signPrefixes)) .reduceByKey((lambda x, y: x + y))) countryContactCounts.saveAsTextFile(outputDir + "/countries.txt") # Query 73s for the call signs CallLogs and parse the personse
L_high = tuple(L_high) R_low1 = re.findall(r'\d+', R_LOW_HN) R_low = list(map(int, R_low1)) R_low = tuple(R_low) R_high1 = re.findall(r'\d+', R_HIGH_HN) R_high = list(map(int, R_high1)) R_high = tuple(R_high) borocode = ('Unknown', 'NY', 'BX', 'K', 'Q', 'R') yield (int(PHYSICALID), (L_low, L_high), (R_low, R_high), ST_LABEL, borocode[int(BOROCODE)], FULL_STREE) street_line = streets.mapPartitionsWithIndex(lines) street_list = sc.broadcast(street_line.collect()) def findid(borough, street, h_num): dd = None for i in street_list.value: if (i[3] == street or i[5] == street) and (i[4] == borough) and ( (h_num[-1] >= i[2][0][-1] and h_num[-1] <= i[2][1][-1]) or (h_num[-1] >= i[1][0][-1] and h_num[-1] <= i[1][1][-1])): dd = i[0] break else: dd = None break return dd def extractScores(partId, records):
images_buf = images_read_rdd.map(images_to_bytes) #images_part = images_buf.repartition(3000) images_features = images_buf.flatMap(extract_opencv_features("sift")) filtered_features = images_features.filter(lambda x: x[1] != None) features_with_filenames = filtered_features.map( lambda x: (Row(fileName=x[0], features=x[1].tolist()))) features = features_with_filenames.flatMap(lambda x: x['features']) mod = buildModel() clusterCenters = mod.clusterCenters clusterCenters = sc.broadcast(clusterCenters) features_bow = features_with_filenames.map( functools.partial(assign_pooling, clusterCenters=clusterCenters, pooling='max')) features_bow.coalesce(1, shuffle=True).saveAsTextFile( "hdfs://discus-p2irc-master:54310/tmp/output_image/") processing_end_time = time() - processing_start_time print "SUCCESS: Images procesed in {} seconds".format( round(processing_end_time, 3)) sc.stop()
MaxWindowPrecMZ = max(np.array([x[1] for x in res])) + max( np.array([x[4] for x in res])) MaxOffset = max(np.array([x[4] for x in res])) SpectraLibrary = { k: SpectraLibrary[k] for k in SpectraLibrary if SpectraLibrary[k]['PrecursorMZ'] < MaxWindowPrecMZ } conf = (SparkConf().set("spark.driver.maxResultSize", "25g")) sc = SparkContext(conf=conf, appName="Specter", pyFiles=['sparse_nnls.py']) #Recast the library as a broadcast variable to improve performance BroadcastLibrary = sc.broadcast(SpectraLibrary) res = sc.parallelize(res, numPartitions) output = res.mapPartitions( partial(RegressSpectraOntoLibrary, Library=BroadcastLibrary, tol=delta * 1e-6, maxWindowOffset=MaxOffset)).collect() output = [[ output[i][j][0], output[i][j][1], output[i][j][2], output[i][j][3], output[i][j][4], output[i][j][5] ] for i in range(len(output)) for j in range(len(output[i]))] scPath = os.path.join(outputDir, baseName + '_SpecterCoeffs.csv')
radioStation = sys.argv[1] file = open("output.txt", "w") print(("Getting audience for themes aired on " + radioStation), file=file) file.close() #Obtain the titles emitted by the radio station indicated by argument titles_radioStation_files = sc.textFile("file_cad*.txt").\ map(split_file_cad).\ filter(lambda keyValue: keyValue[1]==radioStation).\ keys().\ collect() #Broadcast the lookup dictionary to the cluster titles_radioStation_files_lookup = sc.broadcast(titles_radioStation_files) #Obtain the total number of listeners to the titles emitted by the indicated radio station titles_numListeners_files = sc.textFile("file_num*.txt").\ map(split_file_num).\ filter(lambda keyValue: keyValue[0] in titles_radioStation_files_lookup.value).\ reduceByKey(add).\ collect() #Sort the output by titles output = sorted(titles_numListeners_files) #Save the output file = open("output.txt", "a") for o in output: print("%s: %d" % (o[0], o[1]), file=file)
def pack_by_strata(col_group, partition_iter): strata = collections.defaultdict(list) perm = range(num_workers) for _ in range(col_group): perm.insert(0, perm.pop()) for entry in partition_iter: _, (u, m, _, _, _) = entry row_group = (u - 1) / blk_row_size strata[(perm[row_group], row_group, col_group)].append(entry[1]) for item in strata.items(): yield item # add N_i, N_j for each rating entry rating_per_user_b = sc.broadcast(rating_per_user) rating_per_movie_b = sc.broadcast(rating_per_movie) # map to :(<col-group>, (<u> <m> <r> <N_i> <N_j>)) ratings = ratings.map(lambda r: ((r[1] - 1) / blk_col_size, # value is a 5-element tuple (r[0], r[1], r[2], rating_per_user_b.value[r[0]], rating_per_movie_b.value[r[1]]))) \ .partitionBy(num_workers) \ .mapPartitionsWithIndex(pack_by_strata, preservesPartitioning=True) \ .cache() def calculate_loss(pred_rating, true_rating): error, n = 0.0, 0 for _, entries in true_rating:
class SparkFEProcess: def __init__(self): self.parser = self.init_config() sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_2") \ .set("spark.ui.showConsoleProgress", "false") self.sc = SparkContext(conf=sparkConf) self.sc.broadcast(self.parser) self.init_logger() # #初始化相关参数 # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用 # self.bins_dict={} def init_config(self): current_path = os.path.dirname(os.path.realpath(__file__)) workspace_path = current_path.split('featureEngineering')[0] config_file = workspace_path + 'resource/config.ini' parser = configparser.ConfigParser() parser.read(config_file) return parser def init_logger(self): ''' 设置日志级别 :param sc: :return: ''' logger = self.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def read_rdd(self, fileName): try: file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName data_rdd = self.sc.textFile(file_path) return data_rdd except Exception as e: print(e) def data_describe(self): sqlContext = SQLContext(self.sc) print('starto read data after explore_saprk_step1_cross:') rootPath = self.parser.get("hdfs_path", "hdfs_data_path") print('start to read actLog_train_single_cross') test_file_path = rootPath + 'actLog_test_single_cross' actLog_test_rdd = self.sc.pickleFile(test_file_path) #比对label,看labels是否合适 labels = [ ('duration_time', typ.IntegerType()), ('device', typ.IntegerType()), ('music_id', typ.IntegerType()), ('item_city', typ.IntegerType()), ('author_id', typ.IntegerType()), ('item_id', typ.IntegerType()), ('user_city', typ.IntegerType()), ('uid', typ.IntegerType()), ('channel', typ.IntegerType()), ('finish', typ.IntegerType()), ('like', typ.IntegerType()), ('time_day', typ.IntegerType()), ('item_pub_month', typ.IntegerType()), ('item_pub_day', typ.LongType()), ('item_pub_hour', typ.IntegerType()), ('item_pub_minute', typ.IntegerType()), ('uid_count_bin', typ.IntegerType()), ('user_city_count_bin', typ.IntegerType()), ('user_city_count_ratio', typ.DoubleType()), ('item_id_count_bin', typ.IntegerType()), ('item_id_count_ratio', typ.DoubleType()), ('author_id_count_bin', typ.IntegerType()), ('author_id_count_ratio', typ.DoubleType()), ('item_city_count_bin', typ.IntegerType()), ('item_city_count_ratio', typ.DoubleType()), ('music_id_count_bin', typ.IntegerType()), ('music_id_count_ratio', typ.DoubleType()), ('device_count_bin', typ.IntegerType()), ('device_count_ratio', typ.DoubleType()), ('uid_author_id_count_bin', typ.IntegerType()), ('uid_author_id_count_ratio', typ.DoubleType()), ('uid_item_city_count_bin', typ.IntegerType()), ('uid_item_city_count_ratio', typ.DoubleType()), ('uid_channel_count_bin', typ.IntegerType()), ('uid_channel_count_ratio', typ.DoubleType()), ('uid_music_id_count_bin', typ.IntegerType()), ('uid_music_id_count_ratio', typ.DoubleType()), ('uid_device_count_bin', typ.IntegerType()), ('uid_device_count_ratio', typ.DoubleType()), ('author_id_channel_count_bin', typ.IntegerType()), ('author_id_channel_count_ratio', typ.DoubleType()), ('author_id_user_city_count_bin', typ.IntegerType()), ('author_id_user_city_count_ratio', typ.DoubleType()), ('author_id_item_city_count_bin', typ.IntegerType()), ('author_id_item_city_count_ratio', typ.DoubleType()), ('author_id_music_id_count_bin', typ.IntegerType()), ('author_id_music_id_count_ratio', typ.DoubleType()), ('uid_channel_device_count_bin', typ.IntegerType()), #改成uid_channel_device ('uid_channel_device_count_ratio', typ.DoubleType()), #改成uid_channel_device ('author_id_item_city_music_id_count_bin', typ.IntegerType()), ('author_id_item_city_music_id_count_ratio', typ.DoubleType()), ] actionLogSchema = typ.StructType( [typ.StructField(e[0], e[1], True) for e in labels]) df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd, actionLogSchema) df_actLog_test.show(1, truncate=False) print('start to read actLog_train_single_cross') train_file_path = rootPath + 'actLog_train_single_cross' actLog_train_rdd = self.sc.pickleFile(train_file_path) df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd, actionLogSchema) df_actLog_train.show(1, truncate=False) return df_actLog_train, df_actLog_test def data_explore(self, df_train, df_test): sqlContext = SQLContext(self.sc) print("对item_pub_hour进行离散化") def hourBin(x): if x >= 23 or x <= 2: return 1 elif 3 <= x < 8: return 2 elif 8 <= x < 12: return 3 else: return 4 converHourBin = udf(lambda x: hourBin(x), typ.IntegerType()) df_train = df_train.withColumn("item_pub_hour", converHourBin(df_train.item_pub_hour)) df_test = df_test.withColumn("item_pub_hour", converHourBin(df_test.item_pub_hour)) print("--------1、针对uid,authorid,musicid等组合的正负样本数量统计特征--------") print("交叉特征的正负样本数量统计") posneg_feats_list = [] # posneg_feats_list.append(["duration_time"]) # posneg_feats_list.append(["time_day"]) print('cross count') users = ['uid'] authors = ['author_id', 'item_city', 'channel', 'music_id'] #,'item_pub_hour' posneg_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors]) # posneg_feats_list.append(['uid','author_id', 'channel']) # posneg_feats_list.append(['uid', 'author_id', 'music_id']) # posneg_feats_list.append(['uid','author_id', 'channel','time_day']) # posneg_feats_list.append(['uid', 'author_id', 'music_id','time_day']) print("计算以下交叉特征的正负样本比例") #有2、3、4维的交叉特征 print(posneg_feats_list) for i in range(len(posneg_feats_list)): group_cols = posneg_feats_list[i] new_feature = '_'.join(group_cols) #计算df_train数据中正负样本的比例,test中直接拼接,为null则填充为0或者均值 #正负样本判定字段:like finish #d第一步,先拼接 print(new_feature) if len(group_cols) == 2: print("开始处理2维交叉变量") df_train = df_train.withColumn( new_feature, fn.concat_ws( '_', df_train[group_cols[0]].cast(typ.StringType()), df_train[group_cols[1]].cast(typ.StringType()))) df_test = df_test.withColumn( new_feature, fn.concat_ws( '_', df_test[group_cols[0]].cast(typ.StringType()), df_test[group_cols[1]].cast(typ.StringType()))) if len(group_cols) == 3: print("开始处理3维交叉变量") df_train = df_train.withColumn( new_feature, fn.concat_ws( '_', df_train[group_cols[0]].cast(typ.StringType()), df_train[group_cols[1]].cast(typ.StringType())), df_train[group_cols[2]].cast(typ.StringType())) df_test = df_test.withColumn( new_feature, fn.concat_ws( '_', df_test[group_cols[0]].cast(typ.StringType()), df_test[group_cols[1]].cast(typ.StringType())), df_test[group_cols[2]].cast(typ.StringType())) # if len(group_cols)==4: # # print("开始处理4维交叉变量") # df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType())) # ,df_train[group_cols[2]].cast(typ.StringType()) ,df_train[group_cols[3]].cast(typ.StringType())) # df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType())) # ,df_test[group_cols[2]].cast(typ.StringType()) ,df_test[group_cols[3]].cast(typ.StringType())) for target in ["like", "finish"]: df3 = df_train.select( new_feature, target).groupby(new_feature).count().withColumnRenamed( 'count', new_feature + '_count') df4 = df_train.select( new_feature, target).where(df_train[target] == 1).groupby( new_feature).count().withColumnRenamed( 'count', new_feature + "_count_" + target + "_1") df3 = df3.join(df4, new_feature, 'left').na.fill(0) del df4 gc.collect() # print("两列相除:得到正样本的比例",target) df3 = df3.withColumn( new_feature + "_" + target + "_pos_neg", fn.col(new_feature + "_count_" + target + "_1") / fn.col(new_feature + '_count')) df3 = df3.drop(new_feature + "_count_" + target + "_1", new_feature + '_count') print("新的df_train", new_feature, target) df_train = df_train.join(df3, new_feature, "left") df_train.show(1) df_test = df_test.join(df3, new_feature, "left") #会存在null,缺失值设置为0 print("新的df_test", new_feature, target) df_test.show(1) df_test = df_test.na.fill(0) del df3 gc.collect() if new_feature not in ["duration_time", "time_day"]: df_train = df_train.drop(new_feature) df_test = df_test.drop(new_feature) df_train.printSchema() df_test.printSchema() print('最终表结构,该表结构用于concate的输入' ) #是不是应该有build_data_for_like build_data_for_finish df_train.printSchema() df_test.printSchema() print("查看test缺失值") df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in posneg_feats_list]).show() print("查看train缺失值") df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in posneg_feats_list]).show() print('-------5.保存数据预处理结果-------') test_file_path = self.parser.get( "hdfs_path", "hdfs_data_path") + 'actLog_test_step2' os.system("hadoop fs -rm -r {}".format(test_file_path)) df_test.rdd.map(tuple).saveAsPickleFile(test_file_path) del df_test gc.collect() train_file_path = self.parser.get( "hdfs_path", "hdfs_data_path") + 'actLog_train_step2' os.system("hadoop fs -rm -r {}".format( train_file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
# mywriter.writerow(row); # os.environ["SPARK_HOME"] = "/apps/spark/spark-1.4.1-bin-hadoop2.6/"; conf = SparkConf().setAppName("Spark Test").setMaster("spark://spnode01:7077"); sc = SparkContext(conf=conf); features, labels = loadTrainSet("hdfs://spnode01:9000/kaggle/DigitRecognizer/train.csv", sc); m = features.count(); k = 5; features = features.collect(); labels = labels.collect(); featuresBC = sc.broadcast(features); labelsBC = sc.broadcast(labels); testDatas = loadTestSet("hdfs://spnode01:9000/kaggle/DigitRecognizer/test.csv", sc); testDatas.cache(); result = testDatas.map(lambda line : ((((np.tile(line, (m, 1)) - featuresBC.value) ** 2).sum(axis=1)) ** 0.5) .argsort()).map(lambda line : [line[i] for i in range(k)]).map(lambda line : map(lambda x : labelsBC.value[x], line)).map(lambda line : {key : line.count(key) for key in set(line)}).map(lambda line : sorted(line.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0]); # result = result.collect(); # generateResultFile('/home/hadoop/workdatas/kaggle/DigitRecognizer/result_spark.csv', result); result.repartition(1).saveAsTextFile("hdfs://spnode01:9000/kaggle/DigitRecognizer/result.spark"); sc.stop();
def launch_spark_job(): from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.functions import concat, col, lit readFile = sys.argv[1] k = int(sys.argv[2]) num_partitions = int(sys.argv[3]) conf = SparkConf().setAppName("reads Loader" + str(num_partitions)) sc = SparkContext(conf=conf) sc.addPyFile("utils.py") sc.setCheckpointDir( "hdfs://doop-mng1.haifa.ibm.com:8020/projects/Store_Analytics/SparkCheckPoints" ) import utils # from utils import map_read_to_anchors_list, convert_anchors_list_to_seq_edges readLines = ( sc.newAPIHadoopFile( readFile, 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': '@'}) #, .map(lambda delim_lines_tup: delim_lines_tup[1] ) # keeps just the lines and not the @ delimiter .filter(lambda x: x.startswith("SRR") ) # gets rid of entries due to '@' appearing in the wrong line .map(lambda x: x.split("\n")[:2] ) # splits the lines, keeps only the first two .filter(lambda x: len(x) == 2) # git rid of any cut off records .repartition(num_partitions) # .cache() ) print("----------------------there are %i reads" % (readLines.count())) # get new RDD including lists of kmers (with no Ns), (k+1)mers kmers = (readLines.map(lambda entry: entry[1]).flatMap( lambda read: getKmerToNextCharCounts(read, k))) print("----------------------there are %i kmers instances" % (kmers.count())) kmers_with_exts = (kmers.reduceByKey(func=lambda x, y: x + y)) print("----------------------there are %i distinct kmers" % (kmers_with_exts.count())) junctions = kmers_with_exts.filter(lambda kmer_tup: my_filter(kmer_tup)) print("----------------------there are %i junctions" % junctions.count()) # for i in junctions.take(10): # if sum(i[1])>1: # print i generate_juncs = build_partial_junctions_set() junctions_set_rdd = (junctions.mapPartitions(generate_juncs).reduceByKey( merge_sets).collect()) juncs_broadcast = sc.broadcast(junctions_set_rdd[0][1]) print("----------------------there are %i junctions in broadcast" % len(juncs_broadcast.value)) # build edge set rdd, filter out edges including a junction at some end def read_line_map_function(read_line): return utils.map_read_to_anchors_list(read_line[1], k - 10, 10, juncs_broadcast.value) edges_rdd = (readLines.map( lambda read_line: read_line_map_function(read_line)).flatMap( lambda anchors: utils.convert_anchors_list_to_seq_edges(anchors), preservesPartitioning=True).filter( lambda (a, b, c): a not in juncs_broadcast.value and b not in juncs_broadcast.value)) print("----------------------there are %i total edges" % edges_rdd.count()) # create SQLContext to be able to create dataFrame from rdd sqc = SQLContext(sc) edges_df = sqc.createDataFrame(edges_rdd, ["src", "dst", "overlap"]) vertices_df = edges_df.select( concat(col("src"), lit(" "), col("dst")).alias('id')).dropDuplicates() g = GraphFrame(vertices_df, edges_df) # vertices_df.agg(*[count(c).alias(c) for c in vertices_df.columns]).show() print("----------------------there are %i total vertices" % vertices_df.count()) # get connected components of remaining graph result = g.connectedComponents() result.select("id", "component").orderBy("component").show()
from pyspark import SparkConf, SparkContext def loadMovieNames(): movieNames = {} with open("ml-1m/movies.dat") as f: for line in f: fields = line.split('::') movieNames[int(fields[0])] = fields[1] return movieNames conf = SparkConf().setMaster("local").setAppName("PopularMovies") sc = SparkContext(conf=conf) nameDict = sc.broadcast(loadMovieNames()) lines = sc.textFile("ml-1m/ratings.dat") movies = lines.map(lambda x: (int(x.split("::")[1]), 1)) movieCounts = movies.reduceByKey(lambda accum, current: accum + current) flipped = movieCounts.map(lambda (movieId, count): (count, movieId)) sortedMovies = flipped.sortByKey() sortedMoviesWithNames = sortedMovies.map(lambda (count, movieId): (nameDict.value[movieId], count)) results = sortedMoviesWithNames.collect() for result in results: print(result)
}) # reason:1497条 df2 = sqlContext.read.jdbc(url='jdbc:mysql://cdh5-slave2:3306/laws_doc', table='(select id,name,uid from reason ) tmp2', column='id', lowerBound=1, upperBound=1500, numPartitions=1, properties={ "user": "******", "password": "******" }) # acc = sc.accumulator(0) # print "df.count()======================" + str(df.count()) reason_broadcast = sc.broadcast(df2.map(lambda x: (x[1], x[2])).collect()) uuid_reason = df.map(lambda x: x).map( lambda x: get_reason(x)) #title_trial_process # (x[1], ("||".join(list(set(name))), reason_uids, casedate, plt_claim, dft_rep, crs_exm)) # print "uuid_reason.count()======================" + str(uuid_reason.count()) # uuid_reason.foreach(p) # print "uuid_reason=============="+str(uuid_reason.count()) uuid_court = df.map(lambda x: (x[2], x[1])) #court,uuid # print "uuid_court==============" + str(uuid_court.count()) court_province_full_uid = df1.map(lambda x: (x[1], (x[2], x[3]))) #court,province,full_uid uuid_province_full_uid = uuid_court.join(court_province_full_uid).map( lambda x: x[1]) #uuid_court中的法院,court表里面可能没有,court表不全,因此uuid记录数会少。 # .map(lambda x: (x[0], x[1][0], x[1][1])) # uuid,province,full_uid
# return "\001".join([str(valid_jsontxt(i)) for i in result]) def quchong(x, y): max = 0 item_list = y for ln in item_list: if int(ln[-1]) > max: max = int(ln[-1]) y = ln result = y lv = [] for ln in result: lv.append(str(valid_jsontxt(ln))) return "\001".join(lv) s1 = "/commit/iteminfo/20161110" s2 = "/commit/iteminfo/20161111" s3 = "/commit/iteminfo/20161112" rdd1 = sc.textFile(s1) rdd2 = sc.textFile(s2) rdd3 = sc.textFile(s3) rdd = rdd1.union(rdd2).union(rdd3) c_dim = "/hive/warehouse/wlbase_dev.db/t_base_ec_dim/ds=20151023/1073988839" cate_dict = sc.broadcast(sc.textFile(c_dim).map(lambda x: get_cate_dict(x)).filter(lambda x:x!=None).collectAsMap()).value rdd_c = rdd.map(lambda x:f(x,cate_dict)).filter(lambda x:x!=None) rdd_c.groupByKey().mapValues(list).map(lambda (x,y): quchong(x,y))\ .saveAsTextFile("/user/wrt/temp/shuang11_iteminfo") # hfs -rmr /user/wrt/temp/shuang11_iteminfo # spark-submit --executor-memory 6G --driver-memory 8G --total-executor-cores 80 shuang11_item_info.py # LOAD DATA INPATH '/user/wrt/temp/shuang11_iteminfo' OVERWRITE INTO TABLE wlservice.t_wrt_tmp_shuang11_iteminfo_new;
def main(): sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer") #sc = SparkContext("local[*]", appName="RedditBatchLayer") bcURL = sc.broadcast(urlTitlePool) sqlContext = SQLContext(sc) conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET) def addTitleURL(cmtTuple): # 150,000/ 3000 = avg 50 comments/topic onePst = bcURL.value[randint(0, 3000)] return cmtTuple + (onePst[0], onePst[1]) # adding title and url if (smallBatch): logFile = 's3a://reddit-comments/2007/RC_2007-10' #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') year = 2007 month = 12 users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) graph.foreachPartition(insert_graph) else: for key in bucket.list(): if '-' not in key.name.encode( 'utf-8'): # filter out folders and _SUCCESS continue logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8')) year = logFile.split('-')[1][-4:] month = logFile.split('-')[2] from_year = FROM_YEAR_MONTH.split('_')[0] from_month = FROM_YEAR_MONTH.split('_')[1] if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)): continue #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') # 0 1 2 3 4 5 6 7 8 9 (title) 10(url) users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) #.repartition(REPARTITION_SIZE) graph.foreachPartition(insert_graph) sc.stop()
def matrix_vector_mult(tuple): return (tuple[0], round((V.value[tuple[1]-1] * tuple[2]) * 0.8, 15)) if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: modified_pagerank.py inputfile outputpath", file=sys.stderr) exit(-1) sc = SparkContext(appName="Pagerank") graph_rdd = sc.textFile(sys.argv[1]).repartition(10).cache() outlink_rdd = graph_rdd.map(lambda x: (int(x.split("\t")[0]),[int(x.split("\t")[1])])).reduceByKey(lambda x,y: x + y).cache() total_nodes = sc.broadcast(outlink_rdd.count()) M = outlink_rdd.flatMap(weight_matrix).cache() # Modified pagerank local_v = [] for x in range(total_nodes.value): local_v.append(round(Decimal(1)/ Decimal(total_nodes.value), 15)) V = sc.broadcast(local_v) local_e = [] for x in range(total_nodes.value):
#TODO run this in jupyter notebook from pyspark import SparkContext sc = SparkContext('local[*]', 'pyspark') my_dict = {"item1": 1, "item2": 2, "item3": 3, "item4": 4} my_list = ["item1", "item2", "item3", "item4"] my_dict_bc = sc.broadcast(my_dict) def my_func(letter): return my_dict_bc.value[letter] my_list_rdd = sc.parallelize(my_list) result = my_list_rdd.map(lambda x: my_func(x)).collect() print(result)
def loadMovieNames() -> dict: movieNames = {} # Movie titles include swedish characters which require ISO-8859-1 encoding with open( "/home/mmanopoli/Udemy/TamingBigDataWithSparkAndPython/data/ml-100k/u.item", encoding='iso-8859-1') as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames conf = SparkConf().setMaster("local[4]").setAppName("PopularMovies") sc = SparkContext(conf=conf) nameDict = sc.broadcast(loadMovieNames( )) # Broadcast the python movieNames object to each excecutor as nameDict lines = sc.textFile( "/home/mmanopoli/Udemy/TamingBigDataWithSparkAndPython/data/ml-100k/u.data" ) movies = lines.map(lambda x: (int(x.split()[1]), 1)) movieCounts = movies.reduceByKey(lambda x, y: x + y) # flipped = movieCounts.map( lambda x : (x[1], x[0])) # sortedMovies = flipped.sortByKey() sortedMovies = movieCounts.sortBy(lambda x: x[1]) #sortedMoviesWithNames = sortedMovies.map(lambda countMovie : (nameDict.value[countMovie[1]], countMovie[0])) # countMovie[0] is the Movie ID because I used sortBy - that's what we lookup in nameDict sortedMoviesWithNames = sortedMovies.map(
(user1,user2) -> (similarity,co_raters_count) ''' user_sims = pairwise_users.map( lambda p: calcSim(p[0],p[1])).map( lambda p: keyOnFirstUser(p[0],p[1])).groupByKey().map( lambda p: nearestNeighbors(p[0],p[1],50)) ''' 对每个用户的打分记录整理成如下形式 user_id -> [(item_id_1, rating_1), [(item_id_2, rating_2), ...] ''' user_item_hist = lines.map(parseVectorOnUser).groupByKey().collect() ui_dict = {} for (user,items) in user_item_hist: ui_dict[user] = items uib = sc.broadcast(ui_dict) ''' 为每个用户计算Top N的推荐 user_id -> [item1,item2,item3,...] ''' user_item_recs = user_sims.map( lambda p: topNRecommendations(p[0],p[1],uib.value,100)).collect()
from __future__ import print_function from pyspark import SparkConf from pyspark import SparkContext sparkconfig = SparkConf() sparkconfig.setMaster("local[*]") sparkconfig.setAppName("SparkCSVJOB") def compute_each_line(eachLine): # Fetching the broadcast date_code = date_code_broadcast.value data_split = eachLine.split(",") if data_split[0] in date_code: print(eachLine) return sparkcontext = SparkContext(conf=sparkconfig) date_code_broadcast = sparkcontext.broadcast(["20170104", "20170102"]) textFileRDD = sparkcontext.textFile( "/home/dharshekthvel/Downloads/query_result.csv") textFileRDD.map(compute_each_line).collect()
sc.stop() if __name__ == '__main__': main() import sys sys.path.append('/usr/local/lib/python2.7/site-packages') sys.path.append('/home/hadoop/app/spark/python') sys.path.append('/home/hadoop/app/spark/python/lib/py4j-0.8.2.1-src.zip') from pyspark import SparkContext, SparkConf from mysql_utils import MySQLUtils master = 'local[2]' app_name = 'test-broadcast' # spark_home = '/data01/app/bigdata/spark' # local spark_home = '/home/hadoop/app/spark' # test pyFiles = ['mysql_utils.py'] spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) external_cache = get_api_deviceinfo() deviceinfo_b = sc.broadcast(external_cache)
scheduled_departure_time=t[1].scheduled_departure_time, actual_departure_time=t[1].actual_departure_time, departure_delay_minutes=t[1].departure_delay_minutes, scheduled_arrival_time=t[1].scheduled_arrival_time, actual_arrival_time=t[1].actual_arrival_time, arrival_delay_minutes=t[1].arrival_delay_minutes, crs_elapsed_flight_minutes=t[1].crs_elapsed_flight_minutes, distance=t[1].distance) if __name__ == "__main__": sc = SparkContext(appName="InsightEdge Python API Demo: prediction job") ssc = StreamingContext(sc, 3) sqlc = SQLContext(sc) zkQuorum = "localhost:2181" topic = "flights" model = DecisionTreeModel(Utils.load_model_from_grid("DecisionTreeFlightModel", sc)) carrier_mapping = sc.broadcast(load_mapping("CarrierMap", sqlc)) origin_mapping = sc.broadcast(load_mapping("OriginMap", sqlc)) destination_mapping = sc.broadcast(load_mapping("DestinationMap", sqlc)) kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.foreachRDD(predict_and_save) ssc.start() ssc.awaitTermination()
#map data to a binary matrix #1. get the dictionary of the data #The dictionary of each document is a list of UNIQUE(set) words lists=dataRDD.map(lambda x:list(set(x.strip().split(' ')))).collect() all=[] #combine all dictionaries together (fastest solution for Python) for l in lists: all.extend(l) dict=set(all) print len(dict) #it is faster to know the position of the word if we put it as values in a dictionary dictionary={} for i,word in enumerate(dict): dictionary[word]=i #we need the dictionary to be available AS A WHOLE throughout the cluster dict_broad=sc.broadcast(dictionary) #build labelled Points from data data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')] dcRDD=sc.parallelize(data_class,numSlices=16) #get the labelled points labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value)) #Train NaiveBayes model=NaiveBayes.train(labeledRDD) #broadcast the model mb=sc.broadcast(model) test,names=lf.loadUknown('./data/test') name_text=zip(names,test) #for each doc :(name,text): #apply the model on the vector representation of the text #return the name and the class
from pyspark import SparkContext, SparkConf sc = SparkContext(conf=SparkConf().setAppName("Airlines App")) mainRdd = sc.textFile("airports_mod.dat") l = [ 'Airport_Id', 'Name', 'City', 'Country', 'IATA', 'ICAO', 'Latitude', 'Longitude', 'Altitude', 'Timezone', 'DST', 'Tz' ] l = sc.broadcast(l) def stringtodict(s): i = 0 d = {} k = s.split(',') for key in l.value: d[key] = k[i] i += 1 return d mainRddDict = mainRdd.map(stringtodict) mainRddDict.saveAsPickleFile("airports_mod.pickle")
sc = SparkContext(appName="SparkLda") text = sc.textFile(sys.argv[1]).repartition(200) print "caching file ..." text.cache() print "counting file ..." NumDoc = text.count() likelihood, likelihood_old = 0, 0 print "initialing beta ... " beta = rand_init_beta(NumTerm, K) print "initialing beta success! " # E_M iterate for beta for i in range(20): print "starting iteration {0} ...".format(i) print sys.getsizeof(beta) beta_global = sc.broadcast(beta) print "broadcast success" #new_beta = text.flatMap(lambda line, beta=beta : Expectation(line, beta , Alpha, K)).reduceByKey(add) new_beta = text.flatMap(lambda line : Expectation(line, Alpha, K) ).reduceByKey(add) #output = new_beta.collect() output = new_beta.count() print "output", output #print >> open('beta.'+str(i), 'w'), beta (beta, likelihood) = update_beta(output, K, NumTerm) #print beta print "likelihood {0} is {1}".format(i, likelihood) print >> open('beta.final', 'w'), beta
for v in vs: tn = v[0] v3 = v[1][1] if tn == 'e1': t1.append(v[1][0]) else: for v1 in t1: if v1 < v3: if lu.get((v3, v1), False): count += 1 return count if __name__ == '__main__': fn = sys.argv[1] # filename of input p = int(sys.argv[2]) # parallelism sc = SparkContext(master="local[{}]".format(p), appName="Triangle Count") text_file = sc.textFile(fn).filter(maxFilter) lookup = sc.broadcast(text_file.flatMap(toLU).collectAsMap()) count = text_file.flatMap(mymap) \ .groupByKey(p) \ .mapValues(lambda vs: sorted(vs, key=lambda x: x[0])) \ .map(lambda x: checkTriangles(lookup, x[1])) \ .reduce(lambda a,b: a + b) print(count)
starpairs = data.map(extract_user_repo) starpairs.cache() users = starpairs.map(lambda t: t[0]).distinct() # get 5% most popular repos repos = starpairs.map(lambda t: t[1]).distinct() sample = int(0.01 * repos.count()) top_repos = starpairs\ .groupBy(lambda t: t[1])\ .sortBy(lambda t: len(t[1]), False)\ .map(lambda t: t[0])\ .take(sample) top_repos_rdd = sc.parallelize(top_repos) top_repos_rdd.cache() top_repos_bc = sc.broadcast(top_repos) pprint(top_repos[:5]) starpairs_filtered = starpairs.filter(lambda t: t[1] in top_repos_bc.value) starpairs_filtered.cache() # train recommendation model using alternating least squares stars_with_rating = starpairs_filtered.map(lambda t: array([t[0], t[1], 1])) model = ALS.trainImplicit(stars_with_rating, rank=1) # get all user->repo pairs without stars users_repos = users.cartesian(top_repos_rdd).groupByKey() stars_grouped = starpairs_filtered.groupByKey() unstarred = users_repos.join(stars_grouped)\ .map(lambda i: (i[0], set(i[1][0]) - set(i[1][1]) ))\ .flatMap(lambda i: [ (i[0], repo) for repo in i[1] ] )
class SparkFEProcess: def __init__(self): self.parser = self.init_config() sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark") \ .set("spark.ui.showConsoleProgress", "false") self.sc = SparkContext(conf=sparkConf) self.sc.broadcast(self.parser) self.init_logger() # #初始化相关参数 # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用 # self.bins_dict={} def init_config(self): current_path = os.path.dirname(os.path.realpath(__file__)) workspace_path = current_path.split('featureEngineering')[0] config_file = workspace_path + 'resource/config.ini' parser = configparser.ConfigParser() parser.read(config_file) return parser def init_logger(self): ''' 设置日志级别 :param sc: :return: ''' logger = self.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def read_rdd(self, fileName): try: file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName data_rdd = self.sc.textFile(file_path) return data_rdd except Exception as e: print(e) def data_describe(self): print('starto read data for rdd:') rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t')) rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t')) print('finish read rdd, start to init action log rdd:') actionLogRdd_train = rawRdd_train.map( lambda x :(int(x[0]), int(x[1]), int(x[2]), int(x[3]), int(x[4]), int(x[5]), int(x[6]), int(x[7]), int(x[8]), int(x[9]), int(x[10]), int(x[11]))) # total = actionLogRdd_train.count() # print('total: ' + str(total)) actionLogRdd_test = rawRdd_test.map( lambda x :(int(x[0]), int(x[1]), int(x[2]), int(x[3]), int(x[4]), int(x[5]), int(x[6]), int(x[7]), int(x[8]), int(x[9]), int(x[10]), int(x[11]))) #转化为dataframe sqlContext = SQLContext(self.sc) labels=[('uid',typ.IntegerType()), ('user_city',typ.IntegerType()), ('item_id',typ.IntegerType()), ('author_id',typ.IntegerType()), ('item_city',typ.IntegerType()), ('channel',typ.IntegerType()), ('finish',typ.IntegerType()), ('like',typ.IntegerType()), ('music_id',typ.IntegerType()), ('device',typ.IntegerType()), ('time',typ.LongType()), ('duration_time',typ.IntegerType())] actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema) dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema) dfactionLog_train=dfactionLog_train.filter(dfactionLog_train['duration_time']<=300) dfactionLog_test=dfactionLog_test.filter(dfactionLog_test['duration_time']<=300) #train和test合并,并且保存保存train的数量,以便拆分 union可能会改变frame中的顺序 # df=dfactionLog_train.union(dfactionLog_test) # train_count=dfactionLog_train.count() # print("训练集的数量"+str(train_count)) # test_count=dfactionLog_test.count() # print("测试集的数量"+str(test_count)) # print('-------2.finish\like各特征下值的个数-------------') # df.agg( fn.countDistinct('finish').alias('finish_distinct'), \ # fn.countDistinct('like').alias('like_distinct') # ).show() # print("各特征下的最大值,最小值") # df.describe().show() return dfactionLog_train, dfactionLog_test def bining(self,sqlContext,df,col,percent_list): ''' :param sqlContext: :param df: :param col: 需要分箱的列 :return: ''' pandas_df = df.toPandas() bins=[] for percent in percent_list: bins.append(np.percentile(pandas_df.loc[:,col],percent)) #至少有20%的数据项小于或等于这个值 print(col+'查看分箱') print(bins) pandas_df.loc[:,col]=np.digitize(pandas_df.loc[:,col],bins,right=True) # print(pandas_df) #修改pandas中的列名 pandas_df.rename(columns={col:col+'_bin'}, inplace = True) df_spark= sqlContext.createDataFrame(pandas_df) # df_spark.show() return df_spark def city_col_deal(self,df,col): df_city_score=df.groupBy(col).avg('finish', 'like') \ .withColumnRenamed("avg(finish)","avg_finish").withColumnRenamed("avg(like)","avg_like") df_city_score=df_city_score.withColumn(col+'_score', df_city_score.avg_finish*0.7+df_city_score.avg_like*0.3)\ .select(col,fn.bround(col+'_score', scale=4).alias(col+'_score')) return df_city_score def dropUnuseCols(self,df,unuse_col): ''' #删除没有必要的列 #device|time|author_id|music_id| uid|item_id| #保留一下列 # user_city|item_city|channel|finish|like|duration_time # device_Cnt_bin|item_pub_hour||authorid_Cnt_bin|musicid_Cnt_bin|uid_playCnt_bin|itemid_playCnt_bin ''' # unuse_col=['device','time','author_id','music_id','uid','item_id'] for col in unuse_col: df=df.drop(col) return df def data_explore(self,df_train,df_test): sqlContext = SQLContext(self.sc) print("duration_time应该根据喜欢和不喜欢来分箱") print("查看duration_time的分布") print() print("------------1、通过时间戳获取年月日时分,(没有工作日特征,月日交叉表示节日特征,年份转化有问题)-----------------") #作品发布时间-作品发布的最早时间,转化为day time_min = df_train.select(fn.min(df_train['time'])).collect() df_train=df_train.withColumn('time_day', ((df_train.time-fn.lit(time_min[0][0])) /fn.lit(3600 * 24)).cast(typ.IntegerType())) # df_train=df_train.withColumn('time_strDate',fn.from_unixtime(df_train.time , "yyyy-MM-dd HH:mm:ss")) #将 unix 格式的时间戳转换为指定格式的日期,提取小时 df_train=df_train.withColumn('item_pub_month',fn.from_unixtime(df_train.time , "M").cast(typ.IntegerType())) df_train=df_train.withColumn('item_pub_day',fn.from_unixtime(df_train.time , "d").cast(typ.IntegerType())) df_train=df_train.withColumn('item_pub_hour',fn.from_unixtime(df_train.time , "k").cast(typ.IntegerType())) df_train=df_train.withColumn('item_pub_minute',fn.from_unixtime(df_train.time , "m").cast(typ.IntegerType())) print("查看month,day,hour,minute的提取是否正确") df_train.show(truncate=False) df_train=df_train.drop('time') #对时间提取的这部分字段进行count后进行分箱并不明显,就直接当作类别变量处理就可以了,另外增加pos_neg_ratio特征 df_test=df_test.withColumn('time_day', ((df_test.time-fn.lit(time_min[0][0])) /fn.lit(3600 * 24)).cast(typ.IntegerType())) df_test=df_test.withColumn('item_pub_month',fn.from_unixtime(df_test.time , "M").cast(typ.IntegerType())) df_test=df_test.withColumn('item_pub_day',fn.from_unixtime(df_test.time , "d").cast(typ.IntegerType())) df_test=df_test.withColumn('item_pub_hour',fn.from_unixtime(df_test.time , "k").cast(typ.IntegerType())) df_test=df_test.withColumn('item_pub_minute',fn.from_unixtime(df_test.time , "m").cast(typ.IntegerType())) df_test=df_test.drop('time') print('--------2、统计特征:count、ratio、nunique、ctr相关特征') print("计算基础特征和交叉特征的count、类别偏好的ratio") count_feats_list = [] print('single feature count') count_feats_list.extend([[c] for c in df_train.columns if c not in ['time', 'channel', 'like', 'finish','dutration_time',"time_day","item_pub_month","item_pub_day","item_pub_hour","item_pub_minute"]]) print(count_feats_list) print('cross count') users = ['uid'] authors = ['item_id', 'user_city', 'author_id', 'item_city', 'channel', 'music_id', 'device','item_pub_hour'] count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors]) users = ['author_id'] authors = ['channel', 'user_city', 'item_city', 'music_id', 'item_pub_hour'] count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors]) count_feats_list.append(['uid', 'user_city', 'channel', 'device']) count_feats_list.append(['author_id', 'item_city', 'music_id','item_pub_hour']) print("计算count的字段有以下这些") print(count_feats_list) for i in range(len(count_feats_list)): group_cols=count_feats_list[i] new_feature = '_'.join(group_cols) #判断是几维交叉特征,并进行拼接,再计算每个特征值的个数count,并完成映射 if len(group_cols)==1: if new_feature in ["music_id"] : df1 = df_train.where(df_train[new_feature]!=-1).groupby(new_feature).count()\ .withColumnRenamed('count',new_feature+'_count') else: df1 = df_train.groupby(new_feature).count()\ .withColumnRenamed('count',new_feature+'_count') #类别偏好的ratio比例 count_min = df1.select(fn.min(df1[new_feature+'_count'])).collect()[0][0] count_max = df1.select(fn.max(df1[new_feature+'_count'])).collect()[0][0] # F.bround("Rank", scale=4) df1=df1.withColumn(new_feature+'_count_ratio', fn.bround(((df1[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3)) # print("查看df1_1") # df1.show(5,truncate=False) if new_feature=="device": #[1.0, 16.0, 46.0, 102.0, 204.0, 410.0, 10389.0] 修改 percent_list=[0,10,20,30,40,50,60,70,80,90,100] elif new_feature=="author_id": #[1.0, 2.0, 7.0, 32.0, 78.0, 276186.0] percent_list=[0,50,75,90,95,100] elif new_feature=="music_id": #[1.0, 3.0, 13.0, 73.0, 211.0, 193640.0] percent_list=[0,50,75,90,95,100] #每个percent_list不相同 elif new_feature=="uid": #分箱[1.0, 104.0, 329.0, 741.0, 1131.0, 10389.0] percent_list=[0,50,75,90,95,100] elif new_feature=="item_id": #[1.0, 1.0, 2.0, 7.0, 14.0, 6911.0] 分箱修改 percent_list=[0,75,90,95,100] elif new_feature=="user_city": #[1.0, 21935.5, 54519.5, 110179.0, 146319.75, 3789087.0] 修改 percent_list=[0,10,20,30,40,50,60,70,80,90,100] elif new_feature=="item_city": #[1.0, 14725.0, 48576.0, 122887.0, 206845.5, 744265.0] 修改 percent_list=[0,10,20,30,40,50,60,70,80,90,100] else: percent_list=[0,10,20,30,40,50,60,70,80,90,100] df1=self.bining(sqlContext,df1,new_feature+'_count',percent_list) # print(df1.show(5,truncate=False)) df_train=df_train.join(df1,new_feature,'left') # print("train") # df_train.show(5,truncate=False) #ratio是一个连续变量,范围0-1 df_test=df_test.join(df1,new_feature,'left') # print("test") # df_test.show(5,truncate=False) #ratio是一个连续变量,范围0-1 del df1 gc.collect() print("输出所有一维特征处理后的结果") df_train.show(1,truncate=False) df_train.printSchema() df_test.show(1,truncate=False) df_train.printSchema() if len(group_cols)==2: print("开始处理2维交叉变量") df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType())) ) df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType())) ) df2 = df_train.groupby(new_feature).count()\ .withColumnRenamed('count',new_feature+'_count') #类别偏好的ratio比例 count_min = df2.select(fn.min(df2[new_feature+'_count'])).collect()[0][0] count_max = df2.select(fn.max(df2[new_feature+'_count'])).collect()[0][0] # F.bround("Rank", scale=4) df2=df2.withColumn(new_feature+'_count_ratio', fn.bround(((df2[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3)) # print("查看df1_1") # df2.show(5,truncate=False) if new_feature=="uid_item_id": percent_list=[0,20,35,50,65,85,100] #每个percent_list不相同 else: percent_list=[0,50,75,90,95,100] # elif new_feature=="uid_user_city": # percent_list=[0,50,75,90,95,100] # elif new_feature=="uid_author_id": # percent_list=[0,50,75,90,95,100] #每个percent_list不相同 # elif new_feature=="uid_item_city": # percent_list=[0,50,75,90,95,100] # elif new_feature=="uid_channel": # percent_list=[0,50,75,90,95,100] # elif new_feature=="uid_music_id": # percent_list=[0,50,75,90,95,100] # elif new_feature=="uid_device": # percent_list=[0,50,75,90,95,100] # elif new_feature=="uid_time_pub_hour": # percent_list=[0,50,75,90,95,100] # ['uid', 'item_id'], ['uid', 'user_city'], ['uid', 'author_id'], ['uid', 'item_city'], ['uid', 'channel'], ['uid', 'music_id'], # ['uid', 'device'], ['uid', 'time_pub_hour'] #['author_id', 'channel'], ['author_id', 'user_city'], ['author_id', 'item_city'], ['author_id', 'music_id'], ['author_id', 'time_pub_hour'] df2=self.bining(sqlContext,df2,new_feature+'_count',percent_list) print("查看df2_2") df2.show(5,truncate=False) df_train=df_train.join(df2,new_feature,'left') # print("train") # df_train.show(5,truncate=False) #ratio是一个连续变量,范围0-1 df_test=df_test.join(df2,new_feature,'left') # print("test") # df_test.show(5,truncate=False) if len(group_cols)==4: print("开始处理4维交叉变量") df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()), df_train[group_cols[2]].cast(typ.StringType()),df_train[group_cols[3]].cast(typ.StringType())) ) df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()), df_test[group_cols[2]].cast(typ.StringType()),df_test[group_cols[3]].cast(typ.StringType())) ) df3 = df_train.groupby(new_feature).count()\ .withColumnRenamed('count',new_feature+'_count') #类别偏好的ratio比例 count_min = df3.select(fn.min(df3[new_feature+'_count'])).collect()[0][0] count_max = df3.select(fn.max(df3[new_feature+'_count'])).collect()[0][0] # F.bround("Rank", scale=4) df3=df3.withColumn(new_feature+'_count_ratio', fn.bround(((df3[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3)) # print("查看df3_1") # df3.show(5,truncate=False) percent_list=[0,50,75,90,95,100] df3=self.bining(sqlContext,df3,new_feature+'_count',percent_list) print("查看df3_2") df3.show(5,truncate=False) df_train=df_train.join(df3,new_feature,'left') # print("train") # df_train.show(5,truncate=False) # ['uid', 'user_city', 'channel', 'device'], ['author_id', 'item_city', 'music_id', 'time_pub_hour'] df_test=df_test.join(df3,new_feature,'left') # print("test") # df_test.show(5,truncate=False) # df.show(5,truncate=False) print("删除没有必要的列") unuse_col=['item_city','user_city','device','author_id','music_id',] #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到 df_train=self.dropUnuseCols(df_train,unuse_col) df_test=self.dropUnuseCols(df_test,unuse_col) print("表中含有为null的字段,主要产生在leftjoin的时候") print("这一步先不做,三表联合的时候会填充") # df_train=df_train.na.fill(-1) # df_test=df_test.na.fill(-1) print("查看train的统计信息") desc = df_train.describe() desc.show() print("查看test的统计信息") desc = df_test.describe() desc.show() print('-------5.保存数据预处理结果-------') test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_new' os.system("hadoop fs -rm -r {}".format(test_file_path)) df_test.rdd.map(tuple).saveAsPickleFile(test_file_path) del df_test gc.collect() train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_new' os.system("hadoop fs -rm -r {}".format(train_file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_train.rdd.map(tuple).saveAsPickleFile(train_file_path) '''