def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "192.192.0.27:9092" topics = ['topic7'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka") wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(args): if len(args) < 2: sys.exit(1) # Setting the cluster configuration parameters spark_master = args[0] spark_data_file_name = args[1] file_path = CURR_DIR + "/" + spark_data_file_name conf = SparkConf() conf.setMaster(spark_master) conf.setAppName("Log Scanner") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) txt_logs = sc.textFile(file_path).filter(lambda line: check(line)) access_logs = txt_logs.map(lambda line: AccessLog(line)) # Getting response_codes from log objects and caching it response_codes = access_logs.map(lambda log: log.get_status()).cache() log_count = response_codes.count() print("Total Resonse Codes: " + str(log_count)) cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect() print("###########################") print("## Success Rate : " + str(int(response200[0])*100/log_count) + " % ##") print("###########################")
def main(): """ Main entry point of the application """ # Create spark configuration and spark context include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py')) conf = SparkConf() conf.set('spark.executor.memory', '1500m') conf.setAppName("Generating predictions") sc = SparkContext(conf=conf, pyFiles=[include_path]) # Set S3 configuration sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY']) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY']) # Single-pass predictions fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*", file_output="s3n://twitter-stream-predictions/final", sports_model="PyTwitterNews/models/sports.model", politics_model="PyTwitterNews/models/politics.model", technology_model="PyTwitterNews/models/technology.model") # Stop application sc.stop()
def get_default_spark_conf(): conf = SparkConf(). \ setAppName("pyunit-test"). \ setMaster("local-cluster[3,1,2048]"). \ set("spark.ext.h2o.disable.ga","true"). \ set("spark.driver.memory", "2g"). \ set("spark.executor.memory", "2g"). \ set("spark.ext.h2o.client.log.level", "DEBUG"). \ set("spark.ext.h2o.repl.enabled", "false"). \ set("spark.task.maxFailures", "1"). \ set("spark.rpc.numRetries", "1"). \ set("spark.deploy.maxExecutorRetries", "1"). \ set("spark.network.timeout", "360s"). \ set("spark.worker.timeout", "360"). \ set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \ set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \ set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\ set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse")) if ExternalClusterTestHelper.tests_in_external_mode(): conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip()) conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2") return conf
def configureSpark(app_name, master): #Configure SPARK conf = SparkConf().setAppName(app_name) conf = conf.setMaster(master) spark_context = SparkContext(conf=conf) return spark_context
def main(): conf = SparkConf() conf.set("spark.default.parallelism", "24") sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf) lines = sc.textFile(data_files, use_unicode=False) # Create LogLine objects and filter out empty lines logs = lines.flatMap(ll_mapper) # Save in an intermediate format logs.saveAsTextFile(out_dir, compressionCodecClass=codec) return # Gap detection keyed = logs.map(ll_gap_map) merged = keyed.groupByKey() # At this point we have ((boot_id, date), [line_num]) tuples The last step. # is to find all the gaps within each key/tuple. result = merged.flatMap(find_gaps) gaps = result.collect() fd = open("/spark/gaps.json", 'w') fd.write(json.dumps(gaps, indent=4))
def setUpClass(cls): class_name = cls.__name__ conf = SparkConf() conf.set('spark.app.name', 'class_name') # Read the spark configuration and update the spark conf test_spark_config = ConfigParser.ConfigParser() test_spark_config.read('test_config.cfg') test_spark_config.sections() configs = dict(test_spark_config.items('spark_conf_test_generic')) for k, v in configs.items(): conf.set(k, v) cls.spark_test_configs = configs # Create the spark context cls.sc = SparkContext(conf=conf) if 'PYSPARK_DRIVER_PYTHON' in configs.keys(): cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON'] else: cls.sc.pythonExec = 'python2.7' logger = cls.sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s') cls.logger = logging.getLogger(__name__) cls.logger.setLevel(logging.DEBUG)
def main(): parser = argparse.ArgumentParser( description='process some log messages, storing them and signaling ' 'a rest server') parser.add_argument('--mongo', help='the mongodb url', required=True) parser.add_argument('--rest', help='the rest endpoint to signal', required=True) parser.add_argument('--port', help='the port to receive from ' '(default: 1984)', default=1984, type=int) parser.add_argument('--appname', help='the name of the spark application ' '(default: SparkharaLogCounter)', default='SparkharaLogCounter') parser.add_argument('--master', help='the master url for the spark cluster') parser.add_argument('--socket', help='the socket to attach for streaming text data ' '(default: caravan-pathfinder)', default='caravan-pathfinder') args = parser.parse_args() mongo_url = args.mongo rest_url = args.rest sconf = SparkConf().setAppName(args.appname) if args.master: sconf.setMaster(args.master) sc = SparkContext(conf=sconf) ssc = StreamingContext(sc, 1) lines = ssc.socketTextStream(args.socket, args.port) lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url)) ssc.start() ssc.awaitTermination()
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "localhost:9092" topics = ['test'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) print(wordcounts) kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def stackexchange_xml_spark_job(): server = bluebook_conf.HDFS_FQDN conf = SparkConf() xml_file_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\ bluebook_conf.STACKEXCHANGE_XML_FILE_NAME json_ques_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME json_ans_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME conf.setAppName('stackexchange_xml_spark_job') spark_context = SparkContext(conf=conf) file = spark_context.textFile(xml_file_address) # Ques and Ans files are stored seperately depending of their 'posttypeid' # Ques -> posttypeid == 1 # Ans -> posttypeid == 2 ques = file.map(stackexchange_xml_mapper)\ .filter(lambda dic: 'posttypeid' in dic.keys())\ .filter(lambda dic: dic['posttypeid'] == '1')\ .map(lambda d: jsoner(d)) ans = file.map(stackexchange_xml_mapper)\ .filter(lambda dic: 'posttypeid' in dic.keys())\ .filter(lambda dic: dic['posttypeid'] == '2')\ .map(lambda d: jsoner(d)) ques.saveAsTextFile(json_ques_folder_address) ans.saveAsTextFile(json_ans_folder_address)
def __call__(self): log.info("Processing wiki dump: %s ...", self.wk_dump_path) c = SparkConf().setAppName("Wikijson") log.info("Using spark master: %s", c.get("spark.master")) sc = SparkContext(conf=c) if os.path.isdir(self.output_path): log.warn("Writing over output path: %s", self.output_path) shutil.rmtree(self.output_path) # rdd of tuples: (title, namespace, id, redirect, content) pages = wikispark.get_pages_from_wikidump(sc, self.wk_dump_path) pages.cache() articles = wikispark.get_articles_from_pages(pages) redirects = wikispark.get_redirects_from_pages(pages) if self.redirect_links: articles = wikispark.redirect_article_links(articles, redirects) articles.map(self.article_to_json).map(json.dumps).saveAsTextFile( self.output_path, "org.apache.hadoop.io.compress.GzipCodec" ) log.info("Done.")
def setUp(self): conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost') conf.set('spark.ui.showConsoleProgress', False) self.session = SparkSession.builder.config(conf=conf).getOrCreate() self.test_data = [ ('Ricardo', 'engineering', 2), ('Tisa', 'sales', 3), ('Sheree', 'marketing', 4), ('Chantelle', 'engineering', 5), ('Kylee', 'finance', 2), ('Tamatha', 'marketing', 5), ('Trena', 'engineering', 2), ('Arica', 'engineering', 1), ('Santina', 'finance', 2), ('Daria', 'marketing', 1), ('Magnolia', 'sales', 2), ('Antonina', 'finance', 1), ('Sumiko', 'engineering', 1), ('Carmen', 'sales', 2), ('Delois', 'engineering', 1), ('Luetta', 'marketing', 3), ('Yessenia', 'sales', 1), ('Petra', 'engineering', 3), ('Charisse', 'engineering', 4), ('Lillian', 'engineering', 3), ('Wei', 'engineering', 2), ('Lahoma', 'sales', 2), ('Lucilla', 'marketing', 1), ('Stephaine', 'finance', 2), ]
def getSparkContext(self, appName, master): print(appName) print(master) conf = SparkConf().setAppName(appName).setMaster(master) conf.set("spark.local.ip", "127.0.0.1") conf.set("spark.driver.host", "127.0.0.1") return SparkContext(conf=conf)
def main(): spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]') spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf= spark_conf) GA.logInConsole(0, "input file read!") rdd = sc.textFile("/home/fatemeh/Data/saveData.txt", minPartitions= 500, use_unicode=False) rdd.unpersist() # print('\nNumber of Partitions for this run: ', rdd.getNumPartitions()) vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' ')) GA.logInConsole(0 , "Data Vectorized!") ss = list() GA.logInConsole(-1, 'Start the ensemble') GA.logInConsole(-10, "GA with range 3") ss.append(GA.parallel_GA_main(vectorRDD,sc, 5)) # GA.logInConsole(-10, "GA with range 4") # ss.append(GA.parallel_GA_main(norm,sc, 4)) # GA.logInConsole(-10, "GA with range 5") # ss.append(GA.parallel_GA_main(norm,sc, 5)) # GA.logInConsole(-10, "GA with range 3 and Sampled data set") # sampleRDD = norm.sample(False, 0.6, seed=10) # ss.append(GA.parallel_GA_main(sampleRDD,sc, 3)) print(ss) #selectedSS = voted_subsapces(ss) # SSD.outlierDetection(vectorRDD, ss) GA.logInConsole(100, "\nend of program") sc.stop()
def read_conf(): """ Setting up spark contexts """ conf = SparkConf() conf.setMaster("local[*]") conf.setAppName("Testing") return conf
def _test_broadcast_on_driver(self, *extra_confs): conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) bs = self.sc.broadcast(value=5) self.assertEqual(5, bs.value)
def configureSpark(): #Configure SPARK conf = SparkConf().setAppName("a") conf = conf.setMaster("local[*]") conf = conf.set("spark.executor.memory", "2g").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryoserializer.buffer", "256").set("spark.akka.frameSize", "500").set("spark.rpc.askTimeout", "30").set('spark.executor.cores', '4').set('spark.driver.memory','2g') sc = SparkContext(conf=conf) return sc
class OWSparkContext(SharedSparkContext, widget.OWWidget): priority = 0 name = "Context" description = "Create a shared Spark (sc) and Hive (hc) Contexts" icon = "../icons/spark.png" want_main_area = False resizing_enabled = True conf = None def __init__(self): super().__init__() # The main label of the Control's GUI. # gui.label(self.controlArea, self, "Spark Context") self.conf = SparkConf() all_prefedined = dict(self.conf.getAll()) # Create parameters Box. box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True) self.gui_parameters = OrderedDict() main_parameters = OrderedDict() main_parameters['spark.app.name'] = 'OrangeSpark' main_parameters['spark.master'] = 'yarn-client' main_parameters["spark.executor.instances"] = "8" main_parameters["spark.executor.cores"] = "4" main_parameters["spark.executor.memory"] = "8g" main_parameters["spark.driver.cores"] = "4" main_parameters["spark.driver.memory"] = "2g" main_parameters["spark.logConf"] = "false" main_parameters["spark.app.id"] = "dummy" for k, v in main_parameters.items(): default_value = all_prefedined.setdefault(k, v) self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v) all_prefedined.pop(k) for k, v in all_prefedined.items(): self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v)) action_box = gui.widgetBox(box) # Action Button self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context) def onDeleteWidget(self): if self.sc: self.sc.stop() def create_context(self): for key, parameter in self.gui_parameters.items(): self.conf.set(key, parameter.get_value()) self.sc = SparkContext(conf = self.conf) self.hc = HiveContext(self.sc)
def configureSpark(app_name, master): #Configure SPARK conf = SparkConf().setAppName(app_name) conf = conf.setMaster(master) #conf.set("fs.s3n.awsAccessKeyId", "") #conf.set("fs.s3n.awsSecretAccessKey", "") spark_context = SparkContext(conf=conf) return spark_context
def main(): spark_conf = SparkConf().setAppName("Different-Sampling data") spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf= spark_conf) rdd = load_data(sc) print(rdd.getNumPartitions()) parallel_GA_main(sc, rdd, 5) sc.stop()
def init_spark_context(): # load spark context conf = SparkConf().setAppName("ctr-server") conf.set('spark.kryoserializer.buffer', '512mb') conf.set('spark.kryoserializer.buffer.max', '512') # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['/home/ec2-user/engine.py', '/home/ec2-user/app.py']) return sc
def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk", extra_conf=None): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: spark master setting :param py_files: list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: override $SPARK_HOME :param sparktk_home: override $SPARKTK_HOME :param app_name: name of spark app :param extra_conf: dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} :return: pyspark SparkContext """ set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) if not master: master = default_spark_master logger.info("sparktk.create_sc() master not specified, setting to %s", master) conf = SparkConf().setMaster(master).setAppName(app_name) if extra_conf: for k, v in extra_conf.items(): conf = conf.set(k, v) if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register(shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join(["=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc
def create_spark_context(app_name="Quiz Bowl", lm_memory=False, profile=False): spark_conf = SparkConf() if lm_memory: pass # spark_conf = spark_conf.set('spark.max.cores', 30).set('spark.executor.cores', 30) if profile: spark_conf = spark_conf.set('spark.python.profile', True) spark_conf = spark_conf.set('spark.akka.frameSize', 300) return SparkContext(appName=app_name, master=QB_SPARK_MASTER, conf=spark_conf)
def main(): # Spark Configurations conf = SparkConf() conf.set("spark.master", "local[*]") conf = conf.setAppName('Learning PySpark') sc = SparkContext(conf=conf) df = sc\ .textFile('IXQ_20170622080001.csv')\ .map(lambda line: line.split(',')) print(df.take(5))
def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))
def createSparkConf(): from pyspark import SparkConf test_properties = conftest.test_properties() conf = SparkConf() conf.set("cloudant.host", test_properties["cloudanthost"]) conf.set("cloudant.username", test_properties["cloudantusername"]) conf.set("cloudant.password", test_properties["cloudantpassword"]) return conf
def main(): # Setting the cluster configuration parameters conf = SparkConf() conf.setMaster("spark://localhost:7077") conf.setAppName("Tweet App") conf.set("spark.executor.memory", "3g") conf.set("spark.driver.memory", "4g") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) # Creating and SQL context to perform SQL queries sqlContext = SQLContext(sc) # Define the data path curr_path = os.path.dirname(os.path.abspath(__file__)) json_name = "out.json" json_file_path = os.path.join(curr_path + "/../Spark_Jobs/data/", json_name) parquet_file_path = createSQLContext(json_file_path, sqlContext) print(parquet_file_path) # Read from parquet file parquetFile = sqlContext.read.parquet(parquet_file_path) parquetFile.registerTempTable("tweets") counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets") print("============= Count =================") print("Count:: " + str(counter.collect()[0].cnt))
def home3(request): #spark_home = os.environ['SPARK_HOME'] = '/usr/local/spark-1.5.2-bin-2.7.1/' #'/usr/local/spark/' #sys.path.insert(0,os.path.join(spark_home,'python')) #sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip')) #from pyspark import SparkContext, SparkConf #sc = SparkContext() #data=[1,2,3,4,5] #distData = sc.parallelize(data) #first = distData.take(1) #sc.stop() prefs = ["worldnews","politics","Economics","Libertarian"] scfg=SparkConf() scfg.set("spark.cores.max",64) sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg) #data=[1,2,3,4,5] #distData = sc.parallelize(data) #first = distData.take(1) #sc.stop() try: # prep data raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total") parsed_counts = raw_counts.map(lambda st: eval(st)) all_ratings = parsed_counts.map( tup_to_rating ) # assign user-identified preferred subreddits raw_prefs = [ (999, x, 100) for x in prefs ] my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating) # train model model_input = all_ratings.union(my_prefs) #model = ALS.trainImplicit(model_input, 10, 10, alpha=.01) # candidate prefs for prediction #my_prefs_ids = set([javahash(x) for x in prefs]) #all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache() #candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids) #predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache() #final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False) #output = list( final.take(30) ) sc.stop() #return output recommends = ["asfd"] # output except Exception, e: print("App failed. Stopping gracefully") sc.stop() raise Exception(e)
def __init__(self, master, name): self.name=name self.master=master print "init spark ..." os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin" conf = SparkConf() conf.setMaster(self.master) conf.setAppName(self.name) self.sc = SparkContext(conf=conf)
def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20])
import json import matplotlib.pyplot as plt import pandas as pd import gzip import locale from pyspark import SparkConf, SparkContext from pyspark.sql import Row, SparkSession from pyspark.sql.types import StructField, StructType, StringType ## CONFIG ## sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1) locale.getdefaultlocale() locale.getpreferredencoding() conf = SparkConf().set('spark.driver.host', '127.0.0.1') sc = SparkContext(master='local', appName='Video Games rank - Bar graph', conf=conf) spark = SparkSession.builder.appName( "Video Games rank - Bar graph").getOrCreate() ## DATA ## s = sc.textFile('hdfs://namenode:9000/Video_Games_5.json.gz') df = spark.read.json(s) ## PROCESS ## # Printing important informations about the data df.printSchema()
sum_xx += ratingX * ratingX sum_yy += ratingY * ratingY sum_xy += ratingX * ratingY numPairs += 1 numerator = sum_xy denominator = sqrt(sum_xx) * sqrt(sum_yy) score = 0 if (denominator): score = (numerator / (float(denominator))) return (score, numPairs) conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities") sc = SparkContext(conf=conf) print("\nLoading movie names...") nameDict = loadMovieNames() data = sc.textFile( "C:/Users/Andy/Dropbox/FactoryFloor/Repositories/Tutorial_Udemy_SparkPython/Course_Resources/ml-100k/u.data" ) # Map ratings to key / value pairs: user ID => movie ID, rating ratings = data.map(lambda l: l.split()).map( lambda l: (int(l[0]), (int(l[1]), float(l[2])))) # Emit every movie rated together by the same user. # Self-join to find every combination.
from pyspark import SparkConf, SparkContext con = SparkConf() sc = SparkContext(conf=con) list3 = [1,2,3,5,6,7,9,8,1,2,3,1,2,7,8,9,2,3,4,8,9,10,1,7,8,2,7,9,9] rdd1 = sc.parallelize(list3) num = rdd1.count() data = rdd1.countByValue() print data for k in data: print (k, "-",data[k])
# first, import pyspark from pyspark import SparkConf, SparkContext # create a conf conf = SparkConf().setMaster("local").setAppName("FriendsByAge") sc = SparkContext(conf=conf) # parse the RDD into key value pairs in tuple which contains integers def parseLine(line): fields = line.split(',') age = int(fields[2]) numFriends = int(fields[3]) return (age, numFriends) # read the file and create a new RDD after mapping the parseline function lines = sc.textFile("file:///SparkCourse/fakefriends.csv") rdd = lines.map(parseLine) # mapValues does not modify the key, but the values and we use reduceByKey to sum up the values(tuples) totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey( lambda x, y: (x[0] + y[0], x[1] + y[1])) averagesByAge = totalsByAge.mapValues(lambda x: round((x[0] / x[1]))) results = averagesByAge.sortByKey().collect() # in order to print out for result in results: print(result)
# Find popular movies from Ml-100k data from pyspark import SparkConf, SparkContext def loadMovieNames(): movieNames = {} with open("ml-100k/u.ITEM") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames conf = SparkConf().setMaster("local").setAppName("PopularMovies") sc = SparkContext(conf = conf) nameDict = sc.broadcast(loadMovieNames()) lines = sc.textFile("file:///SparkCourse/ml-100k/u.data") movies = lines.map(lambda x: (int(x.split()[1]), 1)) movieCounts = movies.reduceByKey(lambda x, y: x + y) flipped = movieCounts.map( lambda x : (x[1], x[0])) sortedMovies = flipped.sortByKey() sortedMoviesWithNames = sortedMovies.map(lambda countMovie : (nameDict.value[countMovie[1]], countMovie[0])) results = sortedMoviesWithNames.collect() for result in results: print (result)
import sys from pyspark import SparkConf from pyspark.sql.types import * from pyspark import SparkContext from csv import reader import string import io from pyspark.sql import functions from pyspark.sql.session import SparkSession conf = SparkConf().setAppName("311") sc = SparkContext(conf=conf) inputfile = sc.textFile(sys.argv[1], 1) header = inputfile.first() inputfile = inputfile.filter(lambda x: x!=header) inputfile = inputfile.mapPartitions(lambda x: reader(x)) def extractor(line): Date = line[1] AgencyName=line[4] ComplaintType=line[5] Descriptor=line[6] if Descriptor.strip() == '': Descriptor = 'NotSpecified' LocationType=line[7] if LocationType.strip() == '':
if 'microsoft' in tweet.lower(): return "msft" else: return "ibm" if __name__ == '__main__': if len(sys.argv) != 4: print("Usage: consumer.py <kafka-host> <topic-name> <seconds>") exit(-1) kafka_host = sys.argv[1] topic_name = sys.argv[2] seconds = int(sys.argv[3]) conf = SparkConf() \ .setAppName("data_challenge") from pyspark_cassandra import CassandraSparkContext sc = CassandraSparkContext(conf=conf) sc.setLogLevel('ERROR') ssc = StreamingContext(sc, seconds) ssc.checkpoint('./output') d = dict() d['bootstrap.servers'] = kafka_host d['group.id'] = 'test-id' d['enable.auto.commit'] = 'false' kafka_stream = KafkaUtils.createDirectStream(ssc, [topic_name], d)
from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster('local').setAppName('bfs_degree_of_seperation') sc = SparkContext(conf=conf) hit_counter = sc.accumulator(0) def create_rdd(): text_file = sc.textFile('./dataset/graph.txt') return text_file.map(convert_to_bfs) def convert_to_bfs(line): start_id = 5306 fields = line.split() person_id = int(fields[0]) connections = [] for connection in fields[1:]: connections.append(int(connection)) colour = 'WHITE' distance = 9999 if (person_id == start_id): colour = 'GRAY' distance = 0 return (person_id, (connections, distance, colour))
#!/usr/bin/env python from pyspark.sql import SparkSession from pyspark.sql.functions import col, window, asc, desc, lead, lag, udf, hour, month, dayofmonth from pyspark.sql.window import Window from pyspark.sql.types import FloatType, IntegerType, DateType from pyspark import SparkConf import yaml import datetime import os conf = SparkConf() conf.set("spark.jars", os.getenv("HOME") + "/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar") conf.set("spark.executor.extrajavaoptions", "-Xmx15000m") conf.set("spark.executor.memory", "15g") conf.set("spark.driver.memory", "15g") conf.set("spark.storage.memoryFraction", "0") spark = SparkSession.builder \ .config(conf=conf) \ .master("local") \ .appName("SAIDI Calculator") \ .getOrCreate() config = open('config.yaml') config = yaml.load(config) #connect to the database pw_df = spark.read.jdbc("jdbc:postgresql://timescale.lab11.eecs.umich.edu/powerwatch", "pw_dedupe", properties={"user": config['user'], "password": config['password'],"driver":"org.postgresql.Driver"}) #read the data that we care about
# 10.4 task10_4 = context.sql( "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score" ) task10_5 = context.sql( "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score" ) # cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv") task10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_1.csv") task10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_2.csv") task10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_3.csv") task10_4.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_4.csv") task10_5.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_5.csv") if __name__ == "__main__": conf = SparkConf().setAppName("CS143 Project 2B") conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) sc.addPyFile("cleantext.py") comments = sqlContext.read.json("comments-minimal.json.bz2") submissions = sqlContext.read.json("submissions.json.bz2") main(sqlContext)
from __future__ import division from pyspark import SparkContext, SparkConf from pyspark.mllib.stat import Statistics from operator import add import happybase import csv conf = SparkConf().setAppName('ListenerSummarizer') sc = SparkContext(conf=conf) conn = happybase.Connection('localhost') ctable = conn.table('/user/mapr/cust_table') ltable = conn.table('/user/mapr/live_table') atable = conn.table('/user/mapr/agg_table') trackfile = sc.textFile('tracks.csv') clicksfile = sc.textFile('clicks.csv') trainfile = open('features.txt', 'wb') def make_tracks_kv(str): l = str.split(",") return [l[1], [[int(l[2]), l[3], int(l[4]), l[5]]]] def clicks_summary(str): l = str.split(",") custid = l[1] adv = l[2] if (adv == "ADV_REDUCED_1DAY"): return (custid, 1) def compute_stats_byuser(tracks): mcount = morn = aft = eve = night = 0 tracklist = []
from pyspark import SparkConf, SparkContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF # Function for printing each element in RDD def println(x): print x # Boilerplate Spark stuff: conf = SparkConf().setMaster("local[*]").setAppName("SparkTFIDF") sc = SparkContext(conf=conf) # Load documents (one per line). rawData = sc.textFile("doc-sample.csv") fields = rawData.map(lambda x: x.split(",")) documents = fields.map(lambda x: x[1].split(" ")) documentId = fields.map(lambda x: x[0]) # Creating Hash table and TF table hashingTF = HashingTF(100000) tf = hashingTF.transform(documents) # Creating idf tf.cache() idf = IDF(minDocFreq=1).fit(tf) # Calculate TF/IDF tfidf = idf.transform(tf)
import sys from pyspark import SparkConf, SparkContext if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: spark-submit CountCharacters.py <file>", file=sys.stderr) exit(-1) spark_conf = SparkConf() spark_context = SparkContext(conf=spark_conf) logger = spark_context._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) lines = spark_context \ .textFile(sys.argv[1])\ .persist() number_of_as = lines\ .filter(lambda line: "a" in line)\ .count() number_of_bs = lines\ .filter(lambda line: "b" in line)\ .count() number_of_cs = lines\ .filter(lambda line: "c" in line)\ .saveAsTextFile("Cs.txt") print("Number of 'a's: " + str(number_of_as))
print "No such MODE parameter as {0}, ending.".format(MODE) print "" exit(-1) print "" print "==================================================================" print "Main : Starting run at : ", datetime.datetime.now() print "Running on ",MyComputer print "Getting data from ",data_path print "Processing {0} files in batches of {1}".format(stop_after, batch_size) print >> runtimes_file print >> runtimes_file,"NEW RUN, STARTED AT {0}, STOP_AFTER = {1}".format(datetime.datetime.now(),stop_after) print "" # Connect to Spark config = SparkConf().setMaster('local[2]').set('spark.executor.memory', '4g') sc = SparkContext(conf = config, appName = 'SFOX - Coursework INM432') # =============================================================================== # Part 1 - Reading and Preparing text files # =============================================================================== # Part 1d - create TF if MODE == 'TF': print "" print "CREATING TFs" print >> runtimes_file,"CREATING TF" TF.create_tf(data_path, sc, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics) # Part 1e - create IDF if MODE == 'IDF':
from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("MaxTemperatures") sc = SparkContext(conf=conf) def parseLine(line): fields = line.split(',') stationID = fields[0] entryType = fields[2] temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 print("Temperature is:") return (stationID, entryType, temperature) lines = sc.textFile("file:///SparkCourse/1800.csv") parsedLines = lines.map(parseLine) maxTemps = parsedLines.filter(lambda x: "TMAX" in x[1]) stationTemps = maxTemps.map(lambda x: (x[0], x[2])) maxTemps = stationTemps.reduceByKey(lambda x, y: max(x, y)) results = maxTemps.collect() for result in results: print(result[0] + "\t{:.2f}F".format(result[1])) stationTemps = maxTemps.flatMap(lambda x: (x[1], x[2])) maxTemps = stationTemps.reduceByKey(lambda x, y: min(x, y)) results = maxTemps.collect() resuls.collect()
import sys import time import itertools import os from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark import SparkContext, SparkConf from operator import add from graphframes import * # os.environ["PYSPARK_SUBMIT_ARGS"] = ( # "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11") conf = SparkConf()\ .setMaster("local[3]")\ .set("spark.executor.memory", "4g")\ .set("spark.driver.memory", "4g") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) spark = SparkSession(sc) hasattr(sc, "toDF") tick = time.time() #local run threshold = 7 input_file = "data/HW4/ub_sample_data.csv" output_file = "data/HW4/task1_output.csv" # threshold = int(sys.argv[1])
from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext conf = SparkConf() conf.setAppName('TestDstream') conf.setMaster('yarn-cluster') sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 3) lines = ssc.textFileStream('File:///develop/testdata/') words = lines.flatMap(lambda x: x.split(' ')) wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) wordCounts.pprint() ssc.start() ssc.awaitTermination()
from pyspark import SparkConf, SparkContext from pyspark.mllib.clustering import KMeans, KMeansModel from pyspark.sql import SQLContext # from pyspark.sql.functions import * from pyspark.sql.types import * from pyspark.ml.feature import StringIndexer, OneHotEncoder from pyspark.sql.functions import udf import operator conf = SparkConf().setAppName('Anomaly Detection') sc = SparkContext(conf=conf) sqlCt = SQLContext(sc) class AnomalyDetection(): def readData(self, filename): self.rawDF = sqlCt.read.parquet(filename).cache() def cat2Num(self, df, indices): first_feature = df.select( df.rawFeatures[indices[0]]).distinct().collect() second_feature = df.select( df.rawFeatures[indices[1]]).distinct().collect() #Creating 2 lists to extract features first_feature_lst = [] second_feature_lst = [] for row in first_feature: first_feature_lst.append(row[0])
results_file.write('Frequent Itemsets:\n') output = [] for single_item in sorted(result_frequent_itemsets[0]): output.append('(\'' + str(single_item[0]) + '\')') results_file.write(','.join(output) + '\n\n') for freq_set in result_frequent_itemsets[1:]: results_file.write(','.join(map(str, (sorted(freq_set)))) + '\n\n') if __name__ == '__main__': start_time = time.time() # initialize spark conf = SparkConf() conf.set("spark.driver.memory", "4g") conf.set("spark.executor.memory", "4g") conf.setMaster('local[8]') conf.setAppName('Assignment_2') sc = SparkContext.getOrCreate(conf) # get args case = int(sys.argv[1]) support = int(sys.argv[2]) input_file = sys.argv[3] result_file = sys.argv[4] # create baskets rdd data = sc.textFile(input_file).map(lambda x: x.split(',')).map( lambda x: (x[0], x[1]))
from pyspark import SparkConf, SparkContext import csv conf = SparkConf().setMaster("local").setAppName("PFE") sc = SparkContext(conf=conf) # generate labeled dataset lines = sc.textFile("subjectivity.tff") champs = lines.map(lambda x: x.split(' ')) results = champs.collect() labels = [] words = [] for result in results: champ0 = result[0] len1 = len(champ0) label = champ0[5:len1] labels.append(label) champ2 = result[2] len2 = len(champ2) word = champ2[6:len2] words.append(word) keys = words values = labels dictionary = dict(zip(keys, values)) def check_label(word): label = '' for key, value in dictionary.items():
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_SUBJECT_D004031').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext conf = SparkConf() conf.setAppName("Json Pyspark") sc = SparkContext(conf=conf) # Create SQL Context sqlContext = SQLContext(sc) # Read Json file using sqlcontext depJSON = sqlContext.jsonFile( "/user/cloudera/output/itversity/pig/departments_data.json/part-m-00000") for x in depJSON.collect(): print(x) # Create A Structure on top of the raw json data depJSON.registerTempTable("deps") # Now query the table depRDD = sqlContext.sql("Select * from deps") for x in depRDD.collect(): print(x) # Another method to convert the raw file into a structure. This is removed Spark 1.3.0 onwards since Dataframe were introduced. # Also all the methods on SQLContext,HiveContext used to return SchemaRDD in 1.2.1 but from 1.3.0 onwards these functions return DataFrame object. # From 1.3.0 onwards the SchemaRDD class is completely removed from Spark. #sqlContext.registerRDDAsTable(depJSON, "departments")
def init_spark_context(): conf = SparkConf().setAppName("yelp data recommender") sc = SparkContext(conf=conf, pyFiles=['recommender.py', 'app.py']) return sc
# -*- coding: utf-8 -*- from pyspark import SparkContext,SparkConf from pyspark.sql import SQLContext from pyspark.sql.types import * import os import time import re if __name__ == "__main__": conf = SparkConf() sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # PYSPARK_PYTHON = "C:\\Python27\\python.exe" #多版本python情况下,需要配置这个变量指定使用哪个版本 # os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON def valid_casedate(strdate): '''''判断是否是一个有效的日期字符串''' try: time.strptime(strdate, "%Y-%m-%d") return True except: return False def get_date(j, a): d = re.split(ur"年|月|日", j) if len(d) == 4: if len(d[0]) == 4:
__author__ = 'yjxiong' from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.types import * conf = SparkConf().setAppName('build_parquet').setMaster("local[24]")\ .set('spark.executor.memory','4g')\ sc = SparkContext(conf=conf) sqlc = SQLContext(sc) # yfcc_field_list = open('yfcc_schema.txt').read().split() yfcc_field_list = ['image_id', 'labels'] fields = [ StructField(field_name, StringType(), True) for field_name in yfcc_field_list ] yfcc_schema = StructType(fields) # yfcc_data = sc.textFile('/TMP/yjxiong/YFCC/temp_store/yfcc100m_rich_text_eng_dataset_success_images')\ # .map(lambda x: x.split('\t')).coalesce(2000) yfcc_data = sc.textFile('/TMP/yjxiong/YFCC/temp_store/yfcc100m_rich_text_eng_dataset_success_tag_ids')\ .map(lambda x: x.split('\t')).coalesce(10) yfcc_schema_df = sqlc.createDataFrame(yfcc_data, yfcc_schema)
return [(query_dataset, single_column_data.to_csv(index=False), single_column_joined_data.to_csv(index=False), new_record)] def save_id_to_record(id_, record, key_name): record[key_name] = id_ return record if __name__ == '__main__': start_time = time.time() # Spark context conf = SparkConf().setAppName("Data Generation for Use Cases") sc = SparkContext(conf=conf) # parameters params = json.load(open(".params.json")) training_records_file = params['training_records'] datasets_dir = params['datasets_directory'] output_dir = params['new_datasets_directory'] hdfs_address = params['hdfs_address'] hdfs_user = params['hdfs_user'] # HDFS Client hdfs_client = InsecureClient(hdfs_address, user=hdfs_user) create_dir(output_dir, hdfs_client)
import numpy as np from pyspark import SparkContext, SparkConf from pyspark.mllib.recommendation import Rating from pyspark.mllib.recommendation import ALS # For reproducibility np.random.seed(1000) nb_users = 200 nb_products = 100 ratings = [] if __name__ == '__main__': conf = SparkConf().setAppName('ALS').setMaster('local[*]') sc = SparkContext(conf=conf) for _ in range(10): for i in range(nb_users): rating = Rating(user=i, product=np.random.randint(1, nb_products), rating=np.random.randint(0, 5)) ratings.append(rating) # Parallelize the ratings ratings = sc.parallelize(ratings) # Train the model model = ALS.train(ratings, rank=5, iterations=10) # Test the model
f[21] = len(filter(lambda x: x[0] <= 1 and x[1] == 4, items)) # 最后1天购买次数 f[22] = len(filter(lambda x: x[0] <= 3 and x[1] == 4, items)) # 最后3天购买次数 f[23] = len(filter(lambda x: x[0] <= 7 and x[1] == 4, items)) # 最后7天购买次数 f[38] = round(1.0 * len(items) / f[32], 4) if f[32] != 0 else 0.0 # 用户对每件商品的平均交互次数 return "\t".join([str(i) for i in f]) global etime global subset if __name__ == "__main__": import fileinput conf = (SparkConf().setMaster( "spark://namenode.omnilab.sjtu.edu.cn:7077").setAppName("Extract").set( "spark.cores.max", "32").set("spark.driver.memory", "4g").set("spark.executor.memory", "6g")) sc = SparkContext(conf=conf) lines = sc.textFile( 'hdfs://namenode.omnilab.sjtu.edu.cn/user/qiangsiwei/competition_tianchi/uid_iid', 1) target, etime, subset = "12-19-0", "12-18-23", {} # target, etime, subset = "12-18-0", "12-17-23", {} # target, etime, subset = "12-17-0", "12-16-23", {} # target, etime, subset = "12-16-0", "12-15-23", {} # target, etime, subset = "12-15-0", "12-14-23", {} # target, etime, subset = "12-14-0", "12-13-23", {} # target, etime, subset = "12-13-0", "12-12-23", {} # target, etime, subset = "12-12-0", "12-11-23", {} # target, etime, subset = "12-11-0", "12-10-23", {}
from pyspark.sql.functions import * from pyspark.sql.types import * from pyspark.sql.window import Window import time, copy, re, math from datetime import datetime, timedelta, date import json import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') logger = logging.getLogger(__name__) reload(sys) sys.setdefaultencoding('utf-8') #warehouse_location = '/user/hive/warehouse/' conf = SparkConf().set('spark.driver.maxResultSize', '20g') conf.set('spark.yarn.am.cores', 5) conf.set('spark.yarn.executor.memoryOverhead', '10g') conf.set('soark.shuffle.partitions', 800) conf.set('soark.shuffle.io.retryWait', '10s') conf.set('spark.executor.memory', '10g') conf.set('spark.executor.instances', 100) conf.set('spark.executor.cores', 4) conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC') #conf.set("spark.sql.warehouse.dir", warehouse_location) spark = SparkSession \ .builder \ .config(conf=conf) \ .enableHiveSupport() \
# print(count) # Main Execution # Run Configurations case_num = int(sys.argv[1]) support = int(sys.argv[2]) input_path = sys.argv[3] output_path = sys.argv[4] # Level of Parallelism - Recommended by Spark # http://spark.apache.org/docs/latest/tuning.html#level-of-parallelism cpu_num = multiprocessing.cpu_count() task_per_cpu = cpu_num * 3 # Spark Configurations conf = SparkConf().setAppName('HW2 - Task 1').setMaster('local[*]') sc = SparkContext(conf=conf) # Data Input distFile = sc.textFile(input_path, minPartitions=2).coalesce(task_per_cpu) rdd = distFile.map(lambda s: s.split(",")) # SON Algorithm # Divide into market basket model headers = rdd.first() data = rdd.filter(lambda s: s != headers) grouped_data = get_grouped_data(data, case_num) num_count = grouped_data.count() num_part = grouped_data.getNumPartitions()