def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42) # Train the models decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2) # Test the model testDecisionTree(decisionTreeModel, parallelized_test_set)
def save_data_to_db(): from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext conf = SparkConf().setMaster("localhost") sc = SparkContext("local[*]", "camera_mechine_gen") sc.setLogLevel("WARN") data_used_by_camera_mechine_gen.drop() path = '/3/2014-10-15' for station in stations: station_info = station_destinations_by_directions.find_one({"station_name":station}) if station_info == None: continue destinations_by_directions = station_info['destinations_by_directions'] full_path = data_dir_path+'v0/'+station+path print full_path func = map_anlalyser_gen(station, destinations_by_directions) file_data = sc.textFile(full_path).map(pre_process_1).groupByKey().map(func).collect() for i in sorted(file_data, key=lambda x:x[0]): time = i[0] C1_by_directions = list(i[1].iteritems()) #print station, time, C1_by_directions data_used_by_camera_mechine_gen.insert({'station_name':station, 'time':time, 'C1_by_directions':C1_by_directions})
def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Truncate the last 2 features of the data for dataPoint in train_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) for dataPoint in test_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42) # Train the models randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, numTrees=750, seed=42, maxDepth=30, maxBins=32) # Test the model testRandomForest(randomForestModel, parallelized_test_set)
def create_spark_application(app_name): """Creates and returns a Spark & SQL Context.""" conf = (SparkConf().setAppName(app_name)) spark_context = SparkContext(conf=conf) spark_context.setLogLevel('WARN') sql_context = SQLContext(spark_context) return (spark_context, sql_context)
def spark_context(request): """ Pytest fixture for creating a spark context. Args: :param request: pytest.FixtureRequest object """ conf = (SparkConf().setMaster("local").setAppName("pyspark-local-testing")) sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") request.addfinalizer(lambda: sc.stop()) return sc
def functionToCreateContext(): sc = SparkContext("local[*]", "streaming_part") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 5) data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999) data_from_camera_mechine = ssc.socketTextStream("localhost", 9998) #meat data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler) data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler) ssc.checkpoint(checkpointDirectory) # set checkpoint directory return ssc
def spark_context(request): # If RIAK_HOSTS is not set, use Docker to start a Riak node if not os.environ.has_key('RIAK_HOSTS'): docker_cli = request.getfuncargvalue('docker_cli') host_and_port = get_host_and_port(docker_cli) os.environ['RIAK_HOSTS'] = host_and_port os.environ['USE_DOCKER'] = 'true' # Start new spark context conf = SparkConf().setMaster('local[*]').setAppName('pytest-pyspark-local-testing') conf.set('spark.riak.connection.host', os.environ['RIAK_HOSTS']) conf.set('spark.driver.memory', '4g') conf.set('spark.executor.memory', '4g') spark_context = SparkContext(conf=conf) spark_context.setLogLevel('INFO') pyspark_riak.riak_context(spark_context) request.addfinalizer(lambda: spark_context.stop()) return spark_context
def save_data_to_db(): from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext conf = SparkConf().setMaster("localhost") sc = SparkContext("local[*]", "tikcket_mechine_gen") sc.setLogLevel("WARN") sc.addFile(lib_dir+'/getDistance.py') data_used_by_ticket_mechine_gen.drop() path = '/3/2014-10-15' for s in stations: full_path = data_dir_path+'v0/'+s+path print full_path data_to_save = getDistance.get_one_day_group_by_time(full_path, sc) for item in data_to_save: data_used_by_ticket_mechine_gen.insert({'station_name':s, 'time':item[0], 'data':item[1]})
def test(): sc = SparkContext(master='local[4]', appName='lda') sc.setLogLevel('ERROR') def train(): data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()])) corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # print(corpus.take(5)) lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval, k=K, optimizer=optimizer, docConcentration=alpha, topicConcentration=beta) if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel') lda_model.save(sc, "./ldamodel") # train() lda_model = LDAModel.load(sc, "./ldamodel") # topic-word分布(未归一化的dist,每列代表一个topic) topics = lda_model.topicsMatrix() # for tid in range(3): # print('Topic' + str(tid) + ':') # for wid in range(0, lda_model.vocabSize()): # print(' ' + str(topics[wid, tid] / sum(topics[:, tid]))) # 加一个归一化 # # print(' ' + str(topics[wid, tid])) # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重]) topics_dist = lda_model.describeTopics() for tid, topic in enumerate(topics_dist): print('Topic' + str(tid) + ':' + '\n', topic) # 文档的主题分布(mllib不能,ml才可以) # doc_topic = lda_model sc.stop()
from pyspark import SparkConf, SparkContext import re conf = SparkConf().setMaster("spark://192.168.56.100:7077").setAppName("My App") sc = SparkContext(conf = conf) sc.setLogLevel('WARN') p = re.compile('^\d+\.\d+\.\d+\.\d+.*$') input_file = sc.textFile('/etc/hosts') hosts = input_file.filter(lambda x: p.match(x)) ips = hosts.map(lambda x: x.split('\t')[0]) print "\nnumber of ips: %d" % ips.count() print '-'*5 for ip in ips.collect(): print ip
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) # read given file into RDD rdd = sc.textFile(opts.fname).map(lambda line: line.split(",")) # create a dataframe out of RDD pdf = rdd.toDF(headers()) if opts.verbose: pdf.show() print("pdf data type", type(pdf)) pdf.printSchema() # cast columns to correct data types ndf = pdf.withColumn("block_bytes_tmp", pdf.block_bytes.cast(DoubleType()))\ .drop("block_bytes").withColumnRenamed("block_bytes_tmp", "block_bytes")\ .withColumn("block_files_tmp", pdf.block_files.cast(IntegerType()))\ .drop("block_files").withColumnRenamed("block_files_tmp", "block_files")\ .withColumn("br_src_bytes_tmp", pdf.br_src_bytes.cast(DoubleType()))\ .drop("br_src_bytes").withColumnRenamed("br_src_bytes_tmp", "br_src_bytes")\ .withColumn("br_src_files_tmp", pdf.br_src_files.cast(IntegerType()))\ .drop("br_src_files").withColumnRenamed("br_src_files_tmp", "br_src_files")\ .withColumn("br_dest_bytes_tmp", pdf.br_dest_bytes.cast(DoubleType()))\ .drop("br_dest_bytes").withColumnRenamed("br_dest_bytes_tmp", "br_dest_bytes")\ .withColumn("br_dest_files_tmp", pdf.br_dest_files.cast(IntegerType()))\ .drop("br_dest_files").withColumnRenamed("br_dest_files_tmp", "br_dest_files")\ .withColumn("br_node_bytes_tmp", pdf.br_node_bytes.cast(DoubleType()))\ .drop("br_node_bytes").withColumnRenamed("br_node_bytes_tmp", "br_node_bytes")\ .withColumn("br_node_files_tmp", pdf.br_node_files.cast(IntegerType()))\ .drop("br_node_files").withColumnRenamed("br_node_files_tmp", "br_node_files")\ .withColumn("br_xfer_bytes_tmp", pdf.br_xfer_bytes.cast(DoubleType()))\ .drop("br_xfer_bytes").withColumnRenamed("br_xfer_bytes_tmp", "br_xfer_bytes")\ .withColumn("br_xfer_files_tmp", pdf.br_xfer_files.cast(IntegerType()))\ .drop("br_xfer_files").withColumnRenamed("br_xfer_files_tmp", "br_xfer_files") # example of aggregation # res = ndf.filter("dataset_is_open='y'").groupBy().sum('block_bytes') # print("open dataset size", res.collect()) if opts.order == 'dataset': res = ndf.map(lambda r: ((r.dataset_name, r.node_name), r)).groupByKey().map(lambda g: (g[0], stats(g[1]))) elif opts.order == 'site' or opts.order == 'node': res = ndf.map(lambda r: ((r.node_name, r.dataset_name), r)).groupByKey().map(lambda g: (g[0], stats(g[1]))) else: msg = 'The order key="%s" is not supported' % opts.order raise NotImplementedError(msg) if opts.fout: # lines = res.map(toCSV) lines = res.map(lambda g: (g[0][0],(g[0][1], g[1]))).groupByKey().map(toCSV2) lines.saveAsTextFile(opts.fout) else: count = 0 print("dataset site nfiles bsize status cust group") for item in res.collect(): pair = item[0] dataset = pair[0] site = pair[1] items = item[1] nfiles = items[0] bsize = items[1] dstatus = items[2] cust = items[3] group = items[4] print('%s %s %s %s %s %s %s' % (dataset, site, nfiles, bsize, dstatus, cust, group)) count += 1 if count>10: break
denominator += abs(w) numerator += (user_movie_rating_dict[(user_id_1, movie_id_2)] * w) if denominator == 0: return movie_rating_dict.get(movie_id_1, 3.5) rating = numerator / denominator if rating < 0.0: return 0.0 elif rating > 5.0: return 5.0 return rating spark_context = SparkContext(appName='ItemBasedCF', conf=SparkConf()) spark_context.setLogLevel("WARN") file_path = sys.argv[1] test_file = sys.argv[2] similarity_file = sys.argv[3] test_file = spark_context.textFile(test_file) # test_file = test_file.coalesce(4) test_file_header = test_file.first() testing_data = test_file \ .filter(lambda line: line != test_file_header) \ .map(lambda line: user_movie_map(line))\ .persist() data = spark_context.textFile(file_path) header = data.first()
from pyspark.sql.functions import col from pyspark.ml import PipelineModel from pyspark.sql import SQLContext, SparkSession from pyspark import SparkContext from pyspark.streaming import StreamingContext from collections import namedtuple from functools import reduce # from pyspark.sql.functions import desc sc = SparkContext("local[2]", "Streaming App") pipelineFit = PipelineModel.load("standford_500.logreg.model") sc.setLogLevel("error") ssc = StreamingContext(sc, 10) sqlContext = SQLContext(sc) #ssc.checkpoint( "file:/home/ubuntu/tweets/checkpoint/") socket_stream = ssc.socketTextStream( "127.0.0.1", 5555) # Internal ip of the tweepy streamer lines = socket_stream.window(20) #lines.pprint() fields = ("tweet_text") Tweet = namedtuple('Tweet', fields) def getTotalCount(): if ("totalTweetsCount" not in globals()): globals()["totalTweetsCount"] = 0 return globals()["totalTweetsCount"]
# Inialize parser and parse argument parser = argparse.ArgumentParser() parser.add_argument("-input","--input",help="Complete input file path for Dataset ex. hdfs:/CCF/input/example.csv") parser.add_argument("-output","--output",help="Complete output path for results ex. hdfs:/CCF/output") parser.add_argument("-partition","--partition",type=int,help="Number of partitions for dataset") args = parser.parse_args() partition_number = args.partition input_file_path = args.input output_directory = args.output # Initialize spark-context configuration conf = SparkConf() conf.setAppName('pyspark-shell-CCF-v1') sc = SparkContext(conf=conf) sc.setLogLevel("WARN") # Initialize logger log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.warn("################################") LOGGER.warn(" Start CCF RDD version 1") LOGGER.warn("--------------------------------") # Import as RDD line_by_line raw_graph = sc.textFile(input_file_path,minPartitions=partition_number) # CSV transformation -> Separator need to be adapted considering the file format r = raw_graph.map(lambda x:x.split('\t')).map(lambda x:(x[0],x[1]))
def __init__(self, arglist, _sc = None, _sqlContext = None): sc = SparkContext() if _sc is None else _sc sqlContext = HiveContext(sc) if _sqlContext is None else _sqlContext sc.setLogLevel("ERROR") self.sqlContext = sqlContext self.sc = sc self._jvm = sc._jvm from py4j.java_gateway import java_import java_import(self._jvm, "org.tresamigos.smv.ColumnHelper") java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper") java_import(self._jvm, "org.tresamigos.smv.dqm.*") java_import(self._jvm, "org.tresamigos.smv.panel.*") java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper") java_import(self._jvm, "org.tresamigos.smv.SmvRunInfoCollector") self.j_smvPyClient = self.create_smv_pyclient(arglist) # shortcut is meant for internal use only self.j_smvApp = self.j_smvPyClient.j_smvApp() self.log = self.j_smvApp.log() # AFTER app is available but BEFORE stages, # use the dynamically configured app dir to set the source path self.prepend_source(self.SRC_PROJECT_PATH) # issue #429 set application name from smv config sc._conf.setAppName(self.appName()) # user may choose a port for the callback server gw = sc._gateway cbsp = self.j_smvPyClient.callbackServerPort() cbs_port = cbsp.get() if cbsp.isDefined() else gw._python_proxy_port # check wither the port is in-use or not. Try 10 times, if all fail, error out check_counter = 0 while(not check_socket(cbs_port) and check_counter < 10): cbs_port += 1 check_counter += 1 if (not check_socket(cbs_port)): raise SmvRuntimeError("Start Python callback server failed. Port {0}-{1} are all in use".format(cbs_port - check_counter, cbs_port)) # this was a workaround for py4j 0.8.2.1, shipped with spark # 1.5.x, to prevent the callback server from hanging the # python, and hence the java, process from pyspark.streaming.context import _daemonize_callback_server _daemonize_callback_server() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: print("SMV starting Py4j callback server on port {0}".format(cbs_port)) gw._shutdown_callback_server() # in case another has already started gw._start_callback_server(cbs_port) gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.SmvPythonHelper.updatePythonGatewayPort(jgws, gw._python_proxy_port) self.repoFactory = DataSetRepoFactory(self) self.j_smvPyClient.registerRepoFactory('Python', self.repoFactory) # Initialize DataFrame and Column with helper methods smv.helpers.init_helpers()
def main(argv): assert len( argv ) == 3, "Script takes 3 arguments <train_file><model_file><cf_type>" # Unpack arguments train_file, model_file, cf_type = argv config = SparkConf().setMaster("local[*]") \ .setAppName("Task3train") \ .set("spark.executor.memory", "4g") \ .set("spark.driver.memory", "4g") sc = SparkContext(conf=config).getOrCreate() sc.setLogLevel("ERROR") if cf_type == "item_based": lines = sc.textFile(train_file).map(json.loads).cache() business_tokens = lines.map(lambda x: x["business_id"]).distinct( ).zipWithIndex().collectAsMap() tokens_business = {v: k for k, v in business_tokens.items()} rdd = lines.map(lambda x: (business_tokens[x["business_id"]], (x["user_id"], x["stars"]))) \ .groupByKey().filter(lambda x: len(x[1]) >= 3) \ .mapValues(dict) \ .cache() tokens_rdd = rdd.map(lambda x: x[0]) rdd_dict = rdd.collectAsMap() results = tokens_rdd.cartesian(tokens_rdd) \ .filter(lambda x: x[0] < x[1]) \ .filter(lambda x: get_intersect(rdd_dict[x[0]], rdd_dict[x[1]]) >= 3) \ .map(lambda x: ((x[0], x[1]), pearson_correlation(rdd_dict[x[0]], rdd_dict[x[1]]))) \ .filter(lambda x: x[1] > 0.0).collect() print("Number of candidates --------------> ", len(results)) with open(model_file, "w+") as file: for line in results: file.writelines( json.dumps({ "b1": tokens_business[line[0][0]], "b2": tokens_business[line[0][1]], "sim": line[1] }) + "\n") file.close() else: lines = sc.textFile(train_file).map(json.loads).cache() business_tokens = lines.map(lambda x: x["business_id"]).distinct( ).zipWithIndex().collectAsMap() tokens_business = {v: k for k, v in business_tokens.items()} user_tokens = lines.map( lambda x: x["user_id"]).distinct().zipWithIndex().collectAsMap() tokens_user = {v: k for k, v in user_tokens.items()} business_users = lines.map(lambda x: (business_tokens[x["business_id"]], user_tokens[x["user_id"]])) \ .groupByKey() \ .filter(lambda x: len(x[1]) >= 3) \ .mapValues(list).cache() users_business = lines.map(lambda x: (user_tokens[x["user_id"]], (business_tokens[x["business_id"]], x["stars"]))) \ .groupByKey() \ .filter(lambda x: len(x[1]) >= 3) \ .mapValues(dict) \ .collectAsMap() # MinHash hash_funcs = [min_hash_func(i) for i in range(NUM_BUCKETS)] hash_rdd = business_users.map(lambda x: (x[0], get_hash(x[0], hash_funcs))) joined_hash_rdd = hash_rdd.join(business_users).partitionBy( 7, lambda x: hash(x) % 7) signature_mat = joined_hash_rdd.map(lambda x: get_user_hash(x[1])) \ .flatMap(lambda x: x) \ .reduceByKey(lambda h1, h2: min_hash(h1, h2)) lsh_hash_funcs = [lsh_hash(i) for i in range(BANDS)] candidates = signature_mat.map(lambda x: (x[0], generate_bands(x[1]))) \ .map(group_bands) \ .flatMap(lambda x: x) \ .groupByKey() \ .map(lambda x: lsh(x, lsh_hash_funcs)) \ .flatMap(lambda x: x[1]) \ .filter(lambda x: len(x) > 1) \ .flatMap(lambda pairs: [pair for pair in combinations(pairs, 2)]) \ .distinct() \ .filter(lambda x: users_business.get(x[0]) != None and users_business.get(x[1]) != None) \ .filter(lambda x: get_intersect(users_business[x[0]], users_business[x[1]]) >= 3) \ .filter(lambda x: jaccard(users_business[x[0]], users_business[x[1]]) >= 0.01) \ .map(lambda x: ((x[0], x[1]), pearson_correlation(users_business[x[0]], users_business[x[1]]))) \ .filter(lambda x: x[1] > 0.0).collect() print("Number of candidates -----------------> ", len(candidates)) with open(model_file, "w+") as file: for line in candidates: file.writelines( json.dumps({ "u1": tokens_user[line[0][0]], "u2": tokens_user[line[0][1]], "sim": line[1] }) + "\n") file.close()
parser.add_argument('--N', type=int, default=20, help='Number of partitions') parser.add_argument( '--solver', default='GD', choices=['GD', 'LS'], help= 'GD learns β via gradient descent, LS learns β by solving a linear system of equations' ) args = parser.parse_args() sc = SparkContext(appName='Parallel Ridge Regression') sc.setLogLevel('warn') beta = None if args.traindata is not None: # Train a linear model β from data with regularization parameter λ, and store it in beta print('Reading training data from', args.traindata) data = readData(args.traindata, sc) data = data.repartition(args.N).cache() x, y = data.take(1)[0] beta0 = np.zeros(len(x)) if args.solver == 'GD': start = time() print('Training on data from', args.traindata, 'with λ =',
def main(): global spark conf = (SparkConf().setAppName("Enc SNI classification").set( "spark.dynamicAllocation.enabled", "false").set("spark.task.maxFailures", 128).set( "spark.yarn.max.executor.failures", 128).set("spark.executor.cores", "8").set("spark.executor.memory", "12G").set("spark.executor.instances", "80").set( "spark.network.timeout", "300").set( "spark.executorEnv.PYTHON_EGG_CACHE", "./.python-eggs-cache/"). set("spark.executorEnv.PYTHON_EGG_DIR", "./.python-eggs/").set( "spark.driverEnv.PYTHON_EGG_CACHE", "./.python-eggs-cache/").set( "spark.driverEnv.PYTHON_EGG_DIR", "./.python-eggs/").set( "spark.driver.maxResultSize", "1024G").set( "spark.kryoserializer.buffer.max value", "10240G").set("spark.kryoserializer.buffer.max.mb", "2047")) if not DEBUGGER: sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") spark = SparkSession(sc) # Load ASNs print_box("Computing ASNs") asns = {} print("Target ASNs:") for as_name, as_nb in [ l.split(",") for l in open(ASN_FILE, "r").read().splitlines() ]: this_asns = as_nb.split(':') asns[as_name] = this_asns print(" {}: {}".format(as_name, as_nb)) # Compute entries and ASs training = spark.sparkContext.textFile(LOG_TCP_TRAIN_IN)\ .mapPartitions(lambda p: get_tcp_entries(p, TRAIN_HASHING, asns) ).filter(lambda e: e["c_ip"] not in CLIENT_BLACKLIST) testing = spark.sparkContext.textFile(LOG_TCP_TEST_IN )\ .mapPartitions(lambda p: get_tcp_entries(p, TEST_HASHING, asns) ).filter(lambda e: e["c_ip"] not in CLIENT_BLACKLIST) # Persist and print size training.persist(StorageLevel(True, True, False, False, 1)) testing.persist(StorageLevel(True, True, False, False, 1)) print("Training log entries:", training.count()) print("Testing log entries:", testing.count()) # Start classification print_box("Working on classification") models = {} reports = {} for as_name in asns: print("Working on : {}".format(as_name)) # Filter, persist and print size this_training = training.filter(lambda e: e["s_asn_name"] == as_name) this_testing = testing.filter(lambda e: e["s_asn_name"] == as_name) this_training.cache() this_testing.cache() training_count = this_training.count() testing_count = this_testing.count() print(" Training set:", training_count) print(" Testing set:", testing_count) # Proceed only if having data points if training_count > 0 and testing_count > 0: # Compute target domains if BINARY is None: print(" Computing Occurrences") occurrences = dict( this_training.map(lambda w: (w["domain"], None)).countByKey()) target_domains = [ k for k, v in sorted(occurrences.items(), key=operator.itemgetter(1), reverse=True) if v >= MIN_OCCURENCES ] if MAX_DOMAINS is not None: target_domains = target_domains[:MAX_DOMAINS] target_domains_dict = { d: i for i, d in enumerate(target_domains) } for d in occurrences: if d in target_domains: print(" ", d, ":", occurrences[d]) else: print(" Using binary classification with target:", BINARY) target_domains = [BINARY] target_domains_dict = {BINARY: 0} if len(target_domains) > 0: # Extract features print(" Extracting Features") training_features = this_training\ .map(lambda w: extract_feature(w, target_domains, target_domains_dict)).toDF() testing_features = this_testing\ .map(lambda w: extract_feature(w, target_domains,target_domains_dict)).toDF() # Classify if CLASSIFY_SPARK: print(" Classifying") model, training_report, testing_report = classify_spark( training_features, testing_features, target_domains, target_domains_dict) else: training_local = training_features.toPandas() testing_local = testing_features.toPandas() if DATASET_DIR is not None: if not os.path.exists(DATASET_DIR): os.makedirs(DATASET_DIR) training_local.to_csv("{}/{}.training.csv".format( DATASET_DIR, as_name), index=False) testing_local.to_csv("{}/{}.testing.csv".format( DATASET_DIR, as_name), index=False) print(" Classifying") model, training_report, testing_report = classify_local( training_local, testing_local) # Store reports report = { "training": training_report, "testing": testing_report } print(" Macro avg F1:", testing_report["macro avg"]["f1-score"]) print(" Weighted avg F1:", testing_report["weighted avg"]["f1-score"]) reports[as_name] = report if MODEL_OUT is not None: models[as_name] = model else: print(" Skipping as no domain has minimum occurrences") reports[as_name] = { "error": "no domain has minimum occurrences" } models[as_name] = { "error": "no domain has minimum occurrences" } else: print(" Skipping as empty") reports[as_name] = {"error": "empty dataset"} if MODEL_OUT is not None: models[as_name] = {"error": "empty dataset"} gc.collect() # Save results on disk if REPORT_OUT is not None: json.dump(reports, open(REPORT_OUT, "w"), indent=4) if MODEL_OUT is not None and not CLASSIFY_SPARK: pickle.dump(models, open(MODEL_OUT, "wb")) if DEBUGGER: pdb.set_trace()
###### This is a script to use Spark Streaming to consume the sessions data stream from Kafka. ##################################################### from pyspark import SparkContext from pyspark.streaming import StreamingContext, StreamingListener from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition import yaml import redis import psycopg2, datetime with open("config.yml", 'r') as ymlfile: config = yaml.load(ymlfile) # Create Spark Streaming Context sc = SparkContext(appName="consuming_sessions") sc.setLogLevel("Error") ssc = StreamingContext(sc, 2) # Connect to Kafka and split each message to list of strings topic = "sessions" sessionStream = KafkaUtils.createDirectStream( ssc, [topic], {"metadata.broker.list": config['broker_list']}) # add {'auto.offset.reset':'smallest'} to read from beginning lines = sessionStream.map(lambda x: x[1]) lines_list = lines.map(lambda line: line[:-1].split("\t")) ########### Filtering sessions stream and calculate the metrics ###############
from pyspark import SparkContext from pyspark.sql import * import pyspark.sql.functions as f sc = SparkContext('local', 'app') sc.setLogLevel('OFF') spark = SparkSession(sc) spark.readStream \ .format('socket') \ .option('host', 'localhost') \ .option('port', 9999) \ .load() \ .agg(f.collect_list(f.col('value')).alias('list')) \ .select(f.explode(f.col('list'))) \ .writeStream \ .format('console') \ .outputMode('complete') \ .option('checkpointLocation', 'data/checkpoint') \ .start() \ .awaitTermination() # $ ncat -lk 9999 # # first # second # third # ------------------------------------------- # Batch: 0 # -------------------------------------------
print("\n Example") print("spark-submit --packages graphframes:graphframes:0.5.0-spark1.6-s_2.10 " "3-calculate-network-metrics.py " "/home/madis/IR/thesis/parquets/bipartite-sku-only-sku-matches " "network-metrics") sys.exit(1) else: bipartite_location = sys.argv[1] output_location = sys.argv[2] s_conf = SparkConf() s_conf.set("spark.executor.instances", "4") s_conf.set("spark.executor.memory", "2g") s_conf.set("spark.driver.memory", "2g") spark = SparkContext(conf=s_conf) spark.setLogLevel("ERROR") # needed here if running locally, in cluster it can be with other imports import graphframes sqlContext = SQLContext(spark) # needed for connected components # an existing directory! spark.setCheckpointDir("tmp/checkpoint") graph = build_graph(bipartite_location) dfs = find_subgraphs(graph) vertices_out = network_algorithms(graph, dfs)
__author__ = "ResearchInMotion" import findspark findspark.init() from pyspark import SparkConf from pyspark import SparkContext sparkconf = SparkConf().setAppName("WordCount").setMaster("local[*]") sparkcont = SparkContext(conf=sparkconf) sparkcont.setLogLevel("ERROR") def columns(lines): field = lines.split(",") country = field[3] name = field[2] return name, country data = sparkcont.textFile( "/Users/sahilnagpal/PycharmProjects/Python/Pyspark/InputData/airports.text" ) columnsdata = data.map(columns).filter( lambda country: country[1] == "\"United States\"").take(5) for name, country in columnsdata: print("{} , {} ".format(name, country))
def main(): conf = SparkConf().setMaster("local[*]").setAppName("compare_engine") sc = SparkContext(conf=conf) sc.setLogLevel("INFO") sc.addFile(primary) # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() rdd_primary.partitionBy(10).cache() os.system("rm -Rf collects_*") os.system("rm -Rf holder.txt") rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct() rdd_secondary.partitionBy(10).cache() primary_count = rdd_primary.count() primary_report["count"] = primary_count print(primary_report) secondary_count = rdd_secondary.count() secondary_report["count"] = secondary_count print(secondary_report) # Return each Primary file line/record not contained in Secondary not_in_primary = rdd_primary.subtract(rdd_secondary) primary_diff = not_in_primary.count() primary_report["diff"] = primary_diff os.system("rm -Rf collects_*.csv") primary_dir = "collects_{}_primary".format(run_date) primary_report_name = "collects_{}_primary_report.csv".format(run_date) not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir) # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date)) os.system("cat {}/part-0000* >> {}".format(primary_dir, primary_report_name)) os.system("wc -l collects_{}_primary_report.csv".format(run_date)) # Flip Primary Vs Secondary # Return each Secondary file line/record not contained in Primary not_in_secondary = rdd_secondary.subtract(rdd_primary) secondary_diff = not_in_secondary.count() secondary_report["diff"] = secondary_diff not_in_secondary.coalesce(1, True).saveAsTextFile( "collects_{}_secondary".format(run_date)) os.system( "cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv" .format(run_date, run_date)) os.system("wc -l collects_{}_secondary_report.csv".format(run_date)) process_report["primary"] = primary_report process_report["secondary"] = secondary_report print("=" * 100) print("\n") print(process_report) print("\n") print("=" * 100) spark_details(sc) sc.stop()
return predictions def append_key_to_dictionary(dictionary, key, value): dictionary[key] = value return dictionary def insert_into_table(values, table_name, host, port): pass if __name__ == '__main__': sc = SparkContext(appName='PythonSparkStreamingKafka') sc.setLogLevel("WARN") # avoid printing logs # setting up a model lr = StreamingLogisticRegressionWithSGD() parameters = json.load(open('model.json', 'r')) # lr.setInitialWeights(parameters['weights']) lr = create_logistic_regression_skl(parameters['weights'], parameters['intercept']) stop_words = load_stopwords() common_words = load_common_words() reference_table = create_hash_table(common_words=common_words, stop_words=stop_words) ssc = StreamingContext(sparkContext=sc, batchDuration=2) spark_sql = SQLContext(sparkContext=sc) kafkaStream = KafkaUtils.createDirectStream(ssc=ssc, topics=['trump'],
class Reader(): def __init__(self): self.sc = SparkContext('local', 'Stream-SQL') self.ssc = StreamingContext(self.sc, batchDuration=3) self.spark = SparkSession.builder\ .getOrCreate() self.sc.setLogLevel('ERROR') def initStream(self): self.readInput() self.ssc.start() self.ssc.awaitTermination() def inputSQLQuery(self, query): self.modQuery = '' self.dictInnerQuery = {} innerFlag = False innerCol = '' wordList = query.split(' ') wordQuery = '' for i in range(len(wordList)): word = wordList[i] # Detect opening '(' of inner query if word == '(SELECT': innerFlag = True innerCol = wordList[i - 2] if innerFlag: wordQuery += word + ' ' else: self.modQuery += word + ' ' # Detect closing ')' of table) and not AVG(col) if ')' in word and '(' not in word: replaceInner = 'Q' + str(len(self.dictInnerQuery)) self.modQuery += replaceInner + ' ' key = replaceInner value = [wordQuery, innerCol, 0] self.dictInnerQuery[key] = value innerFlag = False wordQuery = '' def readInput(self): lines = self.ssc.textFileStream('Data/Live') self.csvSchema = StructType([ StructField('col1', IntegerType()), StructField('col2', IntegerType()), StructField('col3', IntegerType()) ]) # self.stateDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema) # self.stateDF.show() self.globalDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema) self.totalTime = 0.0 def row(inpStr): return Row(int(inpStr[0]), int(inpStr[1]), int(inpStr[2])) def iterateRDD(rdd): start = time.clock() data = rdd.map(lambda line: line.split(' ')).map(row) df = data.toDF(self.csvSchema) if df.count(): # curDF = df.union(self.stateDF) # self.queryRDD(curDF) # Append to global DF for batch outputs self.globalDF = df.union(self.globalDF) self.outputQuery(self.globalDF) self.totalTime += time.clock() - start # print(str(round(self.totalTime, 2)) + 's') lines.foreachRDD(iterateRDD) def queryRDD(self, df): df.createOrReplaceTempView('table') for key, value in self.dictInnerQuery.items(): innerQuery = value[0] sqlDF = self.spark.sql(innerQuery) sqlRes = sqlDF.first()[0] self.dictInnerQuery[key][2] = sqlRes b = 5 addToState = [False for i in range(df.count())] for key, value in self.dictInnerQuery.items(): col = value[1] val = value[2] tupleList = [{col: x[col]} for x in df.rdd.collect()] for i in range(len(tupleList)): row = tupleList[i] if row[col] > val - b and row[col] < val + b: addToState[i] = True # print(addToState) itr = 0 newRows = [] newStateDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema) for row in df.rdd.collect(): if addToState[itr]: newRows.append(row) itr += 1 # print(newRows) newStateDF = self.spark.createDataFrame(newRows, self.csvSchema) self.stateDF = newStateDF # self.stateDF.show() def outputQuery(self, df): # curQuery = ' '.join(list(map((lambda word: str(round(self.dictInnerQuery[word][2], 2)) if word in self.dictInnerQuery else word), self.modQuery.split()))) # df.createOrReplaceTempView('table') # streamOut = self.spark.sql(curQuery).first()[0] # print(type(streamOut)) # self.globalDF.show() query = 'SELECT AVG(col2) FROM table WHERE col2 > (SELECT AVG(col2) FROM table)' self.globalDF.createOrReplaceTempView('table') globalOut = self.spark.sql(query).first()[0] # print(type(globalOut)) print(globalOut)
# hourly distribution of time gap with resolution 5 seconds # threshold: time interval larger than threshold will be discarded # time interval span two consecutive hours is taken into consideration 'hourly_time_gap_distribution': {'index': 1, 'return_std_range': 3, 'resolution': 5, 'threshold': 3600} } ''' process_properties = {} # ------------------------------------------------------------------------------------------------------------------ # ------------------------Spark Context----------------------------------------------------------------------------- sc = SparkContext(appName='Data_Analysis') sc.setLogLevel('DEBUG') # ------------------------------------------------------------------------------------------------------------------ # ------------------------Prepare Raw RDD--------------------------------------------------------------------------- raw_rdd = load_raw_data(sc, input_path, process_properties['delimiter']) ''' If you want to add more columns or do some manipulation to the raw dataset loaded from hdfs, please put the code here. 1. Make sure that the tansformed rdd is re-assiend to variabel raw_rdd. 2. Make sure the data dictionary and the column number in the configuration section match the new dataset ''' # ------------------------------------------------------------------------------------------------------------------ # ------------------------Main Process------------------------------------------------------------------------------ main_process(raw_rdd, sc, output_path, data_dict, process_properties)
from pyspark import SparkContext, SparkConf, StorageLevel from pyspark.sql import Row, SQLContext import re from operator import add if __name__ == '__main__': conf = SparkConf().setAppName("logReader").setMaster("local[1]") conf.set("mytest.sql.crossJoin.enabled", True) conf.set("mytest.sql.shuffle.partitions", 5) conf.set("mytest.defalut.parallelism", 2) sc = SparkContext(conf=conf) sc.setLogLevel("INFO") input = ["A", "C"] str = "A:B,C,D,E,F;B:A,C,D,E;C:A,B,E;D:A,B,E;" # # rdd = sc.parallelize(re.split(";", str)[0:-1]).map( # lambda kv: (re.split(":", kv)[0], re.split(":", kv)[1])). \ # filter(lambda kv: kv[0] in input).map(lambda kv: kv[1]) keys = sc.parallelize(re.split(";", str)[0:-1]).map( lambda kv: (re.split(":", kv)[0], re.split(":", kv)[1].split(","))). \ filter(lambda kv: kv[0] in input).flatMapValues(lambda x: x).map(lambda x: (x[1], 1)).reduceByKey(add).filter( lambda x: x[1] > 1).keys().collect() print(",".join(keys)) # rdd1=rdd.map(lambda v: re.split(",", v)).collect() # print(rdd1)
if __name__ == '__main__': sc_conf = SparkConf() sc_conf.setAppName('ps_consumer') # pyspark consumer sc_conf.setMaster('local[*]') # sc_conf.set('spark.executor.memory', '2g') # sc_conf.set('spark.executor.cores', '4') # sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) sc_conf.set('spark.io.compression.codec', 'snappy') # sc = SparkContext(master='local[*]', appName='ps_consumer') sc = SparkContext(conf=sc_conf) sc.setLogLevel('INFO') # print(sc) ssc = StreamingContext(sc, 5) # print(ssc) topic = 'firewall' partition = 0 kafka_param = { "metadata.broker.list": 'localhost:9092', "auto.offset.reset": "smallest", "group.id": 'mygroup', } # topicPartion = TopicAndPartition(topic, partition) # fromOffsets = {topicPartion: 500} # stream = KafkaUtils.createDirectStream(
def init_spark(): sc = SparkContext(appName="videoStreamCollector") ssc = StreamingContext(sc, config.Config.BATCH_DURATION) sc.setLogLevel("WARN") return sc, ssc
d = json.loads(d) city = d["city"].encode("utf-8") list_of_cities.append(city) gt = len(set(list_of_cities)) median = get_estimate(list_of_cities) # print(str(time),gt,median) with open(output_file, "a") as fp: writer = csv.writer(fp) writer.writerow([time, gt, median]) return port_no = int(sys.argv[1]) output_file = sys.argv[2] num_hash_functions = 45 sc = SparkContext(appName="task2") sc.setLogLevel("OFF") ssc = StreamingContext(sc, 5) with open(output_file, "w") as fp: writer = csv.writer(fp) writer.writerow(["Time", "Ground Truth", "Estimation"]) hash_list = generate_hash_functions(num_hash_functions) lines = ssc.socketTextStream("localhost", port_no).window(30, 10).foreachRDD(sample) ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def functionToCreateContext(): sc = SparkContext('local[2]', 'checkpoint') sc.setLogLevel('ERROR') ssc = StreamingContext(sc, 5) ssc.checkpoint('/tmp/checkpointDirectory') # set checkpoint directory return ssc
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os import sys, re, os, calendar st = datetime.now() conf = SparkConf().setAppName('PROC_A_R_MONTH_CONT').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #本月月末日期(10位) monthRange = calendar.monthrange(int(etl_date[0:3]), int(etl_date[4:6])) #得到本月的天数
def process(fileName, output): lines = open(file=fileName, mode="r").readlines() data_list = [] for line in lines: if line is None or line.strip().__len__() == 0: continue for data in json.loads(line): title = data['title'].replace('None', '') text = data['text'].replace('None', '') tmp = "" if title is not None or title.strip() != "None": tmp = tmp + title.strip() + '.' if text is None or text.strip() == "None": continue tmp = tmp + text.strip() tmp.replace("\\u00", '').replace("\n", "") data_list.append(tmp) # print("$$$$$$$News is:",tmp) conf = SparkConf().setAppName("Train_News").setMaster("local[1]") sc = SparkContext(conf=conf) sc.setLogLevel("WARN") dataRdd = sc.parallelize(data_list).distinct(numPartitions=1) tokenizedRdd = dataRdd.map(lambda x: [x, preprocess(x)]) taggedRdd = tokenizedRdd.map( lambda x: [x[0], x[1], getNamedEntities(x[0]), extractSignature(x[0])]) prepared_data = taggedRdd.collect() result = [] i = 0 for data1 in prepared_data: print('Processed data:', i) i = i + 1 for data2 in prepared_data: if data1 == data2: continue nn_count = 0 spot_count = 0 nn_score = 0 lav_d_count = 0 lav_d_score = 0 spot_score = 0 if len(data1[2]) != 0: for nn in data1[2]: n = len(nn) if nn in data2[2]: nn_count = nn_count + 1 for tmp in data2[2]: l = len(tmp) if lav_distance(nn, tmp) / (n + l) < 0.25: lav_d_count = lav_d_count + 1 nn_score = nn_count / len(data1[2]) lav_d_score = lav_d_count / len(data1[2]) if len(data1[3]) != 0: for nn in data1[3]: if nn in data2[3]: spot_count = spot_count + 1 spot_score = spot_count / len(data1[3]) result.append([ data1[0], data1[2], data1[3], data2[0], data2[2], data2[3], nn_count, nn_score, lav_d_count, lav_d_score, spot_count, spot_score ]) df = pd.DataFrame( data=result, columns=[ "News1", "Names_Entities_1", "Spot_words_1", "News2", "Names_Entities_2", "Spot_words_2", "Named_Entity_match_count", "Named_Entity_match_score", "Laven_Named_Entity_match_score", "Laven_Named_Entity_match_count", "Spot_Words_Match_score", "Spot_Words_Match_Score" ]) time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S').replace( " ", "_").replace(":", "_") df.query('Named_Entity_match_score > 0 or Spot_Words_Match_Score > 0.25' ).to_csv(output + time + ".csv") print('Report File Saved')
from pyspark import SparkContext """ nasa_19950701.tsv contains 10000 log lines from one of NASA's apache server for 1st July 1995. nasa_19950801.tsv contains 10000 log lines for 1st Aug 1995. Create a spark program to generate a new RDD which contains the log lines from both July 1st & Aug 1st, take 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" The file has header lines: host logname time method url response bytes Make sure the head lines are removed in the resulting RDD """ sc = SparkContext("local[2]", "NASA log Problem") sc.setLogLevel("ERROR") nasa_july = sc.textFile("./in/nasa_19950701.tsv") nasa_aug = sc.textFile("./in/nasa_19950801.tsv") nasa_both = nasa_july.union(nasa_aug) nasa_both_without_header = nasa_both.filter( lambda line: not (line.startswith("host") and "bytes" in line)) nasa_both_sample = nasa_both_without_header.sample(withReplacement=True, fraction=0.1) nasa_both_sample.saveAsTextFile("out/sample_nasa_logs.csv")
from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext(appName="ML Example") sc.setLogLevel("FATAL") sqlContext = SQLContext(sc) # Prepare training data from a list of (label, features) tuples. training = sqlContext.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters: ") print(model1.extractParamMap())
def run(): # Creating the Spark Context sc = SparkContext(master="local[2]", appName="WindowWordCount") sc.setLogLevel("ERROR") # creating the streaming context ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") # creating the SQL context sqlContext = SQLContext(sc) host = "localhost" port = 5599 lines = ssc.socketTextStream(host, port) hashtags = lines.filter(lambda text: len(text) > 0) \ .flatMap(lambda text: text.split(" ")) \ .filter(lambda text: text.lower().startswith('#')) Word = namedtuple('Word', ("word", "count")) Hashtag = namedtuple('Hashtag', ("tag", "count")) Tweet = namedtuple('Tweet', ('text', 'sentiment')) stop_words = set(stopwords.words('english')) list_punct = list(string.punctuation) lemmatizer = WordNetLemmatizer() # processing to obtain data about tweets text and sentiment lines.window(40) \ .map(lambda p: clean_tweet(p)) \ .filter(lambda text: len(text) > 0) \ .map(lambda p: Tweet(p, analyze_sentiment_polarity(p))) \ .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("tweets")) # processing to obtain data about single words in text and their count. NLP tools applied. lines.window(40) \ .map(lambda p: clean_tweet(p)) \ .filter(lambda text: len(text) > 0) \ .flatMap(lambda text: text.split(" ")) \ .map(lambda word: word.lower()) \ .filter(lambda word: word not in stop_words) \ .map(lambda word: ''.join(char for char in word if char not in list_punct)) \ .map(lambda word: lemmatizer.lemmatize(word)) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda p: Word(p[0], p[1])) \ .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("words")) # processing to obtain data about hashtags in text and their count. hashtags.window(40) \ .map(lambda word: ''.join(char for char in word if char not in list_punct)) \ .map(lambda word: (word.lower(), 1)) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda p: Hashtag(p[0], p[1])) \ .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("hashtags")) time_to_wait = 80 ssc.start() print("Session Started.....") print("Collecting tweets...waiting for " + str(time_to_wait) + " seconds..") time.sleep( time_to_wait) # waiting in to ensure that some data are yet collected. print("Tweets Collected....") all_hashtags_df = None all_tweets_df = None all_words_df = None count = 1 count_max = 4 while count <= count_max: print('Count: ' + str(count) + "/" + str(count_max)) print("Waiting for 30 Seconds.....") time.sleep(40) words = sqlContext.sql('Select word, count from words') words_df = words.toPandas() print(words_df) if all_words_df is None: all_words_df = words_df else: all_words_df = pd.concat([all_words_df, words_df], join='inner', ignore_index=True) tags = sqlContext.sql('Select tag, count from hashtags') tags_df = tags.toPandas() print(tags_df) if all_hashtags_df is None: all_hashtags_df = tags_df else: all_hashtags_df = pd.concat([all_hashtags_df, tags_df], join='inner', ignore_index=True) tweets = sqlContext.sql('Select text, sentiment from tweets') tweets_df = tweets.toPandas() if all_tweets_df is None: all_tweets_df = tweets_df else: all_tweets_df = pd.concat([all_tweets_df, tweets_df], join='inner', ignore_index=True) count += 1 ssc.stop() # Saving all dataframes as csv. if all_hashtags_df is not None: all_hashtags_df.to_csv('hashtags.csv') if all_words_df is not None: all_words_df.to_csv('words.csv') if all_tweets_df is not None: all_tweets_df.to_csv('tweets.csv')
def main(): conf = (SparkConf() .setMaster("local[*]") .setAppName("compare_engine")) sc = SparkContext(conf = conf) sc.setLogLevel('INFO') sc.addFile(primary) # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() rdd_primary.partitionBy(10).cache() os.system('rm -Rf collects_*') os.system('rm -Rf holder.txt') rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct() rdd_secondary.partitionBy(10).cache() primary_count = rdd_primary.count() primary_report['count'] = primary_count print(primary_report) secondary_count = rdd_secondary.count() secondary_report['count'] = secondary_count print(secondary_report) # Return each Primary file line/record not contained in Secondary not_in_primary = rdd_primary.subtract(rdd_secondary) primary_diff = not_in_primary.count() primary_report['diff'] = primary_diff os.system('rm -Rf collects_*.csv') primary_dir = 'collects_{}_primary'.format(run_date) primary_report_name = 'collects_{}_primary_report.csv'.format(run_date) not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir) # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date)) os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name)) os.system('wc -l collects_{}_primary_report.csv'.format(run_date)) # Flip Primary Vs Secondary # Return each Secondary file line/record not contained in Primary not_in_secondary = rdd_secondary.subtract(rdd_primary) secondary_diff = not_in_secondary.count() secondary_report['diff'] = secondary_diff not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date)) os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date)) os.system('wc -l collects_{}_secondary_report.csv'.format(run_date)) process_report['primary'] = primary_report process_report['secondary'] = secondary_report print("=" * 100) print('\n') print(process_report) print('\n') print("=" * 100) spark_details(sc) sc.stop()
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils import json import re if __name__ == '__main__': sc = SparkContext(appName='PythonSparkStreamingKafka') sc.setLogLevel(logLevel='WARN') ssc = StreamingContext(sparkContext=sc, batchDuration=1) kafkaStream = KafkaUtils.createDirectStream(ssc=ssc, topics=['trump'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) regex = re.compile('\\w+') lines = kafkaStream.map(lambda line: json.loads(line[1])).\ filter(lambda d: d.get('lang', '') == 'en').\ flatMap(lambda d: regex.findall(d['text'].lower())).\ map(lambda word: (word, 1)).\ reduceByKey(lambda x, y: x+y) lines.pprint() lines.saveAsTextFiles('hdfs:///home/hduser/test_1') ssc.start() ssc.awaitTermination()
STREAM_OUT = 'stream-OUT' # We first delete all files from the STREAM_IN folder # before starting spark streaming. # This way, all files are new print("Deleting existing files in %s ..." % STREAM_IN) p = Path('.') / STREAM_IN for f in p.glob("*.ordtmp"): os.remove(f) print("... done") from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext sc = SparkContext("local[*]", "CountAndVolumePerBatch") sc.setLogLevel( "WARN") #Make sure warnings and errors observed by spark are printed. ssc = StreamingContext(sc, 5) #generate a mini-batch every 5 seconds filestream = ssc.textFileStream( STREAM_IN) #monitor new files in folder stream-IN def parseOrder(line): '''parses a single line in the orders file''' s = line.split(",") try: if s[6] != "B" and s[6] != "S": raise Exception('Wrong format') return [{ "time": datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S"), "orderId": int(s[1]),
parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument('config_file') args = parser.parse_args() # Load config file with open(args.config_file, 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) resolve_placeholder(cfg) cfg_log = cfg['log'] cfg_rti = cfg['pipeline']['rti_transform'] sc = SparkContext() hive_context = HiveContext(sc) sc.setLogLevel(cfg_log['level']) default_hour = cfg_rti['default_hour'] default_price_cat = cfg_rti['default_price_cat'] day_step = cfg_rti['day_step'] start_day = cfg_rti['start_day'] end_day = cfg_rti['end_day'] new_bucket_size = cfg_rti['new_bucket_size'] input_table = cfg_rti['input_table'] output_table = cfg['factdata_table_name'] run(hive_context=hive_context, input_table=input_table, output_table=output_table, start_day=start_day, end_day=end_day,
import traceback # In[2]: try: try: timespan=str(sys.argv[1]) except IndexError: print 'please pass timespan in argument' sys.exit() conf = (SparkConf().setMaster("local").setAppName("hi_report_app").set("spark.executor.memory", "1g")) sc = SparkContext(conf = conf) sc.setLogLevel("Error") sqlContext = SQLContext(sc) # In[2]: config_url='https://s3-ap-southeast-1.amazonaws.com/nlplive.humanindex.data/config.json' try: config_response=requests.get(config_url) config = json.loads(config_response.content) except: print "Cannot fetch Config......" # In[23]: try: fetch_response=requests.get(str(config['baseAPIUrl'])+'/'+str(config['version'])+'/preProcessing/GetPredictionFileJob/'+timespan+'/publisher') #check if api request is successfull or not
def error(point, kmeans): """ Convert Apache time format into a Python datetime object Args: point: point to predict in model kmeans (KMeansModel object): trained k-means model Returns: float: Calculate the within cluster squared error distance and return total for model """ center = kmeans.centers[kmeans.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) if __name__ == '__main__': # Initialize SparkContext object sc = SparkContext(appName="PythonDetectDDOS") sc.setLogLevel("ERROR") # Reduce logging sqlContext = sql.SQLContext(sc) # Path to log input file logFile = "/user/root/src/Project - Developer - apache-access-log (4).txt.gz" # Read log text file and parse based on Apache log standard parsed_logs, access_logs = parseLogs(sc, logFile) # Process data for feature columns to be used in training df4 = dataProcessing(access_logs) df4.show() # Format DataFrame into Dense Vector for mllib K-means clustering data7 = df4.rdd.map(lambda row: Vectors.dense(row[2], row[3])) data7.cache()
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: script.py <zk> <topic>", file=sys.stderr) exit(-1) zkQuorum, topic = sys.argv[1:] sc = SparkContext(appName="KafkaSparkStreaming") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") ks = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 42}) def processInput(line): fields = line[1].split("\t") return ((str(fields[6]), 1), (str(fields[7]), 1)) def updateFunction(newValues, runningCount): return sum(newValues, runningCount or 0) digest = ks.flatMap(processInput)\ .updateStateByKey(updateFunction)\ .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)\
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)
def main(): root = os.path.dirname(os.path.abspath(__file__)) print("Digits Handwriting Recognition using Spark") print("Root file path is = %s" %root) conf = SparkConf().setAppName("OCR") sc = SparkContext(conf = conf) sc.setLogLevel("WARN") sqlContext = SQLContext(sc) print("loading dataset") trainRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist") testRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist.t") # check if rdd support toDF if not hasattr(trainRDD, "toDF"): print("ERROR: RDD does not support toDF") os.exit(1) ## convert RDDs to data frames trainDF = trainRDD.toDF() testDF = testRDD.toDF() print("INFO: train dataframe count = %u" %trainDF.count()) print("INFO: test dataframe count = %u" %testDF.count()) indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") dtc = DecisionTreeClassifier(labelCol="indexedLabel") pipeline = Pipeline(stages=[indexer, dtc]) model = pipeline.fit(trainDF) ## train multiple depth models variedMaxDepthModels = [] print("Create varied depth CNN models [1..8]") for mdepth in xrange(1, 9): start = time.time() ## maximum depth dtc.setMaxDepth(mdepth) ## create pipeline pipeline = Pipeline(stages = [indexer, dtc]) ## create the model model = pipeline.fit(trainDF) ## add to varied container variedMaxDepthModels.append(model) end = time.time() print("trained a CNN depth of %u, duration = [%.3f] secs" %(mdepth, end - start)) print("=================================================") ## report model accuraries evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="precision") ## mdepth print("Evaluate all models precision") for mdepth in xrange(1, 9): model = variedMaxDepthModels[mdepth - 1] predictions = model.transform(testDF) precision = evaluator.evaluate(predictions) print("CNN depth = %u, precision = %.3f" %(mdepth, precision)) print("Finished processing %u digits" %testDF.count())
from pyspark import SparkContext sc = SparkContext(appName="streamingkafka") sc.setLogLevel("WARN") # 减少shell打印日志 rdd = sc.textFile('daily_IBM.csv') rdd = rdd.flatMap(lambda x: x.split(',')) print(rdd.collect())
from sklearn.metrics.cluster import normalized_mutual_info_score from pyspark import SparkContext import json sc = SparkContext(appName="INF553HW5", master="local[*]") sc.setLogLevel("WARN") sc.setLogLevel("ERROR") clustering_file_path = "o1" label_path = "../resource/asnlib/publicdata/cluster2.json" ground_truth = sc.textFile(label_path).map(lambda line: json.loads(line)). \ flatMap(lambda line: [(index, label) for index, label in line.items()]). \ map(lambda pair: (int(pair[0]), pair[1])). \ collect() ground_truth.sort() ground_truth = [cid for _, cid in ground_truth] ground_truth_cluster_size = sc.textFile(label_path).map(lambda line: json.loads(line)). \ flatMap(lambda line: [(index, label) for index, label in line.items()]). \ map(lambda pair: (pair[1], pair[0])). \ groupByKey(). \ mapValues(len).collect() ground_truth_cluster_size.sort(key=lambda pair: pair[1]) prediction_cluster_size = sc.textFile(clustering_file_path).map(lambda line: json.loads(line)). \ flatMap(lambda line: [(index, label) for index, label in line.items()]). \ map(lambda pair: (pair[1], pair[0])). \ groupByKey(). \ mapValues(len).collect()
def aggregate(hdir, cond, precision, min_date, max_date): "Collect aggregated statistics from HDFS" start_time = time.time() print("Aggregating {} FWJR performance data in {} matching {} from {} to {}...".format(precision.replace('y', 'i') + 'ly', hdir, cond, min_date, max_date)) conf = SparkConf().setAppName("wmarchive fwjr aggregator") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) # To test the procedure in an interactive pyspark shell: # # 1. Open a pyspark shell with appropriate configuration with: # # ``` # pyspark --packages com.databricks:spark-avro_2.10:2.0.1 --driver-class-path=/usr/lib/hive/lib/* --driver-java-options=-Dspark.executor.extraClassPath=/usr/lib/hive/lib/* # ``` # 2. Paste this: # # >>> # from pyspark.sql.functions import * # from pyspark.sql.types import * # hdir = '/cms/wmarchive/avro/2016/06/28*' # precision = 'day' fwjr_df = sqlContext.read.format("com.databricks.spark.avro").load(hdir) # <<< # Here we process the filters given by `cond`. # TODO: Filter by min_date and max_date and possibly just remove the `hdir` option and instead process the entire dataset, or make it optional. fwjr_df = make_filters(fwjr_df, cond) # 3. Paste this: # # >>> # Select the data we are interested in jobs = fwjr_df.select( fwjr_df['meta_data.ts'].alias('timestamp'), fwjr_df['meta_data.jobstate'], fwjr_df['meta_data.host'], fwjr_df['meta_data.jobtype'], fwjr_df['task'], fwjr_df['steps.site'].getItem(0).alias('site'), # TODO: improve fwjr_df['steps'], # TODO: `explode` here, see below # TODO: also select `meta_data.fwjr_id` ) # Transfrom each record to the data we then want to group by: # Transform timestamp to start_date and end_date with given precision, # thus producing many jobs that have the same start_date and end_date. # These will later be grouped by. timestamp = jobs['timestamp'] if precision == "hour": start_date = floor(timestamp / 3600) * 3600 end_date = start_date + 3600 elif precision == "day": start_date = floor(timestamp / 86400) * 86400 end_date = start_date + 86400 elif precision == "week": end_date = next_day(to_date(from_unixtime(timestamp)), 'Mon') start_date = date_sub(end_date, 7) start_date = to_utc_timestamp(start_date, 'UTC') end_date = to_utc_timestamp(end_date, 'UTC') elif precision == "month": start_date = trunc(to_date(from_unixtime(timestamp)), 'month') end_date = date_add(last_day(start_date), 1) start_date = to_utc_timestamp(start_date, 'UTC') end_date = to_utc_timestamp(end_date, 'UTC') jobs = jobs.withColumn('start_date', start_date) jobs = jobs.withColumn('end_date', end_date) jobs = jobs.withColumn('timeframe_precision', lit(precision)) jobs = jobs.drop('timestamp') # Transform `task` to task and workflow name jobs = jobs.withColumn('taskname_components', split(jobs['task'], '/')) jobs = jobs.withColumn('workflow', jobs['taskname_components'].getItem(1)) jobs = jobs.withColumn('task', jobs['taskname_components'].getItem(size(jobs['taskname_components']))) jobs = jobs.drop('taskname_components') # Extract exit code and acquisition era stepScopeStruct = StructType([ StructField('exitCode', StringType(), True), StructField('exitStep', StringType(), True), StructField('acquisitionEra', StringType(), True), ]) def extract_step_scope(step_names, step_errors, step_outputs): # TODO: improve this rather crude implementation exitCode = None exitStep = None for (i, errors) in enumerate(step_errors): if len(errors) > 0: exitCode = errors[0].exitCode exitStep = step_names[i] break acquisitionEra = None for outputs in step_outputs: if len(outputs) > 0: acquisitionEra = outputs[0].acquisitionEra break return (exitCode, exitStep, acquisitionEra) extract_step_scope_udf = udf(extract_step_scope, stepScopeStruct) jobs = jobs.withColumn('step_scope', extract_step_scope_udf('steps.name', 'steps.errors', 'steps.output')) jobs = jobs.select('*', 'step_scope.exitCode', 'step_scope.exitStep', 'step_scope.acquisitionEra').drop('step_scope') # <<< # You can check the schema at any time with: # ``` # jobs.printSchema() # ``` # TODO: Phase 1: Aggregation over steps # # Each job has a list of `steps`, each with a `performance` dictionary. # These performance dictionaries must be combined to one by summing their # values, or possibly in a different way for each metric. # E.g. if a job has 3 steps, where 2 of them have a `performance` # dictionary with values such as `performance.cpu.TotalJobTime: 1` and # `performance.cpu.TotalJobTime: 2`, then as a result the _job_ should # have a `performance` dictionary with `performance.cpu.TotalJobTime: 3`. # # All keys in the `performance` schema should be aggregated over in this fashion. # The performance metrics are documented in https://github.com/knly/WMArchiveAggregation # with a reference to `WMArchive/src/maps/metrics.json`. # # To achieve this aggregation using pyspark-native functions, we should # `explode` on the `steps` array and possibly even further down into # `output` and/or `errors`, keeping track of the `meta_data.fwjr_id`. # Then we can group by the `fwjr_id` and make use of the pyspark aggregation # functions such as `pyspark.sql.functions.sum` similar to below. # Phase 2: Aggregation over jobs # Group jobs by scope # TODO: Explore if this is a performance bottleneck since everything # is processed on one node. An approach based on a `reduce` function # may be more feasable. That said, the `groupBy` is exactly # the functionality we want to achieve and is pyspark-native, # so I believe we should test this first and see if it really # leads to any problems. scopes = jobs.groupBy([ 'start_date', 'end_date', 'timeframe_precision', 'jobstate', 'host', 'jobtype', 'site', 'workflow', 'task', 'acquisitionEra', 'exitCode', 'exitStep', ]) # Perform the aggregation over the grouped jobs stats = scopes.agg(*( [ count('jobstate').alias('count') ] + [ # TODO: Specify all aggregation keys here by reading the `performance` schema # to take the average over all jobs. # avg(aggregation_key) for aggregation_key in aggregation_keys ] )).collect() # TODO: Reshape, so that the grouped-by keys are shifted into a `scope` dictionary # and the aggregated performance metrics are shifted into a `performance` # dictionary, to finally achieve the data structure detailed in # https://github.com/knly/WMArchiveAggregation stats = [row.asDict() for row in stats] print("Aggregation finished in {} seconds.".format(time.time() - start_time)) # print("Result of aggregation: {}".format(stats)) return stats
json_tweet = json.loads(tweet) if 'retweet_count' in json_tweet: text = json_tweet["text"] RT_text = text[:2] if json_tweet['retweet_count'] > 0 or RT_text == "RT": return True return False def print_result(**kwargs): print("---------------------", kwargs["source_type"], "--------------") print("All", kwargs["all"].value) print("Retweeted", kwargs["retweeted"].value) conf = SparkConf().setAppName("Twitter tweets listener").setMaster('local[2]') sparkContext = SparkContext(conf=conf) sparkContext.setLogLevel("ERROR") streamingContext = StreamingContext(sparkContext, 1) android_count = sparkContext.accumulator(0) iphone_count = sparkContext.accumulator(0) android_retweeted_count = sparkContext.accumulator(0) iphone_retweeted_count = sparkContext.accumulator(0) dstream = streamingContext.socketTextStream(HOST, PORT) sampled = dstream.transform(samping_function) json_objects = sampled.filter(lambda input: filter_tweets_source(input)) filtered = json_objects.filter(lambda input: filter_retweeted(input))
s_logger = logging.getLogger('py4j.java_gateway') s_logger.setLevel(logging.ERROR) #pip install graphframes # os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11") scConf = pyspark.SparkConf() \ .setAppName('hw4') \ .setMaster('local[3]') sc = SparkContext(conf=scConf) #sc = SparkContext('local[*]', 'task1') # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) sc.setLogLevel('ERROR') N = 7 #N=int(sys.argv[1]) input_file_path = '../../PycharmProjects/553hw4/ub_sample_data.csv' #input_file_path = sys.argv[2] textRDD = sc.textFile(input_file_path).persist() output_file_path = '../../PycharmProjects/553hw4/betweenness.txt' #output_file_path = sys.argv[3] output_file_path2 = '../../PycharmProjects/553hw4/community.txt' #output_file_path2 = sys.argv[4] user_id_list = textRDD.map(lambda line: line.split(",")).filter(
def __init__(self, input, output): conf = SparkConf().setMaster('local').setAppName('URI-MapReduce') sc = SparkContext(conf=conf) sc.setLogLevel("WARN") self.input_rdd = sc.textFile(input) self.output = output
# WARNING: This code was developed on a Python 2.7 and spark-1.5.0 # build and may not run as expected on other configurations. # ###################### import re import math from scipy.stats import poisson import time import sys, getopt import os # Initialize Spark from pyspark import SparkContext sc = SparkContext() sc.setLogLevel('ERROR') ###################### # # Submission by Gioia Dominedo (Harvard ID: 40966234) for # CS 205 - Computing Foundations for Computational Science # # This is part of a joint project with Kendrick Lo that includes a # separate component for word-level checking. This script includes # one of three SPARK implementations for context-level spell-checking # adapted from third party algorithms (Symspell and Viterbi algorithms). # # The following were also used as references: # Peter Norvig, How to Write a Spelling Corrector # (http://norvig.com/spell-correct.html) # Peter Norvig, Natural Language Corpus Data: Beautiful Data
# number of distinct elements expectedEstimate = 2**numHashes random.seed(SEED) a = random.choices([x for x in range(1000, 30000) if isPrime(x)], k=numHashes + 1) # print("a: ", a) b = random.choices([x for x in range(1000, 30000) if isPrime(x)], k=numHashes + 1) # print("b: ", b) # Create a local StreamingContext with two working thread and batch interval of 1 second # SparkContext.setSystemProperty('spark.executor.memory', '4g') # SparkContext.setSystemProperty('spark.driver.memory', '4g') sc = SparkContext("local[*]", "countDistinctCity") sc.setLogLevel(logLevel="OFF") # batch interval ssc = StreamingContext(sc, batch_size) outputFile = open(output_file_path, "w", encoding="utf-8") out = "Time,Ground Truth,Estimation" + "\n" outputFile.write(out) # Create a Data Stream that connects to localhost:9999 dataRDD = ssc.socketTextStream("localhost", port_number) # modify to obtain state of incoming business, then apply bloom filter resultRDD = dataRDD.map(json.loads).map(lambda x: x['city'])\ .window(windowDuration=window_length, slideDuration=sliding_interval)\ .foreachRDD(FMAlgo)
parser.add_argument('-k', dest='kmercount', type=int) args = parser.parse_args() k = args.kmercount def get_samples(filename): samples = [] for sample in open(filename).readlines(): samples.append(sample.strip()[:-4]) return samples samples = get_samples('/root/istc_oceanography/metadata/valid_samples_GA02_filenames.csv') samples += get_samples('/root/istc_oceanography/metadata/valid_samples_GA03_filenames.csv') master_url = open("/root/spark-ec2/cluster-url").read().strip() context = SparkContext(master_url) context.setLogLevel("WARN") sqlcontext = SQLContext(context) def extract_kmers(r): for i in range(0,len(r.seq)-k+1): yield r.seq[i:i+k] for sample_name in samples: sample_filename = "s3n://helgag/ocean_metagenome/overlapped/{sample_name}.csv".format(sample_name=sample_name) customSchema = StructType([ \ StructField("id", StringType(), True), \ StructField("seq", StringType(), True)]) sample = sqlcontext.read.format('com.databricks.spark.csv').options(header='true').load(sample_filename, schema=customSchema).repartition(80) sample = sample.flatMap(extract_kmers).map(Row("kmer")).toDF().groupBy("kmer").agg(count("*")) #Toggle comment the following to export the data sample.registerTempTable(sample_name + "_count")
import re from pyspark import SparkContext from datetime import datetime import matplotlib import matplotlib.pyplot as plot import numpy as np import zipfile sparkContext = SparkContext() sparkContext.setLogLevel('error') # numbers.txt startTime = datetime.now() file = sparkContext.textFile("Numbers.zip/numbers.txt") numbers = file.flatMap(lambda line: line.split(" ")) print('#####################################################################') print('############################# OUTPUT ##############################') print('[NUMBERS.TXT] [MEAN] :' +str(numbers.map(lambda num: float(num)).mean())) print('[NUMBERS.TXT] [STDEV] :'+str(numbers.map(lambda num: float(num)).stdev())) print('[NUMBERS.TXT] [VARIANCE] :'+str(numbers.map(lambda num: float(num)).variance())) endTime = datetime.now() duration = endTime - startTime # numbers2.txt startTime2 = datetime.now() file2 = sparkContext.textFile("numbers2.txt") numbers2 = file2.flatMap(lambda line: line.split(" ")) print('#####################################################################')
# build and may not run as expected on other configurations. # ###################### import re import math from scipy.stats import poisson import time import sys, getopt import os # Initialize Spark from pyspark import SparkContext sc = SparkContext() sc.setLogLevel("ERROR") ###################### # # Submission by Gioia Dominedo (Harvard ID: 40966234) for # CS 205 - Computing Foundations for Computational Science # # This is part of a joint project with Kendrick Lo that includes a # separate component for word-level checking. This script includes # one of three SPARK implementations for context-level spell-checking # adapted from third party algorithms (Symspell and Viterbi algorithms). # # The following were also used as references: # Peter Norvig, How to Write a Spelling Corrector # (http://norvig.com/spell-correct.html) # Peter Norvig, Natural Language Corpus Data: Beautiful Data
def main(): parser = argparse.ArgumentParser(description="sparK-mer") parser.add_argument("-N",metavar="INT", help="Number of nodes to use [%(default])", default=19, type=int) parser.add_argument("-C",metavar="INT", help="Cores per node [%(default)]", default=24, type=int) parser.add_argument("-E",metavar="INT", help="Cores per executor [%(default)]", default=4, type=int) parser.add_argument("-M",metavar="STR", help="Namenode", default="c252-104", type=str) parser.add_argument("-L",metavar="STR", help="Log level", default="WARN", type=str) parser.add_argument("-K",metavar="INT", help="k-mer size [%(default)]", default=15, type=int) parser.add_argument("-v", action="store_true", help="Verbose output") args = parser.parse_args() executorInstances = args.N*args.C/args.E # Explicitly set the storage level #StorageLevel(True, True, False, True, 1) # Set up spark configuration conf = SparkConf().setMaster("yarn-client").setAppName("sparK-mer") #conf = SparkConf().setMaster("local[16]").setAppName("sparK-mer") conf.set("yarn.nodemanager.resource.cpu_vcores",args.C) # Saturate with executors conf.set("spark.executor.instances",executorInstances) conf.set("spark.executor.heartbeatInterval","5s") # cores per executor conf.set("spark.executor.cores",args.E) # set driver cores conf.set("spark.driver.cores",12) # Number of akka threads conf.set("spark.akka.threads",256) # Agregation worker memory conf.set("spark.python.worker.memory","5g") # Maximum message size in MB conf.set("spark.akka.frameSize","128") conf.set("spark.akka.timeout","200s") conf.set("spark.akka.heartbeat.interval","10s") #conf.set("spark.broadcast.blockSize","128m") conf.set("spark.driver.maxResultSize", "20g") conf.set("spark.reducer.maxSizeInFlight","5g") conf.set("spark.executor.memory","7g") #conf.set("spark.shuffle.memoryFraction",0.4) #conf.set("spark.storage.memoryFraction",0.3) #conf.set("spark.storage.unrollFraction",0.3) #conf.set("spark.storage.memoryMapThreshold","256m") #conf.set("spark.kryoserializer.buffer.max","1g") #conf.set("spark.kryoserializer.buffer","128m") #conf.set("spark.core.connection.ack.wait.timeout","600") #conf.set("spark.shuffle.consolidateFiles","true") #conf.set("spark.shuffle.file.buffer","32m") conf.set("spark.shuffle.manager","sort") conf.set("spark.shuffle.spill","true") # Set up Spark Context sc = SparkContext("","",conf=conf) sc.setLogLevel(args.L) # Process DB #frequencyProfile = generateFP(sc, args.K, "hdfs://c252-104/user/gzynda/random_20", args.v) fpStart = time.time() frequencyProfile = generateFP(sc, args.K, "/user/gzynda/library", args.v) frequencyProfile.cache() nGenomes = frequencyProfile.count() fpSecs = time.time()-fpStart print "############################################" print "Counted %i genomes in %.2f seconds"%(nGenomes, fpSecs) print "############################################" # Parse FQ fqStart = time.time() fqFrequency = parseFQ(sc, args.K, "/user/gzynda/reads/HiSeq_accuracy.fq", args.v) fqFrequency.cache() nReads = fqFrequency.count() fqSecs = time.time()-fqStart print "############################################" print "Parsed %i reads in %.2f seconds"%(nReads, fqSecs) print "############################################" # Classify reads classStart = time.time() #classify(sc, fqFrequency, frequencyProfile, args.v) nReads = setClassify(sc, fqFrequency, frequencyProfile, args.v) classSecs = time.time()-classStart print "############################################" print "Classified %i reads in %.2f seconds"%(nReads, classSecs) print "Ran on %i executor instances"%(executorInstances) print "K = %i"%(args.K) print "############################################" sys.exit()