def get_spark(self): if self._spark_context is None: ans = self.supervisor.request('get_spark') if 'error' in ans: raise ContextError(ans['error']) if self.verbose: print("get_spark answer from supervisor:\n"+json.dumps(ans, indent=4)) # path to spark files spark_path = ans['path'] # the dictionary stored in ans['spark']['config'] will be given to SparkConf directly # e.g. { "spark.master": "local[5]", "spark.app.name": "testapp" } spark_config = ans['config'] import findspark findspark.init(spark_path) import pyspark import pymongo_spark pymongo_spark.activate() conf = pyspark.SparkConf() conf.setAll(spark_config.items()) self._spark_context = pyspark.SparkContext() return self._spark_context
def start_spark(self, spark_conf=None, executor_memory=None, profiling=False, graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', extra_conf = None): """Launch a SparkContext Parameters spark_conf: path path to a spark configuration directory executor_memory: string executor memory in java memory string format, e.g. '4G' If `None`, `memory_per_executor` is used. profiling: boolean whether to turn on python profiling or not graphframes_package: string which graphframes to load - if it isn't found, spark will attempt to download it extra_conf: dict additional configuration options """ os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\ .format(graphframes_package=graphframes_package) if spark_conf is None: spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf') os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf) os.environ['PYSPARK_PYTHON'] = sys.executable try: import findspark; findspark.init() from pyspark import SparkContext, SparkConf except ImportError: raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?") conf = SparkConf() conf.set('spark.driver.maxResultSize', '0') if executor_memory is None: executor_memory = '%dM'%self.memory_per_executor conf.set('spark.executor.memory', executor_memory) if profiling: conf.set('spark.python.profile', 'true') else: conf.set('spark.python.profile', 'false') if extra_conf is not None: for k,v in extra_conf.items(): conf.set(k,v) sc = SparkContext(master=self.master_url(), conf=conf) return sc
def table_schema_from_spark(hcat_table_name): #returns schema of table with this database.name in hcatalog # (spark-workaround as long as hcatweb api is not available...) # initialize spark import findspark findspark.init() import pyspark from pyspark.sql import HiveContext sc_conf = pyspark.SparkConf() #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*') #sc_conf.set('spark.master','yarn-client') sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf) hc = HiveContext(sc) hive_schema = hc.table(hcat_table_name).schema.jsonValue() print hive_schema sc.stop() table_schema = {'columns':{}} col_sequence = 0 for field in hive_schema['fields']: table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']} col_sequence += 1 return table_schema
def main(iso_date, base_path): APP_NAME = "load_prediction_results.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() input_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today ) # Load and JSONize text prediction_results_raw = sc.textFile(input_path) prediction_results = prediction_results_raw.map(json_util.loads) # Store to MongoDB prediction_results.saveToMongoDB( "mongodb://localhost:27017/agile_data_science.prediction_results" )
def main(iso_date, base_path): APP_NAME = "pyspark_task_one.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # Get today's date today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() # Load today's data today_input_path = "{}/ch02/data/example_name_titles_daily.json/{}".format( base_path, rounded_today.isoformat() ) # Otherwise load the data and proceed... people_titles = spark.read.json(today_input_path) people_titles.show() # Group by as an RDD titles_by_name = people_titles.rdd.groupBy(lambda x: x["name"]) # Accept the group key/grouped data and concatenate the various titles... # into a master title def concatenate_titles(people_titles): name = people_titles[0] title_records = people_titles[1] master_title = "" for title_record in sorted(title_records): title = title_record["title"] master_title += "{}, ".format(title) master_title = master_title[:-2] record = {"name": name, "master_title": master_title} return record people_with_concatenated_titles = titles_by_name.map(concatenate_titles) people_output_json = people_with_concatenated_titles.map(json.dumps) # Get today's output path today_output_path = "{}/ch02/data/example_master_titles_daily.json/{}".format( base_path, rounded_today.isoformat() ) # Write/replace today's output path os.system("rm -rf {}".format(today_output_path)) people_output_json.saveAsTextFile(today_output_path)
def add_pyspark_path_if_needed(): """Add PySpark to the library path based on the value of SPARK_HOME if pyspark is not already in our path""" try: from pyspark import context except ImportError: # We need to add PySpark, try findspark if we can but it has an # undeclared IPython dep. try: import findspark findspark.init() except ImportError: add_pyspark_path()
def get_task(n): findspark.init() sc = pyspark.SparkContext() sqlContext = SQLContext(sc) raw_bookings = sqlContext.read \ .format('com.databricks.spark.csv') \ .options(header='true', delimiter='^',inferSchema='true') \ .load('bookings.csv') arr_port_by_pass=raw_bookings.select(['arr_port','pax']).groupby('arr_port').sum('pax').orderBy('sum(pax)',ascending=0) df=pd.DataFrame(data=arr_port_by_pass.collect()[:n],columns=['arr_port','num_pass']) geo_o = GeoBase(data='ori_por', verbose=False) df['arr_port_name']=df['arr_port'].map(lambda x: geo_o.get(str(x).replace(' ',''),'name')) json_st=df.to_json() return json_st
def getCount(logFile): import findspark findspark.init() from pyspark import SparkContext #logFile = "declaration.txt" # Should be some file on your system sc = SparkContext("local", "Simple App") logData = sc.textFile(logFile).cache() numAs = logData.filter(lambda s: 'a' in s).count() numBs = logData.filter(lambda s: 'b' in s).count() answer = dict() answer['aCount'] = numAs answer['bCount'] = numBs sc.stop() import json return json.dumps(answer)
def get_spark_context(appName = 'worm'): '''finds your local spark distribution and returns the pyspark.SparkContext Examples -------- >>> sc = get_spark_context() >>> rdd = sc.textFile('README.md') >>> rdd.collect()''' try: import findspark findspark.init() import pyspark sc = pyspark.SparkContext(appName=appName) except Exception as e: sc = None print e return sc
def main(): import findspark findspark.init() import pyspark sc = pyspark.SparkContext() # Mute spark logging apache_logger = sc._jvm.org.apache.log4j apache_logger.LogManager.getLogger("org").setLevel(apache_logger.Level.ERROR) apache_logger.LogManager.getLogger("akka").setLevel(apache_logger.Level.ERROR) # Create RDD textFile = sc.textFile("data/train.txt") splitRDD = textFile.map(lambda w: w.split('\t')) floatRDD = splitRDD.map(lambda w: [float(x) for x in w[0:-1]]) means = KMeans() means.train(floatRDD, k=3, runs=10, max_iterations=10)
def _init_spark(self): """Initializes spark so that pyspark is importable. This also sets up the required environment variables """ global _SPARK_INITIALIZED spark_home = self.spark_home python_path = self._python_path if use_findspark: if _SPARK_INITIALIZED: if spark_home == os.environ["SPARK_HOME"]: # matches with already initialized pass else: # findspark adds two path to the search path. sys.path.pop(0) sys.path.pop(0) findspark.init(spark_home=spark_home, edit_rc=False, edit_profile=False, python_path=python_path) else: findspark.init(spark_home=spark_home, edit_rc=False, edit_profile=False, python_path=python_path) _SPARK_INITIALIZED = True self._set_environment_variables()
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
# Default partitioning: 100 partitions! import matplotlib.pyplot as plt import P2 import seaborn as sns sns.set_context('poster', font_scale=1.25) import findspark as fs fs.init() import pyspark as ps import multiprocessing as mp import numpy as np ######## Same as part A, just setting up ################ # Setup cluster, number of threads = 2x cores config = ps.SparkConf() config = config.setAppName('P2a') sc = ps.SparkContext(conf=config) # Do the computation num_pixels = 2000 rows = sc.range(num_pixels, numSlices=10) cols = sc.range(num_pixels, numSlices=10) indices = rows.cartesian(cols) def mandelbrot_wrapper(row, col):
from pyspark.sql import Row, SQLContext from pyspark import SparkConf, SparkContext from operator import add import requests import sys from pyspark.streaming import StreamingContext import findspark from pyspark.sql import SparkSession from pyspark.sql.types import StructType from pyspark.sql.functions import explode, split from pyspark.sql.functions import col, desc, asc findspark.init() ssc = SparkSession \ .builder \ .appName("StructuredNetworkWordCount") \ .getOrCreate() schema = StructType().add("ID", "string").add("Lang", "string") \ .add("Date", "string").add("Source", "string").add("Len", "string") \ .add("Likes", "string").add("RTs", "string").add("Hashtags", "string") \ .add("UserMentionName", "string").add("UserMentionID", "string").add("name", "string") \ .add("Place", "string").add("Followers", "float").add("Friends", "float") lines = ssc \ .readStream \ .format("csv") \ .option("header", True) \ .schema(schema) \ .option("sep", ";") \
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
import subprocess import sys import findspark findspark.init('/usr/hdp/current/spark2-client') import pyspark from pyspark.sql.functions import lit, col, instr, expr, pow, round, bround, corr, count, mean, stddev_pop, min, max from pyspark.sql.functions import monotonically_increasing_id, initcap, lower, upper, ltrim, rtrim, rpad, lpad, trim from pyspark.sql.functions import regexp_replace, translate, regexp_extract, current_date, current_timestamp, struct from pyspark.sql.functions import date_add, date_sub, datediff, months_between, to_date, to_timestamp, coalesce, split, size from pyspark.sql.functions import array_contains, explode, udf from pyspark.sql import HiveContext from pyspark.sql import SparkSession from pyspark.sql.types import * from pyspark.sql.functions import col, when from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType, LongType from pyspark.sql import functions as F from pyspark.sql.window import Window from pyspark.sql.functions import desc import pandas as pd from datetime import date, timedelta, datetime def get_Spark(): conf = pyspark.SparkConf().setAll([ ('spark.submit.deployMode', 'client'), # deploy in yarn-client or yarn-cluster ('spark.executor.memory', '8g'), # memory allocated for each executor
# Kinesis Data Generator: https://awslabs.github.io/amazon-kinesis-data-generator/ # # After going through the setup, download this file as a .py file, go into the terminal, and run the following command: # ``` # python3 pyspark-streaming/4_morestream/2_Integration_with_Kinesis_Demo.py # ``` # # ### Demo # In[ ]: import findspark # TODO: your path will likely not have 'matthew' in it. Change it to reflect your path. findspark.init('/home/matthew/spark-2.3.0-bin-hadoop2.7') import os os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kinesis-asl_2.11:2.3.0 pyspark-shell' import sys import json import time from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream appName="PythonKinesisApp" sc = SparkContext(appName=appName) ssc = StreamingContext(sc, 1)
time_table[str(time_slot.time())][1]+=int(line[2]) #plotting time_sorted=sorted(time_table.items()) y1=[] y2=[] x=[] for i in range(len(time_sorted)): y1.append(time_sorted[i][1][0]) y2.append(time_sorted[i][1][1]) x.append(time_sorted[i][0]) x_domain=[i for i in range(len(x))] plt.scatter(x_domain, y1, color='red') plt.scatter(x_domain, y2, color='blue') plt.legend(('west side', 'east side'), loc='best') plt.xlabel('Hour of Day') plt.ylabel('Amount of Bikes') plt.show() if __name__ == '__main__': #setting spark_home findspark.init(spark_home) conf=SparkConf().setMaster('local').setAppName('Fremont Bridge Bike Analysis') sc=SparkContext(conf=conf) main(sc)
# ``` # # Donwload this notebook as a .py file and run the following: # ``` # python3 pyspark-streaming/4_morestream/1_Integration_with_Kafka_Demo.py # ``` # Finally, start the producer: # ``` # ~/kafka_2.11-0.11.0.0/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pyspark-kafka-demo # ``` # # import findspark # TODO: your path will likely not have 'matthew' in it. Change it to reflect your path. findspark.init('/home/matthew/spark-2.3.0-bin-hadoop2.7') import os os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0 pyspark-shell' import sys import time from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils n_secs = 1 topic = "pyspark-kafka-demo" conf = SparkConf().setAppName("KafkaStreamProcessor").setMaster("local[*]")
#!/usr/bin/env python # coding: utf-8 # # [o3]- Proyecto Ozono - Predictor_v0 # # [0] - Inicialización # In[ ]: import findspark findspark.init('/home/rulicering/BigData/spark-2.4.5-bin-hadoop2.7') from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.window import Window import pandas as pd from pyspark.sql.types import StructField, StringType, IntegerType, StructType, FloatType import re as reg import numpy as np import datetime #MlLib #from pyspark.ml.regression import LinearRegression from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator #Aux from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler
# Imports and running findspark import findspark import json from datetime import datetime from pyspark.sql import functions as F findspark.init('/home/bigdata/spark-2.4.3-bin-hadoop2.7') import pyspark from pyspark import RDD from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql import SparkSession import json # Spark context details def process_stream(record, spark): columns = [ "Station", "Date", "Last update", "Places", "Available_bikes", "Capacity", "Status", "Position" ] if not record.isEmpty(): df = spark.createDataFrame(record, columns) df = df.filter(df["Station"] < 1000) # delete 1,033 station which is not in Toulouse df.show() df.write.format('org.elasticsearch.spark.sql').mode( 'overwrite' # or .mode('append') ).option('es.nodes', 'localhost').option('es.port', 9200).option( 'es.resource', '%s/%s' % ('velotoulouse-geo', '_doc'),
import findspark from pyspark import SparkContext, SparkConf import csv from common.Utils import Utils findspark.init(python_path='/Users/khwu/.virtualenvs/spark/bin/python3') def load_post_code(): with open('../../in/uk-postcode.csv') as f: reader = csv.reader(f) return { row[0]: row[7] for row in reader if not row[0].startswith('Postcode') } def get_post_prefix(line: str): splits = Utils.COMMA_DELIMITER.split(line) post_code = splits[4] return post_code.split(' ')[0] if __name__ == '__main__': conf = SparkConf().setAppName('ukpostcode').setMaster('local[*]') sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') post_code = sc.broadcast(load_post_code()) HEADER = 'Timestamp,Collected by,Name of makerspace'
import numpy as np import pandas as pd import twython from twython import TwythonStreamer import re from requests_oauthlib import OAuth1 import urllib import sys import ast import json import findspark findspark.init('/usr/local/Cellar/apache-spark/1.5.1/libexec') from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils import requests import threading import Queue import time APP_KEY = "bv6mnYBiFeEVKvPEZlg" APP_SECRET = "nQZk9Ca8qqJxc1Za07WyW0VPZ6gtAUSF3oPD5sun0" OAUTH_TOKEN = "606525030-ilOtJstbRvFCjUNMtOu8DP2HQKGWpQvmUsF6fblE" OAUTH_TOKEN_SECRET = "xSVE47qVOFxxZm1oqKwL6zwLVMWpzxCUYGmLJ6CVHR0mZ"
def main(): import findspark findspark.init() import pyspark sc = pyspark.SparkContext() # Mute spark logging apache_logger = sc._jvm.org.apache.log4j apache_logger.LogManager.getLogger("org").setLevel(apache_logger.Level.ERROR) apache_logger.LogManager.getLogger("akka").setLevel(apache_logger.Level.ERROR) # Create RDD floatRdd = sc.textFile("data/yeast_no_header_data.txt")\ .map(lambda s: s.split('\t'))\ .map(lambda s: list(map(float, s[1:]))) floatRdd.cache() # reduce dimensions rowSize = len(floatRdd.first()) def getDimset(max_index): workingDims = random.randint(2, max_index) indices = set() for _ in range(workingDims): indices.add(random.randint(0, max_index)) return indices def mutateDimensionSet(rowSize, factor): def mutationImpl(dimset): currDimAmount = len(dimset) # print('currDimAmount:', currDimAmount) mutationMaxAmount = math.floor(currDimAmount * factor) # print('mutationMaxAmount:', mutationMaxAmount) mutation_diff = getDimset(mutationMaxAmount) # print('diff_set') # pp.pprint(mutation_diff, compact=True) mutation_sum = getDimset(mutationMaxAmount) # print('sumset') # pp.pprint(mutation_sum, compact=True) return (dimset - mutation_diff) | mutation_sum return mutationImpl mutations_amount = 5 # relative amount of mutations factor = 0.2 # Standard abb standard_dev = 0.3 # amount of centroids centroids_amount = 30 # amount of centroid recalculations n = 2 # get seed dimension set seedDimset = getDimset(rowSize) workingDimsAmount = len(seedDimset) print('Base dimSet:', seedDimset) pp.pprint(seedDimset, compact=True) mutator = mutateDimensionSet(rowSize, factor) dimSets = [seedDimset] for index in range(mutations_amount): curr_dimset = dimSets[-1] new_dimset = mutator(curr_dimset) print('Mutation {} changed: '.format(index)) print('Added: ', end="") pp.pprint(new_dimset - curr_dimset, compact=True) print('Deleted: ', end="") pp.pprint((curr_dimset - new_dimset), compact=True) curr_dimset = new_dimset print('new_size:', len(curr_dimset)) dimSets.append(curr_dimset) def workOnDimsetClosure(dimSets, floatRdd, centroids_amount, standard_dev, n): def closureImpl(index): return (index, workOnDimset(dimSets[index], floatRdd, centroids_amount, standard_dev, n)) return closureImpl biclusteringWorker = workOnDimsetClosure(dimSets, floatRdd, centroids_amount, standard_dev, n) tpool = ThreadPool(processes=mutations_amount) results = tpool.map(biclusteringWorker, range(len(dimSets))) results.sort(key=lambda x: x[1][0]) print('(Mutation index, (quality, rows amount, cols amount, rdd, centroid))') for result in results: print(result)
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear" ] index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
import time import partial import numpy as np import matplotlib.pyplot as plt import networkx as nx import json from networkx.readwrite import json_graph import findspark findspark.init("/usr/local/opt/apache-spark/libexec") import pyspark sc = pyspark.SparkContext() with open("graph/nc_mini.json", "r") as graph_data: graph_data = json.load(graph_data) NC_digraph = json_graph.node_link_graph(graph_data) #################################################################################### nodes_set = NC_digraph.nodes() def cascade(init_nodes, nodes_set_broadcast): # , dist_d): nodes_set = nodes_set_broadcast.value action = {} n = len(init_nodes) # np.random.seed(random_d) # init_nodes = np.random.choice(NC_digraph.nodes(), 1)[0] for i in init_nodes: action[i] = 1 # st = set()
def initSpark(nb_backend, app_name='pyspark', spark_instances=2, executor_cores= 2, max_cores=8, executor_memory='10G'): """ Configure and create SparkContext. nb_backend ... backend for notebook ('local' or 'openstack') app_name ... name of the Spark application spark_instances ... number of executor instances (e.g. number of workers) executor_cores ... the number of cores for each executor max_cores ... max. number of cores that can be used executor_memory ... memory per executor """ # ressource settings # conf.set("spark.executor.instances", 4) # conf.set("spark.cores.max", 16) # conf.set("spark.executor.memory", "10G") # conf.set("spark.executor.cores", 3) import os if nb_backend == 'openstack': os.environ['SPARK_HOME'] = "/usr/local/spark" os.environ['SPARK_DRIVER_MEMORY'] = '10G' # setup Spark import findspark # SPARK_HOME needs to be set for import of findspark findspark.init() import pyspark from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName(app_name) if nb_backend == 'local': # options for Spark on local machine master = 'local' elif nb_backend == 'openstack': # options for Spark on OpenStack cluster master = 'spark://sparkcluster-controller001:7077' else: print "Backend " + nb_backend + " not known" conf.setMaster(master) # configure the number of instances # conf.set("spark.executor.instances", spark_instances) # configure the number of cores per executor conf.set("spark.executor.cores", executor_cores) # configure the max. number of cores a user may request conf.set("spark.cores.max", max_cores) # configure the memory available per Spark executor conf.set("spark.executor.memory", executor_memory) # configure the memory available for the driver conf.set("spark.driver.memory", '2G') if nb_backend == 'openstack': conf.set("spark.driver.extraClassPath", "/usr/local/hadoop/share/hadoop/tools/lib/*") conf.set("spark.executor.extraClassPath", "/usr/local/hadoop/share/hadoop/tools/lib/*") try: sc = SparkContext(conf=conf) return sc except ValueError as exception: print "Could not create SparkContext. Maybe it exists already?"
import time import partial import numpy as np import matplotlib.pyplot as plt import networkx as nx import json from networkx.readwrite import json_graph import findspark findspark.init('/usr/local/opt/apache-spark/libexec') import pyspark sc = pyspark.SparkContext() with open("graph/nc_mini.json", "r") as graph_data: graph_data = json.load(graph_data) NC_digraph = json_graph.node_link_graph(graph_data) ###################################################################################### # #Influence Function Implementation # ####################################################################################### nodes_set = NC_digraph.nodes() def cascade(init_nodes, nodes_set_broadcast):#, dist_d): nodes_set = nodes_set_broadcast.value action = {} n = len(init_nodes) #np.random.seed(random_d) #init_nodes = np.random.choice(NC_digraph.nodes(), 1)[0]
# Work in progress - Use spark as ranker import findspark import pyspark import os findspark.init(os.getenv('HOME') + '/spark-1.6.0-bin-hadoop2.6') os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell' try: print(sc) except NameError: sc = pyspark.SparkContext() print(sc)
import findspark findspark.init('/home/oliver/Documents/spark-2.0.0-bin-hadoop2.7') # findspark.add_packages(['org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.0-preview']) from pyspark import SparkContext # from pyspark.streaming.kafka import KafkaUtils # from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext, Row num_of_users = 1000 sc = SparkContext() sqlContext = SQLContext(sc) # Load a text file and convert each line to a Row. lines = sc.textFile('user_scan_list_transform_'+str(num_of_users)+'.csv') for name in lines.collect(): print name
import findspark findspark.init('/home/zishan/spark-2.2.1-bin-hadoop2.7') from pyspark.sql import SparkSession spark = SparkSession.builder.appName('Popularity').getOrCreate() data = spark.read.csv('OnlineNewsPopularity.csv',inferSchema=True,header=True) from pyspark.ml.feature import VectorIndexer from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=['timedelta', 'n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length', 'num_keywords', 'data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_bus', 'data_channel_is_socmed', 'data_channel_is_tech', 'data_channel_is_world', 'self_reference_max_shares', 'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday',
import seaborn as sns sns.set_context('poster', font_scale=1.25) import findspark as fs fs.init() import pyspark as ps import numpy as np # Assumes local options are already set in conf file...or else this explodes config = ps.SparkConf() config = config.setAppName('wiki_solver') sc = ps.SparkContext(conf=config) #### Create the network #### num_partitions = 40 # Forces all links to be symmetric links_raw_data = sc.textFile('links-simple-sorted.txt', minPartitions=num_partitions) titles_raw_data = sc.textFile('titles-sorted.txt', minPartitions=num_partitions) import re def get_links(x): split = re.findall(r"[\w']+", x) parent = int(split[0]) children = [int(z) for z in split[1:]] parent_to_children = [(parent, z) for z in children] children_to_parent = [(z, parent) for z in children] return parent_to_children + children_to_parent all_links = links_raw_data.flatMap(get_links) node_then_all_links = all_links.groupByKey()
from django.http import HttpResponse from django.shortcuts import render import test_regression import gen_random import time import numpy as np import os import datetime import sys import findspark #options for server findspark.init("/home/ubuntu/spark") home_dir = "/home/ubuntu/CS205_Final_Project/web/mysite/" #options for local #home_dir = "" #findspark.init() import pyspark sc = pyspark.SparkContext(appName="final") # from GitHub def cholesky_solution_linear_regression(x_t_x,x_t_y): L = np.linalg.cholesky(x_t_x) z = np.linalg.solve(L,x_t_y) theta = np.linalg.solve(np.transpose(L),z) return theta
def main(base_path): APP_NAME = "fetch_prediction_requests.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # Load the on-time parquet file on_time_dataframe = spark.read.parquet('{}/data/on_time_performance.parquet'.format(base_path)) on_time_dataframe.registerTempTable("on_time_performance") # Select a few features of interest simple_on_time_features = spark.sql(""" SELECT FlightNum, FlightDate, DayOfWeek, DayofMonth AS DayOfMonth, CONCAT(Month, '-', DayofMonth) AS DayOfYear, Carrier, Origin, Dest, Distance, DepDelay, ArrDelay, CRSDepTime, CRSArrTime FROM on_time_performance """) simple_on_time_features.show() # Filter nulls, they can't help us filled_on_time_features = simple_on_time_features.filter( simple_on_time_features.ArrDelay.isNotNull() & simple_on_time_features.DepDelay.isNotNull() ) # We need to turn timestamps into timestamps, and not strings or numbers def convert_hours(hours_minutes): hours = hours_minutes[:-2] minutes = hours_minutes[-2:] if hours == '24': hours = '23' minutes = '59' time_string = "{}:{}:00Z".format(hours, minutes) return time_string def compose_datetime(iso_date, time_string): return "{} {}".format(iso_date, time_string) def create_iso_string(iso_date, hours_minutes): time_string = convert_hours(hours_minutes) full_datetime = compose_datetime(iso_date, time_string) return full_datetime def create_datetime(iso_string): return iso8601.parse_date(iso_string) def convert_datetime(iso_date, hours_minutes): iso_string = create_iso_string(iso_date, hours_minutes) dt = create_datetime(iso_string) return dt def day_of_year(iso_date_string): dt = iso8601.parse_date(iso_date_string) doy = dt.timetuple().tm_yday return doy def alter_feature_datetimes(row): flight_date = iso8601.parse_date(row['FlightDate']) scheduled_dep_time = convert_datetime(row['FlightDate'], row['CRSDepTime']) scheduled_arr_time = convert_datetime(row['FlightDate'], row['CRSArrTime']) # Handle overnight flights if scheduled_arr_time < scheduled_dep_time: scheduled_arr_time += datetime.timedelta(days=1) doy = day_of_year(row['FlightDate']) return { 'FlightNum': row['FlightNum'], 'FlightDate': flight_date, 'DayOfWeek': int(row['DayOfWeek']), 'DayOfMonth': int(row['DayOfMonth']), 'DayOfYear': doy, 'Carrier': row['Carrier'], 'Origin': row['Origin'], 'Dest': row['Dest'], 'Distance': row['Distance'], 'DepDelay': row['DepDelay'], 'ArrDelay': row['ArrDelay'], 'CRSDepTime': scheduled_dep_time, 'CRSArrTime': scheduled_arr_time, } timestamp_features = filled_on_time_features.rdd.map(alter_feature_datetimes) timestamp_df = timestamp_features.toDF() # Explicitly sort the data and keep it sorted throughout. Leave nothing to chance. sorted_features = timestamp_df.sort( timestamp_df.DayOfYear, timestamp_df.Carrier, timestamp_df.Origin, timestamp_df.Dest, timestamp_df.FlightNum, timestamp_df.CRSDepTime, timestamp_df.CRSArrTime, ) # Store as a single json file and bzip2 it sorted_features.repartition(1).write.mode("overwrite").json("{}/data/simple_flight_delay_features.json".format(base_path)) os.system("cp {}/data/simple_flight_delay_features.json/part* {}/data/simple_flight_delay_features.jsonl".format(base_path, base_path)) os.system("bzip2 --best {}/data/simple_flight_delay_features.jsonl".format(base_path)) os.system("bzcat {}/data/simple_flight_delay_features.jsonl.bz2 >> {}/data/simple_flight_delay_features.jsonl".format(base_path, base_path))
import findspark import time import numpy as np import matplotlib.pyplot as plt import networkx as nx import json from networkx.readwrite import json_graph findspark.init('/Users/zelongqiu/spark') import pyspark import influence_function sc = pyspark.SparkContext() # read graph with open("graph/US.json", "r") as graph_data: graph_data = json.load(graph_data) NC_digraph = json_graph.node_link_graph(graph_data) nc_N_width_nodes = [i for i in NC_digraph.nodes() if len(NC_digraph.succ[i]) >= 300] nodes_set_broadcast = sc.broadcast(NC_digraph) ###################################################################################### # #Influence Function Implementation # ####################################################################################### def cascade(init_nodes, nodes_set):#, dist_d): # nodes_set = nodes_set_broadcast action = {} n = len(init_nodes) #np.random.seed(random_d)
# Starting spark #Only use locally import findspark findspark.init() import pyspark sc = pyspark.SparkContext() import numpy as np import time import csv import itertools #Making necessary imports from decisionTree import * from infoGain import * from bootstrap import * from multiParallelTree import * #Loading the balance dataset sample = [] with open('../data/balance-scale.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: sample.append((str(row[0]),row[1:])) columns = ['Left-Weight','Left-Distance','Right-Weight','Right-Distance'] #Creating an RDD with bootstrapped data # n is number of trees in our forest start_time = time.time() ntrees = 500 res,train_data = create_Forest(sc, sample, ntrees, columns, 3, discrete_column_ids=[], n_bins=10) print "Time taken to train forest: ", time.time() - start_time
Birds Project #import package import findspark findspark.init('/data/spark-1.6.0-bin-hadoop2.6') from pyspark import SparkContext, HiveContext from pyspark.sql import functions as F from pyspark.sql import Window as w %matplotlib inline import seaborn as sns import datetime as datetime import pandas as pd import numpy as np import math import time from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.types import StringType, DoubleType, IntegerType from shapely.wkb import loads from shapely import wkt from numpy.lib.stride_tricks import as_strided from pyspark.sql import SQLContext # load Spark and HiveContext sc = SparkContext() hc = HiveContext(sc) Set the area of Polderbaan and calculate the max relative distance of target bird in each trajectory Extend the Polderbaan area in terms of the max distance # the area of Polderbaan l_lon = 4.706
import findspark import os from nltk.sentiment.util import getPositiveWords from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() os.environ['LC_ALL'] = 'en_US.UTF-8' os.environ['LANG'] = 'en_US.UTF-8' import json import path findspark.init(spark_home='/Applications/spark-1.6.1') from pyspark import SparkContext, SparkConf import datetime def getNum(rdd,word):#return the sum up return rdd.filter(lambda v1:"apple" in v1[0]).map(lambda v1: v1[1]).sum() def containVia(V): Iword=['via','Via'] for iword in Iword: if iword in V: return 1 return 0 def containStock(V): Iword='$' if Iword in V: return 1
import findspark findspark.init("/opt/spark") import pyspark from pyspark.sql.functions import * from pyspark.sql.types import * import heapq import json import logging import re import pandas as pd import boto3 import spacy # Used to split the Wikipedia articles into sentences import text_to_cluster as cluster # ——————— CONSTANTS ————————— # S3 bucket and region in which wikipedia data resides S3_BUCKET = "datamuse-misc" S3_REGION = "us-east-1" # For each word, output this many example sentences OUTPUTS_PER_WORD = 3000 #5 # File which contains a list of wikipedia data files (as S3 keys) to process CORPUS_FILE = "enwiki.full"