Example #1
0
    def get_spark(self):
        if self._spark_context is None:
            ans = self.supervisor.request('get_spark')
            if 'error' in ans:
                raise ContextError(ans['error'])

            if self.verbose:
                print("get_spark answer from supervisor:\n"+json.dumps(ans, indent=4))

            # path to spark files
            spark_path = ans['path']

            # the dictionary stored in ans['spark']['config'] will be given to SparkConf directly
            # e.g. { "spark.master": "local[5]", "spark.app.name": "testapp" }
            spark_config = ans['config']

            import findspark
            findspark.init(spark_path)

            import pyspark
            import pymongo_spark

            pymongo_spark.activate()

            conf = pyspark.SparkConf()
            conf.setAll(spark_config.items())

            self._spark_context = pyspark.SparkContext()

        return self._spark_context
Example #2
0
    def start_spark(self,
                    spark_conf=None, 
                    executor_memory=None,
                    profiling=False, 
                    graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', 
                    extra_conf = None):
        """Launch a SparkContext 
        
        Parameters

        spark_conf: path
            path to a spark configuration directory
        executor_memory: string
            executor memory in java memory string format, e.g. '4G'
            If `None`, `memory_per_executor` is used. 
        profiling: boolean
            whether to turn on python profiling or not
        graphframes_package: string
            which graphframes to load - if it isn't found, spark will attempt to download it
        extra_conf: dict
            additional configuration options
        """

        os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\
                                            .format(graphframes_package=graphframes_package)
        
        if spark_conf is None:
            spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf')

        os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf)

        os.environ['PYSPARK_PYTHON'] = sys.executable

        try: 
            import findspark; findspark.init()
            from pyspark import SparkContext, SparkConf
        except ImportError: 
            raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?")

        conf = SparkConf()

        conf.set('spark.driver.maxResultSize', '0')

        if executor_memory is None: 
            executor_memory = '%dM'%self.memory_per_executor

        conf.set('spark.executor.memory', executor_memory)

        if profiling: 
            conf.set('spark.python.profile', 'true')
        else:
            conf.set('spark.python.profile', 'false')
        
        if extra_conf is not None: 
            for k,v in extra_conf.items(): 
                conf.set(k,v)

        sc = SparkContext(master=self.master_url(), conf=conf)

        return sc    
Example #3
0
def table_schema_from_spark(hcat_table_name):
    #returns schema of table with this database.name in hcatalog
    #   (spark-workaround as long as hcatweb api is not available...)
    # initialize spark
    import findspark
    findspark.init()
     
    import pyspark
    from pyspark.sql import HiveContext
    
    sc_conf = pyspark.SparkConf()
    #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*')
    #sc_conf.set('spark.master','yarn-client')
    
    sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf)
    hc = HiveContext(sc)
    
    hive_schema = hc.table(hcat_table_name).schema.jsonValue()
    
    print hive_schema
    
    sc.stop()
    
    table_schema = {'columns':{}}
    
    col_sequence = 0
    for field in hive_schema['fields']:
        table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']}
        col_sequence += 1
    
    return table_schema
def main(iso_date, base_path):
  
  APP_NAME = "load_prediction_results.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  # Get today and tomorrow's dates as iso strings to scope query
  today_dt = iso8601.parse_date(iso_date)
  rounded_today = today_dt.date()
  iso_today = rounded_today.isoformat()
  
  input_path = "{}/data/prediction_results_daily.json/{}".format(
    base_path,
    iso_today
  )
  
  # Load and JSONize text
  prediction_results_raw = sc.textFile(input_path)
  prediction_results = prediction_results_raw.map(json_util.loads)
  
  # Store to MongoDB
  prediction_results.saveToMongoDB(
    "mongodb://localhost:27017/agile_data_science.prediction_results"
  )
def main(iso_date, base_path):
  APP_NAME = "pyspark_task_one.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()

  # Get today's date
  today_dt = iso8601.parse_date(iso_date)
  rounded_today = today_dt.date()

  # Load today's data
  today_input_path = "{}/ch02/data/example_name_titles_daily.json/{}".format(
    base_path,
    rounded_today.isoformat()
  )

  # Otherwise load the data and proceed...
  people_titles = spark.read.json(today_input_path)
  people_titles.show()
  
  # Group by as an RDD
  titles_by_name = people_titles.rdd.groupBy(lambda x: x["name"])
  
  # Accept the group key/grouped data and concatenate the various titles...
  # into a master title
  def concatenate_titles(people_titles):
    name = people_titles[0]
    title_records = people_titles[1]
    master_title = ""
    for title_record in sorted(title_records):
      title = title_record["title"]
      master_title += "{}, ".format(title)
    master_title = master_title[:-2]
    record = {"name": name, "master_title": master_title}
    return record
  
  people_with_concatenated_titles = titles_by_name.map(concatenate_titles)
  people_output_json = people_with_concatenated_titles.map(json.dumps)
  
  # Get today's output path
  today_output_path = "{}/ch02/data/example_master_titles_daily.json/{}".format(
    base_path,
    rounded_today.isoformat()
  )
  
  # Write/replace today's output path
  os.system("rm -rf {}".format(today_output_path))
  people_output_json.saveAsTextFile(today_output_path)
Example #6
0
def add_pyspark_path_if_needed():
    """Add PySpark to the library path based on the value of SPARK_HOME if
    pyspark is not already in our path"""
    try:
        from pyspark import context
    except ImportError:
        # We need to add PySpark, try findspark if we can but it has an
        # undeclared IPython dep.
        try:
            import findspark
            findspark.init()
        except ImportError:
            add_pyspark_path()
def get_task(n):
	findspark.init()
	sc = pyspark.SparkContext()
	sqlContext = SQLContext(sc)
	raw_bookings = sqlContext.read \
		.format('com.databricks.spark.csv') \
		.options(header='true', delimiter='^',inferSchema='true') \
		.load('bookings.csv')
	arr_port_by_pass=raw_bookings.select(['arr_port','pax']).groupby('arr_port').sum('pax').orderBy('sum(pax)',ascending=0)
	df=pd.DataFrame(data=arr_port_by_pass.collect()[:n],columns=['arr_port','num_pass'])
	geo_o = GeoBase(data='ori_por', verbose=False)
	df['arr_port_name']=df['arr_port'].map(lambda x: geo_o.get(str(x).replace(' ',''),'name'))
	json_st=df.to_json()
	return json_st
Example #8
0
def getCount(logFile):
	import findspark
	findspark.init()

	from pyspark import SparkContext
	#logFile = "declaration.txt"  # Should be some file on your system
	sc = SparkContext("local", "Simple App")
	logData = sc.textFile(logFile).cache()

	numAs = logData.filter(lambda s: 'a' in s).count()
	numBs = logData.filter(lambda s: 'b' in s).count()
	answer = dict()
	answer['aCount'] = numAs
	answer['bCount'] = numBs
	sc.stop()

	import json
	return json.dumps(answer)
Example #9
0
def get_spark_context(appName = 'worm'):
    '''finds your local spark distribution and returns the pyspark.SparkContext

    Examples
    --------
    >>> sc = get_spark_context()
    >>> rdd = sc.textFile('README.md')
    >>> rdd.collect()'''
    try:
        import findspark
        findspark.init()

        import pyspark
        sc = pyspark.SparkContext(appName=appName)
    except Exception as e:
        sc = None
        print e
    return sc
Example #10
0
def main():
    import findspark
    findspark.init()

    import pyspark
    sc = pyspark.SparkContext()

    # Mute spark logging
    apache_logger = sc._jvm.org.apache.log4j
    apache_logger.LogManager.getLogger("org").setLevel(apache_logger.Level.ERROR)
    apache_logger.LogManager.getLogger("akka").setLevel(apache_logger.Level.ERROR)

    # Create RDD
    textFile = sc.textFile("data/train.txt")
    splitRDD = textFile.map(lambda w: w.split('\t'))
    floatRDD = splitRDD.map(lambda w: [float(x) for x in w[0:-1]])

    means = KMeans()
    means.train(floatRDD, k=3, runs=10, max_iterations=10)
Example #11
0
    def _init_spark(self):
        """Initializes spark so that pyspark is importable.  This also sets up the required environment variables
        """
        global _SPARK_INITIALIZED
        spark_home = self.spark_home
        python_path = self._python_path

        if use_findspark:
            if _SPARK_INITIALIZED:
                if spark_home == os.environ["SPARK_HOME"]:
                    # matches with already initialized
                    pass
                else:
                    # findspark adds two path to the search path.
                    sys.path.pop(0)
                    sys.path.pop(0)
                    findspark.init(spark_home=spark_home, edit_rc=False, edit_profile=False, python_path=python_path)
            else:
                findspark.init(spark_home=spark_home, edit_rc=False, edit_profile=False, python_path=python_path)

        _SPARK_INITIALIZED = True
        self._set_environment_variables()
def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
# Default partitioning: 100 partitions!

import matplotlib.pyplot as plt

import P2
import seaborn as sns
sns.set_context('poster', font_scale=1.25)
import findspark as fs
fs.init()
import pyspark as ps
import multiprocessing as mp
import numpy as np

######## Same as part A, just setting up ################

# Setup cluster, number of threads = 2x cores
config = ps.SparkConf()
config = config.setAppName('P2a')

sc = ps.SparkContext(conf=config)

# Do the computation

num_pixels = 2000
rows = sc.range(num_pixels, numSlices=10)
cols = sc.range(num_pixels, numSlices=10)

indices = rows.cartesian(cols)


def mandelbrot_wrapper(row, col):
Example #14
0
from pyspark.sql import Row, SQLContext
from pyspark import SparkConf, SparkContext
from operator import add
import requests
import sys
from pyspark.streaming import StreamingContext
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import explode, split
from pyspark.sql.functions import col, desc, asc

findspark.init()

ssc = SparkSession \
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

schema = StructType().add("ID", "string").add("Lang", "string") \
    .add("Date", "string").add("Source", "string").add("Len", "string") \
    .add("Likes", "string").add("RTs", "string").add("Hashtags", "string") \
    .add("UserMentionName", "string").add("UserMentionID", "string").add("name", "string") \
    .add("Place", "string").add("Followers", "float").add("Friends", "float")

lines = ssc \
    .readStream \
    .format("csv") \
    .option("header", True) \
    .schema(schema) \
    .option("sep", ";") \
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
Example #16
0
import subprocess
import sys
import findspark
findspark.init('/usr/hdp/current/spark2-client')

import pyspark
from pyspark.sql.functions import lit, col, instr, expr, pow, round, bround, corr, count, mean, stddev_pop, min, max
from pyspark.sql.functions import monotonically_increasing_id, initcap, lower, upper, ltrim, rtrim, rpad, lpad, trim
from pyspark.sql.functions import regexp_replace, translate, regexp_extract, current_date, current_timestamp, struct
from pyspark.sql.functions import date_add, date_sub, datediff, months_between, to_date, to_timestamp, coalesce, split, size
from pyspark.sql.functions import array_contains, explode, udf
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, when

from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType, LongType

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
import pandas as pd
from datetime import date, timedelta, datetime


def get_Spark():

    conf = pyspark.SparkConf().setAll([
        ('spark.submit.deployMode',
         'client'),  # deploy in yarn-client or yarn-cluster
        ('spark.executor.memory', '8g'),  # memory allocated for each executor
# Kinesis Data Generator: https://awslabs.github.io/amazon-kinesis-data-generator/
# 
# After going through the setup, download this file as a .py file, go into the terminal, and run the following command:
# ```
# python3 pyspark-streaming/4_morestream/2_Integration_with_Kinesis_Demo.py
# ```
#     

# ### Demo

# In[ ]:


import findspark
# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/home/matthew/spark-2.3.0-bin-hadoop2.7')

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kinesis-asl_2.11:2.3.0 pyspark-shell'


import sys
import json
import time
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

appName="PythonKinesisApp"
sc = SparkContext(appName=appName)
ssc = StreamingContext(sc, 1)
				time_table[str(time_slot.time())][1]+=int(line[2])

	#plotting
	time_sorted=sorted(time_table.items())
	y1=[]
	y2=[]
	x=[]
	for i in range(len(time_sorted)):

		y1.append(time_sorted[i][1][0])
		y2.append(time_sorted[i][1][1])
		x.append(time_sorted[i][0])
	x_domain=[i for i in range(len(x))]
	plt.scatter(x_domain, y1, color='red')
	plt.scatter(x_domain, y2, color='blue')
	plt.legend(('west side', 'east side'), loc='best')
	plt.xlabel('Hour of Day')
	plt.ylabel('Amount of Bikes')
	plt.show()




if __name__ == '__main__':
	#setting spark_home
	
	findspark.init(spark_home)
	conf=SparkConf().setMaster('local').setAppName('Fremont Bridge Bike Analysis')
	sc=SparkContext(conf=conf)
	main(sc)
Example #19
0
# ```
#
# Donwload this notebook as a .py file and run the following:
# ```
# python3 pyspark-streaming/4_morestream/1_Integration_with_Kafka_Demo.py
# ```
# Finally, start the producer:
# ```
# ~/kafka_2.11-0.11.0.0/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pyspark-kafka-demo
# ```
#
#

import findspark
# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/home/matthew/spark-2.3.0-bin-hadoop2.7')

import os
os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0 pyspark-shell'

import sys
import time
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

n_secs = 1
topic = "pyspark-kafka-demo"

conf = SparkConf().setAppName("KafkaStreamProcessor").setMaster("local[*]")
Example #20
0
#!/usr/bin/env python
# coding: utf-8

# # [o3]- Proyecto Ozono - Predictor_v0

# # [0] - Inicialización

# In[ ]:

import findspark
findspark.init('/home/rulicering/BigData/spark-2.4.5-bin-hadoop2.7')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pandas as pd
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, FloatType
import re as reg
import numpy as np
import datetime

#MlLib
#from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

#Aux
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
Example #21
0
# Imports and running findspark
import findspark
import json
from datetime import datetime
from pyspark.sql import functions as F

findspark.init('/home/bigdata/spark-2.4.3-bin-hadoop2.7')
import pyspark
from pyspark import RDD
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SparkSession
import json  # Spark context details


def process_stream(record, spark):
    columns = [
        "Station", "Date", "Last update", "Places", "Available_bikes",
        "Capacity", "Status", "Position"
    ]
    if not record.isEmpty():
        df = spark.createDataFrame(record, columns)
        df = df.filter(df["Station"] <
                       1000)  # delete 1,033 station which is not in Toulouse
        df.show()
        df.write.format('org.elasticsearch.spark.sql').mode(
            'overwrite'  # or .mode('append')
        ).option('es.nodes', 'localhost').option('es.port', 9200).option(
            'es.resource',
            '%s/%s' % ('velotoulouse-geo', '_doc'),
Example #22
0
import findspark
from pyspark import SparkContext, SparkConf
import csv
from common.Utils import Utils

findspark.init(python_path='/Users/khwu/.virtualenvs/spark/bin/python3')


def load_post_code():
    with open('../../in/uk-postcode.csv') as f:
        reader = csv.reader(f)
        return {
            row[0]: row[7]
            for row in reader if not row[0].startswith('Postcode')
        }


def get_post_prefix(line: str):
    splits = Utils.COMMA_DELIMITER.split(line)
    post_code = splits[4]
    return post_code.split(' ')[0]


if __name__ == '__main__':
    conf = SparkConf().setAppName('ukpostcode').setMaster('local[*]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    post_code = sc.broadcast(load_post_code())

    HEADER = 'Timestamp,Collected by,Name of makerspace'

import numpy as np
import pandas as pd
import twython
from twython import TwythonStreamer
import re
from requests_oauthlib import OAuth1
import urllib

import sys
import ast
import json

import findspark
findspark.init('/usr/local/Cellar/apache-spark/1.5.1/libexec')

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import requests

import threading
import Queue
import time

APP_KEY = "bv6mnYBiFeEVKvPEZlg"
APP_SECRET = "nQZk9Ca8qqJxc1Za07WyW0VPZ6gtAUSF3oPD5sun0"
OAUTH_TOKEN = "606525030-ilOtJstbRvFCjUNMtOu8DP2HQKGWpQvmUsF6fblE"
OAUTH_TOKEN_SECRET = "xSVE47qVOFxxZm1oqKwL6zwLVMWpzxCUYGmLJ6CVHR0mZ"
Example #24
0
def main():
    import findspark
    findspark.init()

    import pyspark
    sc = pyspark.SparkContext()

    # Mute spark logging
    apache_logger = sc._jvm.org.apache.log4j
    apache_logger.LogManager.getLogger("org").setLevel(apache_logger.Level.ERROR)
    apache_logger.LogManager.getLogger("akka").setLevel(apache_logger.Level.ERROR)

    # Create RDD
    floatRdd = sc.textFile("data/yeast_no_header_data.txt")\
                 .map(lambda s: s.split('\t'))\
                 .map(lambda s: list(map(float, s[1:])))
    floatRdd.cache()

    # reduce dimensions
    rowSize = len(floatRdd.first())

    def getDimset(max_index):
        workingDims = random.randint(2, max_index)
        indices = set()
        for _ in range(workingDims):
            indices.add(random.randint(0, max_index))
        return indices

    def mutateDimensionSet(rowSize, factor):
        def mutationImpl(dimset):
            currDimAmount = len(dimset)
            # print('currDimAmount:', currDimAmount)
            mutationMaxAmount = math.floor(currDimAmount * factor)
            # print('mutationMaxAmount:', mutationMaxAmount)
            mutation_diff = getDimset(mutationMaxAmount)
            # print('diff_set')
            # pp.pprint(mutation_diff, compact=True)
            mutation_sum = getDimset(mutationMaxAmount)
            # print('sumset')
            # pp.pprint(mutation_sum, compact=True)
            return (dimset - mutation_diff) | mutation_sum
        return mutationImpl

    mutations_amount = 5
    # relative amount of mutations
    factor = 0.2
    # Standard abb
    standard_dev = 0.3
    # amount of centroids
    centroids_amount = 30
    # amount of centroid recalculations
    n = 2

    # get seed dimension set
    seedDimset = getDimset(rowSize)
    workingDimsAmount = len(seedDimset)
    print('Base dimSet:', seedDimset)
    pp.pprint(seedDimset, compact=True)
    mutator = mutateDimensionSet(rowSize, factor)
    dimSets = [seedDimset]
    for index in range(mutations_amount):
        curr_dimset = dimSets[-1]
        new_dimset = mutator(curr_dimset)
        print('Mutation {} changed: '.format(index))
        print('Added: ', end="")
        pp.pprint(new_dimset - curr_dimset, compact=True)
        print('Deleted: ', end="")
        pp.pprint((curr_dimset - new_dimset), compact=True)
        curr_dimset = new_dimset
        print('new_size:', len(curr_dimset))
        dimSets.append(curr_dimset)

    def workOnDimsetClosure(dimSets, floatRdd, centroids_amount, standard_dev, n):
        def closureImpl(index):
            return (index, workOnDimset(dimSets[index], floatRdd, centroids_amount, standard_dev, n))
        return closureImpl

    biclusteringWorker = workOnDimsetClosure(dimSets, floatRdd, centroids_amount, standard_dev, n)

    tpool = ThreadPool(processes=mutations_amount)
    results = tpool.map(biclusteringWorker, range(len(dimSets)))

    results.sort(key=lambda x: x[1][0])
    print('(Mutation index, (quality, rows amount, cols amount, rdd, centroid))')
    for result in results:
        print(result)
def main(base_path):

    # Default to "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except (NameError, UnboundLocalError) as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # Add a Route variable to replace FlightNum
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the bucketizer
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Drop the original column
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"
    ]
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 predictionCol="Prediction",
                                 maxBins=4657,
                                 maxMemoryInMB=1024)
    model = rfc.fit(final_vectorized_features)

    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    model.write().overwrite().save(model_output_path)

    # Evaluate model using test data
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy))

    # Check the distribution of predictions
    predictions.groupBy("Prediction").count().show()

    # Check a sample
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
import time
import partial
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import json
from networkx.readwrite import json_graph
import findspark

findspark.init("/usr/local/opt/apache-spark/libexec")
import pyspark

sc = pyspark.SparkContext()
with open("graph/nc_mini.json", "r") as graph_data:
    graph_data = json.load(graph_data)
    NC_digraph = json_graph.node_link_graph(graph_data)


####################################################################################
nodes_set = NC_digraph.nodes()


def cascade(init_nodes, nodes_set_broadcast):  # , dist_d):
    nodes_set = nodes_set_broadcast.value
    action = {}
    n = len(init_nodes)
    # np.random.seed(random_d)
    # init_nodes = np.random.choice(NC_digraph.nodes(), 1)[0]
    for i in init_nodes:
        action[i] = 1
    # st = set()
Example #27
0
def initSpark(nb_backend, app_name='pyspark', spark_instances=2, executor_cores= 2, max_cores=8, executor_memory='10G'):
    """
    Configure and create SparkContext.

    nb_backend ... backend for notebook ('local' or 'openstack')
    app_name ... name of the Spark application
    spark_instances ... number of executor instances (e.g. number of workers)
    executor_cores ... the number of cores for each executor
    max_cores ... max. number of cores that can be used
    executor_memory ... memory per executor
    """

    # ressource settings
    # conf.set("spark.executor.instances", 4)
    # conf.set("spark.cores.max", 16)
    # conf.set("spark.executor.memory", "10G")
    # conf.set("spark.executor.cores", 3)

    import os
    if nb_backend == 'openstack':
        os.environ['SPARK_HOME'] = "/usr/local/spark"
        os.environ['SPARK_DRIVER_MEMORY'] = '10G'

    # setup Spark
    import findspark # SPARK_HOME needs to be set for import of findspark
    findspark.init()
    import pyspark
    from pyspark import SparkContext, SparkConf

    conf = SparkConf().setAppName(app_name)

    if nb_backend == 'local':
        # options for Spark on local machine
        master = 'local'
    elif nb_backend == 'openstack':
        # options for Spark on OpenStack cluster
        master = 'spark://sparkcluster-controller001:7077'
    else:
        print "Backend " + nb_backend + " not known"

    conf.setMaster(master)

    # configure the number of instances
    # conf.set("spark.executor.instances", spark_instances)

    # configure the number of cores per executor
    conf.set("spark.executor.cores", executor_cores)

    # configure the max. number of cores a user may request
    conf.set("spark.cores.max", max_cores)

    # configure the memory available per Spark executor
    conf.set("spark.executor.memory", executor_memory)

    # configure the memory available for the driver
    conf.set("spark.driver.memory", '2G')

    if nb_backend == 'openstack':
        conf.set("spark.driver.extraClassPath", "/usr/local/hadoop/share/hadoop/tools/lib/*")
        conf.set("spark.executor.extraClassPath", "/usr/local/hadoop/share/hadoop/tools/lib/*")

    try:
        sc = SparkContext(conf=conf)
        return sc
    except ValueError as exception:
        print "Could not create SparkContext. Maybe it exists already?"
import time
import partial
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import json
from networkx.readwrite import json_graph
import findspark
findspark.init('/usr/local/opt/apache-spark/libexec')
import pyspark
sc = pyspark.SparkContext()

with open("graph/nc_mini.json", "r") as graph_data:
    graph_data = json.load(graph_data)
    NC_digraph = json_graph.node_link_graph(graph_data)
    
    
######################################################################################
#
#Influence Function Implementation
#
#######################################################################################

nodes_set = NC_digraph.nodes()

def cascade(init_nodes, nodes_set_broadcast):#, dist_d):
    nodes_set = nodes_set_broadcast.value
    action = {}
    n = len(init_nodes)
    #np.random.seed(random_d)
    #init_nodes = np.random.choice(NC_digraph.nodes(), 1)[0]
# Work in progress - Use spark as ranker 

import findspark
import pyspark
import os

findspark.init(os.getenv('HOME') + '/spark-1.6.0-bin-hadoop2.6')
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'

try:
	print(sc)
except NameError:
	sc = pyspark.SparkContext()
	print(sc)

import findspark
findspark.init('/home/oliver/Documents/spark-2.0.0-bin-hadoop2.7')
# findspark.add_packages(['org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.0-preview'])
from pyspark import SparkContext
# from pyspark.streaming.kafka import KafkaUtils
# from pyspark.streaming import StreamingContext

from pyspark.sql import SQLContext, Row


num_of_users = 1000
sc = SparkContext()
sqlContext = SQLContext(sc)

# Load a text file and convert each line to a Row.
lines = sc.textFile('user_scan_list_transform_'+str(num_of_users)+'.csv')
for name in lines.collect():
    print name
import findspark
findspark.init('/home/zishan/spark-2.2.1-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Popularity').getOrCreate()
data = spark.read.csv('OnlineNewsPopularity.csv',inferSchema=True,header=True)
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['timedelta',
 'n_tokens_title',
 'n_tokens_content',
 'n_unique_tokens',
 'n_non_stop_words',
 'n_non_stop_unique_tokens',
 'num_hrefs',
 'num_self_hrefs',
 'num_imgs',
 'num_videos',
 'average_token_length',
 'num_keywords',
 'data_channel_is_lifestyle',
 'data_channel_is_entertainment',
 'data_channel_is_bus',
 'data_channel_is_socmed',
 'data_channel_is_tech',
 'data_channel_is_world',
 'self_reference_max_shares',
 'self_reference_avg_sharess',
 'weekday_is_monday',
 'weekday_is_tuesday',
 'weekday_is_wednesday',
 'weekday_is_thursday',
import seaborn as sns
sns.set_context('poster', font_scale=1.25)
import findspark as fs
fs.init()
import pyspark as ps
import numpy as np

# Assumes local options are already set in conf file...or else this explodes
config = ps.SparkConf()
config = config.setAppName('wiki_solver')
sc = ps.SparkContext(conf=config)

#### Create the network ####
num_partitions = 40

# Forces all links to be symmetric
links_raw_data = sc.textFile('links-simple-sorted.txt', minPartitions=num_partitions)
titles_raw_data = sc.textFile('titles-sorted.txt', minPartitions=num_partitions)

import re

def get_links(x):
    split = re.findall(r"[\w']+", x)
    parent = int(split[0])
    children = [int(z) for z in split[1:]]
    parent_to_children = [(parent, z) for z in children]
    children_to_parent = [(z, parent) for z in children]
    return parent_to_children + children_to_parent

all_links = links_raw_data.flatMap(get_links)
node_then_all_links = all_links.groupByKey()
Example #33
0
from django.http import HttpResponse
from django.shortcuts import render

import test_regression
import gen_random
import time
import numpy as np
import os
import datetime
import sys

import findspark

#options for server
findspark.init("/home/ubuntu/spark")
home_dir = "/home/ubuntu/CS205_Final_Project/web/mysite/"

#options for local
#home_dir = ""
#findspark.init()

import pyspark
sc = pyspark.SparkContext(appName="final")

# from GitHub
def cholesky_solution_linear_regression(x_t_x,x_t_y):    
    L = np.linalg.cholesky(x_t_x)    
    z = np.linalg.solve(L,x_t_y)    
    theta = np.linalg.solve(np.transpose(L),z)
    return theta
def main(base_path):
  APP_NAME = "fetch_prediction_requests.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
    
  # Load the on-time parquet file
  on_time_dataframe = spark.read.parquet('{}/data/on_time_performance.parquet'.format(base_path))
  on_time_dataframe.registerTempTable("on_time_performance")
  
  # Select a few features of interest
  simple_on_time_features = spark.sql("""
  SELECT
    FlightNum,
    FlightDate,
    DayOfWeek,
    DayofMonth AS DayOfMonth,
    CONCAT(Month, '-',  DayofMonth) AS DayOfYear,
    Carrier,
    Origin,
    Dest,
    Distance,
    DepDelay,
    ArrDelay,
    CRSDepTime,
    CRSArrTime
  FROM on_time_performance
  """)
  simple_on_time_features.show()
  
  # Filter nulls, they can't help us
  filled_on_time_features = simple_on_time_features.filter(
    simple_on_time_features.ArrDelay.isNotNull()
    &
    simple_on_time_features.DepDelay.isNotNull()
  )
  
  # We need to turn timestamps into timestamps, and not strings or numbers
  def convert_hours(hours_minutes):
    hours = hours_minutes[:-2]
    minutes = hours_minutes[-2:]
    
    if hours == '24':
      hours = '23'
      minutes = '59'
    
    time_string = "{}:{}:00Z".format(hours, minutes)
    return time_string
  
  def compose_datetime(iso_date, time_string):
    return "{} {}".format(iso_date, time_string)
  
  def create_iso_string(iso_date, hours_minutes):
    time_string = convert_hours(hours_minutes)
    full_datetime = compose_datetime(iso_date, time_string)
    return full_datetime
  
  def create_datetime(iso_string):
    return iso8601.parse_date(iso_string)
  
  def convert_datetime(iso_date, hours_minutes):
    iso_string = create_iso_string(iso_date, hours_minutes)
    dt = create_datetime(iso_string)
    return dt
  
  def day_of_year(iso_date_string):
    dt = iso8601.parse_date(iso_date_string)
    doy = dt.timetuple().tm_yday
    return doy
  
  def alter_feature_datetimes(row):
    
    flight_date = iso8601.parse_date(row['FlightDate'])
    scheduled_dep_time = convert_datetime(row['FlightDate'], row['CRSDepTime'])
    scheduled_arr_time = convert_datetime(row['FlightDate'], row['CRSArrTime'])
    
    # Handle overnight flights
    if scheduled_arr_time < scheduled_dep_time:
      scheduled_arr_time += datetime.timedelta(days=1)
    
    doy = day_of_year(row['FlightDate'])
    
    return {
      'FlightNum': row['FlightNum'],
      'FlightDate': flight_date,
      'DayOfWeek': int(row['DayOfWeek']),
      'DayOfMonth': int(row['DayOfMonth']),
      'DayOfYear': doy,
      'Carrier': row['Carrier'],
      'Origin': row['Origin'],
      'Dest': row['Dest'],
      'Distance': row['Distance'],
      'DepDelay': row['DepDelay'],
      'ArrDelay': row['ArrDelay'],
      'CRSDepTime': scheduled_dep_time,
      'CRSArrTime': scheduled_arr_time,
    }
  
  timestamp_features = filled_on_time_features.rdd.map(alter_feature_datetimes)
  timestamp_df = timestamp_features.toDF()
  
  # Explicitly sort the data and keep it sorted throughout. Leave nothing to chance.
  sorted_features = timestamp_df.sort(
    timestamp_df.DayOfYear,
    timestamp_df.Carrier,
    timestamp_df.Origin,
    timestamp_df.Dest,
    timestamp_df.FlightNum,
    timestamp_df.CRSDepTime,
    timestamp_df.CRSArrTime,
  )
  
  # Store as a single json file and bzip2 it
  sorted_features.repartition(1).write.mode("overwrite").json("{}/data/simple_flight_delay_features.json".format(base_path))
  os.system("cp {}/data/simple_flight_delay_features.json/part* {}/data/simple_flight_delay_features.jsonl".format(base_path, base_path))
  os.system("bzip2 --best {}/data/simple_flight_delay_features.jsonl".format(base_path))
  os.system("bzcat {}/data/simple_flight_delay_features.jsonl.bz2 >> {}/data/simple_flight_delay_features.jsonl".format(base_path, base_path))
import findspark
import time
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import json
from networkx.readwrite import json_graph
findspark.init('/Users/zelongqiu/spark')
import pyspark
import influence_function
sc = pyspark.SparkContext()

# read graph
with open("graph/US.json", "r") as graph_data:
    graph_data = json.load(graph_data)
    NC_digraph = json_graph.node_link_graph(graph_data)

nc_N_width_nodes = [i for i in NC_digraph.nodes() if len(NC_digraph.succ[i]) >= 300]
nodes_set_broadcast = sc.broadcast(NC_digraph)
######################################################################################
#
#Influence Function Implementation
#
#######################################################################################


def cascade(init_nodes, nodes_set):#, dist_d):
    # nodes_set = nodes_set_broadcast
    action = {}
    n = len(init_nodes)
    #np.random.seed(random_d)
Example #36
0
# Starting spark
#Only use locally
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
import numpy as np
import time
import csv
import itertools

#Making necessary imports
from decisionTree import *
from infoGain import *
from bootstrap import *
from multiParallelTree import *

#Loading the balance dataset
sample = []
with open('../data/balance-scale.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        sample.append((str(row[0]),row[1:]))

columns = ['Left-Weight','Left-Distance','Right-Weight','Right-Distance']
#Creating an RDD with bootstrapped data
# n is number of trees in our forest
start_time = time.time()
ntrees = 500
res,train_data = create_Forest(sc, sample, ntrees, columns, 3, discrete_column_ids=[], n_bins=10)
print "Time taken to train forest: ", time.time() - start_time
Example #37
0
Birds Project

#import package
import findspark
findspark.init('/data/spark-1.6.0-bin-hadoop2.6')
from pyspark import SparkContext, HiveContext
from pyspark.sql import functions as F
from pyspark.sql import Window as w
%matplotlib inline
import seaborn as sns
import datetime as datetime
import pandas as pd
import numpy as np
import math
import time
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType, DoubleType, IntegerType
from shapely.wkb import loads
from shapely import wkt
from numpy.lib.stride_tricks import as_strided
from pyspark.sql import SQLContext

# load Spark and HiveContext
sc = SparkContext()
hc = HiveContext(sc)

Set the area of Polderbaan and calculate the max relative distance of target bird in each trajectory
Extend the Polderbaan area in terms of the max distance

# the area of Polderbaan
l_lon = 4.706
Example #38
0
import findspark
import os
from nltk.sentiment.util import getPositiveWords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
os.environ['LC_ALL'] = 'en_US.UTF-8'


os.environ['LANG'] = 'en_US.UTF-8'
import json
import path
findspark.init(spark_home='/Applications/spark-1.6.1')

from pyspark import SparkContext, SparkConf

import datetime


def getNum(rdd,word):#return the sum up
    return rdd.filter(lambda v1:"apple" in v1[0]).map(lambda v1: v1[1]).sum()
def containVia(V):
    Iword=['via','Via']
    for iword in Iword:
        if iword in V:
            return 1
    return 0
def containStock(V):
    Iword='$'

    if Iword in V:
        return 1
Example #39
0
import findspark
findspark.init("/opt/spark")

import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *

import heapq
import json
import logging
import re

import pandas as pd

import boto3
import spacy  # Used to split the Wikipedia articles into sentences

import text_to_cluster as cluster

# ——————— CONSTANTS —————————

# S3 bucket and region in which wikipedia data resides
S3_BUCKET = "datamuse-misc"
S3_REGION = "us-east-1"

# For each word, output this many example sentences
OUTPUTS_PER_WORD = 3000  #5

# File which contains a list of wikipedia data files (as S3 keys) to process
CORPUS_FILE = "enwiki.full"