Esempio n. 1
0
    def get_spark(self):
        if self._spark_context is None:
            ans = self.supervisor.request('get_spark')
            if 'error' in ans:
                raise ContextError(ans['error'])

            if self.verbose:
                print("get_spark answer from supervisor:\n"+json.dumps(ans, indent=4))

            # path to spark files
            spark_path = ans['path']

            # the dictionary stored in ans['spark']['config'] will be given to SparkConf directly
            # e.g. { "spark.master": "local[5]", "spark.app.name": "testapp" }
            spark_config = ans['config']

            import findspark
            findspark.init(spark_path)

            import pyspark
            import pymongo_spark

            pymongo_spark.activate()

            conf = pyspark.SparkConf()
            conf.setAll(spark_config.items())

            self._spark_context = pyspark.SparkContext()

        return self._spark_context
Esempio n. 2
0
def store_datasets(datasets):
    pymongo_spark.activate()
    mongo_url = config.MONGO_URI
    for (json_files, entities, vertical) in datasets:
        json_files.saveToMongoDB(mongo_url + "ground_truth.json_files_" +
                                 vertical)
        entities.saveToMongoDB(mongo_url + "ground_truth.entities_" + vertical)
Esempio n. 3
0
def store_datasets(datasets):
    pymongo_spark.activate()
    mongo_url = "mongodb://localhost:27017/"
    for (json_files, entities, vertical) in datasets:
        json_files.saveToMongoDB(mongo_url + "ground_truth.json_files_" +
                                 vertical)
        entities.saveToMongoDB(mongo_url + "ground_truth.entities_" + vertical)
def main():

    pymongo_spark.activate()
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext.getOrCreate(conf=conf)

    # Creating a streaming context with batch interval of 10 sec
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    stream(ssc, 300)
Esempio n. 5
0
def main(sc, db_dot_collection):
    mfrom = '*****@*****.**'
    #textRDD = sc.textFile(filename)
    #words = textRDD.flatMap(lambda x: x.split(' ')).map(lambda to: send_email(mfrom,to,msg)
    #wordcount = words.reduceByKey(add).collect()
    #for wc in wordcount:
    #    print wc[0],wc[1]
    msg = "hello"
    pymongo_spark.activate()
    rdd = (sc.mongoRDD('{0}{1}'.format(mongo_url, db_dot_collection)).map(
        lambda doc: send_email(mfrom, doc.get('email'), msg)))
    rdd.collect()
Esempio n. 6
0
    def export_to_mongo(self):
        """ Export data to MongoDB. """
        self.code = self.input

        # Remove cards for a specific set
        if self.code in self.mongodb_clean:
            from pymongo import MongoClient

            client = MongoClient("mongodb://localhost:27017/")
            db = client["mtggg"]
            col = db["cards"]

            col.delete_many({"keyruneCode": self.code})

        df = self.load_parquet_for_keyrune(self.code)

        import pymongo_spark

        pymongo_spark.activate()
        as_dict = df.rdd.map(lambda row: row.asDict())
        as_dict.saveToMongoDB(f"mongodb://localhost:27017/mtggg.cards")

        self.next(self.export_to_es)
Esempio n. 7
0
def main(iso_date, base_path):
    APP_NAME = "pyspark_task_two.py"

    # SparkSession이 없다면 그 환경을 생성
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    import pymongo
    import pymongo_spark
    # 중요: pymongo_spark를 활성화
    pymongo_spark.activate()

    # 오늘 날짜 가져오기
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()

    # 오늘 날째 적재
    today_input_path = "{}/ch02/data/example_master_titles_daily.json/{}".format(
        base_path, rounded_today.isoformat())

    # 데이터를 적재하고 계속 진행
    people_master_titles_raw = sc.textFile(today_input_path)
    people_master_titles = people_master_titles_raw.map(json.loads)
    print(people_master_titles.first())

    people_master_titles.saveToMongoDB(
        'mongodb://localhost:27017/agile_data_science.people_master_titles')
# Load the parquet file
on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
on_time_dataframe.registerTempTable("on_time_performance")

# Filter down to the fields we need to identify and link to a flight
flights = on_time_dataframe.rdd.map(lambda x: 
  (x.Carrier, x.FlightDate, x.FlightNum, x.Origin, x.Dest, x.TailNum)
  )

# Group flights by tail number, sorted by date, then flight number, then origin/dest
flights_per_airplane = flights\
  .map(lambda nameTuple: (nameTuple[5], [nameTuple[0:5]]))\
  .reduceByKey(lambda a, b: a + b)\
  .map(lambda tuple:
      {
        'TailNum': tuple[0], 
        'Flights': sorted(tuple[1], key=lambda x: (x[1], x[2], x[3], x[4]))
      }
    )
flights_per_airplane.first()

# Save to Mongo
import pymongo_spark
pymongo_spark.activate()
flights_per_airplane.saveToMongoDB('mongodb://localhost:27017/agile_data_science.flights_per_airplane')

Esempio n. 9
0
from pyspark import SparkConf
from pyspark import SparkContext

from math import radians, sin, asin, cos, sqrt
import geopy
from geopy.distance import VincentyDistance

from bs4 import BeautifulSoup
import lxml
import requests
from lxml.html.clean import Cleaner
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words

import pymongo_spark
pymongo_spark.activate()


def fun(x, y):
    return x, y + 1


def getConf():
    with open('conf.json') as data_file:
        data = json.load(data_file)
    return data


def f_parse(args):
    def isAlphabet(word):
 def setUpClass(cls):
     pymongo_spark.activate()
     cls.sc = pyspark.SparkContext(appName='test_pickle', master='local')
     cls.client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT)
     cls.coll = cls.client.mongo_hadoop.test
     cls.output_coll = cls.client.mongo_hadoop.test.output
# run the program using ' spark-submit "program name" '
# import following modules
import pymongo  # python mongo connector
import pymongo_spark  # import spark module for pymongo
import json  # import json format
from pymongo import MongoClient  # import the Mongoclient, to connect to the database
from pyspark import SparkContext, SparkConf  # import spark contexts
from pyspark.sql import SQLContext  # import the sql context for spark
from pyspark.sql import SparkSession  # run a spark session

pymongo_spark.activate()  # activate the pymongo-spark connector
conf = SparkConf()  # define spark configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
client = MongoClient()  # define our client to be from MongoClient() server
db = client.test  # define current database db as the 'test' database on the MongoClient()
print('database  ', db)  # print current database in use
path = "hdfs://localhost:9000/home/hadoop/h_data/sales_ord_univ.csv"  # define the HDFS file path
df = sqlContext.read.format("com.databricks.spark.csv").option(
    "header", "true"
).option("inferSchema", "true").load(
    path
)  # Use the path to create a spark SQL dataframe 'df' using  the infered schema.
df_pandas = df.toPandas()  # Create a pandas DataFrame from the SQL Dataframe
records = json.loads(
    df_pandas.T.to_json()).values()  # Create a json format of the dataframe
db.test76.insert(
    records
)  # insert the json format "records " in our current mongo database "db' under the name of "test2"
def activate():
    pymongo_spark.activate()
    print 'activated'
 def setUpClass(cls):
     pymongo_spark.activate()
     cls.sc = pyspark.SparkContext(appName='test_pickle', master='local')
     cls.client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT)
     cls.coll = cls.client.mongo_hadoop.test
     cls.output_coll = cls.client.mongo_hadoop.test.output
Esempio n. 14
0
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # Process data every 10 seconds
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        #import findspark

        # Add the streaming package and initialize
        #findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        #findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        from pyspark import SparkContext, SparkConf
        from pyspark.sql import SparkSession, Row
        from pyspark.streaming import StreamingContext
        from pyspark.streaming.kafka import KafkaUtils

        import pymongo_spark
        pymongo_spark.activate()

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Process Prediction Requests in Streaming
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # Create a dataframe from the RDD-based object stream
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # Do the classification and store to Mongo
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()