def get_spark(self): if self._spark_context is None: ans = self.supervisor.request('get_spark') if 'error' in ans: raise ContextError(ans['error']) if self.verbose: print("get_spark answer from supervisor:\n"+json.dumps(ans, indent=4)) # path to spark files spark_path = ans['path'] # the dictionary stored in ans['spark']['config'] will be given to SparkConf directly # e.g. { "spark.master": "local[5]", "spark.app.name": "testapp" } spark_config = ans['config'] import findspark findspark.init(spark_path) import pyspark import pymongo_spark pymongo_spark.activate() conf = pyspark.SparkConf() conf.setAll(spark_config.items()) self._spark_context = pyspark.SparkContext() return self._spark_context
def store_datasets(datasets): pymongo_spark.activate() mongo_url = config.MONGO_URI for (json_files, entities, vertical) in datasets: json_files.saveToMongoDB(mongo_url + "ground_truth.json_files_" + vertical) entities.saveToMongoDB(mongo_url + "ground_truth.entities_" + vertical)
def store_datasets(datasets): pymongo_spark.activate() mongo_url = "mongodb://localhost:27017/" for (json_files, entities, vertical) in datasets: json_files.saveToMongoDB(mongo_url + "ground_truth.json_files_" + vertical) entities.saveToMongoDB(mongo_url + "ground_truth.entities_" + vertical)
def main(): pymongo_spark.activate() conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext.getOrCreate(conf=conf) # Creating a streaming context with batch interval of 10 sec sc.setLogLevel("WARN") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") stream(ssc, 300)
def main(sc, db_dot_collection): mfrom = '*****@*****.**' #textRDD = sc.textFile(filename) #words = textRDD.flatMap(lambda x: x.split(' ')).map(lambda to: send_email(mfrom,to,msg) #wordcount = words.reduceByKey(add).collect() #for wc in wordcount: # print wc[0],wc[1] msg = "hello" pymongo_spark.activate() rdd = (sc.mongoRDD('{0}{1}'.format(mongo_url, db_dot_collection)).map( lambda doc: send_email(mfrom, doc.get('email'), msg))) rdd.collect()
def export_to_mongo(self): """ Export data to MongoDB. """ self.code = self.input # Remove cards for a specific set if self.code in self.mongodb_clean: from pymongo import MongoClient client = MongoClient("mongodb://localhost:27017/") db = client["mtggg"] col = db["cards"] col.delete_many({"keyruneCode": self.code}) df = self.load_parquet_for_keyrune(self.code) import pymongo_spark pymongo_spark.activate() as_dict = df.rdd.map(lambda row: row.asDict()) as_dict.saveToMongoDB(f"mongodb://localhost:27017/mtggg.cards") self.next(self.export_to_es)
def main(iso_date, base_path): APP_NAME = "pyspark_task_two.py" # SparkSession이 없다면 그 환경을 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() import pymongo import pymongo_spark # 중요: pymongo_spark를 활성화 pymongo_spark.activate() # 오늘 날짜 가져오기 today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() # 오늘 날째 적재 today_input_path = "{}/ch02/data/example_master_titles_daily.json/{}".format( base_path, rounded_today.isoformat()) # 데이터를 적재하고 계속 진행 people_master_titles_raw = sc.textFile(today_input_path) people_master_titles = people_master_titles_raw.map(json.loads) print(people_master_titles.first()) people_master_titles.saveToMongoDB( 'mongodb://localhost:27017/agile_data_science.people_master_titles')
# Load the parquet file on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') on_time_dataframe.registerTempTable("on_time_performance") # Filter down to the fields we need to identify and link to a flight flights = on_time_dataframe.rdd.map(lambda x: (x.Carrier, x.FlightDate, x.FlightNum, x.Origin, x.Dest, x.TailNum) ) # Group flights by tail number, sorted by date, then flight number, then origin/dest flights_per_airplane = flights\ .map(lambda nameTuple: (nameTuple[5], [nameTuple[0:5]]))\ .reduceByKey(lambda a, b: a + b)\ .map(lambda tuple: { 'TailNum': tuple[0], 'Flights': sorted(tuple[1], key=lambda x: (x[1], x[2], x[3], x[4])) } ) flights_per_airplane.first() # Save to Mongo import pymongo_spark pymongo_spark.activate() flights_per_airplane.saveToMongoDB('mongodb://localhost:27017/agile_data_science.flights_per_airplane')
from pyspark import SparkConf from pyspark import SparkContext from math import radians, sin, asin, cos, sqrt import geopy from geopy.distance import VincentyDistance from bs4 import BeautifulSoup import lxml import requests from lxml.html.clean import Cleaner from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words import pymongo_spark pymongo_spark.activate() def fun(x, y): return x, y + 1 def getConf(): with open('conf.json') as data_file: data = json.load(data_file) return data def f_parse(args): def isAlphabet(word):
def setUpClass(cls): pymongo_spark.activate() cls.sc = pyspark.SparkContext(appName='test_pickle', master='local') cls.client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT) cls.coll = cls.client.mongo_hadoop.test cls.output_coll = cls.client.mongo_hadoop.test.output
# run the program using ' spark-submit "program name" ' # import following modules import pymongo # python mongo connector import pymongo_spark # import spark module for pymongo import json # import json format from pymongo import MongoClient # import the Mongoclient, to connect to the database from pyspark import SparkContext, SparkConf # import spark contexts from pyspark.sql import SQLContext # import the sql context for spark from pyspark.sql import SparkSession # run a spark session pymongo_spark.activate() # activate the pymongo-spark connector conf = SparkConf() # define spark configuration sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) client = MongoClient() # define our client to be from MongoClient() server db = client.test # define current database db as the 'test' database on the MongoClient() print('database ', db) # print current database in use path = "hdfs://localhost:9000/home/hadoop/h_data/sales_ord_univ.csv" # define the HDFS file path df = sqlContext.read.format("com.databricks.spark.csv").option( "header", "true" ).option("inferSchema", "true").load( path ) # Use the path to create a spark SQL dataframe 'df' using the infered schema. df_pandas = df.toPandas() # Create a pandas DataFrame from the SQL Dataframe records = json.loads( df_pandas.T.to_json()).values() # Create a json format of the dataframe db.test76.insert( records ) # insert the json format "records " in our current mongo database "db' under the name of "test2"
def activate(): pymongo_spark.activate() print 'activated'
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: #import findspark # Add the streaming package and initialize #findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) #findspark.init() import pyspark import pyspark.sql import pyspark.streaming from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils import pymongo_spark pymongo_spark.activate() conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()