def update_models(): # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map logger.debug( '===================================================Starting load models===================================================' ) try: logger.debug('Loading tokenizer model') new_tokenizer = Tokenizer.load(tokenizer_file) logger.debug('Load tokenizer model successfully') except: logger.debug('Fail to load tokenizer') try: logger.debug('Loading hashing_tf model') new_hashing_tf = HashingTF.load(hashing_tf_file) logger.debug('Load hashing_tf model successfully') except: logger.debug('Fail to load hashing_tf') try: logger.debug('Loading idf_model') new_idf_model = IDFModel.load(idf_model_file) logger.debug('Load IDFModel successfully') except: logger.debug('Fail to load IDFModel') try: logger.debug('Loading nb_model') new_nb_model = NaiveBayesModel.load(nb_model_file) logger.debug('Load NaiveBayesModel successfully') except: logger.debug('Fail to load NaiveBayesModel') try: logger.debug('Updating models') tokenizer = new_tokenizer hashing_tf = new_hashing_tf idf_model = new_idf_model nb_model = new_nb_model logger.debug('update model successfully') except: logger.debug('Fail to update models') logger.debug( '===================================================Stopped load models===================================================' )
except KafkaError as ke: logger.debug('Fail to start kafka producer, caused by %s' % ke.message) try: # Create dstream from kafka topic directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip}) logger.debug('Create direct dstream from kafka successfully') except: logger.debug('Unable to create dstream from kafka') atexit.register(shutdown_hook, kafka_producer, spark) # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map try: logger.debug('Loading models') tokenizer = Tokenizer.load(tokenizer_file) hashing_tf = HashingTF.load(hashing_tf_file) idf_model = IDFModel.load(idf_model_file) nb_model = NaiveBayesModel.load(nb_model_file) selected_tags = pd.read_csv(selected_tags_file, header=None) local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0])) local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) catId_to_tags = sc.broadcast(local_catId_to_tags) tags_to_catId = sc.broadcast(local_tags_to_catId) tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType()) catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType()) logger.debug('loaded models successfully') except: logger.debug('Fail to load models')
queries = idf_model.transform(queries) queries = scalerModel.transform(queries) preds = model.transform(queries) preds.select('payload', 'prediction').show() except: print('No data') APP_NAME = "BigData" conf = pyspark.SparkConf().setAll([('spark.app.name', APP_NAME), ('spark.executor.memory', '8g'), ('spark.cores.max', '2'), ('spark.driver.memory', '8g')]) sc = SparkContext(conf=conf) sqlc = SQLContext(sc) ngrams = udf(to_ngram, StringType()) tokenizer = Tokenizer.load('models/Tokenizer') vectorizer = CountVectorizerModel.load('models/Vectorizer') idf_model = IDFModel.load('models/idf') scalerModel = StandardScalerModel.load('models/scalerModel') model = LogisticRegressionModel.load('models/Logistic_Regression_Model') ssc = StreamingContext(sc, batchDuration=3) lines = ssc.socketTextStream("localhost", 9999) lines.foreachRDD(get_prediction) ssc.start() ssc.awaitTermination()