Exemple #1
0
def update_models():
    # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
    logger.debug(
        '===================================================Starting load models==================================================='
    )
    try:
        logger.debug('Loading tokenizer model')
        new_tokenizer = Tokenizer.load(tokenizer_file)
        logger.debug('Load tokenizer model successfully')
    except:
        logger.debug('Fail to load tokenizer')

    try:
        logger.debug('Loading hashing_tf model')
        new_hashing_tf = HashingTF.load(hashing_tf_file)
        logger.debug('Load hashing_tf model successfully')
    except:
        logger.debug('Fail to load hashing_tf')

    try:
        logger.debug('Loading idf_model')
        new_idf_model = IDFModel.load(idf_model_file)
        logger.debug('Load IDFModel successfully')
    except:
        logger.debug('Fail to load IDFModel')

    try:
        logger.debug('Loading nb_model')
        new_nb_model = NaiveBayesModel.load(nb_model_file)
        logger.debug('Load NaiveBayesModel successfully')
    except:
        logger.debug('Fail to load NaiveBayesModel')

    try:
        logger.debug('Updating models')
        tokenizer = new_tokenizer
        hashing_tf = new_hashing_tf
        idf_model = new_idf_model
        nb_model = new_nb_model
        logger.debug('update model successfully')
    except:
        logger.debug('Fail to update models')
    logger.debug(
        '===================================================Stopped load models==================================================='
    )
Exemple #2
0
	except KafkaError as ke:
		logger.debug('Fail to start kafka producer, caused by %s' % ke.message)

	try:
		# Create dstream from kafka topic
		directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip})
		logger.debug('Create direct dstream from kafka successfully')
	except:
		logger.debug('Unable to create dstream from kafka')

	atexit.register(shutdown_hook, kafka_producer, spark)

	# Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
	try:
		logger.debug('Loading models')
		tokenizer = Tokenizer.load(tokenizer_file)
		hashing_tf = HashingTF.load(hashing_tf_file)
		idf_model = IDFModel.load(idf_model_file)
		nb_model = NaiveBayesModel.load(nb_model_file)
		selected_tags = pd.read_csv(selected_tags_file, header=None)
		local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0]))
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		catId_to_tags = sc.broadcast(local_catId_to_tags)
		tags_to_catId = sc.broadcast(local_tags_to_catId)
		tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType())
		catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType())
		logger.debug('loaded models successfully')
	except:
		logger.debug('Fail to load models')

        queries = idf_model.transform(queries)
        queries = scalerModel.transform(queries)
        preds = model.transform(queries)
        preds.select('payload', 'prediction').show()

    except:
        print('No data')


APP_NAME = "BigData"
conf = pyspark.SparkConf().setAll([('spark.app.name', APP_NAME),
                                   ('spark.executor.memory', '8g'),
                                   ('spark.cores.max', '2'),
                                   ('spark.driver.memory', '8g')])
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)

ngrams = udf(to_ngram, StringType())
tokenizer = Tokenizer.load('models/Tokenizer')
vectorizer = CountVectorizerModel.load('models/Vectorizer')
idf_model = IDFModel.load('models/idf')
scalerModel = StandardScalerModel.load('models/scalerModel')
model = LogisticRegressionModel.load('models/Logistic_Regression_Model')
ssc = StreamingContext(sc, batchDuration=3)
lines = ssc.socketTextStream("localhost", 9999)

lines.foreachRDD(get_prediction)

ssc.start()
ssc.awaitTermination()