Esempio n. 1
0
    def _load_models(self):
        hf_path = self.params_path.format('hf')
        idf_path = self.params_path.format('idfmodel')
        rf_path = self.params_path.format('rf')

        self.hashingTF = HashingTF.load(hf_path)
        self.idfmodel = IDFModel.load(idf_path)
        self.rf = RandomForestClassificationModel.load(rf_path)
Esempio n. 2
0
def update_models():
    # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
    logger.debug(
        '===================================================Starting load models==================================================='
    )
    try:
        logger.debug('Loading tokenizer model')
        new_tokenizer = Tokenizer.load(tokenizer_file)
        logger.debug('Load tokenizer model successfully')
    except:
        logger.debug('Fail to load tokenizer')

    try:
        logger.debug('Loading hashing_tf model')
        new_hashing_tf = HashingTF.load(hashing_tf_file)
        logger.debug('Load hashing_tf model successfully')
    except:
        logger.debug('Fail to load hashing_tf')

    try:
        logger.debug('Loading idf_model')
        new_idf_model = IDFModel.load(idf_model_file)
        logger.debug('Load IDFModel successfully')
    except:
        logger.debug('Fail to load IDFModel')

    try:
        logger.debug('Loading nb_model')
        new_nb_model = NaiveBayesModel.load(nb_model_file)
        logger.debug('Load NaiveBayesModel successfully')
    except:
        logger.debug('Fail to load NaiveBayesModel')

    try:
        logger.debug('Updating models')
        tokenizer = new_tokenizer
        hashing_tf = new_hashing_tf
        idf_model = new_idf_model
        nb_model = new_nb_model
        logger.debug('update model successfully')
    except:
        logger.debug('Fail to update models')
    logger.debug(
        '===================================================Stopped load models==================================================='
    )
Esempio n. 3
0
    ads_filter = udf(filter_ads, BooleanType())
    ads_free = df.filter(ads_filter(df.Text))

    #remove punctuation
    pp_udf = udf(preprocess, ArrayType(StringType()))
    words = ads_free.withColumn('Words', pp_udf(ads_free.Text))

    #remove stop words
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    removed = remover.transform(words)

    params_path = '../tmp/{}'

    #Load trained hashing frequency and transform
    hf_path = params_path.format('hf')
    hashingTF = HashingTF.load(hf_path)
    featureized = hashingTF.transform(removed)

    #Load trained hashing frequency and transform
    idf_path = params_path.format('idfmodel')
    idfmodel = IDFModel.load(idf_path)
    result = idfmodel.transform(featureized)

    #load rf model and predict
    rf_path = params_path.format('rf')
    rf = RandomForestClassificationModel.load(rf_path)
    prediction = rf.transform(result)

    path_to_save = '../tmp/twitterstream_test_prediction.json'
    prediction.write.json(path_to_save)
Esempio n. 4
0
		logger.debug('Fail to start kafka producer, caused by %s' % ke.message)

	try:
		# Create dstream from kafka topic
		directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip})
		logger.debug('Create direct dstream from kafka successfully')
	except:
		logger.debug('Unable to create dstream from kafka')

	atexit.register(shutdown_hook, kafka_producer, spark)

	# Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
	try:
		logger.debug('Loading models')
		tokenizer = Tokenizer.load(tokenizer_file)
		hashing_tf = HashingTF.load(hashing_tf_file)
		idf_model = IDFModel.load(idf_model_file)
		nb_model = NaiveBayesModel.load(nb_model_file)
		selected_tags = pd.read_csv(selected_tags_file, header=None)
		local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0]))
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		catId_to_tags = sc.broadcast(local_catId_to_tags)
		tags_to_catId = sc.broadcast(local_tags_to_catId)
		tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType())
		catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType())
		logger.debug('loaded models successfully')
	except:
		logger.debug('Fail to load models')


	logger.debug('Start to process data')