def load_NaiveB_Model(dataset):
    print ("Accuracy of best NB Model with CrossValidation:")
    evaluator = BinaryClassificationEvaluator()
    best_NBModel = NaiveBayesModel.load("model/NB1/")
    predictions = best_NBModel.transform(dataset)
    accuracy = evaluator.evaluate(predictions)
    print "The  accuracy = %g" % accuracy
 def getOrCreateNB (self):
     try:
         if self.nbModel == None:
             self.nbModel = NaiveBayesModel.load(CONST_NB_FILE)
     except:
         print ("Creating NB Model")
         self.nbModel = self.createNB()
     
     return self.nbModel
Esempio n. 3
0
def predictTweetCategNB(testtf, sc):
    modelTweetCategoryNB = NaiveBayesModel.load("NaiveBayes_model/")
    # select example rows to display.
    tt = sc.parallelize(testtf).map(lambda x: Row(features=x)).toDF()
    tt.show()
    predictions = modelTweetCategoryNB.transform(tt)
    #predictions.show()
    labels = predictions.select("prediction").rdd.map(
        lambda x: category[int(x.prediction)]).collect()
    return labels
def classify_tweets(inbound_dataset):
    # Run the cleansing UDF for tweet column
    udf_cleansing = functions.udf(cleansing)
    inbound_dataset = inbound_dataset.withColumn(
        "tweet_cleansed", udf_cleansing(functions.col("tweet")))

    # Tokenizing
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
    inbound_dataset = tokenizer.transform(inbound_dataset)

    # Generating features
    from pyspark.ml.feature import HashingTF
    features_generator = HashingTF(inputCol="words", outputCol="features")
    inbound_dataset = features_generator.transform(inbound_dataset)

    model_folder = os.path.join(os.getcwd(), "saved_models")
    model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
    if not os.path.exists(model_folder):
        print("model does not exists")

    from pyspark.ml.classification import NaiveBayesModel
    loaded_model = NaiveBayesModel.load(model_full_path)

    # Classifying using saved model
    classified = loaded_model.transform(inbound_dataset)

    spark = getSparkSessionInstance(inbound_dataset.rdd.context.getConf())
    if files_source == "hdfs":
        labels = spark.read.load(os.path.join("file://" + model_folder,
                                              "labels.csv"),
                                 format="csv",
                                 header=True)
    else:
        labels = spark.read.load(os.path.join(model_folder, "labels.csv"),
                                 format="csv",
                                 header=True)

    classified = classified.join(labels,
                                 classified["NB_pred"] == labels["label_id"])

    udf_get_probability = functions.udf(get_probability)
    classified = classified.withColumn(
        "probability",
        udf_get_probability(functions.col("NB_prob"),
                            functions.col("NB_pred")))

    classified = classified.withColumn(
        "label_predicted",
        functions.when(classified.probability < probability_threshold,
                       "2").otherwise(classified.label_predicted))

    return classified
Esempio n. 5
0
def index():
    # Return the template with the teams list passed in
    prediction = ''
    if request.method == 'POST':
        headline = request.form.get('headline')

        #Use headline to make a pysparkdf
        df = spark.createDataFrame([(0, headline)], ['ID', 'text'])

        # Create a length column to be used as a future feature
        df = df.withColumn('length', length(df['text']))
        df.show()

        # Create all the features to the data set
        tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
        stopremove = StopWordsRemover(inputCol='token_text',
                                      outputCol='stop_tokens')
        hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
        idf = IDF(inputCol='hash_token', outputCol='idf_token')

        # Create feature vectors
        clean_up = VectorAssembler(inputCols=['idf_token', 'length'],
                                   outputCol='features')

        # Create a and run a data processing Pipeline
        data_prep_pipeline = Pipeline(
            stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

        # Fit and transform the pipeline
        cleaner = data_prep_pipeline.fit(df)
        cleaned = cleaner.transform(df)

        #load model and make prediction
        model = NaiveBayesModel.load('Trade_Predictor_Model')
        prediction = model.transform(cleaned).select('prediction').toPandas()
        prediction = prediction['prediction'].values[0]
        print(prediction)

        #transform 0, 1, 2 to hold/sell/buy
        if prediction == 0:
            prediction = 'Hold'
        elif prediction == 1:
            prediction = 'Sell'
        else:
            prediction = 'Buy'

        print(prediction)

    return render_template('index.html', action=prediction)  #, teams=teams)
def predict():
    blob_account_name = os.environ.get('ds_blob_account')
    blob_account_key = os.environ.get('ds_blob_key')
    mycontainer = os.environ.get('ds_container')
    filename = os.environ.get('ds_model_filename')
    dirname = os.getcwd()
    dirname1 = "model/data"
    dirname2 = "model/metadata"
    filename1 = "part-00000-a1f9ca3a-3bec-4451-849f-546af11b14ab.snappy.parquet"
    filename2 = "part-00000"
    blobfilename = "HdiSamples/HdiSamples/sentimentfinal/stages/3_NaiveBayes_471fad31e436e6de3ade"

    blob_service = BlockBlobService(account_name=blob_account_name,
                                    account_key=blob_account_key,
                                    endpoint_suffix='core.usgovcloudapi.net')
    generator = blob_service.list_blobs(mycontainer)
    dirname = os.getcwd()
    if not os.path.exists(dirname + "/" + dirname1):
        os.makedirs(dirname + "/" + dirname1)
    if not os.path.exists(dirname + "/" + dirname2):
        os.makedirs(dirname + "/" + dirname2)

    blob_service.get_blob_to_path(mycontainer,
                                  blobfilename + "/data/" + filename1,
                                  dirname + "/" + dirname1 + "/" + filename1)
    blob_service.get_blob_to_path(mycontainer,
                                  blobfilename + "/metadata/" + filename2,
                                  dirname + "/" + dirname2 + "/" + filename2)
    localmodel = os.path.join(dirname, "model")
    model = NaiveBayesModel.load(localmodel)
    input = request.get_data().decode("utf-8")
    inputformat = ast.literal_eval(input)
    testrdd = sc.parallelize(inputformat)
    temp = testrdd.map(lambda x: Row(text=x))
    tempdf = spark.createDataFrame(temp)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopremover = StopWordsRemover().setInputCol("words").setOutputCol(
        "removed").setCaseSensitive(False)
    newhashingTF = HashingTF(inputCol="removed",
                             outputCol="features",
                             numFeatures=2000)
    nb_pipeline = Pipeline(stages=[tokenizer, stopremover, newhashingTF])

    temp1df = nb_pipeline.fit(tempdf).transform(tempdf)

    testpred = model.transform(temp1df)
    return str(testpred.take(1))
Esempio n. 7
0
def naive_bayes():
    conf = SparkConf().setAppName('RF')
    sc = SparkContext(conf=conf)
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([
        Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
        Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
        Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))
    ])

    nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
    model = nb.fit(df)
    # model.pi
    # # DenseVector([-0.81..., -0.58...])
    # model.theta
    # # DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1)
    test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
    result = model.transform(test0).head()
    # result.prediction
    # # 1.0
    # result.probability
    # # DenseVector([0.32..., 0.67...])
    # result.rawPrediction
    # # DenseVector([-1.72..., -0.99...])
    test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))
                            ]).toDF()
    # model.transform(test1).head().prediction
    # # 1.0
    temp_path = "."
    nb_path = temp_path + "/nb"
    nb.save(nb_path)
    nb2 = NaiveBayes.load(nb_path)
    # nb2.getSmoothing()
    # # 1.0
    model_path = temp_path + "/nb_model"
    model.save(model_path)
    model2 = NaiveBayesModel.load(model_path)
    # model.pi == model2.pi
    # # True
    # model.theta == model2.theta
    # # True
    nb = nb.setThresholds([0.01, 10.00])
    model3 = nb.fit(df)
    result = model3.transform(test0).head()
    def test(self, sentence, vocabularys):
        # sentence = ' '.join(jieba.cut(sentence))
        sentence = sentence.split(" ")
        print('句子抽象化后的结果: {}'.format(sentence))
        vector = [0 for x in range(len(vocabularys))]
        for word in sentence:
            if word in vocabularys:
                index = vocabularys.index(word)
                vector[index] = 1

        model = NaiveBayesModel.load(self.model_path)
        # model = DecisionTreeClassificationModel.load(self.model_path)
        test0 = self.spark.createDataFrame(
            [Row(features=Vectors.dense(vector))])
        result = model.transform(test0).head()
        print('The model index is: {}'.format(result.prediction))
        return int(result.prediction)
Esempio n. 9
0
def update_models():
    # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
    logger.debug(
        '===================================================Starting load models==================================================='
    )
    try:
        logger.debug('Loading tokenizer model')
        new_tokenizer = Tokenizer.load(tokenizer_file)
        logger.debug('Load tokenizer model successfully')
    except:
        logger.debug('Fail to load tokenizer')

    try:
        logger.debug('Loading hashing_tf model')
        new_hashing_tf = HashingTF.load(hashing_tf_file)
        logger.debug('Load hashing_tf model successfully')
    except:
        logger.debug('Fail to load hashing_tf')

    try:
        logger.debug('Loading idf_model')
        new_idf_model = IDFModel.load(idf_model_file)
        logger.debug('Load IDFModel successfully')
    except:
        logger.debug('Fail to load IDFModel')

    try:
        logger.debug('Loading nb_model')
        new_nb_model = NaiveBayesModel.load(nb_model_file)
        logger.debug('Load NaiveBayesModel successfully')
    except:
        logger.debug('Fail to load NaiveBayesModel')

    try:
        logger.debug('Updating models')
        tokenizer = new_tokenizer
        hashing_tf = new_hashing_tf
        idf_model = new_idf_model
        nb_model = new_nb_model
        logger.debug('update model successfully')
    except:
        logger.debug('Fail to update models')
    logger.debug(
        '===================================================Stopped load models==================================================='
    )
def sendRecord(df):
    
    hashingTF = HashingTF(inputCol="filteredWords", outputCol="features")
    rescaledData = hashingTF.transform(df)
    
    from pyspark.ml.classification import NaiveBayesModel
    sameModel = NaiveBayesModel.load("TwitterSentimentNB.model") 
    
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    predictions = sameModel.transform(rescaledData)
    
    pr=predictions.select("prediction")
    
    pr=pr.rdd
    
    if((str(pr.collect()==[Row(prediction=1.0)]))=="True"):
        print("Positive tweet")
    else:
        print("Negative tweet")
def naive_bayes_evaluator(test_data, deal_id):
    ####In:
    #A testing data set, as generated by data_prep()
    #The deal_id you want to test a model for
    #NB: The model has to be already saved to the cloud

    ####Out
    #An update message is outputted
    #an evaluator

    model = NaiveBayesModel.load(f"/mnt/lotte/naive_bayes/{deal_id}/")
    predictions = model.transform(test_data.withColumnRenamed(
        deal_id, 'label'))
    # compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy"
    )  #alternatively, use AreaUnderPR to get the precision-recall curve instead of the accuracy
    accuracy = evaluator.evaluate(predictions)
    print("Naive Bayes test accuracy for " + deal_id + " = " + str(accuracy))

    return evaluator
Esempio n. 12
0
def main():
    logging.config.fileConfig('%s/../logging.conf' %
                              os.path.dirname(os.path.abspath(__file__)))
    logger = logging.getLogger(name="simpleExample")

    parser = arg_parser()
    args = parser.parse_args()

    categories = [
        'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
    ]
    fe_pipeline_save_path = args.fe_pipeline_save_path
    classifier_save_path = args.classifier_save_path

    logger.info("Load Data")
    newsgroup_data_loader = DataLoader(categories=categories)
    _, twenty_test = newsgroup_data_loader.load_data()

    logger.info("Transform raw data into Spark DataFrame")
    spark_df_converter = SparkDataFrameConverter()
    twenty_test_df = spark_df_converter.convert(twenty_test.data,
                                                twenty_test.target)

    logger.info(
        "Load Feature Engineering Pipeline and apply transformations on train set"
    )
    fe_pipeline_model = PipelineModel.load(fe_pipeline_save_path)
    twenty_test_counts_df = fe_pipeline_model.transform(twenty_test_df)

    logger.info("Load classifier and apply predictions")
    nb_model = NaiveBayesModel.load(classifier_save_path)
    predicted = nb_model.transform(twenty_test_counts_df)

    logger.info("Evaluate Results")
    evaluator = MulticlassClassificationEvaluator(labelCol="label_indexed",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

    logger.info("Accuracy on test set : {}".format(
        evaluator.evaluate(predicted)))
Esempio n. 13
0
 def loadModelFromDisk(self):
     self.logger.log("info", "Loading pretrained model from disk")
     self.__model = NaiveBayesModel.load(self.__modelPath)
     self.logger.log("info", "Complate..")
Esempio n. 14
0
def apply_naive_bayes_classifier(tweets):
    model_path = 'hdfs://spark01.ctweb.inweb.org.br:9000/limonero/models/' \
                 'Sentiment_Analysis_-_Naive_Bayes.0000'
    model = NaiveBayesModel.load(model_path)
    return model.transform(tweets)
        '', words_with_url)
    return url_less_words


extract_words_udf = udf(extract_words, StringType())
data_filtered = data_filtered.withColumn('text_words',
                                         extract_words_udf('body'))
# data_filtered.show()

regexTokenizer = RegexTokenizer(inputCol="text_words",
                                outputCol="words",
                                pattern="\\W")

## stop words
f = open("./stopwords_twitter.txt", "r")
model = NaiveBayesModel.load('./NB_model_without_pipeline')

add_stopwords = []
for l in f.readlines():
    add_stopwords.append(l.strip())
# print(add_stopwords[:5])

stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

## bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               binary=True,
                               vocabSize=10000,
                               minDF=1)
sc   = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


schema = StructType([StructField('label', FloatType(), True), \
					 StructField('sentences', StringType(), True)])

tokenizer = Tokenizer(inputCol="sentences", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=500)
idf = IDF(inputCol="rawFeatures",outputCol="features")



################Unknown data###########################
model1 = LogisticRegressionModel.load('./model/logisticRegModel5')
nb_model = NaiveBayesModel.load('./model/naiveBayesModel5')

documents1 = sc.wholeTextFiles('../lab3_data/unknown_data/*')
documents1 = \
	documents1.map(
		lambda (title, text): (
			title.encode('ascii', 'ignore').decode('ascii'),
			text.encode('ascii', 'ignore').decode('ascii')
		)
	)

documents1 = documents1.map(lambda (title, text): (getCategory(title), cleanSentences(text)))

documentsDF = sqlContext.createDataFrame(documents1, schema)

wordsData = tokenizer.transform(documentsDF)
Esempio n. 17
0
	try:
		# Create dstream from kafka topic
		directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip})
		logger.debug('Create direct dstream from kafka successfully')
	except:
		logger.debug('Unable to create dstream from kafka')

	atexit.register(shutdown_hook, kafka_producer, spark)

	# Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
	try:
		logger.debug('Loading models')
		tokenizer = Tokenizer.load(tokenizer_file)
		hashing_tf = HashingTF.load(hashing_tf_file)
		idf_model = IDFModel.load(idf_model_file)
		nb_model = NaiveBayesModel.load(nb_model_file)
		selected_tags = pd.read_csv(selected_tags_file, header=None)
		local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0]))
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		catId_to_tags = sc.broadcast(local_catId_to_tags)
		tags_to_catId = sc.broadcast(local_tags_to_catId)
		tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType())
		catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType())
		logger.debug('loaded models successfully')
	except:
		logger.debug('Fail to load models')


	logger.debug('Start to process data')
	process_data(directKafkaStream, kafka_producer)
	ssc.start()
Esempio n. 18
0
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(tokenized).select(
        "label", "filtered", "time_stamp_ms")

    ngram = NGram(n=1, inputCol="filtered", outputCol="ngrams")
    ngramDataFrame = ngram.transform(filteredDataFrame)
    ngramDataFrame.show()

    ngramData = ngramDataFrame.select("label", "ngrams", "time_stamp_ms")
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=3000)
    featurizedData = hashingTF.transform(ngramData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors
    featurizedData.show()

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()

    model = NaiveBayesModel.load("hdfs://localhost:9000/model_naive_bayes")
    predictions = model.transform(rescaledData)
    predictions.show()
    print(type(predictions))

    res = predictions.toPandas()
    res.to_pickle("prediction_panda_df.pck")
    print(predictions.count())
Esempio n. 19
0
	# Create Ngrams
	ngram = NGram(n = 2, inputCol= "words", outputCol = "bigrams")
	wordsData = ngram.transform(wordsData)

	#changed inputCol = "words"
	hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3600)
	featurizedData = hashingTF.transform(wordsData)
	
	# Obtain the TF-IDF score
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	idfModel = idf.fit(featurizedData)
	rescaledData = idfModel.transform(featurizedData)
		
	# Load Naive Bayes model.
	print("Loading Naive Bayes Model...")
	model = NaiveBayesModel.load('/models/tmp/myNaiveBayesModel')

	# Make prediction and test accuracy.
	print("Naive Bayes Model loaded. Begin testing...")
	predictions = model.transform(rescaledData)
	print("Testing completed.")

	# Compute metrics
	print("Computing accuracy...")
	accuracyEval = MulticlassClassificationEvaluator(labelCol = "label", predictionCol = "prediction", metricName = "accuracy")
   	accuracy = accuracyEval.evaluate(predictions)
   	print("Accuracy = " + str(accuracy))
	
	# Format output data
	#resultsDF = predictions.drop("predictions", "rawFeatures", "features", "rawPrediction", "probability", "words", "bigrams", "label")
   	#resultsDF.show(2)
Esempio n. 20
0
                        regParam=6.969697,
                        labelCol='score',
                        featuresCol='X')
LR_model = LR.fit(X_train_large)
LR_model.save(LR_model_path)

# Random Forest
RF = RandomForestClassifier(numTrees=100,
                            maxDepth=15,
                            labelCol="score",
                            featuresCol="X")
RF_model = RF.fit(X_train_large)
RF_model.save(RF_model_path)

# Loading all trained models
NB_Model = NaiveBayesModel.load(NB_model_path)
LR_Model = LogisticRegressionModel.load(LR_model_path)
RF_Model = RandomForestClassificationModel.load(RF_model_path)

voteClassifier = VoteClassifier(NB_Model, LR_Model, RF_Model)
evaluate(voteClassifier.transform_vote(X_test_large),
         confusion=False,
         predictionCol='prediction_vote')
evaluate(voteClassifier.transform_vote(X_test_imbd),
         confusion=False,
         predictionCol='prediction_vote')
voteClassifier.transform_vote(X_test_imbd).show()

# Accuracy: (TP+TN)/N
# Positive Predicitve Value: TP/(TP+FP)
# Negative Predicitve Value: TN/(TN+FN)
idfModel = idf.fit(test_data)
test_data = idfModel.transform(test_data)
test_data.show(5)

from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex")
model = stringIndexer.fit(test_data)
test_data = model.transform(test_data)
test_data.show(5)

predicted = test_data.select("tfidf", "labelIndex")
predicted.show(5)

model_folder = os.path.join(os.getcwd(), 'saved_models')
model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
if not os.path.exists(model_folder):
    print("model does not exists")

from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
loadModel = NaiveBayesModel.load(model_full_path)
predicted = loadModel.transform(predicted)
predicted.show()

total = predicted.count()
correct = predicted.where(predicted['labelIndex'] == predicted['NB_pred']).count()
accuracy = correct/total

print(
    "\nTotal:", total, 
    "\nCorrect:", correct, 
    "\nAccuracy:", accuracy)
def login():
    message = ''
    e_result = ''
    s_result = ''
    t_result = ''
    j_result = ''

    if request.method == 'POST':
        post = request.form.get('text')  # access the data inside

        if len(post) >= 100:

            test = pd.DataFrame([post], columns=['post'])

            newrows = []

            def filter_text(post):
                """Decide whether or not we want to use the post."""
                # should remove link only posts here
                return len(post) > 0

            reg_punc = re.compile('[%s]' % re.escape(string.punctuation))

            def preprocess_text(post):
                """Remove any junk we don't want to use in the post."""

                # Remove links
                post = re.sub(r'http\S+', '', post, flags=re.MULTILINE)

                # All lowercase
                post = post.lower()

                # Remove puncutation
                post = reg_punc.sub('', post)

                return post

            def create_new_rows(row):
                posts = row['post']
                rows = []

                # for p in posts:
                p = preprocess_text(posts)
                rows.append({'post': p})
                return rows

            for index, row in test.iterrows():
                newrows += create_new_rows(row)

            test = pd.DataFrame(newrows)

            df = spark.createDataFrame(test)

            # Create a length column to be used as a future feature
            df = df.withColumn('length', length(df['post']))

            types = [
                'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
                'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
            ]
            types = [x.lower() for x in types]

            tokenizer = Tokenizer(inputCol="post", outputCol="words")
            tokenized = tokenizer.transform(df)

            # Remove stop words
            stopwordList = types
            stopwordList.extend(StopWordsRemover().getStopWords())
            stopwordList = list(set(stopwordList))  #optionnal
            remover = StopWordsRemover(inputCol="words",
                                       outputCol="filtered",
                                       stopWords=stopwordList)
            newFrame = remover.transform(tokenized)

            # Run the hashing term frequency
            hashing = HashingTF(inputCol="filtered", outputCol="hashedValues")
            # Transform into a DF
            hashed_df = hashing.transform(newFrame)

            # Fit the IDF on the data set
            idf = IDF(inputCol="hashedValues", outputCol="idf_token")
            idfModel = idf.fit(hashed_df)
            rescaledData = idfModel.transform(hashed_df)

            # Create feature vectors
            #idf = IDF(inputCol='hash_token', outputCol='idf_token')
            clean_up = VectorAssembler(inputCols=['idf_token', 'length'],
                                       outputCol='features')
            output = clean_up.transform(rescaledData)

            ei_model = NaiveBayesModel.load("static/models/EI_Predictor.h5")
            sn_model = NaiveBayesModel.load("static/models/SN_Predictor.h5")
            tf_model = NaiveBayesModel.load("static/models/TF_Predictor.h5")
            jp_model = NaiveBayesModel.load("static/models/JP_Predictor.h5")

            test_e = ei_model.transform(output)
            e = test_e.toPandas()["prediction"].values[0]
            if e == 0:
                e_result = "I"
            else:
                e_result = "E"
            test_s = sn_model.transform(output)
            s = test_s.toPandas()["prediction"].values[0]
            if s == 0:
                s_result = "N"
            else:
                s_result = "S"
            test_t = tf_model.transform(output)
            t = test_t.toPandas()["prediction"].values[0]
            if t == 0:
                t_result = "F"
            else:
                t_result = "T"
            test_j = jp_model.transform(output)
            j = test_j.toPandas()["prediction"].values[0]
            if j == 0:
                j_result = "P"
            else:
                j_result = "J"

        else:
            message = "Please tell us more about yourself!"

    return render_template('index.html',
                           message=message,
                           test_e=e_result,
                           test_s=s_result,
                           test_t=t_result,
                           test_j=j_result)
Esempio n. 23
0
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import functions as F
import time

## Kafka VM IP
kafka_topic = 'from-pubsub'
zk = '10.182.0.2:2181'
app_name = "from-pubsub"
sc = SparkContext(appName="KafkaPubsub")
ssc = StreamingContext(sc, 0.1)
kafkaStream = KafkaUtils.createStream(ssc, zk, app_name, {kafka_topic: 1})
### Load Saved Model
pipelineFit = PipelineModel.load('gs://wcs_word/NB_pipeline')
print("1")
model = NaiveBayesModel.load('gs://wcs_word/NB_FullTrainedModel')
print("2")
spark = SparkSession(sc)

## Global Variable to calculate Latency
init_time = None
count = 0


## Parsing Function
def row_generate(r):
    return Row(star=float(r[0]) - 1,
               useful=float(r[1]),
               funny=float(r[2]),
               cool=float(r[3]),
               text=str(r[4]))
import pandas as pd
from kafka import KafkaConsumer
import sys
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel, NaiveBayesModel
from sklearn.metrics import accuracy_score, recall_score, precision_score

sc = SparkContext()
sqlContext = SQLContext(sc)

spark = SparkSession.builder.appName('consumer').getOrCreate()
brokers, topic = sys.argv[1:]
consumer = KafkaConsumer(topic, bootstrap_servers=['localhost:9092'])

pip = PipelineModel.load('/Users/aditya/PycharmProjects/BigDataHW3/pipeline')
model_nb = NaiveBayesModel.load(
    '/Users/aditya/PycharmProjects/BigDataHW3/nbModel')
model_lr = LogisticRegressionModel.load(
    '/Users/aditya/PycharmProjects/BigDataHW3/lrModel')

columns = ['actual', 'predicted']
result_df_lr = pd.DataFrame(columns=columns)
result_df_nb = pd.DataFrame(columns=columns)
feed = 0

for msg in consumer:
    article = msg.value
    data = article.split("||")
    label = data[0]
    text = data[1]
    df = sc.parallelize([{"label": label, "text": text}]).toDF()
    df = pip.transform(df)
Esempio n. 25
0
        label = json_data["response"]['results'][ind]['sectionName']
        temp = list()
        temp.append(label)
        temp.append(headline)
        d.append(temp)

    return d


if __name__ == "__main__":

    sc = SparkContext()
    sqlContext = SQLContext(sc)

    lr_model = LogisticRegressionModel.load("lrm.model")
    model = NaiveBayesModel.load("model.model")

    key = "00254a08-1426-4547-b54f-bc0137d9d547"
    from_date = "2018-02-01"
    to_date = "2018-02-12"

    url = 'http://content.guardianapis.com/search?from-date=' + from_date + '&to-date=' + to_date + \
          '&order-by=newest&show-fields=all&page-size=200&%20num_per_section=10000&api-key=' + key

    data = get_data(url)
    df = sqlContext.createDataFrame(data, schema=["category", "text"])
    pipeline_fit = PipelineModel.load("pipelining")
    dataset = pipeline_fit.transform(df)

    predictions = lr_model.transform(dataset)
    predictions1 = model.transform(dataset)
Esempio n. 26
0
from flask import Flask, jsonify, render_template, request
from flask_sqlalchemy import SQLAlchemy
from pyspark.sql import SparkSession
from pyspark import SparkFiles
spark = SparkSession.builder.appName('prez').getOrCreate()
from pyspark.ml.classification import NaiveBayesModel
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.sql.functions import length
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['DEBUG'] = True

model = NaiveBayesModel.load('models/naivebayes.h5')


def pipeline(df):
    print(df.head())
    df = df.withColumn("length", length(df['Speech']))
    # Create the data processing pipeline functions here (note: StringIndexer will be used to encode
    # your target variable column. This column should be named 'label' so our model will recognize it later)
    review_data = Tokenizer(inputCol="Speech", outputCol="Words")
    reviewed = review_data.transform(df)
    #reviewed.show()
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    newFrame = remover.transform(reviewed)
    #newFrame.show()
    hashing = HashingTF(inputCol="filtered",
                        outputCol="hashedValues",
## FIlter for those rows where topic is Null
data_filter_with_null_topic = data_modified_tweet.where(
    col("topic").isNull()).select('trend', 'creation_time', 'twid',
                                  'text_words')
data_filter_with_null_topic.show(5)
print(data_filter_with_null_topic.count())

df_for_topic = data_filter_with_null_topic
## Fit pipeline to filtered dataframe
pipelineFit = pipeline.fit(df_for_topic)
dataset_for_topic = pipelineFit.transform(df_for_topic)
dataset_for_topic.show(5)

## Load saved model for topic classification
model_for_topic_classification = NaiveBayesModel.load(
    '/Users/saumya/Desktop/Big_data_project/NB_model_without_pipeline')
print(model_for_topic_classification)

## Predict topics for unlabelled tweets
predictions = model_for_topic_classification.transform(dataset_for_topic)

## convert the labels to text labels
labeler = IndexToString(inputCol="prediction",
                        outputCol="predictedLabel",
                        labels=[
                            'event', 'sports', 'politics', 'news',
                            'technology', 'business', 'entertainment', 'health'
                        ])
# print(predictions)
prediciton_with_label = labeler.transform(predictions)
prediciton_with_label.show(5)
    total_count_naive_bayes_classification = 0

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    sc.setLogLevel("ERROR")
    spark = SparkSession.builder.getOrCreate()
    ssc = StreamingContext(sc, 1)
    # Setting model path
    save_pipeline_path = output_folder_path + "pipeline"
    saved_logistic_model_path = output_folder_path + "LogisticClassificationModel"
    saved_naive_bayes_model_path = output_folder_path + "NaiveBayesClassificationModel"

    # Loading Pipeline and Model
    loded_pipeline = PipelineModel.load(save_pipeline_path)
    saved_logistic_model = LogisticRegressionModel.load(
        saved_logistic_model_path)
    saved_naive_bayes_model = NaiveBayesModel.load(
        saved_naive_bayes_model_path)

    # Creating Kafka Stream
    kvs = KafkaUtils.createDirectStream(
        ssc,
        topics=['guardian2stream'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})
    document_tuple = kvs.map(lambda line: (int(line[1].split("||")[0].strip(
    ).encode("ascii", "ignore")), line[1].split("||")[1].encode(
        "ascii", "ignore")))
    document_tuple.pprint()
    document_tuple.foreachRDD(process)

    ssc.start()
    ssc.awaitTermination()
    ssc.stop(stopGraceFully=True)