Python NaiveBayesModel.load Beispiele, pyspark.mllib.classification.NaiveBayesModel.load Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: composition_prediction_system.py Projekt: WarnWang/Dissertation

    def load_parameters(self):
        self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                 file_name='amount_method')
        self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                file_name='trend_method')
        self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
        self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
        self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
        amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
        trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')

        if self.amount_prediction_method == self.RANDOM_FOREST:
            amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
        else:
            amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')

        if self.trend_prediction_method == self.RANDOM_FOREST:
            trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.SVM:
            trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
        else:
            trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')

        return trend_model, amount_model

Beispiel #2

0

Datei anzeigen

Datei: ml_nb2.py Projekt: ajmal017/finopt

def predict():
# Make prediction and test accuracy.
    sc = SparkContext(appName= 'nb_test')    
    sameModel = NaiveBayesModel.load(sc, "../../target/myNaiveBayesModel")
    data = sc.textFile('../../data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    
    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.1, 0.9], seed=0)
    print test.collect()
    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    print predictionAndLabel.collect()
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print accuracy

Beispiel #3

0

Datei anzeigen

Datei: test_bayes_saved_model.py Projekt: varsha-varadarajan/sparksentimeter

def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv')
    data_cleaned = data.map(lambda line : line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel")

    print "----------"
    print sameModel.predict(htf.transform("posts jump in net profit"))

    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count()
    buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count()


    # Instantiate metrics object
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabel)

    # Overall statistics
    precision = metrics.precision()
    precision = normalize(precision)
    recall = metrics.recall()
    recall = normalize(recall)
    f1Score = metrics.fMeasure()
    f1Score = normalize(f1Score)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    '''
    # Statistics by class
    labels = data_hashed.map(lambda lp: lp.label).distinct().collect()

    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    '''
    '''

Beispiel #4

0

Datei anzeigen

def predict_row(sentence, spark_context, model_folder):
    """
    
    :param sentence: a sentence to be analysed 
    :type sentence: basestring
    :param spark_context: the current spark context 
    :type spark_context: SparkContext
    :param model_folder: 
    :type model_folder: basestring
    :return: 0.0 if the sentence is negative, 1 if the sentence is neutral and 2 if the sentence is positive
    :rtype: float
    """
    htf = HashingTF(50000)
    sentence_features = htf.transform(tokenize(sentence))
    model = NaiveBayesModel.load(spark_context, model_folder)
    prediction = model.predict(sentence_features)
    print 'prediction :', prediction
    return prediction

Beispiel #5

0

Datei anzeigen

def get_naive_bayes_model(spark_context, train_hashed, model_folder):
    """

    :param spark_context: the current spark context
    :type spark_context: SparkContext
    :param train_hashed:
    :type train_hashed: DataFrame
    :param model_folder:
    :type model_folder: basestring
    :return: a trained Naive Bayes model
    :rtype: NaiveBayesModel
    """
    if not path.exists(model_folder):

        # Train a Naive Bayes model on the training data
        model = NaiveBayes.train(train_hashed)

        # Ask Spark to save the model so it won't have to be re-trained later
        model.save(spark_context, model_folder)
    else:
        model = NaiveBayesModel.load(spark_context, model_folder)
    return model

Beispiel #6

0

Datei anzeigen

Datei: bayes.py Projekt: varsha-varadarajan/sparksentimeter

def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/1.csv')
    data_cleaned = data.map(lambda line: line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(
        lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Save and load model
    model.save(sc, "/home/varshav/Desktop/Bangalore")
    sameModel = NaiveBayesModel.load(sc, "/home/varshav/Desktop/Bangalore")

    print "----------"
    print model.predict(htf.transform("posts jump in net profit"))
    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(
        lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    #buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v == 1 ).count()
    #    print buy_buy
    prediction1 = 1.0 * predictionAndLabel1.filter(
        lambda (x, v): x == v).count() / training.count()

    print prediction
    print prediction1
    sc.stop()

Beispiel #7

0

Datei anzeigen

def test_Model():
    model = NaiveBayesModel.load(sc, "finalproject/model/NaiveBayesModel")
    testFile = sc.textFile("testdata.manual.csv")
    testData = testFile.map(transformData)
    testData = testData.filter(lambda x: x[0] != '\"2\"')
    features = testData.map(lambda x: x[1])
    #print(testData.map(lambda x:x[0]).collect())
    label = testData.map(
        lambda x: x[0]).distinct().zipWithIndex().collectAsMap()
    print(label)
    preprocessedData = preProcess(features)
    featuresData = getFeatures(preprocessedData)
    print("train", featuresData.take(5))
    testingLabeled = createLabeledData(featuresData, testData)
    print("train", testingLabeled.take(5))
    labeledTestingData = testingLabeled.map(
        lambda record: LabeledPoint(label[record[0][0]], record[1]))

    predictionAndLabel = labeledTestingData.map(
        lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / labeledTestingData.count()
    print("Accuracy: ", accuracy)

Beispiel #8

0

Datei anzeigen

Datei: scraper.py Projekt: varsha-varadarajan/sparksentimeter-website

    from pyspark import SparkContext
    from pyspark.mllib.feature import HashingTF
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.classification import NaiveBayes
    from pyspark import SparkConf
    from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
    from pyspark.mllib.linalg import Vectors

    print("Successfully imported Spark Modules")

except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)

sc = SparkContext(appName="Test")
sameModel = NaiveBayesModel.load(
    sc, "/home/sparkCluster/work/PycharmProjects/StockAnalysis/myModel")
htf = HashingTF(50000)

pg_no = 1
company_name = []
company_list = []
companies = []
codes_list = []
comp = []
experts = []
stopwords = []
exclude = []


def getCompanyName():
    global comp

Beispiel #9

0

Datei anzeigen

# **************************************************************************************************************
# 옷 카테고리 예측
# **************************************************************************************************************

from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors

pdf = pd.read_csv('file:///home/ec2-user/data/parseData.csv',encoding='utf-8')
df = sqlContext.createDataFrame(pdf)

#df.show()

htf = HashingTF(10000)
categoryModel = NaiveBayesModel.load(sc, "target/tmp/parseModel")

# **************************************************************************************************************
# 분류
# **************************************************************************************************************

# labelDf.show()

def getGenderLabelCode(rdd, label):
    GenderRdd = rdd.map(lambda row: row.gender).distinct()

    def getGenderCode(rdd):
        dic = {'etc': 'e'}
        for feature in rdd.collect():
            uniToStr = str(feature)
            dic[uniToStr] = uniToStr[0]

Beispiel #10

0

Datei anzeigen

Datei: classify.py Projekt: knb-wkm/pyspark_sample01

    average = numpy.average(score)
    deviation = numpy.std(score)
    return 50 + 10 * ((score - average) / deviation)

# mix-in
NaiveBayesModel.likelihood = likelihood

conf = SparkConf().setAppName("sample").setMaster("local")
sc = SparkContext(conf=conf)

path = os.path.abspath(os.path.dirname(__file__))
texts = pickle.load(open("%s/model/texts.pick" % path))
labels = pickle.load(open("%s/model/labels.pick" % path))

texts = sc.parallelize(texts)
htf = HashingTF(1000)  # Warning!! default value is 2^20
htf.transform(texts)

words = sys.argv[1].split()
test_tf = htf.transform(words)

model = NaiveBayesModel.load(sc, "%s/model" % path)
test = model.predict(test_tf)

likelihoods = model.likelihood(test_tf)
print "likelihoods: %s" % likelihoods
print "standard scores: %s" % standard_score(likelihoods)
print "label: %s" % labels[int(test)].encode('utf-8')
# json_data = {"likelihood": likelihoods[int(test)], "label": labels[int(test)].encode('utf-8')}
# print json.dumps(json_data)

Beispiel #11

0

Datei anzeigen

Datei: naive_bayes_classification_spark_example_data_set.py Projekt: yidun55/mllib

 def load_model(cls, path="$SPARK_HOME/NaiveBayes"):
     """
     """
     return NaiveBayesModel.load(sc, path)

Beispiel #12

0

Datei anzeigen

Datei: classify_with_spark.py Projekt: sbalakr2/News-Shift-Apache-Spark

    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    result = [w for w in stemmed if w]
    if not result:
        return [""]
    return result

folderpath='hdfs://ec2-54-213-170-202.us-west-2.compute.amazonaws.com:9000/user/root/crawled_data'

sc = SparkContext()
data_raw = sc.wholeTextFiles(folderpath)
data_cleaned = data_raw.map(lambda (filename, text): (filename, tokenize(text)))
htf = HashingTF(50000)
data_hashed = data_cleaned.map(lambda (filename, text): (filename, htf.transform(text)))
data_hashed.persist()
sameModel = NaiveBayesModel.load(sc, 'hdfs://ec2-54-213-170-202.us-west-2.compute.amazonaws.com:9000/user/root/bbcmodel')
predictedLabel = data_hashed.map(lambda (filename, text): (filename.split("/")[-1][:-4],sameModel.predict(text)))
preds = predictedLabel.collect()

conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com", port="5432")
cur = conn.cursor()
update = 'update articlestable set classifiedcategory=%s where id=%s'

newscategory = {hash('entertainment'):'entertainment',hash('sports'):'sports',hash('politics'):'politics',hash('technology'):'technology',hash('business'):'business'}

for pred in preds:       
      
    if(hash('entertainment')== pred[1]):
     category = 'entertainment'
    elif(hash('sports')== pred[1]):
     category = 'sports'

Beispiel #13

0

Datei anzeigen

Datei: predict.py Projekt: Christos-Hadjinikolis/eu_tweet_classifier

    sorted_dict = sorted(dictionary_RDD_IDFs_Weights.items(),
                         key=operator.itemgetter(1))

    # Set to max of N words for corresponding number of features for which the model is trained
    Dictionary = []
    for key, value in sorted_dict:
        Dictionary.append(key)

    print(len(Dictionary))

    # Create a broadcast variable for the Dictionary
    Dictionary_BV = sc.broadcast(sorted(Dictionary))

    # Load Naive Bayes Model
    model_path = "/Users/path/to/twitter_analytics/NB_model"
    sameModel = NaiveBayesModel.load(sc, model_path)

    # Start intro Video -  make sure to first run "chmod a+x play.sh" otherwise --> permission denied exception
    video = "Users:path:to:vids:intro.mp4"
    video_1 = subprocess.Popen("osascript runner.scpt " + "'" + video + "'",
                               shell=True)

    # Get user twitter-handle
    x = int(
        input(
            "Do you have a twitter account? \n(1) Yes \n(2) No \nYour choice: "
        ))
    if x == 1:
        user_handle = input("Please provide user twitter handle: ")
        friends = get_friends(user_handle, api)

Beispiel #14

0

Datei anzeigen

    terms = tags.split()

    # filter words that not exist in the vocabulary
    terms = [x for x in list(set(terms)) if x in list(set(vocabulary))]

    indices = list(map(lambda x: vocabulary.index(x), list(set(terms))))
    indices.sort()
    occurrences = list(
        map(lambda x: float(terms.count(vocabulary[x])), indices))

    return [len(vocabulary), indices, occurrences]


conf = SparkConf()
conf.setAppName("NaiveBaye")
conf.set('spark.driver.memory', '6g')
conf.set('spark.executor.memory', '6g')
conf.set('spark.cores.max', 156)

#load tags passed as parameter
tags = sys.argv[1]
bow = bow(tags)  #bag of words of that tags

sc = SparkContext(conf=conf)  # SparkContext

model = NaiveBayesModel.load(sc, "model")

result = model.predict(SparseVector(bow[0], bow[1], bow[2]))

print str(classValues[result])

Beispiel #15

0

Datei anzeigen

Datei: meteos-script-1.6.0.py Projekt: ncarkaci/meteos

 def load_model(self, context, path):
     return NaiveBayesModel.load(context, path)

Beispiel #16

0

Datei anzeigen

Datei: naive_bayes_mllib.py Projekt: Ulitochka/ASDM_labs

def save_model(model, model_name):
    output_dir = model_name
    shutil.rmtree(output_dir, ignore_errors=True)
    model.save(sc, output_dir)


print('*' * 50, 'MODELS_TRAIN', '*' * 50)
iris = datasets.load_iris()
data_set = iris.data
Y = iris.target
data_set = pd.DataFrame(data_set)
data_set['labels'] = Y
print(data_set.head(5))
print(data_set.shape)

s_df = sqlContext.createDataFrame(data_set)
train_dataset = s_df.rdd.map(lambda x: LabeledPoint(x[-1], x[:4]))
training, test = train_dataset.randomSplit([0.6, 0.4])

model = NaiveBayes.train(training, 0.7)
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy(predictionAndLabel)

################################################SAVE_LOAD###############################################################
print('*' * 50, 'SAVE_LOAD', '*' * 50)
save_model(model, 'myNaiveBayesModel')
sameModel = NaiveBayesModel.load(sc, 'myNaiveBayesModel')
predictionAndLabel_1 = test.map(lambda p: (model.predict(p.features), p.label))
accuracy(predictionAndLabel_1)

Beispiel #17

0

Datei anzeigen

Datei: spark.py Projekt: RomRuben/TFM

__author__ = 'ruben'

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

def parseLine(line):
    parts = line.split(',')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)

data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)

# Split data aproximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed = 0)

# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)

# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

# Save and load model
model.save(sc, "myModelPath")
sameModel = NaiveBayesModel.load(sc, "myModelPath")

Beispiel #18

0

Datei anzeigen

 def loadModelFromDisk(self, sc):
     print("Loading pretrained model from disk \n")
     model = NaiveBayesModel.load(
         sc, "hdfs://192.168.1.33:9000//NaiveBayes.model")
     print("Complate \n")
     return model

Beispiel #19

0

Datei anzeigen

Datei: naivebayes_score.py Projekt: IBMPredictiveAnalytics/Multinomial_Naive_Bayes_with_MLlib

if ascontext:
    if ascontext.isComputeDataModelOnly():
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)
    else:
        modelpath = ascontext.getModelContentToPath("model")
        model_metadata = json.loads(ascontext.getModelContentToString("model.metadata"))

# create a DataModelTools to handle data model and data conversions
datamodel = model_metadata["datamodel"]
dmt = DataModelTools(datamodel)
predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.classification import NaiveBayesModel
model = NaiveBayesModel.load(sc, modelpath);

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors).map(lambda x:x[1])

# scoring generates an RDD of predictions (but not the original features)
predictions = model.predict(dv)

# now we need to zip together the original rows from the DataFrame and the RDD of predictions
# we end up with an RDD containing the list of values from the original dataframe plus the predicted class, converted from the encoded number to the original string
def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result

Beispiel #20

0

Datei anzeigen

    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4])

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))

    # Save and load model
    output_dir = 'target/tmp/myNaiveBayesModel'
    shutil.rmtree(output_dir, ignore_errors=True)
    model.save(sc, output_dir)
    sameModel = NaiveBayesModel.load(sc, output_dir)
    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()
    print('sameModel accuracy {}'.format(accuracy))

    # $example off$

Beispiel #21

0

Datei anzeigen

 def getModel(self, path):
     if self.type == 'NaiveBayes':
         return NaiveBayesModel.load(self.sc, path)
     elif self.type == 'DecisionTree':
         return DecisionTreeModel.load(self.sc, path)

Beispiel #22

0

Datei anzeigen

Datei: my_metrics.py Projekt: varsha-varadarajan/sparksentimeter

def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile(
        '/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels1.csv'
    )
    data_cleaned = data.map(lambda line: line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(
        lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    sameModel = NaiveBayesModel.load(
        sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel")

    print "----------"
    print sameModel.predict(htf.transform("posts jump in net profit"))

    predictionAndLabel = test.map(lambda p:
                                  (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(
        lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()
    prediction1 = 1.0 * predictionAndLabel1.filter(
        lambda (x, v): x == v).count() / training.count()

    buy_buy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == 1 and v == 1).count()
    sell_sell = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == 2 and v == 2).count()
    hold_hold = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == 3 and v == 3).count()

    print buy_buy
    print sell_sell
    print hold_hold

    # Statistics by class
    labels = data_hashed.map(lambda lp: lp.label).distinct().collect()
    print labels
    print type(labels[0])
    '''
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    '''
    '''
    print("Class %s precision = %s" % (1, metrics.precision(1)))
    print("Class %s recall = %s" % (1, metrics.recall(1)))
    print("Class %s F1 Measure = %s" % (1, metrics.fMeasure()))

    print("Class %s precision = %s" % (2, metrics.precision(2)))
    print("Class %s recall = %s" % (2, metrics.recall(2)))
    print("Class %s F1 Measure = %s" % (2, metrics.fMeasure(2)))
    '''
    # Weighted stats
    '''
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    '''

    sc.stop()

Beispiel #23

0

Datei anzeigen

kafka_configuration_params = {
    "topic": ["BigData"],
    "connectionstring": "localhost:9092"
}

from pyspark.streaming.kafka import KafkaUtils
directKafkaStream = KafkaUtils.createDirectStream(
    ssc, kafka_configuration_params["topic"],
    {"metadata.broker.list": kafka_configuration_params["connectionstring"]})

from pyspark.mllib.classification import SVMModel, LogisticRegressionModel, NaiveBayesModel

LR_model = LogisticRegressionModel.load(sc, "../../notebooks/LR_model")
SVM_model = SVMModel.load(sc, "../../notebooks/SVM_model")
NB_model = NaiveBayesModel.load(sc, "../../notebooks/NB_model")

import nltk
import random
from nltk.tokenize import word_tokenize

allowed_word_types = ["JJ"]

rdd_all_words = sc.textFile("../../notebooks/all_words/part-00000")
rdd_broadcast_all_words = sc.broadcast(rdd_all_words.collect())


def convert_tweet_to_instance(tweets):

    rdd_tweets = tweets.map( \
    lambda tweet: [word[0] for word in nltk.pos_tag(word_tokenize(tweet)) if word[1] in allowed_word_types])

Beispiel #24

0

Datei anzeigen

 def init_spark_components(self):
     print("Loading Model")
     self.model = NaiveBayesModel.load(sc,
                                       path.join(self.base_path, 'model'))
     self.tf = HashingTF()

Beispiel #25

0

Datei anzeigen

 def load_model(cls, path="$SPARK_HOME/NaiveBayes"):
     """
     """
     return NaiveBayesModel.load(sc, path)

Beispiel #26

0

Datei anzeigen

Datei: stream.py Projekt: EmmittXu/SentimentAnalysis_StockPrice

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import time as tm
from threading import Thread
import numpy as np

conf = SparkConf().setAppName("appName").setMaster("local")
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

#Load pretrained models
output_dir1 = '/home/emmittxu/Desktop/Stock-Sentiment-alalysis/Models/myNaiveBayesModel'
output_dir2 = '/home/emmittxu/Desktop/Stock-Sentiment-alalysis/Models/sent_stockModel'
print("Loading model.......")
model1 = NaiveBayesModel.load(sc, output_dir1)
model2 = NaiveBayesModel.load(sc, output_dir2)
print("Models successfully loaded......")

#Global variables to record the number of positive and negative sentiments
negative = 0.0
neutral = 0.0
positive = 0.0


#Do feature extraction using TF-IDF and feed feature vectors to the sentiment classifier
def vectorize_feature(training):
    try:
        global positive
        global negative
        positive = 0

Beispiel #27

0

Datei anzeigen

 def loadModel(self, sc):
     global model
     model = NaiveBayesModel.load(
         sc, "hdfs://192.168.1.33:9000/NaiveBayes.model")

Beispiel #28

0

Datei anzeigen

Datei: Classifier.py Projekt: aprando/master-thesis-social-recsys

	def getModel(self, path):
		if self.type == 'NaiveBayes':
			return NaiveBayesModel.load(self.sc, path)
		elif self.type == 'DecisionTree':
			return DecisionTreeModel.load(self.sc, path)

Beispiel #29

0

Datei anzeigen

Datei: naiveBayes.py Projekt: sigurdsa/SparkTwitter

    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    return [w for w in stemmed if w]


def parseLine(line):
    parts = line.split(',')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)

data = sc.textFile('C:\Users\SigurdLap\PycharmProjects\sparkTwitter\naiveBayes.txt').map(parseLine)

# Split data approximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=0)
# Prøve split i 5 deler, cross validation

# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)

# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

# Save and load model
model.save(sc, "target/tmp/myNaiveBayesModel")
sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")

Beispiel #30

0

Datei anzeigen

        analysis_type = 'hashtag_analysis'
        send_df_to_dashboard(hashtag_counts_df, analysis_type)
    except:
        e = sys.exc_info()[0]
        print("There is an error: %s" % e)


conf = SparkConf()
conf.setAppName("TwitterStreamApp")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
ssc.checkpoint("checkpoint_models")
htf = HashingTF(50000)
NB_output_dir = '/spark/NaiveBayes'
NB_load_model = NaiveBayesModel.load(sc, NB_output_dir)

# Sentiment Analysis #

## 01 read tweets from stream ##
dataStream = ssc.socketTextStream("localhost", 9009)
## 02 split the text into words #
words = dataStream.map(lambda x: x.split(" "))
## 03 transformed the words into features ##
features = words.map(lambda x: htf.transform(x))
## 04 predict the sentiment ##
prediction = features.map(lambda x: classify(x))
## 05 label the sentiments ##
label_sentiments = prediction.map(lambda x: ('positive', 1)
                                  if x == 1 else ('negative', 1))
## 06 aggregate the results using sentiment as key ##

Beispiel #31

0

Datei anzeigen

    tf_val = 1048576

    # LOADING AND COMPUTING TF's TRAINING MODEL
    print('Loading TRAINING_TF_MODEL...', end="")
    tf_training = sc.pickleFile(os.getcwd() + "/Desktop/MODEL/TF/TF_MODEL_" +
                                str(tf_val))
    print('done!')

    print('Computing TF-IDF MODEL...', end="")
    idf_training = IDF(minDocFreq=5).fit(tf_training)
    print('done!')

    print('Loading Naive Bayes Model...', end="")
    NBM = NaiveBayesModel.load(
        sc,
        os.getcwd() + "/Desktop/MODEL/NBM/NaiveBayesModel_" + str(tf_val))
    print('done!')

    print('READY TO PROCESS DATA...')

    kafkaParams = {'metadata.broker.list"': kafka_brokers}

    # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS
    streamData = KafkaUtils.createDirectStream(
        ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers})

    ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS #########

    # PRE-PROCESSING TWEETS DATA (TESTING)
    obj1 = TweetPreProcessing()

Beispiel #32

0

Datei anzeigen

Datei: naive_bays.py Projekt: bravekjh/Spark


def parseLine(line):
    parts = line.split(',')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="Jay")

    # $example on$
    data = sc.textFile('data/mllib/naive_bayes_data.txt').map(parseLine)

    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()

    # Save and load model
    model.save(sc, "jay/myNaiveBayesModel")
    sameModel = NaiveBayesModel.load(sc, "jay/myNaiveBayesModel")
    # $example off$

Beispiel #33

0

Datei anzeigen

Datei: naive_bayes_example.py Projekt: HKUST-SING/spark

# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonNaiveBayesExample")

    # $example on$
    data = sc.textFile("data/mllib/sample_naive_bayes_data.txt").map(parseLine)

    # Split data approximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print("model accuracy {}".format(accuracy))

    # Save and load model
    output_dir = "target/tmp/myNaiveBayesModel"
    shutil.rmtree(output_dir, ignore_errors=True)
    model.save(sc, output_dir)
    sameModel = NaiveBayesModel.load(sc, output_dir)
    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print("sameModel accuracy {}".format(accuracy))

    # $example off$

Beispiel #34

0

Datei anzeigen

Datei: Classification_live_data.py Projekt: SuperDev0403/Sentiment-Analysis-of-Twitter

    def classify(self, transformer):
        votes = []
        for c in self._classifiers:
            v = c.predict(transformer)
            votes.append(v)
        return mode(votes)


conf = SparkConf()
conf.setAppName("TA")
sc = SparkContext(conf=conf)
tre = StreamingContext(sc, 10)
htf = HashingTF(50000)

NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes'
NB_model = NaiveBayesModel.load(sc, NB_directory)

LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression'
LR_model = LogisticRegressionModel.load(sc, LR_directory)

DT_output_dir = 'hdfs://master:9000/user/hadoop/DT'
DT_model = DecisionTreeModel.load(sc, DT_output_dir)

voted_classifier = VoteClassifier(NB_model, LR_model, DT_model)


def sentiment(test_sample):
    sample_data_test = test_sample.split(" ")
    cli = htf.transform(sample_data_test)
    return voted_classifier.classify(cli)

Beispiel #35

0

Datei anzeigen

def salary_pre(request):
    sc=SparkContext('local','test')
    spark = SparkSession.builder.getOrCreate()
    hive_con=HiveContext(sc)
    nd_idf=IDFModel.load('hdfs://localhost:9000/ndidf')
    agg_idf=IDFModel.load('hdfs://localhost:9000/aggidf')
    model=NaiveBayesModel.load(sc,'hdfs://localhost:9000/nymodel')
    # hive_con.sql('use zp')
    # testdata=hive_con.sql('select education,mon_wa,name,work_area,work_desp,work_exp,work_lable from `qtzp` where id=789')
    # testdataRDD = testdata.rdd.map(lambda i: Row(**{
    #     'education': new_edu_trans(i.education),
    #     'salary': mon_wa_trans(i.mon_wa),
    #     'name': i.name,
    #     'city': i.work_area,
    #     'work_desp': i.work_desp,
    #     'work_exp': i.work_exp,
    #     'work_lable': i.work_lable
    # }))
    # dataDF=testdataRDD.map(lambda i:Row(**{
    #     'salary': int(i.salary),
    #     'agg': [i.education] + [i.city] + [i.work_lable] + [i.work_exp],
    #     'name_and_desp': desp_text_division(i.name + ',' + i.work_desp)
    # })).toDF()
    # dataDF.show()
    city=request.POST.get('city')
    edu=request.POST.get('education')
    introduce=request.POST.get('introduce')
    position=request.POST.get('job')
    exp=request.POST.get('exp')
    dataRDD=sc.parallelize([[edu,city,position,exp,introduce]])
    # schema=StructType([StructField('education',StringType(),True),StructField('work_area',StringType(),True),StructField('work_lable',
    #         StringType(),True),StructField('work_exp',StringType(),True),StructField('work_desp',StringType(),True)])
    # rowRDD=dataRDD.map(lambda i:Row(i[0],i[1],i[2],i[3],i[4]))
    # dataDF=spark.createDataFrame(rowRDD,schema)
    # dataDF.show()
    dataDF=dataRDD.map(lambda i:Row(**{
        'education':i[0],
        'work_area':i[1],
        'work_lable':i[2],
        'work_exp':i[3],
        'work_desp':i[4]
    })).map(lambda i:Row(**{
        'education':str(new_edu_trans(i.education)),
        'city':[i.work_area],
        'work_desp':i.work_desp,
        'work_lable':[i.work_lable],
        'work_exp':[i.work_exp]
    })).map(lambda i:Row(**{
        'agg':[i.education] + i.city + i.work_lable + i.work_exp,
        'name_and_desp':desp_text_division(i.work_desp)
    })).toDF()
    dataDF.show()

    ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240)
    aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256)
    data = ndtf.transform(dataDF)
    data = aggtf.transform(data)
    idfdata = nd_idf.transform(data)
    idfdata = agg_idf.transform(idfdata)
    RDD = idfdata.rdd
    # featuresRDD = RDD.map(lambda i: (i.salary, i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist()))  #test
    featuresRDD = RDD.map(lambda i: i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist())      #应用
    # featuresRDD = featuresRDD.map(lambda i: features_trans(i))      #test
    featuresRDD=featuresRDD.map(lambda i:DenseVector(i))       #应用
    # result=featuresRDD.map(lambda i: model.predict(i.features)).collect()       #test
    result=featuresRDD.map(lambda i:model.predict(i)).collect()
    # result=result[0]
    spark.stop()
    sc.stop()
    city_mean=models.CSR.objects.using('db2').filter(city__contains=city)
    city_mean=city_mean[0].salary
    salary=result_trans(result[0])
    pos_mean=models.ITS.objects.using('db2').get(name=position)
    pos_mean=pos_mean.salary
    data_lst=[city_mean,pos_mean,salary]
    data_lst=json.dumps(data_lst)
    return render(request,'salary.html',{'result':result,'position':position,'city':city,'edu':edu,'exp':exp,'data':data_lst})

Beispiel #36

0

Datei anzeigen

    def calculateSentiment(self,sc,query):
        model = NaiveBayesModel.load(sc,"finalproject/model/NaiveBayesModel")
        query = query
        print (query)
        twitDG = TwitterDataGenerator()
        twitDG.getData(query)
        inputFile = sc.textFile("finalproject/tweets.csv").distinct()
        input_id = inputFile.zipWithIndex().map(lambda l:(l[1],l[0]))
        preprocessedData = self.preProcess(inputFile)
        inputFileProcessed = self.processInputFile(inputFile)
        print("#################################################################################################")
        print(preprocessedData.take(5))
        print("--------------------------------------------------------------------------------------------------")
        print(inputFileProcessed.take(5))
        print("input file processed ",inputFileProcessed.count())
        print("preprocessed count",preprocessedData.count())
        hashingTF = HashingTF()
        tfData = preprocessedData.map(lambda tup: hashingTF.transform(tup))
        idfData = IDF().fit(tfData)
        tfidfData = idfData.transform(tfData)
        output = tfidfData.map(lambda rec: model.predict(rec))
        i_I=inputFileProcessed.map(lambda l: l[0]).zipWithIndex().map(lambda l:(l[1],l[0]))
        print("input file count",inputFile.count())
        print ("output file count",output.count())
        o_I=output.zipWithIndex().map(lambda l:(l[1],l[0]))
        i_o =i_I.join(o_I).map(lambda l:l[1])
        print(i_o.take(i_o.count()))
        print(i_o.count())
        outputJson = {}
        tweetList = []
        tweet = {}
        positiveCount =0
        negativeCount =0
        for i in i_o.take(i_o.count()):
            print(i)
                #print data,data1
            if i[1] == 0.0:
                negativeCount = negativeCount+1
                text = "This is a negative Tweet"
            elif i[1] == 1.0:
                positiveCount = positiveCount + 1
                text = "This is a positive Tweet"
                    #data = text
            #replace(u"\u2022", "*").encode("utf-8")
            if len(i[0]) > 4:
                tweet = {}
                tweet['value'] = i[0].encode("ascii","ignore")
                tweet['sentiment'] = text
                tweetList.append(tweet)
                print i[0].encode("ascii","ignore")
                print text
                print "-------------------------------------"

                #print unicode(str(data),"utf-8")
        print (positiveCount)
        print (negativeCount)
        outputJson["tweets"] = json.dumps(tweetList)
        outputJson["positiveTweetCount"] = positiveCount
        outputJson["negativeTweetCount"] = negativeCount
        wordflatMap = preprocessedData.flatMap(lambda xs: [x for x in xs]).map(lambda x:x.encode("ascii","ignore")).map(lambda x: (x, 1)).reduceByKey(add)
        wordFlatMap_reversed = wordflatMap.map(lambda l:(l[1],l[0])).filter(lambda l: (l[1]!="rt" and l[1]!=query))
        wordFlatMap_sorted = wordFlatMap_reversed.sortByKey(False)
        print (wordFlatMap_sorted.take(10))
        outputFrequencyList = {}
        mostFrequentWordList = []
        wordCount = {}
        words =[]
        counts = []
        for i in wordFlatMap_sorted.take(10):
            wordCount = {}
            wordCount['word'] = i[1]
            wordCount['count'] = i[0]
            mostFrequentWordList.append(wordCount)
        outputJson["frequency"] = json.dumps(mostFrequentWordList)
        return outputJson

Beispiel #37

0

Datei anzeigen

Datei: step3urltest.py Projekt: xiaoyugan0418/Text-classification-with-Naive-Bayes

    tokenhtml = tokenize(a)
    print(tokenhtml)
    for i in range(0, len(tokenhtml)):
        body = ''
        body += tokenhtml[i] + ' '
    html_dict.append({"label": "0", "text": body})

    sc = SparkContext()
    htmldata = sc.parallelize(html_dict)
    labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning=True)

    tf = HashingTF().transform(
        htmldata.map(lambda doc: doc["text"], preservesPartitioning=True))
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    end_tfidf = datetime.now()
    tfidf_time = format(end_tfidf - start_tfidf)

    dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    sameModel = NaiveBayesModel.load(
        sc, "/Users/apple/Dropbox/2016Spring/COSC526/MacHW1/mymodel")
    start_predict = datetime.now()
    predictionAndLabel = dataset.map(lambda p: (sameModel.predict(p.features)))

    predict_time = format(end_predict - start_predict)
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / dataset.count()

    print(tfidf_time)

    print(accuracy)

Beispiel #38

0

Datei anzeigen

Datei: spark_service.py Projekt: classifier/classifier-backend

import sys

os.environ['SPARK_HOME'] = 'spark/spark'
sys.path.append('spark/spark/python/')


try:
   from pyspark import SparkContext
   from pyspark import SparkConf
   print("Successfully imported Spark Modules")

except ImportError as e:
   print("Can not import Spark Modules", e)
   sys.exit(1)


config = SparkConf().setMaster('local[*]').setAppName('SparkService')
sc = SparkContext(conf=config)
sc.setLogLevel("ERROR")


from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import NaiveBayesModel

hashingTF = HashingTF()

sameModel = NaiveBayesModel.load(sc, "spark/nbm")

print(sameModel.predict(hashingTF.transform("This is good place".split(" "))))

Beispiel #39

0

Datei anzeigen

Datei: make_prediction.py Projekt: janes/recsys-tcc-ml

def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user)

    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[
        0], word_tokenize(s[1].lower()), s[2], s[3])).map(lambda p: (p[
            0], [x for x in p[1] if x in tokens], p[2], p[3])).cache())

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [
        PorterStemmer().stem(x) for x in s[1] if x not in stpwrds
    ], s[2], s[3])).filter(lambda x: len(x[1]) >= 20 or
                           (x[2] == u'Post' and len(x[1]) > 0)).cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(
        lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(
        sqlContext.read.parquet(
            "/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))

    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (
        tfIDF.filter(tfIDF.type == u'Post')
        #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
        .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(
        sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF.map(lambda p: (NB.predict(p.features), p[
        0], SVM.predict(p.features))).filter(lambda p: p[2] == 1).map(
            lambda p: (p[0], p[1])).groupByKey().mapValues(list).collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type == category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(
                postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf.map(lambda x: (
                post, x.label, cossine(x.features, postVector))).filter(
                    lambda x: x[2] >= threshold).collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)

Beispiel #40

0

Datei anzeigen

    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)


# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonNaiveBayesExample")

    # $example on$
    data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)

    # Split data aproximately into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=0)

    # Train a naive Bayes model.
    model = NaiveBayes.train(training, 1.0)

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p:
                                  (model.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / test.count()

    # Save and load model
    model.save(sc, "target/tmp/myNaiveBayesModel")
    sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")
    # $example off$

Beispiel #41

0

Datei anzeigen

Datei: make_prediction.py Projekt: felipecontra3/recsys-tcc-ml

def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)

Beispiel #42

0

Datei anzeigen

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint


def parseLine(line):
    parts = line.split(',')
    label = float(parts[0])
    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)


data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)

# Split data aproximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=0)

# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)

# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(
    lambda (x, v): x == v).count() / test.count()

# Save and load model
model.save(sc, "myModelPath")
sameModel = NaiveBayesModel.load(sc, "myModelPath")