Ejemplo n.º 1
0
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 读取文件
    data = sc.textFile(hdfs_path)

    # 分词
    documents = data.map(tokenize)
    documents.cache()

    # TF
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)

    # IDF
    idf = IDF(minDocFreq=2).fit(tf)
    
    # TFIDF
    tfidf = idf.transform(tf)

    # 链接到 MongoDB
    from pymongo import MongoClient
    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1')
    clear_mongodb(mongo_client)

    # zip
    term_tfidf = documents.zip(tfidf).map(doc_tfidf)
    articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y)
    for article in articles.collect():
        item = {}
        item['text'] = article[0].encode('utf-8')
        item['size'] = int(article[1] * 10)
        send_mongodb(mongo_client, item)
def get_feature_vectors(sc, input_file, feature_dimensions):
    """Get feature vector from the lines in input_file_obj using
    TF/IDF.

    Returns:
        vectors RDD

    """

    # Load documents (one per line).
    tweet_file = sc.textFile(input_file)
    input_text_rdd = tweet_file.map(lambda line: _tokenize(line))
    input_text_rdd.cache()

    # The default feature dimension is 2^20; for a corpus with million
    # tweets recommended dimensions are 50000 or 100000. Use higher
    # dimensions for larger corpus of tweets.
    hashing_tf = HashingTF(feature_dimensions)
    tf = hashing_tf.transform(input_text_rdd)
    tf.cache()
    idf = IDF(minDocFreq=2).fit(tf)
    tfidf = idf.transform(tf)
    tfidf.cache()

    return input_text_rdd, tfidf
Ejemplo n.º 3
0
Archivo: ml.py Proyecto: aditcoding/zfs
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
                if not w in stopset:
                        tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file
   # print wordArr
    #tokens = sc.textFile("hdfs:/adi/tokens1.txt")

    # Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc,"tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
#    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
Ejemplo n.º 4
0
 def tfidf(self):
     self._create_rdd()
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return tfidf
def get_tfidf_features(txt):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf
def tfidf(rdd_doc):
    hasingTF = HashingTF()
    trainTf = hasingTF.transform(rdd_doc)
    trainTf.cache()
    idf = IDF().fit(trainTf)
    trainTfidf = idf.transform(trainTf)
    trainTfidf.cache()
    return trainTfidf, lambda x: hasingTF.indexOf(x)
Ejemplo n.º 7
0
 def tfidf(self, tokenizer):
     """
     Get TFIDF matrix rdd with spark tfidf functions
     """
     self._create_rdd(tokenizer)
     hashingTF = HashingTF()
     tf = hashingTF.transform(self.token_rdd)
     idf = IDF(minDocFreq=2).fit(tf)
     tfidf = idf.transform(tf)
     return self.rdd, idf, tfidf
Ejemplo n.º 8
0
def tf_idf(sc,title_token):
    hashingTF = HashingTF(100)
    title_token = sc.parallelize(title_token)
    tf = hashingTF.transform(title_token)
    print tf, ' tf'
   
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
   
    return tfidf
Ejemplo n.º 9
0
def tf_idf_cal(words_rdd):
	hashingTF = HashingTF()
	tf = hashingTF.transform(words_rdd)

	idf = IDF().fit(tf)
	
	tfidf = idf.transform(tf).cache()

	tfidf_str = tfidf.map(lambda line: str(line)).cache()

	return tfidf_str
Ejemplo n.º 10
0
def use_naive_nayes():
    """
    Running the Naive Bayes from Spark's Mlib library
    """
    from pyspark.mllib.classification import NaiveBayes
    from pyspark.mllib.feature import HashingTF, IDF
    from pyspark.mllib.linalg import SparseVector, Vectors
    from pyspark.mllib.regression import LabeledPoint
    #loading the files
    path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/"
    train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split())
    test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8'))
    #TF-IDF
    tr_pos = HashingTF().transform(train_pos)  ;  tr_pos_idf = IDF().fit(tr_pos)
    tr_neg = HashingTF().transform(train_neg)  ;  tr_neg_idf = IDF().fit(tr_neg)
    te_pos = HashingTF().transform(test_pos)  ;  te_pos_idf = IDF().fit(te_pos)
    te_neg = HashingTF().transform(test_neg)  ;  te_neg_idf = IDF().fit(te_neg)
    #IDF step
    tr_pos_tfidf = tr_pos_idf.transform(tr_pos)  ;  tr_neg_tfidf = tr_neg_idf.transform(tr_neg)
    te_pos_tfidf = te_pos_idf.transform(te_pos)  ;  te_neg_tfidf = te_neg_idf.transform(te_neg)
    #Creating labels
    pos_label = [1] * 12500  ;  pos_label = sc.parallelize(pos_label)
    neg_label = [1] * 12500  ;  neg_label = sc.parallelize(neg_label)
    # Combine using zip
    train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    #Joining 2 RDDS to form the final training set
    train_file = train_pos_file.union(train_neg_file)
    test_file = test_pos_file.union(test_neg_file)
    # Fitting a Naive bayes model
    model = NaiveBayes.train(train_file)
    # Make prediction and test accuracy
    predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0]))
    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    print ""
    print "Test accuracy is {}".format(round(accuracy,4))
Ejemplo n.º 11
0
def mySpark(minFreq, keyWord):

    # text cleaning function
    def removePunctuation(text):
        res=text.lower().strip()
        res=re.sub("[^0-9a-zA-Z ]", "", res)
        return res.split(" ")

    # Function for printing each element in RDD
    def println(x):
        for i in x:
            print i

    # Boilerplate Spark stuff:
    conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
    sc = SparkContext(conf = conf)

    # Load documents content (one per line) + cleaning.
    rawData = sc.textFile("list_berita-30.tsv")
    fields = rawData.map(lambda x: x.split("\t"))
    documents = fields.map(lambda x: removePunctuation(x[3]))

    # Get documents content without word mapping
    documentNames = fields.map(lambda x: x[3])

    # TF processing
    hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
    tf = hashingTF.transform(documents)

    # IDF & TF-IDF processing
    tf.cache()
    idf = IDF(minDocFreq=int(minFreq)).fit(tf)
    tfidf = idf.transform(tf)

    # Get keyword relevance with content and zip it
    keywordTF = hashingTF.transform(removePunctuation(keyWord))
    keywordHashValue = int(keywordTF.indices[0])
    keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])
    zippedResults = keywordRelevance.zip(documentNames)

    # print result
    print "Best document for keywords is:"
    print zippedResults.max()
Ejemplo n.º 12
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
Ejemplo n.º 13
0
def main(sc):

    data = sc.textFile('data/train.txt').map(parseLine)
    #print(data.take(10))

    # Train/Test split
    training, test = data.randomSplit([0.7, 0.3], seed=0)

    # TF-IDF
    # TF
    # Features will be hashed to indexes
    # And the feature(term) frequencies will be calculated
    hashingTF = HashingTF()
    # For each training example
    tf_training = training.map(lambda tup: hashingTF.transform(tup[1]))
    # IDF
    # Compute the IDF vector
    idf_training = IDF().fit(tf_training)
    # Scale the TF by IDF
    tfidf_training = idf_training.transform(tf_training)

    # (SparseVector(1048576, {110670: 1.5533, ...), 0)
    tfidf_idx = tfidf_training.zipWithIndex()
    # (['The', 'Da', 'Vinci', 'Code', 'book', 'is', 'just', 'awesome.'], 0)
    training_idx = training.zipWithIndex()

    # Reverse the index and the SparseVector
    idx_training = training_idx.map(lambda line: (line[1], line[0]))
    idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
    #print(idx_training.take(10))

    # rdd.join: (K,V).join(K,W) -> (K, (V,W))
    # idx_tfidf has no info about lables(0/1)
    # but idx_training has
    joined_tfidf_training = idx_training.join(idx_tfidf)
    training_labeled = joined_tfidf_training.map(lambda tup: tup[1])
    training_labeled = training_labeled.map(
        lambda x: LabeledPoint(x[0][0], x[1]))
    #print(training_labeled.take(10))

    # Train a naive Bayes model
    model = NaiveBayes.train(training_labeled, 1.0)

    # Test the model
    tf_test = test.map(lambda tup: hashingTF.transform(tup[1]))
    idf_test = IDF().fit(tf_test)

    tfidf_test = idf_test.transform(tf_test)
    tfidf_idx = tfidf_test.zipWithIndex()
    test_idx = test.zipWithIndex()
    idx_test = test_idx.map(lambda line: (line[1], line[0]))
    idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
    joined_tfidf_test = idx_test.join(idx_tfidf)

    test_labeled = joined_tfidf_test.map(lambda tup: tup[1])
    labeled_test_data = test_labeled.map(lambda k: LabeledPoint(k[0][0], k[1]))
    #print(labeled_test_data.take(2))
    # Apply the trained model on Test data
    predictionAndLabel = labeled_test_data.map(
        lambda p: (model.predict(p.features), p.label))
    #print(predictionAndLabel.take(10))

    # Calculate the accuracy
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda x: x[0] == x[1]).count() / labeled_test_data.count()

    print('>>> Accuracy')
    print(accuracy)

    #model.save(sc, '/model')
    output = open('src/model/model.ml', 'wb')
    pickle.dump(model, output)
# In[106]:

documents1 = sc.textFile(fileName).map(lambda line: line.split(" "))
hashingTF = HashingTF(50000)
tf = hashingTF.transform(documents1)

# In[107]:

tf.cache()
if feature == "W":
    print(feature)
    tfidf = tf.map(lambda x: x)
else:
    idf = IDF(minDocFreq=1).fit(tf)
    tfidf = idf.transform(tf)

# In[136]:

# a = tfidf.collect()
# b = tfidf
# b.collect()
# PythonRDD[12] at RDD at PythonRDD.scala:52

# In[109]:

documentModel = documents1.zip(tfidf)
random.seed(20181031)

# In[110]:
Ejemplo n.º 15
0
dataRDD = sc.parallelize(data, numSlices=16)
lists = dataRDD.map(doc2words)
#lists=dataRDD.map(doc2words).collect()

# create dict
all = []
for l in lists.collect():
    all.extend(l)
dict = set(all)

# TF-IDF
hashingTF = HashingTF(numFeatures=len(dict))
tf = hashingTF.transform(lists)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf).collect()
#data2 =tfidf.collect()

# a = lists.collect()
# b = tfidf.collect()
# print "type tfidf {} len {}".format(type(b),len(b))
# c=b[0]
# print "type c {} len {}".format(type(c),len(c))

## cross-validaton/Grid-search:
cv = ShuffleSplit(len(tfidf), n_iter=3, test_size=0.3, random_state=42)

nb = NaiveBayes()
lr = LogisticRegressionWithLBFGS()
svm = SVMWithSGD()
models = [lr, nb, svm]
Ejemplo n.º 16
0
            if(flag=="n"):
                words.append(word)
        data.append(list(words))

    data.remove("")
    documents = sc.parallelize(data)
    def hashing(x):
        return hashingTF.transform([x]).indices[0]
    hashed = documents.flatMap(lambda line: line).map(lambda word:(hashing(word), word)).distinct()
    hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash')
    # hashingTF = HashingTF()
    # Tf-Idfの生成
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tf_idf_data = idf.transform(tf)
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    K = 5


	# Index documents with unique IDs
    corpus_data = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
    print corpus_data
	# Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus_data, k=K)

	# Output topics. Each is a distribution over words (matching word count vectors)
    print "Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):"
    topics = ldaModel.topicsMatrix()
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    def idx_to_word(idx):
Ejemplo n.º 17
0
data_key = data.map(
    lambda x: (x.item_id, x.label,
               creat_vector(x.keywords.split('|'), keywords_dict, num_words)))

#2)item words tf
data = hiveContext.sql('select * from wl_service.t_lt_train_item_words_v2')
#words to vector
tf = HashingTF()
data_tf = data.map(lambda x:
                   (x.item_id, x.label, tf.transform(x.words.split('_'))))

#3)item words tf_idf
features = data_tf.map(lambda x: x[-1])
idf = IDF().fit(features)
idf_FT = idf.transform(features)
#item_id,label,feature
data_tfidf = data_tf.map(lambda x: (x[0], x[1])).zip(idf_FT).map(
    lambda x: (x[0][0], x[0][1], x[1]))

#4)item info(price 和 favor特征需要标准化)
data_item = hiveContext.sql('select * from wl_service.t_lt_trian_item_info_v3')
data_item = data_item.na.fill(0)
data_item_ft = data_item.map(lambda x: (x.item_id, x.label, x[1:11]))

#5)shop info(price/favor/credit,特征需要标准化)
data_shop = hiveContext.sql(
    'select * from wl_service.t_lt_trian_item_shop_info_v4')
data_shop = data_shop.na.fill(0)
data_shop_ft = data_shop.map(lambda x: (x.item_id, x.label, x[1:23]))
Ejemplo n.º 18
0
nonStopWords2t=nonStopWordst.map(lambda t:' '.join([stemmer.stem(word) for word in t.split(" ") if len(word)>=2 ]))
tokenCountst=nonStopWords2t.map(lambda t:list(set((word,t.count(word)) for word in t.split(" "))))
manyTokenst = tokenCountst.map(lambda l:[mytuple for mytuple in l if mytuple[1]>=2])# if mytuple[1]>=2])
rareTokenst= manyTokenst.flatMap(lambda l: l)
rareTokens1t=rareTokenst.reduceByKey(lambda a,b:a+b).filter(lambda t:t[1]<=1).map(lambda t:t[0])
raresett= set(rareTokens1t.collect())
manytokens_finalt= manyTokenst.map(lambda l:[mytuple for mytuple in l if mytuple[0] not in  raresett])






dim= math.pow(2,16)
hashingTF= HashingTF(dim)
tokenst = manytokens_finalt.map(lambda l:[k for (k,v) in l])
tft = hashingTF.transform(tokenst)
tft.cache()

#print(tf.take(1))

#idft = IDF().fit(tft) surtout pas de nouveau idft!
tfidft_supervised = idf.transform(tft) #use the training set IDF to transform the test data, as this creates a more realistic estimation of model performance on new data
#p272
#print(tfidft.take(1))
idft = IDF().fit(tft)
tfidft_unsupervised=idft.transform(tft)



Ejemplo n.º 19
0
labels = training_raw.map(
    lambda doc: doc["label"],  # Standard Python dict access
    preservesPartitioning=True # This is obsolete.
)


# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf = HashingTF(numFeatures=numfeatures).transform( ## Use much larger number in practice
    training_raw.map(lambda doc: doc["text"].split(),
    preservesPartitioning=True))

tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# Combine using zip
training = labels.zip(tf).map(lambda x: LabeledPoint(x[0], x[1]))

# TEST DATA
testlabel = testlabels.map(lambda line: float(line))
t = reviewdata1.collect()
l = testlabel.collect()
testdata = [{"text":t[i],"label":l[i]} for i in range(len(l))]

test_raw = sc.parallelize(testdata)

testlabels = test_raw.map(
    lambda doc: doc["label"],  # Standard Python dict access
    preservesPartitioning=True # This is obsolete.
sc = SparkContext('local', conf=conf)

tweetData = sc.textFile("data/tweets_formatted_data.csv")
tweetData.take(2)
fields = tweetData.map(lambda x: x.split(","))
fields.take(1)
documents = fields.map(lambda x: x[1].lower().split(" "))

documents.take(1)
documentNames = fields.map(lambda x: x[0])
hashingTF = HashingTF(100000)
article_hash_value = hashingTF.transform(documents)
article_hash_value.cache()

idf = IDF().fit(article_hash_value)
tfidf = idf.transform(article_hash_value)

xformedData = tweetData.zip(tfidf)
xformedData.cache()
xformedData.collect()[0]

from pyspark.mllib.regression import LabeledPoint


def convertToLabeledPoint(inVal):
    origAttr = inVal[0].split(",")
    sentiment = 0.0 if origAttr[0] == "feedback" else 1.0
    return LabeledPoint(sentiment, inVal[1])


tweetLp = xformedData.map(convertToLabeledPoint)
Ejemplo n.º 21
0
	def tfIdf_cluster(self,content,title,date,tfidf):
                tfidf_list=content
		inputRDD = sc.parallelize(tfidf_list)
		hasingTF = HashingTF(2 ** 20)
		trainTf = hasingTF.transform(inputRDD)
                idf = IDF().fit(trainTf)
                trainTfidf = idf.transform(trainTf)
                km = KMeans.train(trainTfidf, 2, maxIterations=100, runs=10) #training new model
		 
		result = km.predict(trainTfidf)
		k_data = array(result.collect())
		
		grp1_news = []
		grp2_news = []
		
		#把抓到的新聞存成[{},{}] key & value 的樣子方便前端取用
		# i = 0 
		for idx, grp in enumerate(k_data):
		
			if grp == 0:
				news =  {
					'title':title[idx],
					'date':date[idx],
					'content':''.join(content[idx].split()),
					'tfidf':tfidf[idx],
				}
				grp1_news.append(news)

			
			if grp == 1:
				news =  {
					'title':title[idx],
					'date':date[idx],
					'content':''.join(content[idx].split()),
					'tfidf':tfidf[idx],
				}
				grp2_news.append(news)
				
		#存取新聞分群TFIDF詞數量開始------------------------------------
		tfidf_word_grp1=[]           #用來裝TFIDF詞跟數量
		all_tfidf_grp1=[]                #用來裝所有TFIDF詞
		for post in grp1_news:
			tfidf = post['tfidf']
			for i in tfidf:
				all_tfidf_grp1.append(i)
		tfidf_dic1 = {}
		for ele in all_tfidf_grp1: # n
			if not ele in tfidf_dic1:
				tfidf_dic1[ele] = 1
			else:
				tfidf_dic1[ele] = tfidf_dic1[ele] + 1
		for i in range(0,len(tfidf_dic1)):
			data = {
				"text":tfidf_dic1.keys()[i],
				"size":(tfidf_dic1.values()[i])*1.5,
			}
			tfidf_word_grp1.append(data)
		
		tfidf_word_grp1.sort(key=lambda d:d['size'],reverse=True)   #幫情緒字進行排序
		
		tfidf_word_grp1 = tfidf_word_grp1[0:50]
		tfidf_word_grp1 = json.dumps(tfidf_word_grp1)
		
		#---------------------------------------------------------------------------------------------
		tfidf_word_grp2=[]           #用來裝TFIDF詞跟數量
		all_tfidf_grp2=[]                #用來裝所有TFIDF詞
		for post in grp2_news:
			tfidf = post['tfidf']
			for i in tfidf:
				all_tfidf_grp2.append(i)
		tfidf_dic2 = {}
		for ele in all_tfidf_grp2: # n
			if not ele in tfidf_dic2:
				tfidf_dic2[ele] = 1
			else:
				tfidf_dic2[ele] = tfidf_dic2[ele] + 1
		for i in range(0,len(tfidf_dic2)):
			data = {
				"text":tfidf_dic2.keys()[i],
				"size":(tfidf_dic2.values()[i])*1.5,
			}
			tfidf_word_grp2.append(data)
		tfidf_word_grp2.sort(key=lambda d:d['size'],reverse=True)   #幫情緒字進行排序
		tfidf_word_grp2 = tfidf_word_grp2[0:50]
		tfidf_word_grp2 = json.dumps(tfidf_word_grp2)	
		
		#存取新聞分群TFIDF詞數量結束------------------------------------
			
		return grp1_news,grp2_news,tfidf_word_grp1,tfidf_word_grp2
    negRateDocument.repartition(1)
    posRateDocument = sc.parallelize(fiveRateDocument.take(negRateDocument.count())).repartition(1)
    allRateDocument = negRateDocument.union(posRateDocument)
    allRateDocument.repartition(1)
    rate = allRateDocument.map(lambda s: s[0])
    document = allRateDocument.map(lambda s: s[1])

    # 分词
    words = document.map(lambda w: "/".join(jieba.cut_for_search(w))).map(lambda line: line.split("/"))

    # 训练词频矩阵
    hashingTF = HashingTF()
    tf = hashingTF.transform(words)
    tf.cache()

    # 计算 TF-IDF 矩阵
    idfModel = IDF().fit(tf)
    tfidf = idfModel.transform(tf)

    # 生成训练集和测试集
    zipped = rate.zip(tfidf)
    data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
    training, test = data.randomSplit([0.6, 0.4], seed=0)

    # 训练贝叶斯分类模型
    NBmodel = NaiveBayes.train(training, 1.0)
    predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count()

    print accuracy
    # whether set to '1', 'pca_mode' allows to use data projection on principal components
    pca_mode = sc.broadcast(0)
    low_dim = 2
    feature_dim = 4096  # 1048576
    k = feature_dim

    # LOADING AND COMPUTING TF's TRAINING MODEL
    print('Loading TRAINING_TF_MODEL...')
    tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' +
                                str(feature_dim))
    print('done!')

    print('Computing TF-IDF MODEL...')
    idf_training = IDF(minDocFreq=5).fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    print('done!')

    # APPLYING PCA ON TRAINING DATA
    if pca_mode.value == 1:
        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
Ejemplo n.º 24
0
tweets = sc.textFile("/Users/anshulrastogi/Downloads/nlp/twits.txt")
tweets = tweets.map(lambda x: re.sub(r"(@|#)(\w+)", '', x))
tweets = tweets.map(lambda x: x.split(','))
plain_txt = tweets.map(lambda x: (x[0], x[1].encode('utf-8').translate(
    string.maketrans("", ""), string.punctuation)))
plain = plain_txt.map(lambda x: (x[0], x[1].translate(string.maketrans("", ""),
                                                      '0123456789').lower()))
labels = plain.map(lambda x: float(x[0])).collect()
tokens = plain.map(lambda x: x[1].split())

hashingTF = HashingTF()
tf = hashingTF.transform(tokens)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
labeledData = tfidf.zipWithIndex().map(lambda
                                       (x, y): LabeledPoint(labels[y], x))

model = NaiveBayes.train(labeledData)

for tweet in feed:
    tweet_text = tweet['new_val']['text']
    message = re.sub(r"(@|#)(\w+)", '', tweet_text)
    message = message.encode('utf-8').translate(string.maketrans("", ""),
                                                string.punctuation)
    message = message.translate(string.maketrans("", ""), '0123456789').lower()

    tf_new = hashingTF.transform(message.split(" "))
    tweet['new_val']['sentiment'] = model.predict(idf.transform(tf_new))
    rdb.db("sentiment").table("classified_messages").insert(tweet).run()
Ejemplo n.º 25
0
    time = parts[1]
    features = parts[3].split(' ')
    return (time, features)

#read the testing dataset
data2 = sc.textFile("/Users/macho/Desktop/data2.txt").map(parseLineTest)
time = data2.map(lambda doc: doc[0], preservesPartitioning=True)
tw = data2.map(lambda doc: doc[1],preservesPartitioning=True)
#read the training dataset
data1 = sc.textFile('/Users/macho/Desktop/data1.txt').map(parseLine)
#split training dataset into labels and text
labels = data1.map(lambda doc: doc[0], preservesPartitioning=True)
text = data1.map(lambda doc: doc[1], preservesPartitioning=True)
#combine training and testing dataset together
alltext = text.union(tw)
#calculate TF-IDF
Hash = HashingTF()
tf = Hash.transform(alltext)
tf1 = Hash.transform(text)
tf2 = Hash.transform(tw)
idf = IDF().fit(tf)
tfidf1 = idf.transform(tf1)
tfidf2 = idf.transform(tf2)
#Use Naive Bayes to classify training dataset
training = labels.zip(tfidf1).map(lambda x: LabeledPoint(x[0], x[1]))
model = NaiveBayes.train(training)
#Predict the labels of testing dataset
pred = time.zip(model.predict(tfidf2)).map(lambda x: ('',x[0],x[1],''))
#save the result
pred.saveAsTextFile("/Users/macho/Desktop/out")
data = sc.textFile("training_test_data.txt").map(parseLine)
'''
Split data into labels and features, transform
preservesPartitioning is not really required
since map without partitioner shouldn't trigger repartitiong
'''
# Extract all the "labels"
labels = data.map(lambda doc: doc[0]["label"], preservesPartitioning=True)

for x in labels.take(3):
    print x
# Perform TF-IDF
tf = HashingTF().transform(
    data.map(lambda doc: doc[0]["text"], preservesPartitioning=True))
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# Combine lables and tfidf and create LabeledPoint data
dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))

for x in dataset.take(3):
    print(x)
result = []
'''
Random split dataset - 60% as training data and 40% as testing.
Train and test the model 10 times. Then put the accuracy into result[]
'''
for i in range(0, 10):
    training, test = dataset.randomSplit([0.6, 0.4], seed=i)
    model = NaiveBayes.train(training, 1.0)
    predictionAndLabel = test.map(lambda p:
Ejemplo n.º 27
0
def distributed_ops( corpus, sanit=False, recall=False, corpred=False, \
                     streams=False, segred=False, tfidf=False, lda=False, \
                     word2vec=False, fin=None, segclust=None):

    # Return item for end results
    return_list = []

    ##########################################

    # Default actions:
    if (segred):
        zipped_corpus = zip(segclust,corpus)
        #print zipped_corpus
    corpus = sc.parallelize(corpus).cache()

    if (sanit or recall):
        corpus = corpus.map(lambda doc: preprocess(doc))
        # Here we "recover all" text, after having removed multi-ws & ws-pad punctuation
        # & replace \n by NL etc... (see function "preprocess" above)
        # We use the same regex sub/filtration rules as in the implementation found
        # @ https://github.com/alexalemi/segmentation (from which we got files in
        # directory: representation.py, tools.py and splitters.py, and which
        # segmentSETxRes.py is based on)
        if (recall):
            return_list.append(recover_encoding(corpus.collect()))

    # Here we return only potentially "meaningful words" - see function "return_words" above
    # Keeps alpha-numeric (removes numeric and non-alphabetical/alphanumeric)
    corpus_distrib = corpus.map(lambda doc: return_words(doc))
    print 'Original number of docs in corpus {filtering *docs* for alpha(+alphanumeric)-only words}: %i'%corpus_distrib.count()
    
    # merge corpus docs into one continuous split text
    corpus_merge = []
    corpus_collect = corpus_distrib.collect() # rdd2list
    for list_of_words in corpus_collect:
        corpus_merge.extend(list_of_words) # list-of-wordslist2{single-wordslist}
    
    # use numpy functions to sort dict words based on term-frequency
    corpus_merge_array = np.array(corpus_merge)
    corpus_merge_sorted = np.sort(corpus_merge_array)
    corpus_merge_unique, counts = np.unique(corpus_merge_sorted,return_counts=True)
    sort_ixs = np.argsort(counts)[::-1]
    counts = counts[sort_ixs]
    corpus_merge_unique = corpus_merge_unique[sort_ixs]
    return_list.append(corpus_merge_unique)
    return_list.append(counts)
    print
    for i,w in enumerate(corpus_merge_unique):
        print ('Counted word "%s" _%i_ many times.'%(w,counts[i]))
    print

    #########################################################################################
    # Next we split the text based on "verbosity/density/sparsity" as would
    # befit an articulate document (i.e. articles/papers/journal entries)
    # or more conversational/blog-entry-like/Q&A style/headings-only-
    # -retrieved website results.
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point-center)]))

    # The following will further sanitize text.
    if (corpred):
        # Use pretrained term frequencies:
        # Experimentally, the following clustering has helped us get rid of
        # irrelevant search engine text results.
        corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=False))
        corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc)).cache()
        # print 'Corpus vectorized'
        # collected = corpus2vec.collect()
        tempor = corpus.collect()
        print
        print
        for i,vec in enumerate(corpus2vec.collect()):
            print 'Got vecs:'
            print vec
            print 'Of text:'
            print tempor[i].split()
            print
        print

        # choose 5 clusters
        clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||")
        WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum
        print
        print 'Within Set Sum of Squared Error = ' + str(WSSE)
        print 'The cluster centers:'
        print clusters.centers
        print
        print
        return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect())

    # The following will cluster for article length + content
    if (streams):
        corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=True))
        temple = corpus.collect()
        print
        print
        for i,vec in enumerate(corpus2vec.collect()):
            print 'Got vecs:'
            print vec
            print 'Of text:'
            print temple[i].split()
            print
        print
        sumall = corpus2vec.reduce(lambda vecx,vecy: np.array([vecx[0]+vecy[0]]))
        corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc,normalizer=sumall)).cache()
        #
        clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||")
        WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum
        print
        print 'Within Set Sum of Squared Error = ' + str(WSSE)
        print 'The cluster centers:'
        print clusters.centers
        print
        print
        return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect())

    #########################################################################################

    # Here we want to remove documents from the corpus which do not contain
    # 'english' dictionary words at all, or words that can be word2vec transformed
    # and "synonimized".
    if (segred):
        corpus_english_prose = sc.parallelize(zipped_corpus).filter(lambda doc: check(doc))
        zipped_corpus = zip(*corpus_english_prose.collect())
        red_clusts = list(zipped_corpus[0])
        red_text = recover_encoding(list(zipped_corpus[1]))
        return_list.append(red_clusts)
        return_list.append(red_text)
        print 'Number of docs in corpus {filtering *corpus* for alpha(+alphanumeric)-only words}: %i'%corpus_english_prose.count()

        f1 = open(''.join([filename,'-document_clusters.txt']),'w')
        f1.write('\n'.join(map(str,red_clusts)))
        f1.close()
        f2 = open(''.join([filename,'-documents_sanitized.txt']),'w')
        f2.write('\n'.join(red_text))
        f2.close()
        f3 = open(''.join([filename,'-documents_dict.txt']),'w')
        f3.write('\n'.join(corpus_merge_unique))
        f3.close()

    #########################################################################################

    if (tfidf):
        # generate document term frequences
        htf = HashingTF()
        tf = htf.transform(corpus_distrib)
        # generate idf = log{ frac{#docs}{#docs w. term} }
        idf = IDF().fit(tf)
        # scale tf * idf
        tfidf = idf.transform(tf)
        # collect tfidf for future use
        doc_tfidf = tfidf.collect()
        # generate unique word : HashingTF hash dict
        corpus_dict_tfidf_t = {}
        # uniquifie merged corpus into terms
        #corpus_merge_unique = sorted(set(corpus_merge))
        # fill in unique word : HashingTF hash dict
        for word in corpus_merge_unique:
            idx = htf.indexOf(word)
            corpus_dict_tfidf_t[word] = idx
            # index not necessarily found in doc_tfidf.

        # no return item

    #########################################################################################

    if (lda):
        corpus_dict = {}
        for c,word in enumerate(corpus_merge_unique):
            corpus_dict[word]=counts[c]
        def return_freq_words(doc,corpus_dict):
            return [word for word in doc if word in corpus_dict if corpus_dict[word]>2]
        corpus_distrib_red = corpus_distrib.map(lambda doc: return_freq_words(doc,corpus_dict)).cache()
        gensim_corpora_id2word = corpora.Dictionary(corpus_distrib_red.collect())
        gensim_doc2bow_doctf = corpus_distrib_red.map(lambda doc: gensim_corpora_id2word.doc2bow(doc)).collect()
        f1 = open(''.join([filename,'-gensim_corpora_id2word.pkl']),'w')
        pickle.dump(gensim_corpora_id2word,f1)
        f1.close()
        f2 = open(''.join([filename,'-gensim_doc2bow_doctf.pkl']),'w')
        pickle.dump(gensim_doc2bow_doctf,f2)
        f2.close()
        f3 = open(''.join([filename,'-corpus.pkl']),'w')
        pickle.dump(corpus_distrib.collect(),f3)
        f3.close()

    if (word2vec):
        #
        def increase_tf(doc): # only words with freq >= 5 are vectorized
            ret_doc = []
            for i in xrange(5):  # <<<
                ret_doc.extend(doc)  # <<<
            return ret_doc
        #
        corpus_distrib_ext = corpus_distrib.map(lambda doc: increase_tf(doc))
        word_mbd = Word2Vec().setVectorSize(50).setSeed(42L).fit(corpus_distrib_ext)
        word2vec_dict = {}
        for i,w in enumerate(corpus_merge_unique):
            #print ('Counted word "%s" _%i_ many times.'%(w,counts[i]))
            word2vec_dict[w] = word_mbd.transform(w)
            try:
                print ('Top 5 embedding cosine similarity synonyms of word "%s":'%w)
                proximal_synonyms = word_mbd.findSynonyms(w,5)
                for s,cs in proximal_synonyms:
                    print ('  "%s" with score _%f_'%(s,cs))
            except:
                print 'No synonyms found (word not in dict).'
        print
        print 'Processing + Spark MLLib has given us %i word2vec vectors.'%len(word2vec_dict)
        return_list.append(word2vec_dict)
        f4 = open(''.join([filename,'-word2vec_dict.pkl']),'w')
        pickle.dump(word2vec_dict,f4)
        f4.close()

    if len(return_list)==1:
        return_list = return_list[0]
    return return_list
Ejemplo n.º 28
0
def calculate_tfidf(documents):
    hashingTF = HashingTF()
    tf = hashingTF.transform(documents.map(lambda x: x[1]))
    tf.cache()
    idf = IDF().fit(tf)
    return idf.transform(tf)
Ejemplo n.º 29
0
from pyspark import SparkContext
"""
Following implementation is based on sample provided by Apache Spark repository
It uses building in method to calculate TF-TDF
https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/tf_idf_example.py
"""
sc = SparkContext("local", "TF-IDF")

# Load documents (one per line).
documents = sc.textFile("hdfs://$NAME_NODE_IP:9000/input/reviews.txt").map(
    lambda line: line.split(" "))

hashingTF = HashingTF()
tf = hashingTF.transform(documents)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
idfIgnore = IDF(minDocFreq=2).fit(tf)
tfidfIgnore = idfIgnore.transform(tf)

# save tf-idf
tfidfIgnore.saveAsTextFile('hdfs://$NAME_NODE_IP:9000/output/tfidf')
Ejemplo n.º 30
0
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            model_name = "svm"
            updated_model = "svm0"
            model_path, data_path, metadata_path = '', '', ''

            #performing looping process to check the availability of new model classifier
            for i in range(60, -1, -1):
                model_path = "hdfs://VM10-1-0-14:9000/classifier/" + model_name + str(
                    i)
                updated_model = model_name + str(i)
                data_path = model_path + "/data/part-r*"
                metadata_path = model_path + "/metadata/part-00000"
                if (patherror(data_path) == False
                        and patherror(metadata_path) == False):
                    break

            #load model classifier
            model = SVMModel.load(sc, model_path)

            start = time.time()
            reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #review tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            tfidf = idf.transform(tf)

            prediction = model.predict(tfidf)

            labeled_prediction = reviews_label.zip(prediction).map(
                lambda x: (float(x[1]), x[0]))

            metrics = MulticlassMetrics(labeled_prediction)

            output = reviews.zip(prediction)

            filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub(
                '[^0-9]', '', str(datetime.now())) + ".out"
            output.saveAsTextFile(filename)

            end = time.time()
            print(updated_model, ';', reviews.count(), ';',
                  metrics.accuracy, ';', metrics.precision(0.0), ';',
                  metrics.precision(1.0), ';', metrics.recall(0.0), ';',
                  metrics.recall(1.0), ';', metrics.fMeasure(0.0), ';',
                  metrics.fMeasure(1.0), ';', (end - start))
Ejemplo n.º 31
0
sc = SparkContext()
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", 
	AWS_SECRET_ACCESS_KEY)

text_negative = sc.textFile("s3n://sent/train_neg.txt")
text_positive = sc.textFile("s3n://sent/train_pos.txt")

train_text = text_negative.union(text_positive)
train_labels = text_negative.map(lambda x: 0.0).union(
	text_positive.map(lambda x: 1.0))

tf = HashingTF().transform(train_text.map(parseline, 
	preservesPartitioning=True))
idf = IDF().fit(tf)
train_tfidf = idf.transform(tf)

training = train_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], 
	x[1]))

model = NaiveBayes.train(training)

# TESTING SET =================================================================

text_negative = sc.textFile("s3n://sent/test_neg.txt")
text_positive = sc.textFile("s3n://sent/test_pos.txt")

test_text = text_negative.union(text_positive)
test_tlabels = text_negative.map(lambda x: 0.0).union(
	text_positive.map(lambda x: 1.0))
Ejemplo n.º 32
0
>>>corpus = parts.map(lambda row: Row(id=row[0], comment=row[1], class=row[2]))
#The parts is a list of fields as we have each field in the line delimited on “\t”.
#Let's break the corpus that has [ID, comment, class (0,1)] in the different RDD objects:
>>>comment = corpus.map(lambda row: " " + row.comment)
>>>class_var = corpus.map(lambda row:row.class)
#Once we have the comments, we need to do a process very similar to what we did in Chapter 6, Text Classification, where we used scikit to do tokenization, hash vectorizer and calculate TF, IDF, and tf-idf using a vectorizer.
#The following is the snippet of how to create tokenization, term frequency, and inverse document frequency:
>>>from pyspark.mllib.feature import HashingTF
>>>from pyspark.mllib.feature import IDF
# https://spark.apache.org/docs/1.3.0/mllib-feature-extraction.html 
>>>comment_tokenized = comment.map(lambda line: line.strip().split(" "))
>>>hashingTF = HashingTF(1000) # to select only 1000 features 
>>>comment_tf = hashingTF.transform(comment_tokenized)

>>>comment_idf = IDF().fit(comment_tf)
>>>comment_tfidf = comment_idf.transform(comment_tf)
#Will merge the class with the c tfidf RDD like this:
>>>finaldata = class_var.zip(comment_tfidf)
#We will do a typical test and train smapling
>>>train, test = finaldata.randomSplit([0.8, 0.2], seed=0)
#Let's perform the main classification commands, which are quite similar to scikit. We are using a logistic regression, which is widely used classifier. The pyspark.mllib provides you a variety of algorithms.
#For more information on pyspark.mllib visit https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html

#The following is an example of logistic regression classifier:
>>>from pyspark.mllib.regression import LabeledPoint
>>>from pyspark.mllib.classification import NaiveBayes
>>>train_rdd = train.map(lambda t: LabeledPoint(t[0], t[1]))
>>>test_rdd = test.map(lambda t: LabeledPoint(t[0], t[1]))
>>>nb = NaiveBayes.train(train_rdd,lambda = 1.0)
>>>nb_output = test_rdd.map(lambda point: (NB.predict(point.features), point.label))
>>>print nb_output
Ejemplo n.º 33
0
from pyspark.mllib.util import MLUtils
#>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),                         LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
#>>> tempFile = NamedTemporaryFile(delete=True)
#>>> tempFile.close()
#>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)



from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF, IDF
from pyspark import SparkContext

sc=SparkContext("local","dd")
train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(","))
trainlabels = train.map(lambda(a,b): int(b))
traintf = HashingTF().transform(train.map(lambda(a,b): a.split()))
trainidf = IDF().fit(traintf)
traintfidf = trainidf.transform(traintf)
#densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray()))
#zippeddata = trainlabels.zip(densetrain)
#new = zippeddata.map(lambda (a,vec) : (a,vec.toArray()))
training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1]))
MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile")
data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000")
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)
model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
# COMMAND ----------

#tags count
read_tags_1m_data.count()

# COMMAND ----------

# TFIDF of Documents
from pyspark.mllib.feature import HashingTF, IDF
hashingTF = HashingTF(features_length)
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

#tf.cache()
#idf = IDF().fit(tf)
#tfidf = idf.transform(tf)

# COMMAND ----------

tfidf

# COMMAND ----------

#Documents after TFIDF
tfidf
tfidf.take(3)
mtif = tfidf.map(lambda x: [x])
Ejemplo n.º 35
0
def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamtf = spam.map(lambda email: tf.transform(email.split(" ")))
	hamtf = ham.map(lambda email: tf.transform(email.split(" ")))

	spamtf.cache()
	hamtf.cache()

	spamidf = IDF().fit(spamtf)
	hamidf = IDF().fit(hamtf)
	
	spamFeatures = spamidf.transform(spamtf)
	hamFeatures = hamidf.transform(hamtf)
	
	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = SVMWithSGD.train(trainingData, iterations=100)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "SpamSvm.pkl", "wb" ) )

	sc.stop()
                         vocabSize=vocabSize,
                         minDF=mindocFrequencies)

    cvmodel = cv.fit(wordsDataFrame)
    result = cvmodel.transform(wordsDataFrame).select("label", "features")
    vocablist = (cvmodel.vocabulary)
    #print (vocablist)
    #result.select("features").show(truncate=False)

    countVectors = result.select("features")

    frequencyVectors = countVectors.rdd.map(
        lambda vector: DenseVector(vector[0].toArray()))

    idf = IDF().fit(frequencyVectors)
    tfidf = idf.transform(frequencyVectors)

    frequencyVectors = frequencyVectors.zipWithIndex()

    #resultMod = result.rdd.map(lambda vector: DenseVector(vector[1].toArray()))

    # resultMod = resultMod.toDF()

    resultMod = sqlContext.createDataFrame(frequencyVectors,
                                           ["features", "documentId"])

    # prepare corpus for LDA
    corpus = tfidf.map(lambda x: [1, x]).cache()

    # train LDA
    # optimizer parameter "em" or "online"
Ejemplo n.º 37
0
    def calculateSentiment(self,sc,query):
        model = NaiveBayesModel.load(sc,"finalproject/model/NaiveBayesModel")
        query = query
        print (query)
        twitDG = TwitterDataGenerator()
        twitDG.getData(query)
        inputFile = sc.textFile("finalproject/tweets.csv").distinct()
        input_id = inputFile.zipWithIndex().map(lambda l:(l[1],l[0]))
        preprocessedData = self.preProcess(inputFile)
        inputFileProcessed = self.processInputFile(inputFile)
        print("#################################################################################################")
        print(preprocessedData.take(5))
        print("--------------------------------------------------------------------------------------------------")
        print(inputFileProcessed.take(5))
        print("input file processed ",inputFileProcessed.count())
        print("preprocessed count",preprocessedData.count())
        hashingTF = HashingTF()
        tfData = preprocessedData.map(lambda tup: hashingTF.transform(tup))
        idfData = IDF().fit(tfData)
        tfidfData = idfData.transform(tfData)
        output = tfidfData.map(lambda rec: model.predict(rec))
        i_I=inputFileProcessed.map(lambda l: l[0]).zipWithIndex().map(lambda l:(l[1],l[0]))
        print("input file count",inputFile.count())
        print ("output file count",output.count())
        o_I=output.zipWithIndex().map(lambda l:(l[1],l[0]))
        i_o =i_I.join(o_I).map(lambda l:l[1])
        print(i_o.take(i_o.count()))
        print(i_o.count())
        outputJson = {}
        tweetList = []
        tweet = {}
        positiveCount =0
        negativeCount =0
        for i in i_o.take(i_o.count()):
            print(i)
                #print data,data1
            if i[1] == 0.0:
                negativeCount = negativeCount+1
                text = "This is a negative Tweet"
            elif i[1] == 1.0:
                positiveCount = positiveCount + 1
                text = "This is a positive Tweet"
                    #data = text
            #replace(u"\u2022", "*").encode("utf-8")
            if len(i[0]) > 4:
                tweet = {}
                tweet['value'] = i[0].encode("ascii","ignore")
                tweet['sentiment'] = text
                tweetList.append(tweet)
                print i[0].encode("ascii","ignore")
                print text
                print "-------------------------------------"

                #print unicode(str(data),"utf-8")
        print (positiveCount)
        print (negativeCount)
        outputJson["tweets"] = json.dumps(tweetList)
        outputJson["positiveTweetCount"] = positiveCount
        outputJson["negativeTweetCount"] = negativeCount
        wordflatMap = preprocessedData.flatMap(lambda xs: [x for x in xs]).map(lambda x:x.encode("ascii","ignore")).map(lambda x: (x, 1)).reduceByKey(add)
        wordFlatMap_reversed = wordflatMap.map(lambda l:(l[1],l[0])).filter(lambda l: (l[1]!="rt" and l[1]!=query))
        wordFlatMap_sorted = wordFlatMap_reversed.sortByKey(False)
        print (wordFlatMap_sorted.take(10))
        outputFrequencyList = {}
        mostFrequentWordList = []
        wordCount = {}
        words =[]
        counts = []
        for i in wordFlatMap_sorted.take(10):
            wordCount = {}
            wordCount['word'] = i[1]
            wordCount['count'] = i[0]
            mostFrequentWordList.append(wordCount)
        outputJson["frequency"] = json.dumps(mostFrequentWordList)
        return outputJson
Ejemplo n.º 38
0
# Load documents (one per line).
documents = sc.textFile(sys.argv[1]).map(parseLine)  #rdd

label = documents.map(lambda x: x[1])
features = documents.map(lambda x: x[2])

labelSet = list(set(
    label.collect()))  # change RDD to set (only unique categories)
print "Category-Label mapping:", labelSet

hashingTF = HashingTF(5000)
tf = hashingTF.transform(features)

tf.cache()
idf = IDF(minDocFreq=5).fit(tf)
tfidf = idf.transform(tf).cache()

data = label.zip(tfidf).map(
    lambda x: LabeledPoint(labelSet.index(x[0]), x[1])).cache()
training = data.sample(False, .90)
test = data.sample(False, .10)
print "Num Points:", data.count()
# Build the model
model = LogisticRegressionWithLBFGS.train(training, numClasses=len(labelSet))

# test a few items
labelsAndPreds = test.map(
    lambda p: (labelSet[int(p.label)], p.label, model.predict(p.features)))
temp = labelsAndPreds.take(50)

for index in range(len(temp)):
Ejemplo n.º 39
0
def tfidf(data):
    hashing = HashingTF()
    tf = hashing.transform(data)
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    return tfidf
Ejemplo n.º 40
0
def produce_tfidf(x):
    tf = HashingTF().transform(x)
    idf = IDF(minDocFreq=5).fit(tf)
    tfidf = idf.transform(tf)
    return tfidf
Ejemplo n.º 41
0
def vectorize_feature(training):
    hashingTF = HashingTF()
    tf_training = training.map(lambda tup: hashingTF.transform(tup[1]))
    idf_training = IDF().fit(tf_training)
    tfidf_training = idf_training.transform(tf_training)
    return tfidf_training
Ejemplo n.º 42
0
# Databricks notebook source exported at Thu, 23 Jun 2016 07:23:39 UTC
from pyspark import SparkConf,SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
rawData = sc.textFile("/FileStore/tables/dp736dao1466664806758/subset_small-50f68.tsv")
fields = rawData.map(lambda x:x.split("\t"))
documents = fields.map(lambda x:x[3].split(" "))

#Document names
documentNames = fields.map(lambda x:x[1])

#hash the word in document to their term frequencies
hashingtf = HashingTF(100000) #to save memory
tf = hashingtf.transform(documents) # each value ->term frequency of unique hash value

#calculating tf*idf score
idf = IDF(minDocFreq = 2).fit(tf)
tfidf = idf.transform(tf) # each value ->tf*idf of unique hash value of each document

#Test
gettysBurgTF = hashingtf.transform("Gettysburg")
gettysburgHashValue = int(gettysBurgTF.indices[0])

gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue])
zippedResults = gettysburgRelevance.zip(documentNames)

#print best result
print zippedResults.max()

Ejemplo n.º 43
0
def returnTFIDF(tokens, hashingTF):
	tf = hashingTF.transform(tokens)
	idf = IDF(minDocFreq=25).fit(tf)
	tfidf = idf.transform(tf)
	return tfidf
Ejemplo n.º 44
0
def classify_tweet(tf):
    idf = IDF().fit(tf)
    tf_idf = idf.transform(tf)

    return tf_idf
Ejemplo n.º 45
0
allLableDocument = negLableDocument.union(posLableDocument)
allLableDocument.repartition(1)
lable = allLableDocument.map(lambda s: s[0])
document = allLableDocument.map(lambda s: s[1])

import jieba
words = document.map(lambda w: "/".join(jieba.cut_for_search(w))).map(
    lambda line: line.split("/"))

from pyspark.mllib.feature import HashingTF, IDF
hashingTF = HashingTF()
tf = hashingTF.transform(words)
tf.cache()

idfModel = IDF().fit(tf)
tfidf = idfModel.transform(tf)

from pyspark.mllib.regression import LabeledPoint
zipped = lable.zip(tfidf)
data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
training, test = data.randomSplit([0.6, 0.4], seed=0)

from pyspark.mllib.classification import NaiveBayes
NBmodel = NaiveBayes.train(training, 1.0)
predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(
    lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count()
# 0.6707555665973106

# yourDocument=input("输入待分类的评论:")
yourDocument = """那道黄金饺主食太肯爹了,每个饺子比小馄炖还小,炸过的,吃起来软塔塔的,里面就点萝卜丝,小小的12个,58元,大家千万别上当啊,菜谱里没有的,点菜时服务员竭力推荐的,千万别上当!??"""
    docData = docData.split()
    docData = [x for x in docData if x not in stopWordList]
    docData = [porter.stem(word) for word in docData]
    return (docID, docData)


data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse)

titles = data.map(lambda x: x[0])
documents = data.map(lambda x: x[1])
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF().fit(tf)
normalizer = Normalizer()
tfidf = normalizer.transform(idf.transform(tf))
tfidfData = titles.zip(tfidf).toDF(["label", "features"])
#idf.rdd.saveAsTextFile("idf_model")
#sc.parallelize(idf.idf()).coalesce(1).saveAsTextFile("idf")
#MLUtils.saveAsLibSVMFile(tfidfData, "tfidf_column.out")

query = parse((
    0,
    "location_id organization_id name latitude longitude bbl bin cd council nta tract"
))[1]
queryTF = hashingTF.transform(query)
queryTFIDF = normalizer.transform(idf.transform(queryTF))
queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(
    queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0)
if (queryRelevance.isEmpty()):
    print("nothing matched")
Ejemplo n.º 47
0
    twoRateDocument = rateDocument.filter(lambda line: int(float(line[0])) == 2).map(lambda line: (0, line[1]))
    oneRateDocument = rateDocument.filter(lambda line: int(float(line[0])) == 1).map(lambda line: (0, line[1]))
    allRateDocument = oneRateDocument.union(twoRateDocument).union(threeRateDocument).union(fourRateDocument).union(fiveRateDocument)

    # Generate training data
    rate = allRateDocument.map(lambda s: s[0])
    document = allRateDocument.map(lambda s: s[1].split(" "))
    tipsDocument = tipsDocument.map(lambda s: s[1])
    document_t = tipsDocument.map(lambda s: s.split(" "))

    hashingTF = HashingTF()
    tf=hashingTF.transform(document)
    tf.cache()

    idfModel = IDF().fit(tf)
    tfidf = idfModel.transform(tf)

    tf_t=hashingTF.transform(document_t)
    tf_t.cache()
    idfModel_t = IDF().fit(tf_t)
    tfidf_t = idfModel_t.transform(tf_t)
    training_t = tfidf_t

    zipped = rate.zip(tfidf)
    data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
    training, test = data.randomSplit([0.6, 0.4], seed=0)
#    LRmodel = LogisticRegressionWithSGD.train(training, iterations = 50)
#    NBmodel = NaiveBayes.train(training, 1.0)
    SVMmodel = SVMWithSGD.train(training, iterations=100)
    prediction = training_t.map(lambda p: (SVMmodel.predict(p)))
    predictionAndLabel = test.map(lambda p: (SVMmodel.predict(p.features), p.label))
Ejemplo n.º 48
0
 
  hashingTF = HashingTF(tf_val)
 
  print('Computing TF model...')
 
  tf_training = training.map(lambda tup: hashingTF.transform(tup[1]))
 
  print('Saving TF_MODEL...')
 
  tf_training.saveAsPickleFile("/model/TF_MODEL_"+str(tf_val))
 
  idf_training = IDF().fit(tf_training)
 
  print('Computing TF-IDF...')
 
  tfidf_training = idf_training.transform(tf_training)
 
  tfidf_idx = tfidf_training.zipWithIndex()
 
  training_idx = training.zipWithIndex()
 
  idx_training = training_idx.map(lambda line: (line[1], line[0]))
 
  idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
 
  joined_tfidf_training = idx_training.join(idx_tfidf)
 
  training_labeled = joined_tfidf_training.map(lambda tup: tup[1])
 
  labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1]))
 
Ejemplo n.º 49
0
    data.remove("")
    documents = sc.parallelize(data)

    def hashing(x):
        return hashingTF.transform([x]).indices[0]

    hashed = documents.flatMap(lambda line: line).map(
        lambda word: (hashing(word), word)).distinct()
    hashed_word = pd.DataFrame(hashed.collect(),
                               columns=['hash', 'word']).set_index('hash')
    # hashingTF = HashingTF()
    # Tf-Idfの生成
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tf_idf_data = idf.transform(tf)
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    K = 5

    # Index documents with unique IDs
    corpus_data = tf_idf_data.zipWithIndex().map(
        lambda x: [x[1], x[0]]).cache()
    print corpus_data
    # Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus_data, k=K)

    # Output topics. Each is a distribution over words (matching word count vectors)
    print "Learned topics (as distributions over vocab of " + str(
        ldaModel.vocabSize()) + " words):"
    topics = ldaModel.topicsMatrix()
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
Ejemplo n.º 50
0
stop = sc.broadcast(stop)

sentiments = {'1.0': "Positive", '0.0': "Negative"}

tweets = sc.textFile("/Users/anshulrastogi/Downloads/nlp/twits.txt")
tweets = tweets.map(lambda x: re.sub(r"(@|#)(\w+)", '', x))
tweets = tweets.map(lambda x: x.split(','))
plain_txt = tweets.map(lambda x: (x[0], x[1].encode('utf-8').translate(string.maketrans("", ""), string.punctuation)))
plain = plain_txt.map(lambda x: (x[0], x[1].translate(string.maketrans("", ""), '0123456789').lower()))
labels = plain.map(lambda x: float(x[0])).collect()
tokens = plain.map(lambda x: x[1].split())

hashingTF = HashingTF()
tf = hashingTF.transform(tokens)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
labeledData = tfidf.zipWithIndex().map(lambda (x, y): LabeledPoint(labels[y], x))

model = NaiveBayes.train(labeledData)

for tweet in feed:
    tweet_text = tweet['new_val']['text']
    message = re.sub(r"(@|#)(\w+)", '', tweet_text)
    message = message.encode('utf-8').translate(string.maketrans("", ""), string.punctuation)
    message = message.translate(string.maketrans("", ""), '0123456789').lower()

    tf_new = hashingTF.transform(message.split(" "))
    tweet['new_val']['sentiment'] = model.predict(idf.transform(tf_new))
    rdb.db("sentiment").table("classified_messages").insert(tweet).run()
    print sentiments[str(model.predict(idf.transform(tf_new)))] + " - " + tweet_text
def filterStopWords(x):
	filtered_x = []
	for word in x:
		if word not in stopwordsList and len(word)>1:
			filtered_x.append(word)
	return filtered_x

documents = documents.map(lambda x: filterStopWords(x)).filter(lambda x: len(x)>0)


## Step 3: Extract TF-IDF features
hashingTF = HashingTF(nFeature)   # default is 2^20
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF(minDocFreq=5).fit(tf)
tfidf = idf.transform(tf).repartition(nPartition)
tf.unpersist()
del idf
tfidf.cache()

## Step 4: Clustering with k-mean algorithm

pool = [10, 100, 1000]
for nCluster in pool:
	# Build the model (cluster the data)
	kmeans_model = KMeans.train(tfidf, nCluster, maxIterations=10, runs=1, initializationMode="random")

	# Evaluate clustering by computing Within Set Sum of Squared Errors
	'''
	def error(point):
	    center = kmeans_model.centers[kmeans_model.predict(point)]
Ejemplo n.º 52
0
print(stops[1590:])

# 清理标点符号、停用词、非法字符等
comments_clean = comments_tokenized.map(
    lambda ele: [e for e in ele if e not in stops])
comments_clean.take(2)
'''
3.4.TF-IDF
'''
# 定义features数量
hashingTF = HashingTF(5000)

# tf-idf
comments_tf = hashingTF.transform(comments_clean)
comments_idf = IDF().fit(comments_tf)
comments_tfidf = comments_idf.transform(comments_tf)
'''
3.5.朴素贝叶斯
'''
# 合并RDD。标签和评论内容。
final_data = labels.zip(comments_tfidf)

# 划分训练集和测试集
train_set, test_set = final_data.randomSplit([0.8, 0.2], seed=20182019)
train_rdd = train_set.map(
    lambda ele: LabeledPoint(ele[0], ele[1]))  # 转化为标签数据类型
test_rdd = test_set.map(lambda ele: LabeledPoint(ele[0], ele[1]))

# 训练
clf_nb = NaiveBayes.train(train_rdd)
Ejemplo n.º 53
0
if __name__ == "__main__":
    sc = SparkContext(appName="TFIDFExample")  # SparkContext

    # $example on$
    # Load documents (one per line).
    documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" "))

    hashingTF = HashingTF()
    tf = hashingTF.transform(documents)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # First to compute the IDF vector and second to scale the term frequencies by IDF.
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idfIgnore = IDF(minDocFreq=2).fit(tf)
    tfidfIgnore = idfIgnore.transform(tf)
    # $example off$

    print("tfidf:")
    for each in tfidf.collect():
        print(each)

    print("tfidfIgnore:")
    for each in tfidfIgnore.collect():
Ejemplo n.º 54
0
    doc_wo_counters = documents.mapPartitionsWithIndex(lambda i, iter: islice(iter, 3, None) if i == 0 else iter)

    final_doc = doc_wo_counters.map(lambda x: (int(x[0]), doc_to_words(int(x[1]), int(x[2])).encode("utf8"))).reduceByKey(lambda x, y: x + " " + y)

    vect_rep = final_doc.map(lambda x: x[1])

    raw_document = sc.textFile("test.txt")
    vect_rep = raw_document.map(lambda line: line.encode("utf8").split(" "))

    
    # TfIDF
    hashingTF = HashingTF()
    tf = hashingTF.transform(vect_rep)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf_vectors = idf.transform(tf)
    
    #Build the model (cluster the data)
    clusters = KMeans.train(tfidf_vectors, 10, maxIterations=100)
    
    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point.toArray() - center)]))

    WSSSE = tfidf_vectors.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    # Save and load model
    clusters.save(sc, "myModelPath")
    sameModel = KMeansModel.load(sc, "myModelPath")
Ejemplo n.º 55
0
    .map(lambda line: line.split(" "))\
    .map(lambda x: filter_word(x))\
    .map(lambda x: (0.0, x))


documents_train = documents.union(documents_neg)

labels = documents_train.map(lambda x: x[0])
train_set = documents_train.map(lambda x: x[1])

hashingTF = HashingTF()
tf = hashingTF.transform(train_set)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# Create a labeled point with a positive label and a dense feature vector
training = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))

model = NaiveBayes.train(training)

######### Calculate TFIDF with test data ########

### test_pos data ###
documents_t_RDD = sc.textFile("/Users/tracy/msan-ml/hw2/aclImdb/test_pos.txt")
# This command is for running on EMR connecting to S3
# documents_RDD = sc.textFile("s3n://aml-aml/test_pos.txt")

documents_t = documents_t_RDD.map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())\
    .map(lambda line: line.split(" "))\
Ejemplo n.º 56
0
def init_tranining_set(sc):
    """
    合并积极/消极的词性
    param: sc spark对象的context
    """

    # 获取积极文本构造rdd
    positive_file1 = os.path.join(settings.DATA_DIR, '分类词库/positive.txt')
    positive_data1 = sc.textFile(positive_file1)
    # 数据去重
    positive_data1 = positive_data1.distinct()
    positive_data1 = positive_data1.map(lambda line: line.split('###')).filter(
        lambda line: len(line) == 2)

    #positive_file2 = os.path.join(settings.DATA_DIR, 'new_post.txt')
    #positive_data2 = sc.textFile(positive_file2)
    ## 数据去重
    #positive_data2 = positive_data2.distinct()
    #positive_data2 = positive_data2.map(lambda line : line.split('###')).filter(lambda line : len(line)==2)

    #positive_data = positive_data1.union(positive_data2)
    #positive_data.repartition(1)

    # 获取消极文本构造rdd
    negative_file1 = os.path.join(settings.DATA_DIR, '分类词库/negative.txt')
    negative_data1 = sc.textFile(negative_file1)
    negative_data1 = negative_data1.distinct()
    # 取两列
    negative_data1 = negative_data1.map(lambda line: line.split('###')).filter(
        lambda line: len(line) == 2)
    #negative_file2 = os.path.join(settings.DATA_DIR, 'new_negi.txt')
    #negative_data2 = sc.textFile(negative_file2)
    #negative_data2 = negative_data2.distinct()
    ## 取两列
    #negative_data2 = negative_data2.map(lambda line : line.split('###')).filter(lambda line : len(line)==2)

    #negative_data = negative_data1.union(negative_data2)
    #negative_data.repartition(1)

    positive_data = positive_data1
    negative_data = negative_data1
    print negative_data.count()
    print positive_data.count()

    # 合并训练集
    all_data = negative_data.union(positive_data)
    all_data.repartition(1)
    # 评分已经提前进行处理只有-1与1
    rate = all_data.map(lambda s: s[0])
    document = all_data.map(lambda s: s[1])

    words = document.map(lambda w:"/".\
            join(jieba.cut_for_search(w))).\
            map(lambda line: line.split("/"))

    # 训练词频矩阵
    hashingTF = HashingTF()
    tf = hashingTF.transform(words)

    # 计算TF-IDF矩阵
    idfModel = IDF().fit(tf)
    tfidf = idfModel.transform(tf)
    tf.cache()

    # 生成训练集和测试集
    zipped = rate.zip(tfidf)
    data = zipped.map(lambda line: LabeledPoint(line[0], line[1]))
    training, test = data.randomSplit([0.6, 0.4], seed=0)

    # 训练贝叶斯分类模型
    NBmodel = NaiveBayes.train(training, 1.0)
    predictionAndLabel = test.map(lambda p:
                                  (NBmodel.predict(p.features), p.label))
    accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \
            if x[0] == x[1] else 0.0).count() / test.count()

    # 文本按照csv格式存储
    words.repartition(1).saveAsTextFile("traning_words")
    # 贝叶斯分类模型以pickle存储
    with open('NBmodel.pkl', 'w') as f:
        pickle.dump(NBmodel, f)
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
doc = sc.textFile(“target url”).map(lambda line: line.split(' '))
hashingTF = HashingTF()
hashingTF.indexOf("COMPANY NAME")
tf = hashingTF.transform(doc)
idf = IDF().fit(tf)
tfidf = idf.transform(tf).collect()
print(tfidf)
Ejemplo n.º 58
0
spark = SparkSession(sc)
script_dir = os.path.dirname(__file__)
#training = "training.1600000.processed.noemoticon.csv"
#testing="testdata.manual.2009.06.14.csv"
filename="train.csv"
#testing="test.csv"
abs_file_path = os.path.join(script_dir, filename)
#abs_file_path_testing= os.path.join(script_dir, testing)
raw_data = sc.textFile(abs_file_path)
#rawTestingData=sc.textFile(abs_file_path_testing)
label_text=raw_data.map(lambda x:(float(x.split(",")[0][1]), x.split(",")[5].encode('ascii','ignore')))
hashingTF = HashingTF()
# test=rawTestingData.map(lambda x:(float(x.split(",")[0][1]), x.split(",")[5].encode('ascii','ignore')))
feature_htf = label_text.map(lambda tup: hashingTF.transform(tup[1]))
feature_idf= IDF().fit(feature_htf)
featured= feature_idf.transform(feature_htf)

label=label_text.map(lambda tup: tup[0])
featured_idx=featured.zipWithIndex()
label_idx=label.zipWithIndex()
idx_featured=featured_idx.map(lambda x:(x[1],x[0]))
idx_label=label_idx.map(lambda x:(x[1],x[0]))


label_feature=idx_label.join(idx_featured).map(lambda x:x[1])


label_feature_LabeledPoint=label_feature.map(lambda x:(LabeledPoint(x[0],x.SparseVector)))
for i in label_feature_LabeledPoint.collect():
    print i
Ejemplo n.º 59
0
    print x

# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)

# Load documents (one per line).
rawData = sc.textFile("subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

documentNames = fields.map(lambda x: x[1])


hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

keywordTF = hashingTF.transform(["Apollo"])
keywordHashValue = int(keywordTF.indices[0])

keywordRelevance = tfidf.map(lambda x: x[keywordHashValue])

zippedResults = keywordRelevance.zip(documentNames)

print "Best document for keywords is:"
print zippedResults.max()
Ejemplo n.º 60
0
hashingTF = HashingTF(dim)

tf=hashingTF.transform(tokens)

tf.cache()

v=tf.first()

print(v.size)
print(v.values)
print(v.indices)

idf = IDF().fit(tf)

tfidf=idf.transform(tf)

v2=tfidf.first()

print(v2.size)
print(v2.values)
print(v2.indices)

minMaxVals = tfidf.map(lambda v: (min(v.values),max(v.values)))
globalMin=minMaxVals.reduce(min)
globalMax=minMaxVals.reduce(max)
globalMinMax=(globalMin[0],globalMax[1])

###Using a TF-IDF model

hockeyText= rdd.filter(lambda (file,text): file.find("hockey")!= -1)