def main(): # 初始化 SparkContext sc = spark_context(spark_master) # 读取文件 data = sc.textFile(hdfs_path) # 分词 documents = data.map(tokenize) documents.cache() # TF hashingTF = HashingTF() tf = hashingTF.transform(documents) # IDF idf = IDF(minDocFreq=2).fit(tf) # TFIDF tfidf = idf.transform(tf) # 链接到 MongoDB from pymongo import MongoClient mongo_client = MongoClient(mongo_host) mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1') clear_mongodb(mongo_client) # zip term_tfidf = documents.zip(tfidf).map(doc_tfidf) articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y) for article in articles.collect(): item = {} item['text'] = article[0].encode('utf-8') item['size'] = int(article[1] * 10) send_mongodb(mongo_client, item)
def get_feature_vectors(sc, input_file, feature_dimensions): """Get feature vector from the lines in input_file_obj using TF/IDF. Returns: vectors RDD """ # Load documents (one per line). tweet_file = sc.textFile(input_file) input_text_rdd = tweet_file.map(lambda line: _tokenize(line)) input_text_rdd.cache() # The default feature dimension is 2^20; for a corpus with million # tweets recommended dimensions are 50000 or 100000. Use higher # dimensions for larger corpus of tweets. hashing_tf = HashingTF(feature_dimensions) tf = hashing_tf.transform(input_text_rdd) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() return input_text_rdd, tfidf
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc,"tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def tfidf(self): self._create_rdd() hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return tfidf
def get_tfidf_features(txt): hashingTF = HashingTF() tf = hashingTF.transform(txt) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def tfidf(rdd_doc): hasingTF = HashingTF() trainTf = hasingTF.transform(rdd_doc) trainTf.cache() idf = IDF().fit(trainTf) trainTfidf = idf.transform(trainTf) trainTfidf.cache() return trainTfidf, lambda x: hasingTF.indexOf(x)
def tfidf(self, tokenizer): """ Get TFIDF matrix rdd with spark tfidf functions """ self._create_rdd(tokenizer) hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return self.rdd, idf, tfidf
def tf_idf(sc,title_token): hashingTF = HashingTF(100) title_token = sc.parallelize(title_token) tf = hashingTF.transform(title_token) print tf, ' tf' idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def tf_idf_cal(words_rdd): hashingTF = HashingTF() tf = hashingTF.transform(words_rdd) idf = IDF().fit(tf) tfidf = idf.transform(tf).cache() tfidf_str = tfidf.map(lambda line: str(line)).cache() return tfidf_str
def use_naive_nayes(): """ Running the Naive Bayes from Spark's Mlib library """ from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.linalg import SparseVector, Vectors from pyspark.mllib.regression import LabeledPoint #loading the files path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/" train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8')) #TF-IDF tr_pos = HashingTF().transform(train_pos) ; tr_pos_idf = IDF().fit(tr_pos) tr_neg = HashingTF().transform(train_neg) ; tr_neg_idf = IDF().fit(tr_neg) te_pos = HashingTF().transform(test_pos) ; te_pos_idf = IDF().fit(te_pos) te_neg = HashingTF().transform(test_neg) ; te_neg_idf = IDF().fit(te_neg) #IDF step tr_pos_tfidf = tr_pos_idf.transform(tr_pos) ; tr_neg_tfidf = tr_neg_idf.transform(tr_neg) te_pos_tfidf = te_pos_idf.transform(te_pos) ; te_neg_tfidf = te_neg_idf.transform(te_neg) #Creating labels pos_label = [1] * 12500 ; pos_label = sc.parallelize(pos_label) neg_label = [1] * 12500 ; neg_label = sc.parallelize(neg_label) # Combine using zip train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) #Joining 2 RDDS to form the final training set train_file = train_pos_file.union(train_neg_file) test_file = test_pos_file.union(test_neg_file) # Fitting a Naive bayes model model = NaiveBayes.train(train_file) # Make prediction and test accuracy predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0])) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "" print "Test accuracy is {}".format(round(accuracy,4))
def mySpark(minFreq, keyWord): # text cleaning function def removePunctuation(text): res=text.lower().strip() res=re.sub("[^0-9a-zA-Z ]", "", res) return res.split(" ") # Function for printing each element in RDD def println(x): for i in x: print i # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf = conf) # Load documents content (one per line) + cleaning. rawData = sc.textFile("list_berita-30.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: removePunctuation(x[3])) # Get documents content without word mapping documentNames = fields.map(lambda x: x[3]) # TF processing hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # IDF & TF-IDF processing tf.cache() idf = IDF(minDocFreq=int(minFreq)).fit(tf) tfidf = idf.transform(tf) # Get keyword relevance with content and zip it keywordTF = hashingTF.transform(removePunctuation(keyWord)) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) # print result print "Best document for keywords is:" print zippedResults.max()
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) words = wordsData.select("words").rdd.map(lambda x: x.words) hashingTF = MllibHashingTF(numFeatures) tf = hashingTF.transform(words) tf.cache() idf = MllibIDF().fit(tf) tfidf = idf.transform(tf) # @TODO make this nicer tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns)) with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns) tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns)) return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def main(sc): data = sc.textFile('data/train.txt').map(parseLine) #print(data.take(10)) # Train/Test split training, test = data.randomSplit([0.7, 0.3], seed=0) # TF-IDF # TF # Features will be hashed to indexes # And the feature(term) frequencies will be calculated hashingTF = HashingTF() # For each training example tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) # IDF # Compute the IDF vector idf_training = IDF().fit(tf_training) # Scale the TF by IDF tfidf_training = idf_training.transform(tf_training) # (SparseVector(1048576, {110670: 1.5533, ...), 0) tfidf_idx = tfidf_training.zipWithIndex() # (['The', 'Da', 'Vinci', 'Code', 'book', 'is', 'just', 'awesome.'], 0) training_idx = training.zipWithIndex() # Reverse the index and the SparseVector idx_training = training_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) #print(idx_training.take(10)) # rdd.join: (K,V).join(K,W) -> (K, (V,W)) # idx_tfidf has no info about lables(0/1) # but idx_training has joined_tfidf_training = idx_training.join(idx_tfidf) training_labeled = joined_tfidf_training.map(lambda tup: tup[1]) training_labeled = training_labeled.map( lambda x: LabeledPoint(x[0][0], x[1])) #print(training_labeled.take(10)) # Train a naive Bayes model model = NaiveBayes.train(training_labeled, 1.0) # Test the model tf_test = test.map(lambda tup: hashingTF.transform(tup[1])) idf_test = IDF().fit(tf_test) tfidf_test = idf_test.transform(tf_test) tfidf_idx = tfidf_test.zipWithIndex() test_idx = test.zipWithIndex() idx_test = test_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) joined_tfidf_test = idx_test.join(idx_tfidf) test_labeled = joined_tfidf_test.map(lambda tup: tup[1]) labeled_test_data = test_labeled.map(lambda k: LabeledPoint(k[0][0], k[1])) #print(labeled_test_data.take(2)) # Apply the trained model on Test data predictionAndLabel = labeled_test_data.map( lambda p: (model.predict(p.features), p.label)) #print(predictionAndLabel.take(10)) # Calculate the accuracy accuracy = 1.0 * predictionAndLabel.filter( lambda x: x[0] == x[1]).count() / labeled_test_data.count() print('>>> Accuracy') print(accuracy) #model.save(sc, '/model') output = open('src/model/model.ml', 'wb') pickle.dump(model, output)
# In[106]: documents1 = sc.textFile(fileName).map(lambda line: line.split(" ")) hashingTF = HashingTF(50000) tf = hashingTF.transform(documents1) # In[107]: tf.cache() if feature == "W": print(feature) tfidf = tf.map(lambda x: x) else: idf = IDF(minDocFreq=1).fit(tf) tfidf = idf.transform(tf) # In[136]: # a = tfidf.collect() # b = tfidf # b.collect() # PythonRDD[12] at RDD at PythonRDD.scala:52 # In[109]: documentModel = documents1.zip(tfidf) random.seed(20181031) # In[110]:
dataRDD = sc.parallelize(data, numSlices=16) lists = dataRDD.map(doc2words) #lists=dataRDD.map(doc2words).collect() # create dict all = [] for l in lists.collect(): all.extend(l) dict = set(all) # TF-IDF hashingTF = HashingTF(numFeatures=len(dict)) tf = hashingTF.transform(lists) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf).collect() #data2 =tfidf.collect() # a = lists.collect() # b = tfidf.collect() # print "type tfidf {} len {}".format(type(b),len(b)) # c=b[0] # print "type c {} len {}".format(type(c),len(c)) ## cross-validaton/Grid-search: cv = ShuffleSplit(len(tfidf), n_iter=3, test_size=0.3, random_state=42) nb = NaiveBayes() lr = LogisticRegressionWithLBFGS() svm = SVMWithSGD() models = [lr, nb, svm]
if(flag=="n"): words.append(word) data.append(list(words)) data.remove("") documents = sc.parallelize(data) def hashing(x): return hashingTF.transform([x]).indices[0] hashed = documents.flatMap(lambda line: line).map(lambda word:(hashing(word), word)).distinct() hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash') # hashingTF = HashingTF() # Tf-Idfの生成 tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tf_idf_data = idf.transform(tf) print dt.now().strftime('%Y/%m/%d %H:%M:%S') K = 5 # Index documents with unique IDs corpus_data = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() print corpus_data # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus_data, k=K) # Output topics. Each is a distribution over words (matching word count vectors) print "Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):" topics = ldaModel.topicsMatrix() print dt.now().strftime('%Y/%m/%d %H:%M:%S') def idx_to_word(idx):
data_key = data.map( lambda x: (x.item_id, x.label, creat_vector(x.keywords.split('|'), keywords_dict, num_words))) #2)item words tf data = hiveContext.sql('select * from wl_service.t_lt_train_item_words_v2') #words to vector tf = HashingTF() data_tf = data.map(lambda x: (x.item_id, x.label, tf.transform(x.words.split('_')))) #3)item words tf_idf features = data_tf.map(lambda x: x[-1]) idf = IDF().fit(features) idf_FT = idf.transform(features) #item_id,label,feature data_tfidf = data_tf.map(lambda x: (x[0], x[1])).zip(idf_FT).map( lambda x: (x[0][0], x[0][1], x[1])) #4)item info(price 和 favor特征需要标准化) data_item = hiveContext.sql('select * from wl_service.t_lt_trian_item_info_v3') data_item = data_item.na.fill(0) data_item_ft = data_item.map(lambda x: (x.item_id, x.label, x[1:11])) #5)shop info(price/favor/credit,特征需要标准化) data_shop = hiveContext.sql( 'select * from wl_service.t_lt_trian_item_shop_info_v4') data_shop = data_shop.na.fill(0) data_shop_ft = data_shop.map(lambda x: (x.item_id, x.label, x[1:23]))
nonStopWords2t=nonStopWordst.map(lambda t:' '.join([stemmer.stem(word) for word in t.split(" ") if len(word)>=2 ])) tokenCountst=nonStopWords2t.map(lambda t:list(set((word,t.count(word)) for word in t.split(" ")))) manyTokenst = tokenCountst.map(lambda l:[mytuple for mytuple in l if mytuple[1]>=2])# if mytuple[1]>=2]) rareTokenst= manyTokenst.flatMap(lambda l: l) rareTokens1t=rareTokenst.reduceByKey(lambda a,b:a+b).filter(lambda t:t[1]<=1).map(lambda t:t[0]) raresett= set(rareTokens1t.collect()) manytokens_finalt= manyTokenst.map(lambda l:[mytuple for mytuple in l if mytuple[0] not in raresett]) dim= math.pow(2,16) hashingTF= HashingTF(dim) tokenst = manytokens_finalt.map(lambda l:[k for (k,v) in l]) tft = hashingTF.transform(tokenst) tft.cache() #print(tf.take(1)) #idft = IDF().fit(tft) surtout pas de nouveau idft! tfidft_supervised = idf.transform(tft) #use the training set IDF to transform the test data, as this creates a more realistic estimation of model performance on new data #p272 #print(tfidft.take(1)) idft = IDF().fit(tft) tfidft_unsupervised=idft.transform(tft)
labels = training_raw.map( lambda doc: doc["label"], # Standard Python dict access preservesPartitioning=True # This is obsolete. ) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf = HashingTF(numFeatures=numfeatures).transform( ## Use much larger number in practice training_raw.map(lambda doc: doc["text"].split(), preservesPartitioning=True)) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # Combine using zip training = labels.zip(tf).map(lambda x: LabeledPoint(x[0], x[1])) # TEST DATA testlabel = testlabels.map(lambda line: float(line)) t = reviewdata1.collect() l = testlabel.collect() testdata = [{"text":t[i],"label":l[i]} for i in range(len(l))] test_raw = sc.parallelize(testdata) testlabels = test_raw.map( lambda doc: doc["label"], # Standard Python dict access preservesPartitioning=True # This is obsolete.
sc = SparkContext('local', conf=conf) tweetData = sc.textFile("data/tweets_formatted_data.csv") tweetData.take(2) fields = tweetData.map(lambda x: x.split(",")) fields.take(1) documents = fields.map(lambda x: x[1].lower().split(" ")) documents.take(1) documentNames = fields.map(lambda x: x[0]) hashingTF = HashingTF(100000) article_hash_value = hashingTF.transform(documents) article_hash_value.cache() idf = IDF().fit(article_hash_value) tfidf = idf.transform(article_hash_value) xformedData = tweetData.zip(tfidf) xformedData.cache() xformedData.collect()[0] from pyspark.mllib.regression import LabeledPoint def convertToLabeledPoint(inVal): origAttr = inVal[0].split(",") sentiment = 0.0 if origAttr[0] == "feedback" else 1.0 return LabeledPoint(sentiment, inVal[1]) tweetLp = xformedData.map(convertToLabeledPoint)
def tfIdf_cluster(self,content,title,date,tfidf): tfidf_list=content inputRDD = sc.parallelize(tfidf_list) hasingTF = HashingTF(2 ** 20) trainTf = hasingTF.transform(inputRDD) idf = IDF().fit(trainTf) trainTfidf = idf.transform(trainTf) km = KMeans.train(trainTfidf, 2, maxIterations=100, runs=10) #training new model result = km.predict(trainTfidf) k_data = array(result.collect()) grp1_news = [] grp2_news = [] #把抓到的新聞存成[{},{}] key & value 的樣子方便前端取用 # i = 0 for idx, grp in enumerate(k_data): if grp == 0: news = { 'title':title[idx], 'date':date[idx], 'content':''.join(content[idx].split()), 'tfidf':tfidf[idx], } grp1_news.append(news) if grp == 1: news = { 'title':title[idx], 'date':date[idx], 'content':''.join(content[idx].split()), 'tfidf':tfidf[idx], } grp2_news.append(news) #存取新聞分群TFIDF詞數量開始------------------------------------ tfidf_word_grp1=[] #用來裝TFIDF詞跟數量 all_tfidf_grp1=[] #用來裝所有TFIDF詞 for post in grp1_news: tfidf = post['tfidf'] for i in tfidf: all_tfidf_grp1.append(i) tfidf_dic1 = {} for ele in all_tfidf_grp1: # n if not ele in tfidf_dic1: tfidf_dic1[ele] = 1 else: tfidf_dic1[ele] = tfidf_dic1[ele] + 1 for i in range(0,len(tfidf_dic1)): data = { "text":tfidf_dic1.keys()[i], "size":(tfidf_dic1.values()[i])*1.5, } tfidf_word_grp1.append(data) tfidf_word_grp1.sort(key=lambda d:d['size'],reverse=True) #幫情緒字進行排序 tfidf_word_grp1 = tfidf_word_grp1[0:50] tfidf_word_grp1 = json.dumps(tfidf_word_grp1) #--------------------------------------------------------------------------------------------- tfidf_word_grp2=[] #用來裝TFIDF詞跟數量 all_tfidf_grp2=[] #用來裝所有TFIDF詞 for post in grp2_news: tfidf = post['tfidf'] for i in tfidf: all_tfidf_grp2.append(i) tfidf_dic2 = {} for ele in all_tfidf_grp2: # n if not ele in tfidf_dic2: tfidf_dic2[ele] = 1 else: tfidf_dic2[ele] = tfidf_dic2[ele] + 1 for i in range(0,len(tfidf_dic2)): data = { "text":tfidf_dic2.keys()[i], "size":(tfidf_dic2.values()[i])*1.5, } tfidf_word_grp2.append(data) tfidf_word_grp2.sort(key=lambda d:d['size'],reverse=True) #幫情緒字進行排序 tfidf_word_grp2 = tfidf_word_grp2[0:50] tfidf_word_grp2 = json.dumps(tfidf_word_grp2) #存取新聞分群TFIDF詞數量結束------------------------------------ return grp1_news,grp2_news,tfidf_word_grp1,tfidf_word_grp2
negRateDocument.repartition(1) posRateDocument = sc.parallelize(fiveRateDocument.take(negRateDocument.count())).repartition(1) allRateDocument = negRateDocument.union(posRateDocument) allRateDocument.repartition(1) rate = allRateDocument.map(lambda s: s[0]) document = allRateDocument.map(lambda s: s[1]) # 分词 words = document.map(lambda w: "/".join(jieba.cut_for_search(w))).map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) tf.cache() # 计算 TF-IDF 矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count() print accuracy
# whether set to '1', 'pca_mode' allows to use data projection on principal components pca_mode = sc.broadcast(0) low_dim = 2 feature_dim = 4096 # 1048576 k = feature_dim # LOADING AND COMPUTING TF's TRAINING MODEL print('Loading TRAINING_TF_MODEL...') tf_training = sc.pickleFile(os.getcwd() + '/model/TF/TF_MODEL_' + str(feature_dim)) print('done!') print('Computing TF-IDF MODEL...') idf_training = IDF(minDocFreq=5).fit(tf_training) tfidf_training = idf_training.transform(tf_training) print('done!') # APPLYING PCA ON TRAINING DATA if pca_mode.value == 1: print('Applying PCA on training data...') PCA_model = PCA(low_dim).fit(tfidf_training) tfidf_training = PCA_model.transform(tfidf_training) k = low_dim # pcArray = model.transform(tfidf_training.first()).toArray() #setting checkpoint # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint") # CREATING DStream FROM TRAINING'S RDD
tweets = sc.textFile("/Users/anshulrastogi/Downloads/nlp/twits.txt") tweets = tweets.map(lambda x: re.sub(r"(@|#)(\w+)", '', x)) tweets = tweets.map(lambda x: x.split(',')) plain_txt = tweets.map(lambda x: (x[0], x[1].encode('utf-8').translate( string.maketrans("", ""), string.punctuation))) plain = plain_txt.map(lambda x: (x[0], x[1].translate(string.maketrans("", ""), '0123456789').lower())) labels = plain.map(lambda x: float(x[0])).collect() tokens = plain.map(lambda x: x[1].split()) hashingTF = HashingTF() tf = hashingTF.transform(tokens) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) labeledData = tfidf.zipWithIndex().map(lambda (x, y): LabeledPoint(labels[y], x)) model = NaiveBayes.train(labeledData) for tweet in feed: tweet_text = tweet['new_val']['text'] message = re.sub(r"(@|#)(\w+)", '', tweet_text) message = message.encode('utf-8').translate(string.maketrans("", ""), string.punctuation) message = message.translate(string.maketrans("", ""), '0123456789').lower() tf_new = hashingTF.transform(message.split(" ")) tweet['new_val']['sentiment'] = model.predict(idf.transform(tf_new)) rdb.db("sentiment").table("classified_messages").insert(tweet).run()
time = parts[1] features = parts[3].split(' ') return (time, features) #read the testing dataset data2 = sc.textFile("/Users/macho/Desktop/data2.txt").map(parseLineTest) time = data2.map(lambda doc: doc[0], preservesPartitioning=True) tw = data2.map(lambda doc: doc[1],preservesPartitioning=True) #read the training dataset data1 = sc.textFile('/Users/macho/Desktop/data1.txt').map(parseLine) #split training dataset into labels and text labels = data1.map(lambda doc: doc[0], preservesPartitioning=True) text = data1.map(lambda doc: doc[1], preservesPartitioning=True) #combine training and testing dataset together alltext = text.union(tw) #calculate TF-IDF Hash = HashingTF() tf = Hash.transform(alltext) tf1 = Hash.transform(text) tf2 = Hash.transform(tw) idf = IDF().fit(tf) tfidf1 = idf.transform(tf1) tfidf2 = idf.transform(tf2) #Use Naive Bayes to classify training dataset training = labels.zip(tfidf1).map(lambda x: LabeledPoint(x[0], x[1])) model = NaiveBayes.train(training) #Predict the labels of testing dataset pred = time.zip(model.predict(tfidf2)).map(lambda x: ('',x[0],x[1],'')) #save the result pred.saveAsTextFile("/Users/macho/Desktop/out")
data = sc.textFile("training_test_data.txt").map(parseLine) ''' Split data into labels and features, transform preservesPartitioning is not really required since map without partitioner shouldn't trigger repartitiong ''' # Extract all the "labels" labels = data.map(lambda doc: doc[0]["label"], preservesPartitioning=True) for x in labels.take(3): print x # Perform TF-IDF tf = HashingTF().transform( data.map(lambda doc: doc[0]["text"], preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) # Combine lables and tfidf and create LabeledPoint data dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])) for x in dataset.take(3): print(x) result = [] ''' Random split dataset - 60% as training data and 40% as testing. Train and test the model 10 times. Then put the accuracy into result[] ''' for i in range(0, 10): training, test = dataset.randomSplit([0.6, 0.4], seed=i) model = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p:
def distributed_ops( corpus, sanit=False, recall=False, corpred=False, \ streams=False, segred=False, tfidf=False, lda=False, \ word2vec=False, fin=None, segclust=None): # Return item for end results return_list = [] ########################################## # Default actions: if (segred): zipped_corpus = zip(segclust,corpus) #print zipped_corpus corpus = sc.parallelize(corpus).cache() if (sanit or recall): corpus = corpus.map(lambda doc: preprocess(doc)) # Here we "recover all" text, after having removed multi-ws & ws-pad punctuation # & replace \n by NL etc... (see function "preprocess" above) # We use the same regex sub/filtration rules as in the implementation found # @ https://github.com/alexalemi/segmentation (from which we got files in # directory: representation.py, tools.py and splitters.py, and which # segmentSETxRes.py is based on) if (recall): return_list.append(recover_encoding(corpus.collect())) # Here we return only potentially "meaningful words" - see function "return_words" above # Keeps alpha-numeric (removes numeric and non-alphabetical/alphanumeric) corpus_distrib = corpus.map(lambda doc: return_words(doc)) print 'Original number of docs in corpus {filtering *docs* for alpha(+alphanumeric)-only words}: %i'%corpus_distrib.count() # merge corpus docs into one continuous split text corpus_merge = [] corpus_collect = corpus_distrib.collect() # rdd2list for list_of_words in corpus_collect: corpus_merge.extend(list_of_words) # list-of-wordslist2{single-wordslist} # use numpy functions to sort dict words based on term-frequency corpus_merge_array = np.array(corpus_merge) corpus_merge_sorted = np.sort(corpus_merge_array) corpus_merge_unique, counts = np.unique(corpus_merge_sorted,return_counts=True) sort_ixs = np.argsort(counts)[::-1] counts = counts[sort_ixs] corpus_merge_unique = corpus_merge_unique[sort_ixs] return_list.append(corpus_merge_unique) return_list.append(counts) print for i,w in enumerate(corpus_merge_unique): print ('Counted word "%s" _%i_ many times.'%(w,counts[i])) print ######################################################################################### # Next we split the text based on "verbosity/density/sparsity" as would # befit an articulate document (i.e. articles/papers/journal entries) # or more conversational/blog-entry-like/Q&A style/headings-only- # -retrieved website results. def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point-center)])) # The following will further sanitize text. if (corpred): # Use pretrained term frequencies: # Experimentally, the following clustering has helped us get rid of # irrelevant search engine text results. corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=False)) corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc)).cache() # print 'Corpus vectorized' # collected = corpus2vec.collect() tempor = corpus.collect() print print for i,vec in enumerate(corpus2vec.collect()): print 'Got vecs:' print vec print 'Of text:' print tempor[i].split() print print # choose 5 clusters clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||") WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum print print 'Within Set Sum of Squared Error = ' + str(WSSE) print 'The cluster centers:' print clusters.centers print print return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect()) # The following will cluster for article length + content if (streams): corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=True)) temple = corpus.collect() print print for i,vec in enumerate(corpus2vec.collect()): print 'Got vecs:' print vec print 'Of text:' print temple[i].split() print print sumall = corpus2vec.reduce(lambda vecx,vecy: np.array([vecx[0]+vecy[0]])) corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc,normalizer=sumall)).cache() # clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||") WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum print print 'Within Set Sum of Squared Error = ' + str(WSSE) print 'The cluster centers:' print clusters.centers print print return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect()) ######################################################################################### # Here we want to remove documents from the corpus which do not contain # 'english' dictionary words at all, or words that can be word2vec transformed # and "synonimized". if (segred): corpus_english_prose = sc.parallelize(zipped_corpus).filter(lambda doc: check(doc)) zipped_corpus = zip(*corpus_english_prose.collect()) red_clusts = list(zipped_corpus[0]) red_text = recover_encoding(list(zipped_corpus[1])) return_list.append(red_clusts) return_list.append(red_text) print 'Number of docs in corpus {filtering *corpus* for alpha(+alphanumeric)-only words}: %i'%corpus_english_prose.count() f1 = open(''.join([filename,'-document_clusters.txt']),'w') f1.write('\n'.join(map(str,red_clusts))) f1.close() f2 = open(''.join([filename,'-documents_sanitized.txt']),'w') f2.write('\n'.join(red_text)) f2.close() f3 = open(''.join([filename,'-documents_dict.txt']),'w') f3.write('\n'.join(corpus_merge_unique)) f3.close() ######################################################################################### if (tfidf): # generate document term frequences htf = HashingTF() tf = htf.transform(corpus_distrib) # generate idf = log{ frac{#docs}{#docs w. term} } idf = IDF().fit(tf) # scale tf * idf tfidf = idf.transform(tf) # collect tfidf for future use doc_tfidf = tfidf.collect() # generate unique word : HashingTF hash dict corpus_dict_tfidf_t = {} # uniquifie merged corpus into terms #corpus_merge_unique = sorted(set(corpus_merge)) # fill in unique word : HashingTF hash dict for word in corpus_merge_unique: idx = htf.indexOf(word) corpus_dict_tfidf_t[word] = idx # index not necessarily found in doc_tfidf. # no return item ######################################################################################### if (lda): corpus_dict = {} for c,word in enumerate(corpus_merge_unique): corpus_dict[word]=counts[c] def return_freq_words(doc,corpus_dict): return [word for word in doc if word in corpus_dict if corpus_dict[word]>2] corpus_distrib_red = corpus_distrib.map(lambda doc: return_freq_words(doc,corpus_dict)).cache() gensim_corpora_id2word = corpora.Dictionary(corpus_distrib_red.collect()) gensim_doc2bow_doctf = corpus_distrib_red.map(lambda doc: gensim_corpora_id2word.doc2bow(doc)).collect() f1 = open(''.join([filename,'-gensim_corpora_id2word.pkl']),'w') pickle.dump(gensim_corpora_id2word,f1) f1.close() f2 = open(''.join([filename,'-gensim_doc2bow_doctf.pkl']),'w') pickle.dump(gensim_doc2bow_doctf,f2) f2.close() f3 = open(''.join([filename,'-corpus.pkl']),'w') pickle.dump(corpus_distrib.collect(),f3) f3.close() if (word2vec): # def increase_tf(doc): # only words with freq >= 5 are vectorized ret_doc = [] for i in xrange(5): # <<< ret_doc.extend(doc) # <<< return ret_doc # corpus_distrib_ext = corpus_distrib.map(lambda doc: increase_tf(doc)) word_mbd = Word2Vec().setVectorSize(50).setSeed(42L).fit(corpus_distrib_ext) word2vec_dict = {} for i,w in enumerate(corpus_merge_unique): #print ('Counted word "%s" _%i_ many times.'%(w,counts[i])) word2vec_dict[w] = word_mbd.transform(w) try: print ('Top 5 embedding cosine similarity synonyms of word "%s":'%w) proximal_synonyms = word_mbd.findSynonyms(w,5) for s,cs in proximal_synonyms: print (' "%s" with score _%f_'%(s,cs)) except: print 'No synonyms found (word not in dict).' print print 'Processing + Spark MLLib has given us %i word2vec vectors.'%len(word2vec_dict) return_list.append(word2vec_dict) f4 = open(''.join([filename,'-word2vec_dict.pkl']),'w') pickle.dump(word2vec_dict,f4) f4.close() if len(return_list)==1: return_list = return_list[0] return return_list
def calculate_tfidf(documents): hashingTF = HashingTF() tf = hashingTF.transform(documents.map(lambda x: x[1])) tf.cache() idf = IDF().fit(tf) return idf.transform(tf)
from pyspark import SparkContext """ Following implementation is based on sample provided by Apache Spark repository It uses building in method to calculate TF-TDF https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/tf_idf_example.py """ sc = SparkContext("local", "TF-IDF") # Load documents (one per line). documents = sc.textFile("hdfs://$NAME_NODE_IP:9000/input/reviews.txt").map( lambda line: line.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) # save tf-idf tfidfIgnore.saveAsTextFile('hdfs://$NAME_NODE_IP:9000/output/tfidf')
def process(reviews): if (reviews.isEmpty()): pass else: model_name = "svm" updated_model = "svm0" model_path, data_path, metadata_path = '', '', '' #performing looping process to check the availability of new model classifier for i in range(60, -1, -1): model_path = "hdfs://VM10-1-0-14:9000/classifier/" + model_name + str( i) updated_model = model_name + str(i) data_path = model_path + "/data/part-r*" metadata_path = model_path + "/metadata/part-00000" if (patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = SVMModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) prediction = model.predict(tfidf) labeled_prediction = reviews_label.zip(prediction).map( lambda x: (float(x[1]), x[0])) metrics = MulticlassMetrics(labeled_prediction) output = reviews.zip(prediction) filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub( '[^0-9]', '', str(datetime.now())) + ".out" output.saveAsTextFile(filename) end = time.time() print(updated_model, ';', reviews.count(), ';', metrics.accuracy, ';', metrics.precision(0.0), ';', metrics.precision(1.0), ';', metrics.recall(0.0), ';', metrics.recall(1.0), ';', metrics.fMeasure(0.0), ';', metrics.fMeasure(1.0), ';', (end - start))
sc = SparkContext() sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY) text_negative = sc.textFile("s3n://sent/train_neg.txt") text_positive = sc.textFile("s3n://sent/train_pos.txt") train_text = text_negative.union(text_positive) train_labels = text_negative.map(lambda x: 0.0).union( text_positive.map(lambda x: 1.0)) tf = HashingTF().transform(train_text.map(parseline, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) training = train_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) model = NaiveBayes.train(training) # TESTING SET ================================================================= text_negative = sc.textFile("s3n://sent/test_neg.txt") text_positive = sc.textFile("s3n://sent/test_pos.txt") test_text = text_negative.union(text_positive) test_tlabels = text_negative.map(lambda x: 0.0).union( text_positive.map(lambda x: 1.0))
>>>corpus = parts.map(lambda row: Row(id=row[0], comment=row[1], class=row[2])) #The parts is a list of fields as we have each field in the line delimited on “\t”. #Let's break the corpus that has [ID, comment, class (0,1)] in the different RDD objects: >>>comment = corpus.map(lambda row: " " + row.comment) >>>class_var = corpus.map(lambda row:row.class) #Once we have the comments, we need to do a process very similar to what we did in Chapter 6, Text Classification, where we used scikit to do tokenization, hash vectorizer and calculate TF, IDF, and tf-idf using a vectorizer. #The following is the snippet of how to create tokenization, term frequency, and inverse document frequency: >>>from pyspark.mllib.feature import HashingTF >>>from pyspark.mllib.feature import IDF # https://spark.apache.org/docs/1.3.0/mllib-feature-extraction.html >>>comment_tokenized = comment.map(lambda line: line.strip().split(" ")) >>>hashingTF = HashingTF(1000) # to select only 1000 features >>>comment_tf = hashingTF.transform(comment_tokenized) >>>comment_idf = IDF().fit(comment_tf) >>>comment_tfidf = comment_idf.transform(comment_tf) #Will merge the class with the c tfidf RDD like this: >>>finaldata = class_var.zip(comment_tfidf) #We will do a typical test and train smapling >>>train, test = finaldata.randomSplit([0.8, 0.2], seed=0) #Let's perform the main classification commands, which are quite similar to scikit. We are using a logistic regression, which is widely used classifier. The pyspark.mllib provides you a variety of algorithms. #For more information on pyspark.mllib visit https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html #The following is an example of logistic regression classifier: >>>from pyspark.mllib.regression import LabeledPoint >>>from pyspark.mllib.classification import NaiveBayes >>>train_rdd = train.map(lambda t: LabeledPoint(t[0], t[1])) >>>test_rdd = test.map(lambda t: LabeledPoint(t[0], t[1])) >>>nb = NaiveBayes.train(train_rdd,lambda = 1.0) >>>nb_output = test_rdd.map(lambda point: (NB.predict(point.features), point.label)) >>>print nb_output
from pyspark.mllib.util import MLUtils #>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] #>>> tempFile = NamedTemporaryFile(delete=True) #>>> tempFile.close() #>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name) from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF, IDF from pyspark import SparkContext sc=SparkContext("local","dd") train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(",")) trainlabels = train.map(lambda(a,b): int(b)) traintf = HashingTF().transform(train.map(lambda(a,b): a.split())) trainidf = IDF().fit(traintf) traintfidf = trainidf.transform(traintf) #densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray())) #zippeddata = trainlabels.zip(densetrain) #new = zippeddata.map(lambda (a,vec) : (a,vec.toArray())) training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1])) MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile") data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000") (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
# COMMAND ---------- #tags count read_tags_1m_data.count() # COMMAND ---------- # TFIDF of Documents from pyspark.mllib.feature import HashingTF, IDF hashingTF = HashingTF(features_length) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) #tf.cache() #idf = IDF().fit(tf) #tfidf = idf.transform(tf) # COMMAND ---------- tfidf # COMMAND ---------- #Documents after TFIDF tfidf tfidf.take(3) mtif = tfidf.map(lambda x: [x])
def main(): """ Driver program for a spam filter using Spark and MLLib """ # Consolidate the individual email files into a single spam file # and a single ham file makeDataFileFromEmails( "data/spam_2/", "data/spam.txt") makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" ) # Create the Spark Context for parallel processing sc = SparkContext( appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "data/spam.txt" ) ham = sc.textFile( "data/ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamtf = spam.map(lambda email: tf.transform(email.split(" "))) hamtf = ham.map(lambda email: tf.transform(email.split(" "))) spamtf.cache() hamtf.cache() spamidf = IDF().fit(spamtf) hamidf = IDF().fit(hamtf) spamFeatures = spamidf.transform(spamtf) hamFeatures = hamidf.transform(hamtf) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets ( trainingData, testData ) = data.randomSplit( [0.7, 0.3] ) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = SVMWithSGD.train(trainingData, iterations=100) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) ) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() ) print( "*********** SPAM FILTER RESULTS **********" ) print( "\n" ) print( "Error Rate: " + str( error_rate ) ) print( "\n" ) # Serialize the model for presistance pickle.dump( model, open( "SpamSvm.pkl", "wb" ) ) sc.stop()
vocabSize=vocabSize, minDF=mindocFrequencies) cvmodel = cv.fit(wordsDataFrame) result = cvmodel.transform(wordsDataFrame).select("label", "features") vocablist = (cvmodel.vocabulary) #print (vocablist) #result.select("features").show(truncate=False) countVectors = result.select("features") frequencyVectors = countVectors.rdd.map( lambda vector: DenseVector(vector[0].toArray())) idf = IDF().fit(frequencyVectors) tfidf = idf.transform(frequencyVectors) frequencyVectors = frequencyVectors.zipWithIndex() #resultMod = result.rdd.map(lambda vector: DenseVector(vector[1].toArray())) # resultMod = resultMod.toDF() resultMod = sqlContext.createDataFrame(frequencyVectors, ["features", "documentId"]) # prepare corpus for LDA corpus = tfidf.map(lambda x: [1, x]).cache() # train LDA # optimizer parameter "em" or "online"
def calculateSentiment(self,sc,query): model = NaiveBayesModel.load(sc,"finalproject/model/NaiveBayesModel") query = query print (query) twitDG = TwitterDataGenerator() twitDG.getData(query) inputFile = sc.textFile("finalproject/tweets.csv").distinct() input_id = inputFile.zipWithIndex().map(lambda l:(l[1],l[0])) preprocessedData = self.preProcess(inputFile) inputFileProcessed = self.processInputFile(inputFile) print("#################################################################################################") print(preprocessedData.take(5)) print("--------------------------------------------------------------------------------------------------") print(inputFileProcessed.take(5)) print("input file processed ",inputFileProcessed.count()) print("preprocessed count",preprocessedData.count()) hashingTF = HashingTF() tfData = preprocessedData.map(lambda tup: hashingTF.transform(tup)) idfData = IDF().fit(tfData) tfidfData = idfData.transform(tfData) output = tfidfData.map(lambda rec: model.predict(rec)) i_I=inputFileProcessed.map(lambda l: l[0]).zipWithIndex().map(lambda l:(l[1],l[0])) print("input file count",inputFile.count()) print ("output file count",output.count()) o_I=output.zipWithIndex().map(lambda l:(l[1],l[0])) i_o =i_I.join(o_I).map(lambda l:l[1]) print(i_o.take(i_o.count())) print(i_o.count()) outputJson = {} tweetList = [] tweet = {} positiveCount =0 negativeCount =0 for i in i_o.take(i_o.count()): print(i) #print data,data1 if i[1] == 0.0: negativeCount = negativeCount+1 text = "This is a negative Tweet" elif i[1] == 1.0: positiveCount = positiveCount + 1 text = "This is a positive Tweet" #data = text #replace(u"\u2022", "*").encode("utf-8") if len(i[0]) > 4: tweet = {} tweet['value'] = i[0].encode("ascii","ignore") tweet['sentiment'] = text tweetList.append(tweet) print i[0].encode("ascii","ignore") print text print "-------------------------------------" #print unicode(str(data),"utf-8") print (positiveCount) print (negativeCount) outputJson["tweets"] = json.dumps(tweetList) outputJson["positiveTweetCount"] = positiveCount outputJson["negativeTweetCount"] = negativeCount wordflatMap = preprocessedData.flatMap(lambda xs: [x for x in xs]).map(lambda x:x.encode("ascii","ignore")).map(lambda x: (x, 1)).reduceByKey(add) wordFlatMap_reversed = wordflatMap.map(lambda l:(l[1],l[0])).filter(lambda l: (l[1]!="rt" and l[1]!=query)) wordFlatMap_sorted = wordFlatMap_reversed.sortByKey(False) print (wordFlatMap_sorted.take(10)) outputFrequencyList = {} mostFrequentWordList = [] wordCount = {} words =[] counts = [] for i in wordFlatMap_sorted.take(10): wordCount = {} wordCount['word'] = i[1] wordCount['count'] = i[0] mostFrequentWordList.append(wordCount) outputJson["frequency"] = json.dumps(mostFrequentWordList) return outputJson
# Load documents (one per line). documents = sc.textFile(sys.argv[1]).map(parseLine) #rdd label = documents.map(lambda x: x[1]) features = documents.map(lambda x: x[2]) labelSet = list(set( label.collect())) # change RDD to set (only unique categories) print "Category-Label mapping:", labelSet hashingTF = HashingTF(5000) tf = hashingTF.transform(features) tf.cache() idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf).cache() data = label.zip(tfidf).map( lambda x: LabeledPoint(labelSet.index(x[0]), x[1])).cache() training = data.sample(False, .90) test = data.sample(False, .10) print "Num Points:", data.count() # Build the model model = LogisticRegressionWithLBFGS.train(training, numClasses=len(labelSet)) # test a few items labelsAndPreds = test.map( lambda p: (labelSet[int(p.label)], p.label, model.predict(p.features))) temp = labelsAndPreds.take(50) for index in range(len(temp)):
def tfidf(data): hashing = HashingTF() tf = hashing.transform(data) idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def produce_tfidf(x): tf = HashingTF().transform(x) idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf) return tfidf
def vectorize_feature(training): hashingTF = HashingTF() tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) idf_training = IDF().fit(tf_training) tfidf_training = idf_training.transform(tf_training) return tfidf_training
# Databricks notebook source exported at Thu, 23 Jun 2016 07:23:39 UTC from pyspark import SparkConf,SparkContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF rawData = sc.textFile("/FileStore/tables/dp736dao1466664806758/subset_small-50f68.tsv") fields = rawData.map(lambda x:x.split("\t")) documents = fields.map(lambda x:x[3].split(" ")) #Document names documentNames = fields.map(lambda x:x[1]) #hash the word in document to their term frequencies hashingtf = HashingTF(100000) #to save memory tf = hashingtf.transform(documents) # each value ->term frequency of unique hash value #calculating tf*idf score idf = IDF(minDocFreq = 2).fit(tf) tfidf = idf.transform(tf) # each value ->tf*idf of unique hash value of each document #Test gettysBurgTF = hashingtf.transform("Gettysburg") gettysburgHashValue = int(gettysBurgTF.indices[0]) gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue]) zippedResults = gettysburgRelevance.zip(documentNames) #print best result print zippedResults.max()
def returnTFIDF(tokens, hashingTF): tf = hashingTF.transform(tokens) idf = IDF(minDocFreq=25).fit(tf) tfidf = idf.transform(tf) return tfidf
def classify_tweet(tf): idf = IDF().fit(tf) tf_idf = idf.transform(tf) return tf_idf
allLableDocument = negLableDocument.union(posLableDocument) allLableDocument.repartition(1) lable = allLableDocument.map(lambda s: s[0]) document = allLableDocument.map(lambda s: s[1]) import jieba words = document.map(lambda w: "/".join(jieba.cut_for_search(w))).map( lambda line: line.split("/")) from pyspark.mllib.feature import HashingTF, IDF hashingTF = HashingTF() tf = hashingTF.transform(words) tf.cache() idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) from pyspark.mllib.regression import LabeledPoint zipped = lable.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) from pyspark.mllib.classification import NaiveBayes NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count() # 0.6707555665973106 # yourDocument=input("输入待分类的评论:") yourDocument = """那道黄金饺主食太肯爹了,每个饺子比小馄炖还小,炸过的,吃起来软塔塔的,里面就点萝卜丝,小小的12个,58元,大家千万别上当啊,菜谱里没有的,点菜时服务员竭力推荐的,千万别上当!??"""
docData = docData.split() docData = [x for x in docData if x not in stopWordList] docData = [porter.stem(word) for word in docData] return (docID, docData) data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse) titles = data.map(lambda x: x[0]) documents = data.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) normalizer = Normalizer() tfidf = normalizer.transform(idf.transform(tf)) tfidfData = titles.zip(tfidf).toDF(["label", "features"]) #idf.rdd.saveAsTextFile("idf_model") #sc.parallelize(idf.idf()).coalesce(1).saveAsTextFile("idf") #MLUtils.saveAsLibSVMFile(tfidfData, "tfidf_column.out") query = parse(( 0, "location_id organization_id name latitude longitude bbl bin cd council nta tract" ))[1] queryTF = hashingTF.transform(query) queryTFIDF = normalizer.transform(idf.transform(queryTF)) queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot( queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0) if (queryRelevance.isEmpty()): print("nothing matched")
twoRateDocument = rateDocument.filter(lambda line: int(float(line[0])) == 2).map(lambda line: (0, line[1])) oneRateDocument = rateDocument.filter(lambda line: int(float(line[0])) == 1).map(lambda line: (0, line[1])) allRateDocument = oneRateDocument.union(twoRateDocument).union(threeRateDocument).union(fourRateDocument).union(fiveRateDocument) # Generate training data rate = allRateDocument.map(lambda s: s[0]) document = allRateDocument.map(lambda s: s[1].split(" ")) tipsDocument = tipsDocument.map(lambda s: s[1]) document_t = tipsDocument.map(lambda s: s.split(" ")) hashingTF = HashingTF() tf=hashingTF.transform(document) tf.cache() idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf_t=hashingTF.transform(document_t) tf_t.cache() idfModel_t = IDF().fit(tf_t) tfidf_t = idfModel_t.transform(tf_t) training_t = tfidf_t zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # LRmodel = LogisticRegressionWithSGD.train(training, iterations = 50) # NBmodel = NaiveBayes.train(training, 1.0) SVMmodel = SVMWithSGD.train(training, iterations=100) prediction = training_t.map(lambda p: (SVMmodel.predict(p))) predictionAndLabel = test.map(lambda p: (SVMmodel.predict(p.features), p.label))
hashingTF = HashingTF(tf_val) print('Computing TF model...') tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) print('Saving TF_MODEL...') tf_training.saveAsPickleFile("/model/TF_MODEL_"+str(tf_val)) idf_training = IDF().fit(tf_training) print('Computing TF-IDF...') tfidf_training = idf_training.transform(tf_training) tfidf_idx = tfidf_training.zipWithIndex() training_idx = training.zipWithIndex() idx_training = training_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) joined_tfidf_training = idx_training.join(idx_tfidf) training_labeled = joined_tfidf_training.map(lambda tup: tup[1]) labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1]))
data.remove("") documents = sc.parallelize(data) def hashing(x): return hashingTF.transform([x]).indices[0] hashed = documents.flatMap(lambda line: line).map( lambda word: (hashing(word), word)).distinct() hashed_word = pd.DataFrame(hashed.collect(), columns=['hash', 'word']).set_index('hash') # hashingTF = HashingTF() # Tf-Idfの生成 tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tf_idf_data = idf.transform(tf) print dt.now().strftime('%Y/%m/%d %H:%M:%S') K = 5 # Index documents with unique IDs corpus_data = tf_idf_data.zipWithIndex().map( lambda x: [x[1], x[0]]).cache() print corpus_data # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus_data, k=K) # Output topics. Each is a distribution over words (matching word count vectors) print "Learned topics (as distributions over vocab of " + str( ldaModel.vocabSize()) + " words):" topics = ldaModel.topicsMatrix() print dt.now().strftime('%Y/%m/%d %H:%M:%S')
stop = sc.broadcast(stop) sentiments = {'1.0': "Positive", '0.0': "Negative"} tweets = sc.textFile("/Users/anshulrastogi/Downloads/nlp/twits.txt") tweets = tweets.map(lambda x: re.sub(r"(@|#)(\w+)", '', x)) tweets = tweets.map(lambda x: x.split(',')) plain_txt = tweets.map(lambda x: (x[0], x[1].encode('utf-8').translate(string.maketrans("", ""), string.punctuation))) plain = plain_txt.map(lambda x: (x[0], x[1].translate(string.maketrans("", ""), '0123456789').lower())) labels = plain.map(lambda x: float(x[0])).collect() tokens = plain.map(lambda x: x[1].split()) hashingTF = HashingTF() tf = hashingTF.transform(tokens) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) labeledData = tfidf.zipWithIndex().map(lambda (x, y): LabeledPoint(labels[y], x)) model = NaiveBayes.train(labeledData) for tweet in feed: tweet_text = tweet['new_val']['text'] message = re.sub(r"(@|#)(\w+)", '', tweet_text) message = message.encode('utf-8').translate(string.maketrans("", ""), string.punctuation) message = message.translate(string.maketrans("", ""), '0123456789').lower() tf_new = hashingTF.transform(message.split(" ")) tweet['new_val']['sentiment'] = model.predict(idf.transform(tf_new)) rdb.db("sentiment").table("classified_messages").insert(tweet).run() print sentiments[str(model.predict(idf.transform(tf_new)))] + " - " + tweet_text
def filterStopWords(x): filtered_x = [] for word in x: if word not in stopwordsList and len(word)>1: filtered_x.append(word) return filtered_x documents = documents.map(lambda x: filterStopWords(x)).filter(lambda x: len(x)>0) ## Step 3: Extract TF-IDF features hashingTF = HashingTF(nFeature) # default is 2^20 tf = hashingTF.transform(documents) tf.cache() idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf).repartition(nPartition) tf.unpersist() del idf tfidf.cache() ## Step 4: Clustering with k-mean algorithm pool = [10, 100, 1000] for nCluster in pool: # Build the model (cluster the data) kmeans_model = KMeans.train(tfidf, nCluster, maxIterations=10, runs=1, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors ''' def error(point): center = kmeans_model.centers[kmeans_model.predict(point)]
print(stops[1590:]) # 清理标点符号、停用词、非法字符等 comments_clean = comments_tokenized.map( lambda ele: [e for e in ele if e not in stops]) comments_clean.take(2) ''' 3.4.TF-IDF ''' # 定义features数量 hashingTF = HashingTF(5000) # tf-idf comments_tf = hashingTF.transform(comments_clean) comments_idf = IDF().fit(comments_tf) comments_tfidf = comments_idf.transform(comments_tf) ''' 3.5.朴素贝叶斯 ''' # 合并RDD。标签和评论内容。 final_data = labels.zip(comments_tfidf) # 划分训练集和测试集 train_set, test_set = final_data.randomSplit([0.8, 0.2], seed=20182019) train_rdd = train_set.map( lambda ele: LabeledPoint(ele[0], ele[1])) # 转化为标签数据类型 test_rdd = test_set.map(lambda ele: LabeledPoint(ele[0], ele[1])) # 训练 clf_nb = NaiveBayes.train(train_rdd)
if __name__ == "__main__": sc = SparkContext(appName="TFIDFExample") # SparkContext # $example on$ # Load documents (one per line). documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) # $example off$ print("tfidf:") for each in tfidf.collect(): print(each) print("tfidfIgnore:") for each in tfidfIgnore.collect():
doc_wo_counters = documents.mapPartitionsWithIndex(lambda i, iter: islice(iter, 3, None) if i == 0 else iter) final_doc = doc_wo_counters.map(lambda x: (int(x[0]), doc_to_words(int(x[1]), int(x[2])).encode("utf8"))).reduceByKey(lambda x, y: x + " " + y) vect_rep = final_doc.map(lambda x: x[1]) raw_document = sc.textFile("test.txt") vect_rep = raw_document.map(lambda line: line.encode("utf8").split(" ")) # TfIDF hashingTF = HashingTF() tf = hashingTF.transform(vect_rep) tf.cache() idf = IDF().fit(tf) tfidf_vectors = idf.transform(tf) #Build the model (cluster the data) clusters = KMeans.train(tfidf_vectors, 10, maxIterations=100) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point.toArray() - center)])) WSSSE = tfidf_vectors.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "myModelPath") sameModel = KMeansModel.load(sc, "myModelPath")
.map(lambda line: line.split(" "))\ .map(lambda x: filter_word(x))\ .map(lambda x: (0.0, x)) documents_train = documents.union(documents_neg) labels = documents_train.map(lambda x: x[0]) train_set = documents_train.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(train_set) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Create a labeled point with a positive label and a dense feature vector training = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])) model = NaiveBayes.train(training) ######### Calculate TFIDF with test data ######## ### test_pos data ### documents_t_RDD = sc.textFile("/Users/tracy/msan-ml/hw2/aclImdb/test_pos.txt") # This command is for running on EMR connecting to S3 # documents_RDD = sc.textFile("s3n://aml-aml/test_pos.txt") documents_t = documents_t_RDD.map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())\ .map(lambda line: line.split(" "))\
def init_tranining_set(sc): """ 合并积极/消极的词性 param: sc spark对象的context """ # 获取积极文本构造rdd positive_file1 = os.path.join(settings.DATA_DIR, '分类词库/positive.txt') positive_data1 = sc.textFile(positive_file1) # 数据去重 positive_data1 = positive_data1.distinct() positive_data1 = positive_data1.map(lambda line: line.split('###')).filter( lambda line: len(line) == 2) #positive_file2 = os.path.join(settings.DATA_DIR, 'new_post.txt') #positive_data2 = sc.textFile(positive_file2) ## 数据去重 #positive_data2 = positive_data2.distinct() #positive_data2 = positive_data2.map(lambda line : line.split('###')).filter(lambda line : len(line)==2) #positive_data = positive_data1.union(positive_data2) #positive_data.repartition(1) # 获取消极文本构造rdd negative_file1 = os.path.join(settings.DATA_DIR, '分类词库/negative.txt') negative_data1 = sc.textFile(negative_file1) negative_data1 = negative_data1.distinct() # 取两列 negative_data1 = negative_data1.map(lambda line: line.split('###')).filter( lambda line: len(line) == 2) #negative_file2 = os.path.join(settings.DATA_DIR, 'new_negi.txt') #negative_data2 = sc.textFile(negative_file2) #negative_data2 = negative_data2.distinct() ## 取两列 #negative_data2 = negative_data2.map(lambda line : line.split('###')).filter(lambda line : len(line)==2) #negative_data = negative_data1.union(negative_data2) #negative_data.repartition(1) positive_data = positive_data1 negative_data = negative_data1 print negative_data.count() print positive_data.count() # 合并训练集 all_data = negative_data.union(positive_data) all_data.repartition(1) # 评分已经提前进行处理只有-1与1 rate = all_data.map(lambda s: s[0]) document = all_data.map(lambda s: s[1]) words = document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() # 文本按照csv格式存储 words.repartition(1).saveAsTextFile("traning_words") # 贝叶斯分类模型以pickle存储 with open('NBmodel.pkl', 'w') as f: pickle.dump(NBmodel, f)
from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF doc = sc.textFile(“target url”).map(lambda line: line.split(' ')) hashingTF = HashingTF() hashingTF.indexOf("COMPANY NAME") tf = hashingTF.transform(doc) idf = IDF().fit(tf) tfidf = idf.transform(tf).collect() print(tfidf)
spark = SparkSession(sc) script_dir = os.path.dirname(__file__) #training = "training.1600000.processed.noemoticon.csv" #testing="testdata.manual.2009.06.14.csv" filename="train.csv" #testing="test.csv" abs_file_path = os.path.join(script_dir, filename) #abs_file_path_testing= os.path.join(script_dir, testing) raw_data = sc.textFile(abs_file_path) #rawTestingData=sc.textFile(abs_file_path_testing) label_text=raw_data.map(lambda x:(float(x.split(",")[0][1]), x.split(",")[5].encode('ascii','ignore'))) hashingTF = HashingTF() # test=rawTestingData.map(lambda x:(float(x.split(",")[0][1]), x.split(",")[5].encode('ascii','ignore'))) feature_htf = label_text.map(lambda tup: hashingTF.transform(tup[1])) feature_idf= IDF().fit(feature_htf) featured= feature_idf.transform(feature_htf) label=label_text.map(lambda tup: tup[0]) featured_idx=featured.zipWithIndex() label_idx=label.zipWithIndex() idx_featured=featured_idx.map(lambda x:(x[1],x[0])) idx_label=label_idx.map(lambda x:(x[1],x[0])) label_feature=idx_label.join(idx_featured).map(lambda x:x[1]) label_feature_LabeledPoint=label_feature.map(lambda x:(LabeledPoint(x[0],x.SparseVector))) for i in label_feature_LabeledPoint.collect(): print i
print x # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf = conf) # Load documents (one per line). rawData = sc.textFile("subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) documentNames = fields.map(lambda x: x[1]) hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) keywordTF = hashingTF.transform(["Apollo"]) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) print "Best document for keywords is:" print zippedResults.max()
hashingTF = HashingTF(dim) tf=hashingTF.transform(tokens) tf.cache() v=tf.first() print(v.size) print(v.values) print(v.indices) idf = IDF().fit(tf) tfidf=idf.transform(tf) v2=tfidf.first() print(v2.size) print(v2.values) print(v2.indices) minMaxVals = tfidf.map(lambda v: (min(v.values),max(v.values))) globalMin=minMaxVals.reduce(min) globalMax=minMaxVals.reduce(max) globalMinMax=(globalMin[0],globalMax[1]) ###Using a TF-IDF model hockeyText= rdd.filter(lambda (file,text): file.find("hockey")!= -1)