def column_search(words,row_filter): if row_filter == 'n' or row_filter == 'N': min_row = 0 else: min_row = row_filter rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse) titles = data.map(lambda x: x[0]) documents = data.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) normalizer = Normalizer() tfidf = normalizer.transform(idf.transform(tf)) tfidfData = titles.zip(tfidf).toDF(["label", "features"]) query = parse((0, words))[1] queryTF = hashingTF.transform(query) queryTFIDF = normalizer.transform(idf.transform(queryTF)) queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0) queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"]) queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns) queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores) queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row)) if (queryRelevance.isEmpty()): print("Sorry, nothing matched in column search, please try a different keyword") else: print("Here is your column search result") queryRelevance.toDF().show() '''
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) words = wordsData.select("words").rdd.map(lambda x: x.words) hashingTF = MllibHashingTF(numFeatures) tf = hashingTF.transform(words) tf.cache() idf = MllibIDF().fit(tf) tfidf = idf.transform(tf) # @TODO make this nicer tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns)) with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns) tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns)) return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def init_tranining_set(sc): """ 合并积极/消极的词性 param: sc spark对象的context """ words = sc.textFile('traning_words.csv') # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() with open('NBmodel.pkl', 'r') as f: NBmodel = pickle.load(f) session = get_session(settings.DB_URL) for r in session.execute('select * from traning_collection').fetchall(): yourDocument = r[3] print r[3] yourwords="/".join(jieba.cut_for_search(yourDocument)).split("/") yourtf = hashingTF.transform(yourwords) yourtfidf=idfModel.transform(yourtf) print('NaiveBayes Model Predict:', NBmodel.predict(yourtfidf))
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc,"tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def analyse_data(self, data): """ 针对入口数据进行合适的分析 param data: file, unicode, str """ words = self.sc.textFile(self.training_words_dir) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() with open(self.NBmodel, 'r') as f: NBmodel = pickle.load(f) # 先分词后分析 yourwords = set("/".join(jieba.cut_for_search(data)).split("/")) print '分词结果:{}'.format(yourwords) yourtf = hashingTF.transform(yourwords) yourtfidf = idfModel.transform(yourtf) return NBmodel.predict(yourtfidf), data
def get_feature_vectors(sc, input_file, feature_dimensions): """Get feature vector from the lines in input_file_obj using TF/IDF. Returns: vectors RDD """ # Load documents (one per line). tweet_file = sc.textFile(input_file) input_text_rdd = tweet_file.map(lambda line: _tokenize(line)) input_text_rdd.cache() # The default feature dimension is 2^20; for a corpus with million # tweets recommended dimensions are 50000 or 100000. Use higher # dimensions for larger corpus of tweets. hashing_tf = HashingTF(feature_dimensions) tf = hashing_tf.transform(input_text_rdd) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() return input_text_rdd, tfidf
def main(): # 初始化 SparkContext sc = spark_context(spark_master) # 读取文件 data = sc.textFile(hdfs_path) # 分词 documents = data.map(tokenize) documents.cache() # TF hashingTF = HashingTF() tf = hashingTF.transform(documents) # IDF idf = IDF(minDocFreq=2).fit(tf) # TFIDF tfidf = idf.transform(tf) # 链接到 MongoDB from pymongo import MongoClient mongo_client = MongoClient(mongo_host) mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1') clear_mongodb(mongo_client) # zip term_tfidf = documents.zip(tfidf).map(doc_tfidf) articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y) for article in articles.collect(): item = {} item['text'] = article[0].encode('utf-8') item['size'] = int(article[1] * 10) send_mongodb(mongo_client, item)
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map( lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc, "tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def main(): #Reading the json file reviews_data = sqlContext.read.json(input) reviews=reviews_data.select('reviewText') reviews_rdd=reviews.rdd.cache() rdd_data=reviews_rdd.map(lambda line:str(line.reviewText)) transformed_data=rdd_data.map(transform_data) #Finding Tf-IDF representation hashingTF = HashingTF() tf = hashingTF.transform(transformed_data) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf).collect() # Normalization # tfidf = idf.transform(tf) # normalizer1 = Normalizer() # normalized_vector=normalizer1.transform(tfidf).collect() score_rdd=reviews_data.rdd.map(lambda line:str(line.overall)).cache().collect() dates_rdd=reviews_data.rdd.map(lambda line:str(line.reviewTime)).map(lambda line:line.split(", ")).map(lambda (a,b):b).cache().collect() combinedList=zip(tfidf,score_rdd,dates_rdd) combinedRDD=sc.parallelize(combinedList).cache() TrainRDD=combinedRDD.filter(lambda (x,y,z):z!='2014').map(lambda (x,y,z):(x,y)) TestRDD=combinedRDD.filter(lambda (x,y,z):z=='2014').map(lambda (x,y,z):(x,y)) #Saving test and training data TrainRDD.saveAsPickleFile(output+'/Train_data_unnormalized.pickle') TestRDD.saveAsPickleFile(output+'/Test_data_unnormalized.pickle')
def tfidf(self): self._create_rdd() hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return tfidf
def process_data(data): print("Processing data ...") if (not data.isEmpty()): nbModel = bc_model.value hashingTF = HashingTF(100000) tf = hashingTF.transform( data.map(lambda x: x[0].encode('utf-8', 'ignore'))) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() prediction = nbModel.predict(tfidf) temp = [] i = 0 for p, q, r in data.collect(): temp.append([]) temp[i].append(p.encode('utf-8', 'ignore')) temp[i].append(q) temp[i].append(r) i += 1 i = 0 for p in prediction.collect(): temp[i].append(p) i += 1 print(temp) for i in temp: insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2])) else: print("Empty RDD !!!") pass
def TFIDF(source, destination): if destination[-1] != '/': destination = destination + '/' ## typically define the source message rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d" % ind + ".txt" ind = ind + 1 file = open(dest_path, 'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination + "TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0, 100): print "" #Testing Printing" except KeyboardInterrupt: pass
def _compute_tfid(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts.map(lambda t: t.words)) tf.cache() idf = IDF().fit(tf) tfidfs = idf.transform(tf) text_tfs = texts.zip(tfidfs) return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
def TFIDF(source, destination): if destination[-1] != '/': destination=destination+'/' ## typically define the source message rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split()) tf=HashingTF() tfVectors=tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d"%ind + ".txt" ind = ind + 1 file = open(dest_path,'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf=IDF() idfModel=idf.fit(tfVectors) tfIdfVectors=idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination+"TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0,100): print ""#Testing Printing" except KeyboardInterrupt: pass
def training_set(sc, numFeatures, pos_file = "data/training_positif_clean.csv", neg_file = "data/training_negatif_clean.csv" ): """ Input : number of retained features in the tweet-term structure Output : normalized tweet-term format training set IDF model (that will be used in the test phase) """ text_negative = sc.textFile(neg_file) text_positive = sc.textFile(pos_file) train_text = text_negative.union(text_positive) train_labels = text_negative.map(lambda x: 0.0).union(text_positive.map(lambda x: 1.0)) tf = HashingTF(numFeatures=numFeatures).transform(train_text.map(lambda x : x)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) training = train_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) return (training, idf)
def hashing(self, size): self.hashing_TF = HashingTF( size) #100K hash buckets just to save some memory tf = self.hashing_TF.transform(self.documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) self.tfidf = idf.transform(tf)
def get_tfidf_features(txt_rdd): hashingTF = HashingTF() tf = hashingTF.transform(txt_rdd) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def tfidf(rdd_doc): hasingTF = HashingTF() trainTf = hasingTF.transform(rdd_doc) trainTf.cache() idf = IDF().fit(trainTf) trainTfidf = idf.transform(trainTf) trainTfidf.cache() return trainTfidf, lambda x: hasingTF.indexOf(x)
def get_tfidf_features(txt): hashingTF = HashingTF() tf = hashingTF.transform(txt) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def tfidf(self): tf = HashingTF().transform(self._sents) self._tf = tf tf.cache() idf = IDF().fit(tf) self.idf = idf tfidf = idf.transform(tf) self._tfidf = dict(enumerate(tfidf.collect()))
def create_bayes(self): """ 创建贝叶斯训练模型 """ if self._check_traning_exists(): return # 获取积极文本构造rdd positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt') positive_data = self.sc.textFile(positive_file) # 数据去重 positive_data = positive_data.distinct() positive_data = positive_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 获取消极文本构造rdd negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt') negative_data = self.sc.textFile(negative_file) negative_data = negative_data.distinct() negative_data = negative_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 合并训练集 all_data = negative_data.union(positive_data) all_data.repartition(1) # 评分已经提前进行处理只有-1与1 rate = all_data.map(lambda s: s[0]) document = all_data.map(lambda s: s[1]) words = document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() # 存储rdd words.repartition(1).saveAsTextFile(self.training_words_dir) # 贝叶斯分类模型以pickle存储 with open(self.NBmodel, 'w') as f: pickle.dump(NBmodel, f)
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = SVMWithSGD.train(training, iterations=100) model_name = "svm" + str(counter_model) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) counter_model.add(1) end = time.time() print("Model Name : ", model_name, ", Total Reviews : ", reviews.count(), "Processing Time : ", (end - start))
def tfidf(self, tokenizer): """ Get TFIDF matrix rdd with spark tfidf functions """ self._create_rdd(tokenizer) hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return self.rdd, idf, tfidf
def tf_idf(sc,title_token): hashingTF = HashingTF(100) title_token = sc.parallelize(title_token) tf = hashingTF.transform(title_token) print tf, ' tf' idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def test_idf_model(self): data = [ Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]), Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]), Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]), Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9]) ] model = IDF().fit(self.sc.parallelize(data, 2)) idf = model.idf() self.assertEqual(len(idf), 11)
def process(reviews): if(reviews.isEmpty()): pass else: model_name = "dt" updated_model = "dt0" model_path, data_path, metadata_path = '','','' #performing looping process to check the availability of new model classifier for i in range(25,-1,-1): model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i) updated_model = model_name+str(i) data_path = model_path+"/data/part-r*" metadata_path = model_path+"/metadata/part-00000" if(patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = DecisionTreeModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) prediction = model.predict(tfidf) labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0])) metrics = MulticlassMetrics(labeled_prediction) output = reviews.zip(prediction) filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out" output.saveAsTextFile(filename) end = time.time() print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
def generate_tf_idf(twProfilesRdd, numFe): """ Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples: (gender,(clean words tuple)) """ gtlp = generate_gender_tf(twProfilesRdd, numFe) idf = IDF() tfVectorsRDD = gtlp.map(lambda tp: tp[1]) idfModel = idf.fit(tfVectorsRDD) idfRdd = idfModel.transform(tfVectorsRDD) return (idfRdd.zip(gtlp).map(lambda tp: (tp[1][0], tp[0])), idfModel)
def extractKeywords_Train(self): documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) tfidfIgnore.saveAsTextFile("AAA")
def tf_idf_cal(words_rdd): hashingTF = HashingTF() tf = hashingTF.transform(words_rdd) idf = IDF().fit(tf) tfidf = idf.transform(tf).cache() tfidf_str = tfidf.map(lambda line: str(line)).cache() return tfidf_str
def generate_tf_idf(twProfilesRdd,numFe): """ Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples: (gender,(clean words tuple)) """ gtlp=generate_gender_tf(twProfilesRdd, numFe) idf=IDF() tfVectorsRDD=gtlp.map(lambda tp: tp[1]) idfModel=idf.fit(tfVectorsRDD) idfRdd=idfModel.transform(tfVectorsRDD) return (idfRdd.zip(gtlp).map(lambda tp:(tp[1][0],tp[0])),idfModel)
def vectorize(training): hashingTF = HashingTF() tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) idf_training = IDF().fit(tf_training) tfidf_training = idf_training.transform(tf_training) tfidf_idx = tfidf_training.zipWithIndex() training_idx = training.zipWithIndex() idx_training = training_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) joined_tfidf_training = idx_training.join(idx_tfidf) training_labeled = joined_tfidf_training.map(lambda tup: tup[1]) labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1])) return labeled_training_data
def predictSentiment(tweetText): nbModel = bc_model.value hashingTF = HashingTF() tf = hashingTF.transform(tweetText) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() prediction = nbModel.predict(tfidf) print "Predictions for this window :" for i in range(0, prediction.count()): print prediction.collect()[i], tweetText.collect()[i]
def calcTfidf(doc, source): """ This method computes TF-IDF scores for the given document. While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: first to compute the IDF vector and second to scale the term frequencies by IDF. """ hashingTF = HashingTF(200000) tf = hashingTF.transform(doc) print "TF calculated for "+source.split('/')[-1] tf.cache() idf = IDF().fit(tf) ##idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) print "TF-IDF calculated for "+source.split('/')[-1] return hashingTF, tfidf
def tf_idf(data): # This hashes each of the word / feature using MurmurHash 3 and generates # an index for each, then calculates TF for each index # TODO : Need to check best numFeatures tf = HashingTF(numFeatures=10000).transform(data.map(lambda x : x.split())) # TF-IDF is calculated to understand how important a word is to a particular # document idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def training_set(pos_file, neg_file): text_negative = sc.textFile(neg_file) text_positive = sc.textFile(pos_file) train_text = text_negative.union(text_positive) train_labels = text_negative.map(lambda x: 0.0).union( text_positive.map(lambda x: 1.0)) tf = HashingTF(numFeatures=10000).transform(train_text.map(lambda x: x)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) training = train_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) return (training, idf)
def create_tfidf(sc): # start = time.time() docs = sc.textFile(FILE0, 4).map(split_docs).cache() tags = docs.map(lambda doc: doc[1].split()).cache() tag = tags.map(lambda tags: tags[0]) words = docs.map(lambda doc: doc[0].split()) words = words.map(preProcess).cache() # id_tag = tag.zipWithIndex().map(swapOder) hashingTF = HashingTF() tf = hashingTF.transform(words) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf).cache() #tfidf = tfidf.collect() return tfidf, tags
def normTFIDF(fn_tokens_RDD, vecDim, caching=True): keysRDD = fn_tokens_RDD.keys() tokensRDD = fn_tokens_RDD.values() tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize( tokens, vecDim)) # passing the vecDim value. TIP: you need a lambda. if caching: tfVecRDD.persist( StorageLevel.MEMORY_ONLY ) # since we will read more than once, caching in Memory will make things quicker. idf = IDF() # create IDF object idfModel = idf.fit(tfVecRDD) # calculate IDF values tfIdfRDD = idfModel.transform( tfVecRDD) # 2nd pass needed (see lecture slides), transforms RDD norm = Normalizer( ) # create a Normalizer object like in the example linked above normTfIdfRDD = norm.transform(tfIdfRDD) # and apply it to the tfIdfRDD zippedRDD = keysRDD.zip(normTfIdfRDD) # zip the keys and values together return zippedRDD
def parseTextRDDToIndex(self, data, label=True): if label: labels = data.map(lambda line: float(line.split(" ", 1)[0])) documents = data.map(lambda line: line.split(" ", 1)[1].split(" ")) else: documents = data.map(lambda line: line.split(" ")) tf = HashingTF().transform(documents) tf.cache() idfIgnore = IDF(minDocFreq=2).fit(tf) index = idfIgnore.transform(tf) if label: return labels.zip(index).map( lambda line: LabeledPoint(line[0], line[1])) else: return index
def mySpark(minFreq, keyWord): # text cleaning function def removePunctuation(text): res = text.lower().strip() res = re.sub("[^0-9a-zA-Z ]", "", res) return res.split(" ") # Function for printing each element in RDD def println(x): for i in x: print i # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf=conf) # Load documents content (one per line) + cleaning. rawData = sc.textFile("list_berita-30.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: removePunctuation(x[3])) # Get documents content without word mapping documentNames = fields.map(lambda x: x[3]) # TF processing hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # IDF & TF-IDF processing tf.cache() idf = IDF(minDocFreq=int(minFreq)).fit(tf) tfidf = idf.transform(tf) # Get keyword relevance with content and zip it keywordTF = hashingTF.transform(removePunctuation(keyWord)) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) # print result print "Best document for keywords is:" print zippedResults.max()
def mySpark(minFreq, keyWord): # text cleaning function def removePunctuation(text): res=text.lower().strip() res=re.sub("[^0-9a-zA-Z ]", "", res) return res.split(" ") # Function for printing each element in RDD def println(x): for i in x: print i # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf = conf) # Load documents content (one per line) + cleaning. rawData = sc.textFile("list_berita-30.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: removePunctuation(x[3])) # Get documents content without word mapping documentNames = fields.map(lambda x: x[3]) # TF processing hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # IDF & TF-IDF processing tf.cache() idf = IDF(minDocFreq=int(minFreq)).fit(tf) tfidf = idf.transform(tf) # Get keyword relevance with content and zip it keywordTF = hashingTF.transform(removePunctuation(keyWord)) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) # print result print "Best document for keywords is:" print zippedResults.max()
def extract_features(self, feat='tfidf', **kwargs): """ Converts each subtitle into its TF/TFIDF representation. Normalizes if necessary. Parameters -------- Feat: 'tf' or 'tfidf'. kwargs: num_features, minDocFreq, or other arguments to be passed to the MLLib objects. Returns -------- RDD of features with key. """ # transform BOW into TF vectors num_features = kwargs.get('num_features', 10000) htf = HashingTF(num_features) feat_rdd = self.RDD.mapValues(htf.transform).cache() # transform TF vectors into IDF vectors if feat == 'tfidf': keys, tf_vecs = feat_rdd.keys(), feat_rdd.values() minDocFreq = kwargs.get('minDocFreq', 2) idf = IDF(minDocFreq=minDocFreq) idf_model = idf.fit(tf_vecs) idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray())) feat_rdd = keys.zip(idf_rdd) if self.model_type == 'log_reg': normalizer = StandardScaler(withMean=True, withStd=True) keys, vecs = feat_rdd.keys(), feat_rdd.values() norm_model = normalizer.fit(vecs) norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray())) feat_rdd = keys.zip(norm_rdd) return feat_rdd
def use_naive_nayes(): """ Running the Naive Bayes from Spark's Mlib library """ from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.linalg import SparseVector, Vectors from pyspark.mllib.regression import LabeledPoint #loading the files path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/" train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8')) #TF-IDF tr_pos = HashingTF().transform(train_pos) ; tr_pos_idf = IDF().fit(tr_pos) tr_neg = HashingTF().transform(train_neg) ; tr_neg_idf = IDF().fit(tr_neg) te_pos = HashingTF().transform(test_pos) ; te_pos_idf = IDF().fit(te_pos) te_neg = HashingTF().transform(test_neg) ; te_neg_idf = IDF().fit(te_neg) #IDF step tr_pos_tfidf = tr_pos_idf.transform(tr_pos) ; tr_neg_tfidf = tr_neg_idf.transform(tr_neg) te_pos_tfidf = te_pos_idf.transform(te_pos) ; te_neg_tfidf = te_neg_idf.transform(te_neg) #Creating labels pos_label = [1] * 12500 ; pos_label = sc.parallelize(pos_label) neg_label = [1] * 12500 ; neg_label = sc.parallelize(neg_label) # Combine using zip train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) #Joining 2 RDDS to form the final training set train_file = train_pos_file.union(train_neg_file) test_file = test_pos_file.union(test_neg_file) # Fitting a Naive bayes model model = NaiveBayes.train(train_file) # Make prediction and test accuracy predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0])) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "" print "Test accuracy is {}".format(round(accuracy,4))
sc = SparkContext() rdd = sc.wholeTextFiles("/usr/local/Cellar/BigDataAdvanced/Assignment1/TwitterStuff/TweetData").map(lambda (name,text):text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() count = 0 for vec in a: print vec count = count + 1 with open("TF_Tweet"+str(count)+".txt","w") as f: f.write(str(vec)) f.close() idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) file = open("TF-IDF_tweet.txt", 'w') file.write(str(tfIdfVectors.collect())) #count = 0 #output=tfIdfVectors.collect() #for vec in output: # print vec # count = count + 1 # with open("TF_Wiki"+str(count)+".txt","w") as f: # f.write(str(vec)) # f.close()
# remove top 3 lines from document doc_wo_counters = documents.mapPartitionsWithIndex(lambda i, iter: islice(iter, 3, None) if i == 0 else iter) final_doc = doc_wo_counters.map(lambda x: (int(x[0]), doc_to_words(int(x[1]), int(x[2])).encode("utf8"))).reduceByKey(lambda x, y: x + " " + y) vect_rep = final_doc.map(lambda x: x[1]) raw_document = sc.textFile("test.txt") vect_rep = raw_document.map(lambda line: line.encode("utf8").split(" ")) # TfIDF hashingTF = HashingTF() tf = hashingTF.transform(vect_rep) tf.cache() idf = IDF().fit(tf) tfidf_vectors = idf.transform(tf) #Build the model (cluster the data) clusters = KMeans.train(tfidf_vectors, 10, maxIterations=100) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point.toArray() - center)])) WSSSE = tfidf_vectors.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "myModelPath")
def distributed_ops( corpus, sanit=False, recall=False, corpred=False, \ streams=False, segred=False, tfidf=False, lda=False, \ word2vec=False, fin=None, segclust=None): # Return item for end results return_list = [] ########################################## # Default actions: if (segred): zipped_corpus = zip(segclust,corpus) #print zipped_corpus corpus = sc.parallelize(corpus).cache() if (sanit or recall): corpus = corpus.map(lambda doc: preprocess(doc)) # Here we "recover all" text, after having removed multi-ws & ws-pad punctuation # & replace \n by NL etc... (see function "preprocess" above) # We use the same regex sub/filtration rules as in the implementation found # @ https://github.com/alexalemi/segmentation (from which we got files in # directory: representation.py, tools.py and splitters.py, and which # segmentSETxRes.py is based on) if (recall): return_list.append(recover_encoding(corpus.collect())) # Here we return only potentially "meaningful words" - see function "return_words" above # Keeps alpha-numeric (removes numeric and non-alphabetical/alphanumeric) corpus_distrib = corpus.map(lambda doc: return_words(doc)) print 'Original number of docs in corpus {filtering *docs* for alpha(+alphanumeric)-only words}: %i'%corpus_distrib.count() # merge corpus docs into one continuous split text corpus_merge = [] corpus_collect = corpus_distrib.collect() # rdd2list for list_of_words in corpus_collect: corpus_merge.extend(list_of_words) # list-of-wordslist2{single-wordslist} # use numpy functions to sort dict words based on term-frequency corpus_merge_array = np.array(corpus_merge) corpus_merge_sorted = np.sort(corpus_merge_array) corpus_merge_unique, counts = np.unique(corpus_merge_sorted,return_counts=True) sort_ixs = np.argsort(counts)[::-1] counts = counts[sort_ixs] corpus_merge_unique = corpus_merge_unique[sort_ixs] return_list.append(corpus_merge_unique) return_list.append(counts) print for i,w in enumerate(corpus_merge_unique): print ('Counted word "%s" _%i_ many times.'%(w,counts[i])) print ######################################################################################### # Next we split the text based on "verbosity/density/sparsity" as would # befit an articulate document (i.e. articles/papers/journal entries) # or more conversational/blog-entry-like/Q&A style/headings-only- # -retrieved website results. def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point-center)])) # The following will further sanitize text. if (corpred): # Use pretrained term frequencies: # Experimentally, the following clustering has helped us get rid of # irrelevant search engine text results. corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=False)) corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc)).cache() # print 'Corpus vectorized' # collected = corpus2vec.collect() tempor = corpus.collect() print print for i,vec in enumerate(corpus2vec.collect()): print 'Got vecs:' print vec print 'Of text:' print tempor[i].split() print print # choose 5 clusters clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||") WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum print print 'Within Set Sum of Squared Error = ' + str(WSSE) print 'The cluster centers:' print clusters.centers print print return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect()) # The following will cluster for article length + content if (streams): corpus2vec = corpus.map(lambda doc: genre_score(doc,type2=True)) temple = corpus.collect() print print for i,vec in enumerate(corpus2vec.collect()): print 'Got vecs:' print vec print 'Of text:' print temple[i].split() print print sumall = corpus2vec.reduce(lambda vecx,vecy: np.array([vecx[0]+vecy[0]])) corpus2vec = corpus2vec.map(lambda doc: process_doc2vec_word_counts(doc,normalizer=sumall)).cache() # clusters = KMeans.train(corpus2vec, 5, maxIterations=90, runs=10, initializationMode="k-means||") WSSE = corpus2vec.map(lambda point: error(point)).reduce(lambda x,y: x+y) # cumsum print print 'Within Set Sum of Squared Error = ' + str(WSSE) print 'The cluster centers:' print clusters.centers print print return_list.append(corpus2vec.map(lambda pt: clusters.predict(pt)).collect()) ######################################################################################### # Here we want to remove documents from the corpus which do not contain # 'english' dictionary words at all, or words that can be word2vec transformed # and "synonimized". if (segred): corpus_english_prose = sc.parallelize(zipped_corpus).filter(lambda doc: check(doc)) zipped_corpus = zip(*corpus_english_prose.collect()) red_clusts = list(zipped_corpus[0]) red_text = recover_encoding(list(zipped_corpus[1])) return_list.append(red_clusts) return_list.append(red_text) print 'Number of docs in corpus {filtering *corpus* for alpha(+alphanumeric)-only words}: %i'%corpus_english_prose.count() f1 = open(''.join([filename,'-document_clusters.txt']),'w') f1.write('\n'.join(map(str,red_clusts))) f1.close() f2 = open(''.join([filename,'-documents_sanitized.txt']),'w') f2.write('\n'.join(red_text)) f2.close() f3 = open(''.join([filename,'-documents_dict.txt']),'w') f3.write('\n'.join(corpus_merge_unique)) f3.close() ######################################################################################### if (tfidf): # generate document term frequences htf = HashingTF() tf = htf.transform(corpus_distrib) # generate idf = log{ frac{#docs}{#docs w. term} } idf = IDF().fit(tf) # scale tf * idf tfidf = idf.transform(tf) # collect tfidf for future use doc_tfidf = tfidf.collect() # generate unique word : HashingTF hash dict corpus_dict_tfidf_t = {} # uniquifie merged corpus into terms #corpus_merge_unique = sorted(set(corpus_merge)) # fill in unique word : HashingTF hash dict for word in corpus_merge_unique: idx = htf.indexOf(word) corpus_dict_tfidf_t[word] = idx # index not necessarily found in doc_tfidf. # no return item ######################################################################################### if (lda): corpus_dict = {} for c,word in enumerate(corpus_merge_unique): corpus_dict[word]=counts[c] def return_freq_words(doc,corpus_dict): return [word for word in doc if word in corpus_dict if corpus_dict[word]>2] corpus_distrib_red = corpus_distrib.map(lambda doc: return_freq_words(doc,corpus_dict)).cache() gensim_corpora_id2word = corpora.Dictionary(corpus_distrib_red.collect()) gensim_doc2bow_doctf = corpus_distrib_red.map(lambda doc: gensim_corpora_id2word.doc2bow(doc)).collect() f1 = open(''.join([filename,'-gensim_corpora_id2word.pkl']),'w') pickle.dump(gensim_corpora_id2word,f1) f1.close() f2 = open(''.join([filename,'-gensim_doc2bow_doctf.pkl']),'w') pickle.dump(gensim_doc2bow_doctf,f2) f2.close() f3 = open(''.join([filename,'-corpus.pkl']),'w') pickle.dump(corpus_distrib.collect(),f3) f3.close() if (word2vec): # def increase_tf(doc): # only words with freq >= 5 are vectorized ret_doc = [] for i in xrange(5): # <<< ret_doc.extend(doc) # <<< return ret_doc # corpus_distrib_ext = corpus_distrib.map(lambda doc: increase_tf(doc)) word_mbd = Word2Vec().setVectorSize(50).setSeed(42L).fit(corpus_distrib_ext) word2vec_dict = {} for i,w in enumerate(corpus_merge_unique): #print ('Counted word "%s" _%i_ many times.'%(w,counts[i])) word2vec_dict[w] = word_mbd.transform(w) try: print ('Top 5 embedding cosine similarity synonyms of word "%s":'%w) proximal_synonyms = word_mbd.findSynonyms(w,5) for s,cs in proximal_synonyms: print (' "%s" with score _%f_'%(s,cs)) except: print 'No synonyms found (word not in dict).' print print 'Processing + Spark MLLib has given us %i word2vec vectors.'%len(word2vec_dict) return_list.append(word2vec_dict) f4 = open(''.join([filename,'-word2vec_dict.pkl']),'w') pickle.dump(word2vec_dict,f4) f4.close() if len(return_list)==1: return_list = return_list[0] return return_list
documents_neg = documents_neg_RDD.map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())\ .map(lambda line: line.split(" "))\ .map(lambda x: filter_word(x))\ .map(lambda x: (0.0, x)) documents_train = documents.union(documents_neg) labels = documents_train.map(lambda x: x[0]) train_set = documents_train.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(train_set) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # Create a labeled point with a positive label and a dense feature vector training = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])) model = NaiveBayes.train(training) ######### Calculate TFIDF with test data ######## ### test_pos data ### documents_t_RDD = sc.textFile("/Users/tracy/msan-ml/hw2/aclImdb/test_pos.txt") # This command is for running on EMR connecting to S3 # documents_RDD = sc.textFile("s3n://aml-aml/test_pos.txt") documents_t = documents_t_RDD.map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())\
def tfIdf_cluster(self,content,title,date,tfidf): tfidf_list=content inputRDD = sc.parallelize(tfidf_list) hasingTF = HashingTF(2 ** 20) trainTf = hasingTF.transform(inputRDD) idf = IDF().fit(trainTf) trainTfidf = idf.transform(trainTf) km = KMeans.train(trainTfidf, 2, maxIterations=100, runs=10) #training new model result = km.predict(trainTfidf) k_data = array(result.collect()) grp1_news = [] grp2_news = [] #把抓到的新聞存成[{},{}] key & value 的樣子方便前端取用 # i = 0 for idx, grp in enumerate(k_data): if grp == 0: news = { 'title':title[idx], 'date':date[idx], 'content':''.join(content[idx].split()), 'tfidf':tfidf[idx], } grp1_news.append(news) if grp == 1: news = { 'title':title[idx], 'date':date[idx], 'content':''.join(content[idx].split()), 'tfidf':tfidf[idx], } grp2_news.append(news) #存取新聞分群TFIDF詞數量開始------------------------------------ tfidf_word_grp1=[] #用來裝TFIDF詞跟數量 all_tfidf_grp1=[] #用來裝所有TFIDF詞 for post in grp1_news: tfidf = post['tfidf'] for i in tfidf: all_tfidf_grp1.append(i) tfidf_dic1 = {} for ele in all_tfidf_grp1: # n if not ele in tfidf_dic1: tfidf_dic1[ele] = 1 else: tfidf_dic1[ele] = tfidf_dic1[ele] + 1 for i in range(0,len(tfidf_dic1)): data = { "text":tfidf_dic1.keys()[i], "size":(tfidf_dic1.values()[i])*1.5, } tfidf_word_grp1.append(data) tfidf_word_grp1.sort(key=lambda d:d['size'],reverse=True) #幫情緒字進行排序 tfidf_word_grp1 = tfidf_word_grp1[0:50] tfidf_word_grp1 = json.dumps(tfidf_word_grp1) #--------------------------------------------------------------------------------------------- tfidf_word_grp2=[] #用來裝TFIDF詞跟數量 all_tfidf_grp2=[] #用來裝所有TFIDF詞 for post in grp2_news: tfidf = post['tfidf'] for i in tfidf: all_tfidf_grp2.append(i) tfidf_dic2 = {} for ele in all_tfidf_grp2: # n if not ele in tfidf_dic2: tfidf_dic2[ele] = 1 else: tfidf_dic2[ele] = tfidf_dic2[ele] + 1 for i in range(0,len(tfidf_dic2)): data = { "text":tfidf_dic2.keys()[i], "size":(tfidf_dic2.values()[i])*1.5, } tfidf_word_grp2.append(data) tfidf_word_grp2.sort(key=lambda d:d['size'],reverse=True) #幫情緒字進行排序 tfidf_word_grp2 = tfidf_word_grp2[0:50] tfidf_word_grp2 = json.dumps(tfidf_word_grp2) #存取新聞分群TFIDF詞數量結束------------------------------------ return grp1_news,grp2_news,tfidf_word_grp1,tfidf_word_grp2
print x # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf = conf) # Load documents (one per line). rawData = sc.textFile("subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) documentNames = fields.map(lambda x: x[1]) hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) keywordTF = hashingTF.transform(["Apollo"]) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) print "Best document for keywords is:" print zippedResults.max()
def returnTFIDF(tokens, hashingTF): tf = hashingTF.transform(tokens) idf = IDF(minDocFreq=25).fit(tf) tfidf = idf.transform(tf) return tfidf
dim=pow(2,18) hashingTF = HashingTF(dim) tf=hashingTF.transform(tokens) tf.cache() v=tf.first() print(v.size) print(v.values) print(v.indices) idf = IDF().fit(tf) tfidf=idf.transform(tf) v2=tfidf.first() print(v2.size) print(v2.values) print(v2.indices) minMaxVals = tfidf.map(lambda v: (min(v.values),max(v.values))) globalMin=minMaxVals.reduce(min) globalMax=minMaxVals.reduce(max) globalMinMax=(globalMin[0],globalMax[1]) ###Using a TF-IDF model
from pyspark.mllib.util import MLUtils #>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] #>>> tempFile = NamedTemporaryFile(delete=True) #>>> tempFile.close() #>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name) from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF, IDF from pyspark import SparkContext sc=SparkContext("local","dd") train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(",")) trainlabels = train.map(lambda(a,b): int(b)) traintf = HashingTF().transform(train.map(lambda(a,b): a.split())) trainidf = IDF().fit(traintf) traintfidf = trainidf.transform(traintf) #densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray())) #zippeddata = trainlabels.zip(densetrain) #new = zippeddata.map(lambda (a,vec) : (a,vec.toArray())) training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1])) MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile") data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000") (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
def calculate_tfidf(documents): hashingTF = HashingTF() tf = hashingTF.transform(documents.map(lambda x: x[1])) tf.cache() idf = IDF().fit(tf) return idf.transform(tf)
def main(): """ Driver program for a spam filter using Spark and MLLib """ # Consolidate the individual email files into a single spam file # and a single ham file makeDataFileFromEmails( "data/spam_2/", "data/spam.txt") makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" ) # Create the Spark Context for parallel processing sc = SparkContext( appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "data/spam.txt" ) ham = sc.textFile( "data/ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamtf = spam.map(lambda email: tf.transform(email.split(" "))) hamtf = ham.map(lambda email: tf.transform(email.split(" "))) spamtf.cache() hamtf.cache() spamidf = IDF().fit(spamtf) hamidf = IDF().fit(hamtf) spamFeatures = spamidf.transform(spamtf) hamFeatures = hamidf.transform(hamtf) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets ( trainingData, testData ) = data.randomSplit( [0.7, 0.3] ) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = SVMWithSGD.train(trainingData, iterations=100) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) ) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() ) print( "*********** SPAM FILTER RESULTS **********" ) print( "\n" ) print( "Error Rate: " + str( error_rate ) ) print( "\n" ) # Serialize the model for presistance pickle.dump( model, open( "SpamSvm.pkl", "wb" ) ) sc.stop()
# $example off$ if __name__ == "__main__": sc = SparkContext(appName="TFIDFExample") # SparkContext # $example on$ # Load documents (one per line). documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) # $example off$ print("tfidf:") for each in tfidf.collect(): print(each) print("tfidfIgnore:")
for word ,flag in psegCut: if(flag=="n"): words.append(word) data.append(list(words)) data.remove("") documents = sc.parallelize(data) def hashing(x): return hashingTF.transform([x]).indices[0] hashed = documents.flatMap(lambda line: line).map(lambda word:(hashing(word), word)).distinct() hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash') # hashingTF = HashingTF() # Tf-Idfの生成 tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tf_idf_data = idf.transform(tf) print dt.now().strftime('%Y/%m/%d %H:%M:%S') K = 5 # Index documents with unique IDs corpus_data = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() print corpus_data # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus_data, k=K) # Output topics. Each is a distribution over words (matching word count vectors) print "Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):" topics = ldaModel.topicsMatrix() print dt.now().strftime('%Y/%m/%d %H:%M:%S')
def filterStopWords(x): filtered_x = [] for word in x: if word not in stopwordsList and len(word)>1: filtered_x.append(word) return filtered_x documents = documents.map(lambda x: filterStopWords(x)).filter(lambda x: len(x)>0) ## Step 3: Extract TF-IDF features hashingTF = HashingTF(nFeature) # default is 2^20 tf = hashingTF.transform(documents) tf.cache() idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf).repartition(nPartition) tf.unpersist() del idf tfidf.cache() ## Step 4: Clustering with k-mean algorithm pool = [10, 100, 1000] for nCluster in pool: # Build the model (cluster the data) kmeans_model = KMeans.train(tfidf, nCluster, maxIterations=10, runs=1, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors ''' def error(point):