def process_data(data): print("Processing data ...") if (not data.isEmpty()): nbModel = bc_model.value hashingTF = HashingTF(100000) tf = hashingTF.transform( data.map(lambda x: x[0].encode('utf-8', 'ignore'))) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() prediction = nbModel.predict(tfidf) temp = [] i = 0 for p, q, r in data.collect(): temp.append([]) temp[i].append(p.encode('utf-8', 'ignore')) temp[i].append(q) temp[i].append(r) i += 1 i = 0 for p in prediction.collect(): temp[i].append(p) i += 1 print(temp) for i in temp: insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2])) else: print("Empty RDD !!!") pass
def analyse_data(self, data): """ 针对入口数据进行合适的分析 param data: file, unicode, str """ words = self.sc.textFile(self.training_words_dir) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() with open(self.NBmodel, 'r') as f: NBmodel = pickle.load(f) # 先分词后分析 yourwords = set("/".join(jieba.cut_for_search(data)).split("/")) print '分词结果:{}'.format(yourwords) yourtf = hashingTF.transform(yourwords) yourtfidf = idfModel.transform(yourtf) return NBmodel.predict(yourtfidf), data
def init_tranining_set(sc): """ 合并积极/消极的词性 param: sc spark对象的context """ words = sc.textFile('traning_words.csv') # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() with open('NBmodel.pkl', 'r') as f: NBmodel = pickle.load(f) session = get_session(settings.DB_URL) for r in session.execute('select * from traning_collection').fetchall(): yourDocument = r[3] print r[3] yourwords="/".join(jieba.cut_for_search(yourDocument)).split("/") yourtf = hashingTF.transform(yourwords) yourtfidf=idfModel.transform(yourtf) print('NaiveBayes Model Predict:', NBmodel.predict(yourtfidf))
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map( lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc, "tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def tfidf(self): self._create_rdd() hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return tfidf
def get_feature_vectors(sc, input_file, feature_dimensions): """Get feature vector from the lines in input_file_obj using TF/IDF. Returns: vectors RDD """ # Load documents (one per line). tweet_file = sc.textFile(input_file) input_text_rdd = tweet_file.map(lambda line: _tokenize(line)) input_text_rdd.cache() # The default feature dimension is 2^20; for a corpus with million # tweets recommended dimensions are 50000 or 100000. Use higher # dimensions for larger corpus of tweets. hashing_tf = HashingTF(feature_dimensions) tf = hashing_tf.transform(input_text_rdd) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() return input_text_rdd, tfidf
def column_search(words,row_filter): if row_filter == 'n' or row_filter == 'N': min_row = 0 else: min_row = row_filter rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse) titles = data.map(lambda x: x[0]) documents = data.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) normalizer = Normalizer() tfidf = normalizer.transform(idf.transform(tf)) tfidfData = titles.zip(tfidf).toDF(["label", "features"]) query = parse((0, words))[1] queryTF = hashingTF.transform(query) queryTFIDF = normalizer.transform(idf.transform(queryTF)) queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0) queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"]) queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns) queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores) queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row)) if (queryRelevance.isEmpty()): print("Sorry, nothing matched in column search, please try a different keyword") else: print("Here is your column search result") queryRelevance.toDF().show() '''
def training_set(sc, numFeatures, pos_file = "data/training_positif_clean.csv", neg_file = "data/training_negatif_clean.csv" ): """ Input : number of retained features in the tweet-term structure Output : normalized tweet-term format training set IDF model (that will be used in the test phase) """ text_negative = sc.textFile(neg_file) text_positive = sc.textFile(pos_file) train_text = text_negative.union(text_positive) train_labels = text_negative.map(lambda x: 0.0).union(text_positive.map(lambda x: 1.0)) tf = HashingTF(numFeatures=numFeatures).transform(train_text.map(lambda x : x)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) training = train_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) return (training, idf)
def main(): #Reading the json file reviews_data = sqlContext.read.json(input) reviews=reviews_data.select('reviewText') reviews_rdd=reviews.rdd.cache() rdd_data=reviews_rdd.map(lambda line:str(line.reviewText)) transformed_data=rdd_data.map(transform_data) #Finding Tf-IDF representation hashingTF = HashingTF() tf = hashingTF.transform(transformed_data) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf).collect() # Normalization # tfidf = idf.transform(tf) # normalizer1 = Normalizer() # normalized_vector=normalizer1.transform(tfidf).collect() score_rdd=reviews_data.rdd.map(lambda line:str(line.overall)).cache().collect() dates_rdd=reviews_data.rdd.map(lambda line:str(line.reviewTime)).map(lambda line:line.split(", ")).map(lambda (a,b):b).cache().collect() combinedList=zip(tfidf,score_rdd,dates_rdd) combinedRDD=sc.parallelize(combinedList).cache() TrainRDD=combinedRDD.filter(lambda (x,y,z):z!='2014').map(lambda (x,y,z):(x,y)) TestRDD=combinedRDD.filter(lambda (x,y,z):z=='2014').map(lambda (x,y,z):(x,y)) #Saving test and training data TrainRDD.saveAsPickleFile(output+'/Train_data_unnormalized.pickle') TestRDD.saveAsPickleFile(output+'/Test_data_unnormalized.pickle')
def TFIDF(source, destination): if destination[-1] != '/': destination = destination + '/' ## typically define the source message rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d" % ind + ".txt" ind = ind + 1 file = open(dest_path, 'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination + "TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0, 100): print "" #Testing Printing" except KeyboardInterrupt: pass
def _compute_tfid(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts.map(lambda t: t.words)) tf.cache() idf = IDF().fit(tf) tfidfs = idf.transform(tf) text_tfs = texts.zip(tfidfs) return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
def get_tfidf(rdd): # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. hashingTF = HashingTF() tf = hashingTF.transform(rdd) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idfIgnore = IDF(minDocFreq=1).fit(tf) tfidf_rdd = idfIgnore.transform(tf) # rdd of SparseVectors [(doc_id_i: {word_id_j: tfidfscore_j, ...}), ... }] # or m docs x n counts return tfidf_rdd
def get_tfidf_features(txt_rdd): hashingTF = HashingTF() tf = hashingTF.transform(txt_rdd) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def hashing(self, size): self.hashing_TF = HashingTF( size) #100K hash buckets just to save some memory tf = self.hashing_TF.transform(self.documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) self.tfidf = idf.transform(tf)
def tfidf(self): tf = HashingTF().transform(self._sents) self._tf = tf tf.cache() idf = IDF().fit(tf) self.idf = idf tfidf = idf.transform(tf) self._tfidf = dict(enumerate(tfidf.collect()))
def create_bayes(self): """ 创建贝叶斯训练模型 """ if self._check_traning_exists(): return # 获取积极文本构造rdd positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt') positive_data = self.sc.textFile(positive_file) # 数据去重 positive_data = positive_data.distinct() positive_data = positive_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 获取消极文本构造rdd negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt') negative_data = self.sc.textFile(negative_file) negative_data = negative_data.distinct() negative_data = negative_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 合并训练集 all_data = negative_data.union(positive_data) all_data.repartition(1) # 评分已经提前进行处理只有-1与1 rate = all_data.map(lambda s: s[0]) document = all_data.map(lambda s: s[1]) words = document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() # 存储rdd words.repartition(1).saveAsTextFile(self.training_words_dir) # 贝叶斯分类模型以pickle存储 with open(self.NBmodel, 'w') as f: pickle.dump(NBmodel, f)
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = SVMWithSGD.train(training, iterations=100) model_name = "svm" + str(counter_model) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) counter_model.add(1) end = time.time() print("Model Name : ", model_name, ", Total Reviews : ", reviews.count(), "Processing Time : ", (end - start))
def test_idf_model(self): data = [ Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]), Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]), Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]), Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9]) ] model = IDF().fit(self.sc.parallelize(data, 2)) idf = model.idf() self.assertEqual(len(idf), 11)
def process(reviews): if(reviews.isEmpty()): pass else: model_name = "dt" updated_model = "dt0" model_path, data_path, metadata_path = '','','' #performing looping process to check the availability of new model classifier for i in range(25,-1,-1): model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i) updated_model = model_name+str(i) data_path = model_path+"/data/part-r*" metadata_path = model_path+"/metadata/part-00000" if(patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = DecisionTreeModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) prediction = model.predict(tfidf) labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0])) metrics = MulticlassMetrics(labeled_prediction) output = reviews.zip(prediction) filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out" output.saveAsTextFile(filename) end = time.time() print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
def extractKeywords_Train(self): documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) tfidfIgnore.saveAsTextFile("AAA")
def generate_tf_idf(twProfilesRdd, numFe): """ Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples: (gender,(clean words tuple)) """ gtlp = generate_gender_tf(twProfilesRdd, numFe) idf = IDF() tfVectorsRDD = gtlp.map(lambda tp: tp[1]) idfModel = idf.fit(tfVectorsRDD) idfRdd = idfModel.transform(tfVectorsRDD) return (idfRdd.zip(gtlp).map(lambda tp: (tp[1][0], tp[0])), idfModel)
def use_naive_nayes(): """ Running the Naive Bayes from Spark's Mlib library """ from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.linalg import SparseVector, Vectors from pyspark.mllib.regression import LabeledPoint #loading the files path = "/Users/abhisheksingh29895/Desktop/courses/CURRENT/Advance_Machine_Learning/HW2/aclImdb/" train_pos = sc.textFile(path + "train/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) train_neg = sc.textFile(path + "train/neg/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_pos = sc.textFile(path + "test/pos/*txt").map(lambda line: line.encode('utf8')).map(lambda line: line.split()) test_neg = sc.textFile(path + "test/neg/*txt").map(lambda line: line.encode('utf8')) #TF-IDF tr_pos = HashingTF().transform(train_pos) ; tr_pos_idf = IDF().fit(tr_pos) tr_neg = HashingTF().transform(train_neg) ; tr_neg_idf = IDF().fit(tr_neg) te_pos = HashingTF().transform(test_pos) ; te_pos_idf = IDF().fit(te_pos) te_neg = HashingTF().transform(test_neg) ; te_neg_idf = IDF().fit(te_neg) #IDF step tr_pos_tfidf = tr_pos_idf.transform(tr_pos) ; tr_neg_tfidf = tr_neg_idf.transform(tr_neg) te_pos_tfidf = te_pos_idf.transform(te_pos) ; te_neg_tfidf = te_neg_idf.transform(te_neg) #Creating labels pos_label = [1] * 12500 ; pos_label = sc.parallelize(pos_label) neg_label = [1] * 12500 ; neg_label = sc.parallelize(neg_label) # Combine using zip train_pos_file = pos_label.zip(tr_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) train_neg_file = neg_label.zip(tr_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_pos_file = pos_label.zip(te_pos_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) test_neg_file = neg_label.zip(te_neg_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) #Joining 2 RDDS to form the final training set train_file = train_pos_file.union(train_neg_file) test_file = test_pos_file.union(test_neg_file) # Fitting a Naive bayes model model = NaiveBayes.train(train_file) # Make prediction and test accuracy predictionAndLabel = test_file.map(lambda p: (model.predict(p[1]), p[0])) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "" print "Test accuracy is {}".format(round(accuracy,4))
def predictSentiment(tweetText): nbModel = bc_model.value hashingTF = HashingTF() tf = hashingTF.transform(tweetText) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() prediction = nbModel.predict(tfidf) print "Predictions for this window :" for i in range(0, prediction.count()): print prediction.collect()[i], tweetText.collect()[i]
def vectorize(training): hashingTF = HashingTF() tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) idf_training = IDF().fit(tf_training) tfidf_training = idf_training.transform(tf_training) tfidf_idx = tfidf_training.zipWithIndex() training_idx = training.zipWithIndex() idx_training = training_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) joined_tfidf_training = idx_training.join(idx_tfidf) training_labeled = joined_tfidf_training.map(lambda tup: tup[1]) labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1])) return labeled_training_data
def calcTfidf(doc, source): """ This method computes TF-IDF scores for the given document. While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: first to compute the IDF vector and second to scale the term frequencies by IDF. """ hashingTF = HashingTF(200000) tf = hashingTF.transform(doc) print "TF calculated for "+source.split('/')[-1] tf.cache() idf = IDF().fit(tf) ##idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) print "TF-IDF calculated for "+source.split('/')[-1] return hashingTF, tfidf
def tf_idf(data): # This hashes each of the word / feature using MurmurHash 3 and generates # an index for each, then calculates TF for each index # TODO : Need to check best numFeatures tf = HashingTF(numFeatures=10000).transform(data.map(lambda x : x.split())) # TF-IDF is calculated to understand how important a word is to a particular # document idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def training_set(pos_file, neg_file): text_negative = sc.textFile(neg_file) text_positive = sc.textFile(pos_file) train_text = text_negative.union(text_positive) train_labels = text_negative.map(lambda x: 0.0).union( text_positive.map(lambda x: 1.0)) tf = HashingTF(numFeatures=10000).transform(train_text.map(lambda x: x)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) training = train_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) return (training, idf)
def get_sentiment_analysis(self, lines): hashingTF = HashingTF() iDF = IDF() model = pickle.load(open('main/model.ml', 'rb')) def classify_tweet(tf): return iDF.fit(tf).transform(tf) analysis = lines.map(lambda line: line.split('@')) \ .map(lambda x: hashingTF.transform(x)) \ .transform(classify_tweet) \ .map(lambda x: LabeledPoint(1, x)) \ .map(lambda x: model.predict(x.features)) \ analysis.foreachRDD(lambda rdd: self.post_sentiment_analysis(rdd))
def create_tfidf(sc): # start = time.time() docs = sc.textFile(FILE0, 4).map(split_docs).cache() tags = docs.map(lambda doc: doc[1].split()).cache() tag = tags.map(lambda tags: tags[0]) words = docs.map(lambda doc: doc[0].split()) words = words.map(preProcess).cache() # id_tag = tag.zipWithIndex().map(swapOder) hashingTF = HashingTF() tf = hashingTF.transform(words) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf).cache() #tfidf = tfidf.collect() return tfidf, tags
def normTFIDF(fn_tokens_RDD, vecDim, caching=True): keysRDD = fn_tokens_RDD.keys() tokensRDD = fn_tokens_RDD.values() tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize( tokens, vecDim)) # passing the vecDim value. TIP: you need a lambda. if caching: tfVecRDD.persist( StorageLevel.MEMORY_ONLY ) # since we will read more than once, caching in Memory will make things quicker. idf = IDF() # create IDF object idfModel = idf.fit(tfVecRDD) # calculate IDF values tfIdfRDD = idfModel.transform( tfVecRDD) # 2nd pass needed (see lecture slides), transforms RDD norm = Normalizer( ) # create a Normalizer object like in the example linked above normTfIdfRDD = norm.transform(tfIdfRDD) # and apply it to the tfIdfRDD zippedRDD = keysRDD.zip(normTfIdfRDD) # zip the keys and values together return zippedRDD