def TFIDF(source, destination): if destination[-1] != '/': destination=destination+'/' ## typically define the source message rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split()) tf=HashingTF() tfVectors=tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d"%ind + ".txt" ind = ind + 1 file = open(dest_path,'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf=IDF() idfModel=idf.fit(tfVectors) tfIdfVectors=idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination+"TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0,100): print ""#Testing Printing" except KeyboardInterrupt: pass
def TFIDF(source, destination): if destination[-1] != '/': destination = destination + '/' ## typically define the source message rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d" % ind + ".txt" ind = ind + 1 file = open(dest_path, 'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination + "TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0, 100): print "" #Testing Printing" except KeyboardInterrupt: pass
def generate_tf_idf(twProfilesRdd, numFe): """ Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples: (gender,(clean words tuple)) """ gtlp = generate_gender_tf(twProfilesRdd, numFe) idf = IDF() tfVectorsRDD = gtlp.map(lambda tp: tp[1]) idfModel = idf.fit(tfVectorsRDD) idfRdd = idfModel.transform(tfVectorsRDD) return (idfRdd.zip(gtlp).map(lambda tp: (tp[1][0], tp[0])), idfModel)
def generate_tf_idf(twProfilesRdd,numFe): """ Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples: (gender,(clean words tuple)) """ gtlp=generate_gender_tf(twProfilesRdd, numFe) idf=IDF() tfVectorsRDD=gtlp.map(lambda tp: tp[1]) idfModel=idf.fit(tfVectorsRDD) idfRdd=idfModel.transform(tfVectorsRDD) return (idfRdd.zip(gtlp).map(lambda tp:(tp[1][0],tp[0])),idfModel)
def normTFIDF(fn_tokens_RDD, vecDim, caching=True): keysRDD = fn_tokens_RDD.keys() tokensRDD = fn_tokens_RDD.values() tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize( tokens, vecDim)) # passing the vecDim value. TIP: you need a lambda. if caching: tfVecRDD.persist( StorageLevel.MEMORY_ONLY ) # since we will read more than once, caching in Memory will make things quicker. idf = IDF() # create IDF object idfModel = idf.fit(tfVecRDD) # calculate IDF values tfIdfRDD = idfModel.transform( tfVecRDD) # 2nd pass needed (see lecture slides), transforms RDD norm = Normalizer( ) # create a Normalizer object like in the example linked above normTfIdfRDD = norm.transform(tfIdfRDD) # and apply it to the tfIdfRDD zippedRDD = keysRDD.zip(normTfIdfRDD) # zip the keys and values together return zippedRDD
def extract_features(self, feat='tfidf', **kwargs): """ Converts each subtitle into its TF/TFIDF representation. Normalizes if necessary. Parameters -------- Feat: 'tf' or 'tfidf'. kwargs: num_features, minDocFreq, or other arguments to be passed to the MLLib objects. Returns -------- RDD of features with key. """ # transform BOW into TF vectors num_features = kwargs.get('num_features', 10000) htf = HashingTF(num_features) feat_rdd = self.RDD.mapValues(htf.transform).cache() # transform TF vectors into IDF vectors if feat == 'tfidf': keys, tf_vecs = feat_rdd.keys(), feat_rdd.values() minDocFreq = kwargs.get('minDocFreq', 2) idf = IDF(minDocFreq=minDocFreq) idf_model = idf.fit(tf_vecs) idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray())) feat_rdd = keys.zip(idf_rdd) if self.model_type == 'log_reg': normalizer = StandardScaler(withMean=True, withStd=True) keys, vecs = feat_rdd.keys(), feat_rdd.values() norm_model = normalizer.fit(vecs) norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray())) feat_rdd = keys.zip(norm_rdd) return feat_rdd
##################################################### ################# Step 2 TF-IDF ##################### ##################################################### from pyspark.mllib.feature import HashingTF, IDF # TF部分 tf = HashingTF(50000) # 取50000维 vectors = data.map( lambda line: (line[0], line[2], tf.transform(line[1]))) # 形如 [index, title, tf] # IDF部分 vec = vectors.map(lambda line: line[2]) # 只留下tf结果 idf = IDF() idfmodel = idf.fit(vec) tfIdfVectors = idfmodel.transform(vec) # 获得tf-idf结果,完成tf-idf步骤 ##################################################### ################## Step 3 SVD ####################### ##################################################### # 进计算VD from pyspark.mllib.linalg.distributed import RowMatrix import numpy as np tfIdf_matrix = RowMatrix(tfIdfVectors) # 将算好的tf-idf结果转换成矩阵 svd = tfIdf_matrix.computeSVD(100, True) # 计算svd并只保留特征值最大的前100个 u = svd.U s = svd.s v = svd.V
sc = SparkContext() rdd = sc.wholeTextFiles("/usr/local/Cellar/BigDataAdvanced/Assignment1/TwitterStuff/TweetData").map(lambda (name,text):text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() count = 0 for vec in a: print vec count = count + 1 with open("TF_Tweet"+str(count)+".txt","w") as f: f.write(str(vec)) f.close() idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) file = open("TF-IDF_tweet.txt", 'w') file.write(str(tfIdfVectors.collect())) #count = 0 #output=tfIdfVectors.collect() #for vec in output: # print vec # count = count + 1 # with open("TF_Wiki"+str(count)+".txt","w") as f: # f.write(str(vec)) # f.close()
data = df.rdd.map(list) print(data.first()) score = data.map(lambda s: 1.0 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0) comment = data.map(lambda s: s[3]) print(score.count()) print(comment.count()) tf = HashingTF() tfVectors = tf.transform(comment).cache() idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) #print(tfIdfVectors.take(3)) #需要用 RDD 的 zip 算子将这两部分数据连接起来,并将其转化为分类模型里的 LabeledPoint 类型 zip_score_comment = score.zip(tfIdfVectors) final_data = zip_score_comment.map(lambda line: LabeledPoint(line[0], line[1])) train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=0) print(train_data.take(1)) time_start = time.time() #SVMModel = SVMWithSGD.train(train_data,iterations=100) SVMModel = SVMWithSGD.train(train_data, iterations=1000) time_end = time.time() cost_time = time_end - time_start print("spark_svm_en cost_time:", cost_time)
pattern="\\W") stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(nltkstop) regexer = regexTokenizer.transform(TschemaPeople) stop = stopwordsRemover.transform(regexer) #tokenizer = Tokenizer(inputCol="filtered", outputCol="words") #wordsData = tokenizer.transform(stop) hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures") #hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(stop) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) seenData = idfModel.transform(featurizedData) (trainingData1, testData1) = seenData.randomSplit([0.6, 0.4], seed=100) lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData1) predictions1 = lrModel.transform(testData1) predictions1.select("FileContent","label","prediction") \ .orderBy("probability", ascending=False) \ .show( truncate = 30) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") print("the accuracy is:", evaluator.evaluate(predictions1))
folders = os.listdir(root) folders for f in range(0, len(folders)): print(f) print(root + folders[f]) data = sc.wholeTextFiles(root + folders[f]) data.cache() documents = data.map(lambda s: tokenize(s[1])).map( lambda s: remove_stopwords(s, stopwords)) files = data.map(lambda s: s[0]).collect() documents.cache() hashingTF = HashingTF() featurizedData = hashingTF.transform(documents) idf = IDF() idfModel = idf.fit(featurizedData) featurizedData.cache() tfidfs = idfModel.transform(featurizedData) tfidfs.cache() final_rdd = tfidfs.zipWithIndex().map(lambda s: IndexedRow(s[1], s[0])) final_rdd.cache() sims = IndexedRowMatrix(final_rdd).toCoordinateMatrix().transpose( ).toIndexedRowMatrix().columnSimilarities() pairs = sims.entries.map(lambda m: [m.i, m.j, m.value]).collect() for p in range(0, len(pairs)): pairs.append([pairs[p][1], pairs[p][0], pairs[p][2]]) results = [] for p in range(0, len(files)): results.append([p, 0, 0.0]) for p in range(0, len(pairs)):
from pyspark.mllib.linalg import Vectors from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF sc = SparkContext(appName="LatentDirichletAllocationExample") # SparkContext #--Load and parse the data--# #datalist=PreProcessUtils.dataPrepare() #data = sc.parallelize(datalist) data = sc.textFile('../../data/lda_data.txt') #--transform data to tf vectors --# htf = HashingTF() tfData = htf.transform(data) #transform to tf-idf vectors idf = IDF() idfData = idf.fit(tfData) tfidf = idfData.transform(tfData) #parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) print(tfidf.collect()) # Index documents with unique IDs corpus = tfidf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() print(corpus.collect()) # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
def get_tfidf(documents): documents_sparse_vectors = get_sparse_vectors(documents) idf = IDF() model = idf.fit(documents_sparse_vectors) tfidf = model.transform(documents_sparse_vectors) return tfidf
return wordbag documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id']) #users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department') #for u in users.select('department','user').take(10000): # print u ''' professors = documents.select('prof_name').distinct() department = documents.select('department').distinct() #grade 1/2/3/4 eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5 for e in eval_total.collect(): print e ''' htf = HashingTF(inputCol= 'words',outputCol = 'rawFeatures') featured = htf.transform(documents) idf = IDF(inputCol = 'rawFeatures',outputCol = 'idf') idfModel = idf.fit(featured) tf_idf = idfModel.transform(featured) normalizer = Normalizer(inputCol = 'idf', outputCol = 'idf_norm', p = 2.0) normData = normalizer.transform(tf_idf) normData.rdd.saveAsPickleFile('idf_normalized')