def TFIDF(source, destination):
    if destination[-1] != '/':
        destination=destination+'/'
## typically define the source message
    rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split())
    tf=HashingTF()
    tfVectors=tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d"%ind + ".txt"
        ind = ind + 1
        file = open(dest_path,'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf=IDF()
    idfModel=idf.fit(tfVectors)
    tfIdfVectors=idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination+"TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0,100):
            print ""#Testing Printing"
    except KeyboardInterrupt:
            pass
Ejemplo n.º 2
0
def TFIDF(source, destination):

    if destination[-1] != '/':
        destination = destination + '/'
    ## typically define the source message
    rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split())
    tf = HashingTF()
    tfVectors = tf.transform(rdd).cache()
    a = tfVectors.collect()
    # Storing the TF values above in individual files, one per link
    ind = 0
    for vector in a:
        dest_path = destination + "TF_%d" % ind + ".txt"
        ind = ind + 1
        file = open(dest_path, 'w')
        file.write(str(vector))
        file.close()
    # Calculating IDF Values for each case.
    idf = IDF()
    idfModel = idf.fit(tfVectors)
    tfIdfVectors = idfModel.transform(tfVectors)
    # Writing TF-IDF values to a single file.
    file = open(destination + "TF-IDF.txt", 'w')
    file.write(str(tfIdfVectors.collect()))
    try:
        for i in range(0, 100):
            print ""  #Testing Printing"
    except KeyboardInterrupt:
        pass
def generate_tf_idf(twProfilesRdd, numFe):
    """
    Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples:
    (gender,(clean words tuple))
    """
    gtlp = generate_gender_tf(twProfilesRdd, numFe)
    idf = IDF()
    tfVectorsRDD = gtlp.map(lambda tp: tp[1])
    idfModel = idf.fit(tfVectorsRDD)
    idfRdd = idfModel.transform(tfVectorsRDD)
    return (idfRdd.zip(gtlp).map(lambda tp: (tp[1][0], tp[0])), idfModel)
def generate_tf_idf(twProfilesRdd,numFe):
    """
    Generate TF IDF tuple (gender,sparse vector) from rdd containing following tuples:
    (gender,(clean words tuple))
    """
    gtlp=generate_gender_tf(twProfilesRdd, numFe)
    idf=IDF()
    tfVectorsRDD=gtlp.map(lambda tp: tp[1])
    idfModel=idf.fit(tfVectorsRDD)
    idfRdd=idfModel.transform(tfVectorsRDD)
    return (idfRdd.zip(gtlp).map(lambda tp:(tp[1][0],tp[0])),idfModel)
def normTFIDF(fn_tokens_RDD, vecDim, caching=True):
    keysRDD = fn_tokens_RDD.keys()
    tokensRDD = fn_tokens_RDD.values()
    tfVecRDD = tokensRDD.map(lambda tokens: hashing_vectorize(
        tokens, vecDim))  # passing the vecDim value. TIP: you need a lambda.
    if caching:
        tfVecRDD.persist(
            StorageLevel.MEMORY_ONLY
        )  # since we will read more than once, caching in Memory will make things quicker.
    idf = IDF()  # create IDF object
    idfModel = idf.fit(tfVecRDD)  # calculate IDF values
    tfIdfRDD = idfModel.transform(
        tfVecRDD)  # 2nd pass needed (see lecture slides), transforms RDD
    norm = Normalizer(
    )  # create a Normalizer object like in the example linked above
    normTfIdfRDD = norm.transform(tfIdfRDD)  # and apply it to the tfIdfRDD
    zippedRDD = keysRDD.zip(normTfIdfRDD)  # zip the keys and values together
    return zippedRDD
Ejemplo n.º 6
0
    def extract_features(self, feat='tfidf', **kwargs):
        """
        Converts each subtitle into its TF/TFIDF representation.
        Normalizes if necessary.

        Parameters
        --------
        Feat: 'tf' or 'tfidf'.
        kwargs: num_features, minDocFreq, or other arguments to be passed
        to the MLLib objects.

        Returns
        --------
        RDD of features with key.
        """

        # transform BOW into TF vectors
        num_features = kwargs.get('num_features', 10000)
        htf = HashingTF(num_features)
        feat_rdd = self.RDD.mapValues(htf.transform).cache()

        # transform TF vectors into IDF vectors
        if feat == 'tfidf':
            keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
            minDocFreq = kwargs.get('minDocFreq', 2)
            idf = IDF(minDocFreq=minDocFreq)
            idf_model = idf.fit(tf_vecs)
            idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(idf_rdd)

        if self.model_type == 'log_reg':
            normalizer = StandardScaler(withMean=True, withStd=True)
            keys, vecs = feat_rdd.keys(), feat_rdd.values()
            norm_model = normalizer.fit(vecs)
            norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(norm_rdd)

        return feat_rdd
Ejemplo n.º 7
0
#####################################################
################# Step 2 TF-IDF #####################
#####################################################

from pyspark.mllib.feature import HashingTF, IDF

# TF部分
tf = HashingTF(50000)  # 取50000维
vectors = data.map(
    lambda line:
    (line[0], line[2], tf.transform(line[1])))  # 形如 [index, title, tf]

# IDF部分
vec = vectors.map(lambda line: line[2])  # 只留下tf结果
idf = IDF()
idfmodel = idf.fit(vec)
tfIdfVectors = idfmodel.transform(vec)  # 获得tf-idf结果,完成tf-idf步骤

#####################################################
################## Step 3 SVD #######################
#####################################################

# 进计算VD
from pyspark.mllib.linalg.distributed import RowMatrix
import numpy as np

tfIdf_matrix = RowMatrix(tfIdfVectors)  # 将算好的tf-idf结果转换成矩阵
svd = tfIdf_matrix.computeSVD(100, True)  # 计算svd并只保留特征值最大的前100个
u = svd.U
s = svd.s
v = svd.V
Ejemplo n.º 8
0
sc = SparkContext()

rdd = sc.wholeTextFiles("/usr/local/Cellar/BigDataAdvanced/Assignment1/TwitterStuff/TweetData").map(lambda (name,text):text.split())
tf = HashingTF()
tfVectors = tf.transform(rdd).cache()
a = tfVectors.collect()
count = 0 
for vec in a:
        print vec
        count = count + 1
        with open("TF_Tweet"+str(count)+".txt","w") as f:
                f.write(str(vec))
        f.close()

idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)
file = open("TF-IDF_tweet.txt", 'w')
file.write(str(tfIdfVectors.collect()))

#count = 0
#output=tfIdfVectors.collect()
#for vec in output:
#	print vec
#	count = count + 1
#	with open("TF_Wiki"+str(count)+".txt","w") as f:
#		f.write(str(vec))
#	f.close()
	
	
Ejemplo n.º 9
0
data = df.rdd.map(list)
print(data.first())

score = data.map(lambda s: 1.0
                 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0)
comment = data.map(lambda s: s[3])

print(score.count())
print(comment.count())

tf = HashingTF()
tfVectors = tf.transform(comment).cache()

idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)

#print(tfIdfVectors.take(3))
#需要用 RDD 的 zip 算子将这两部分数据连接起来,并将其转化为分类模型里的 LabeledPoint 类型
zip_score_comment = score.zip(tfIdfVectors)
final_data = zip_score_comment.map(lambda line: LabeledPoint(line[0], line[1]))
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=0)
print(train_data.take(1))

time_start = time.time()
#SVMModel = SVMWithSGD.train(train_data,iterations=100)
SVMModel = SVMWithSGD.train(train_data, iterations=1000)
time_end = time.time()
cost_time = time_end - time_start
print("spark_svm_en cost_time:", cost_time)
Ejemplo n.º 10
0
                                pattern="\\W")
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(nltkstop)

regexer = regexTokenizer.transform(TschemaPeople)
stop = stopwordsRemover.transform(regexer)

#tokenizer = Tokenizer(inputCol="filtered", outputCol="words")
#wordsData = tokenizer.transform(stop)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
#hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)

featurizedData = hashingTF.transform(stop)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
seenData = idfModel.transform(featurizedData)

(trainingData1, testData1) = seenData.randomSplit([0.6, 0.4], seed=100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData1)
predictions1 = lrModel.transform(testData1)

predictions1.select("FileContent","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show( truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

print("the accuracy is:", evaluator.evaluate(predictions1))
folders = os.listdir(root)
folders
for f in range(0, len(folders)):
    print(f)
    print(root + folders[f])
    data = sc.wholeTextFiles(root + folders[f])
    data.cache()
    documents = data.map(lambda s: tokenize(s[1])).map(
        lambda s: remove_stopwords(s, stopwords))
    files = data.map(lambda s: s[0]).collect()
    documents.cache()
    hashingTF = HashingTF()
    featurizedData = hashingTF.transform(documents)
    idf = IDF()
    idfModel = idf.fit(featurizedData)
    featurizedData.cache()
    tfidfs = idfModel.transform(featurizedData)
    tfidfs.cache()
    final_rdd = tfidfs.zipWithIndex().map(lambda s: IndexedRow(s[1], s[0]))
    final_rdd.cache()
    sims = IndexedRowMatrix(final_rdd).toCoordinateMatrix().transpose(
    ).toIndexedRowMatrix().columnSimilarities()
    pairs = sims.entries.map(lambda m: [m.i, m.j, m.value]).collect()
    for p in range(0, len(pairs)):
        pairs.append([pairs[p][1], pairs[p][0], pairs[p][2]])
    results = []
    for p in range(0, len(files)):
        results.append([p, 0, 0.0])

    for p in range(0, len(pairs)):
Ejemplo n.º 12
0
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

sc = SparkContext(appName="LatentDirichletAllocationExample")  # SparkContext
#--Load and parse the data--#
#datalist=PreProcessUtils.dataPrepare()
#data = sc.parallelize(datalist)
data = sc.textFile('../../data/lda_data.txt')

#--transform data to tf vectors --#
htf = HashingTF()
tfData = htf.transform(data)
#transform to tf-idf vectors
idf = IDF()
idfData = idf.fit(tfData)
tfidf = idfData.transform(tfData)

#parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

print(tfidf.collect())

# Index documents with unique IDs
corpus = tfidf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
print(corpus.collect())
# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
def get_tfidf(documents):
    documents_sparse_vectors = get_sparse_vectors(documents)
    idf = IDF()
    model = idf.fit(documents_sparse_vectors)
    tfidf = model.transform(documents_sparse_vectors)
    return tfidf
Ejemplo n.º 14
0
	return wordbag
	

documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id'])

#users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department')
#for u in users.select('department','user').take(10000):
#	print u
'''
professors = documents.select('prof_name').distinct()
department = documents.select('department').distinct()
#grade	1/2/3/4
eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5

for e in eval_total.collect():
	print e
'''



htf = HashingTF(inputCol= 'words',outputCol = 'rawFeatures')
featured = htf.transform(documents)
idf = IDF(inputCol = 'rawFeatures',outputCol = 'idf')
idfModel = idf.fit(featured)
tf_idf = idfModel.transform(featured)
normalizer = Normalizer(inputCol = 'idf', outputCol = 'idf_norm', p = 2.0)
normData = normalizer.transform(tf_idf)

normData.rdd.saveAsPickleFile('idf_normalized')