Example #1
0
 def _compute_tfid(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts.map(lambda t: t.words))
     tf.cache()
     idf = IDF().fit(tf)
     tfidfs = idf.transform(tf)
     text_tfs = texts.zip(tfidfs)
     return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
Example #2
0
 def tfidf(self):
     tf = HashingTF().transform(self._sents)
     self._tf = tf
     tf.cache()
     idf = IDF().fit(tf)
     self.idf = idf
     tfidf = idf.transform(tf)
     self._tfidf = dict(enumerate(tfidf.collect()))
Example #3
0
    def parseTextRDDToIndex(self, data, label=True):

        if label:
            labels = data.map(lambda line: float(line.split(" ", 1)[0]))
            documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
        else:
            documents = data.map(lambda line: line.split(" "))

        tf = HashingTF().transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        index = idfIgnore.transform(tf)

        if label:
            return labels.zip(index).map(
                lambda line: LabeledPoint(line[0], line[1]))
        else:
            return index
Example #4
0
 def _compute_idf(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts)
     tf.cache()
     idf = IDF().fit(tf)
     return idf
training_raw = sc.parallelize(traindata)


labels = training_raw.map(
    lambda doc: doc["label"],  # Standard Python dict access
    preservesPartitioning=True # This is obsolete.
)


# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf = HashingTF(numFeatures=numfeatures).transform( ## Use much larger number in practice
    training_raw.map(lambda doc: doc["text"].split(),
    preservesPartitioning=True))

tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# Combine using zip
training = labels.zip(tf).map(lambda x: LabeledPoint(x[0], x[1]))

# TEST DATA
testlabel = testlabels.map(lambda line: float(line))
t = reviewdata1.collect()
l = testlabel.collect()
testdata = [{"text":t[i],"label":l[i]} for i in range(len(l))]

test_raw = sc.parallelize(testdata)

testlabels = test_raw.map(