Python HashingTF.cache Examples

Programming Language: Python

Namespace/Package Name: pyspark.mllib.feature

Class/Type: HashingTF

Method/Function: cache

Examples at hotexamples.com: 5

Python HashingTF.cache - 5 examples found. These are the top rated real world Python examples of pyspark.mllib.feature.HashingTF.cache extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HashingTF(30)

transform(30)

cache(5)

indexOf(4)

map(2)

Example #1

Show file

 def _compute_tfid(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts.map(lambda t: t.words))
     tf.cache()
     idf = IDF().fit(tf)
     tfidfs = idf.transform(tf)
     text_tfs = texts.zip(tfidfs)
     return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))

Example #2

Show file

File: text_rdd.py Project: oliyura/UANLP

 def tfidf(self):
     tf = HashingTF().transform(self._sents)
     self._tf = tf
     tf.cache()
     idf = IDF().fit(tf)
     self.idf = idf
     tfidf = idf.transform(tf)
     self._tfidf = dict(enumerate(tfidf.collect()))

Example #3

Show file

File: meteos-script-1.6.0.py Project: ncarkaci/meteos

    def parseTextRDDToIndex(self, data, label=True):

        if label:
            labels = data.map(lambda line: float(line.split(" ", 1)[0]))
            documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
        else:
            documents = data.map(lambda line: line.split(" "))

        tf = HashingTF().transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        index = idfIgnore.transform(tf)

        if label:
            return labels.zip(index).map(
                lambda line: LabeledPoint(line[0], line[1]))
        else:
            return index

Example #4

Show file

File: text_rdd.py Project: oliyura/UANLP

 def _compute_idf(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts)
     tf.cache()
     idf = IDF().fit(tf)
     return idf

Example #5

Show file

File: tfidf_classify_spark.py Project: PB12203006/Largedata

training_raw = sc.parallelize(traindata)


labels = training_raw.map(
    lambda doc: doc["label"],  # Standard Python dict access
    preservesPartitioning=True # This is obsolete.
)


# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf = HashingTF(numFeatures=numfeatures).transform( ## Use much larger number in practice
    training_raw.map(lambda doc: doc["text"].split(),
    preservesPartitioning=True))

tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# Combine using zip
training = labels.zip(tf).map(lambda x: LabeledPoint(x[0], x[1]))

# TEST DATA
testlabel = testlabels.map(lambda line: float(line))
t = reviewdata1.collect()
l = testlabel.collect()
testdata = [{"text":t[i],"label":l[i]} for i in range(len(l))]

test_raw = sc.parallelize(testdata)

testlabels = test_raw.map(