def apply(self):
     it = len(self.data)
     if self.norm:
         it += len(self.data)
     if self.TFIDF:
         it += len(self.data)
     pb = OWGUI.ProgressBar(self, iterations=it)
     newdata = orngText.bagOfWords(self.data, textAttribute=self.textAttribute, callback=pb.advance)
     if self.norm:
         newdata = orngText.Preprocessor_norm()(newdata, self.norm, callback=pb.advance)
     if self.TFIDF:
         newdata = orngText.PreprocessorConstructor_tfidf()(newdata)(newdata)
     self.send("Bag-of-Words", newdata)
     self.nWords = len(newdata.domain.getmetas(orngText.TEXTMETAID))
     pb.finish()
Esempio n. 2
0
import orange
import orngText
import orngMDS

data = orange.ExampleTable("bookexcerpts")
datain = data

# process text, obtain TFIDF vectors
p = orngText.Preprocess(language="en")
data = p.removeStopwordsFromExampleTable(data, 0)
data = p.lemmatizeExampleTable(data, 0)
data = orngText.bagOfWords(data)
bof = data
data = orngText.PreprocessorConstructor_tfidf()(data)(data)
data = orngText.Preprocessor_norm()(data, 2)

# compute distance as 1/cos(fi)
distance = orngText.cos(data, distance = 1)

# use MDS for visualization
print "running MDS"
mds=orngMDS.MDS(distance)
mds.run(100)

from pylab import *
colors = ["red", "yellow", "blue"]

points = []
for (i,d) in enumerate(data):
   points.append((mds.points[i][0], mds.points[i][1], d.getclass()))
for c in range(len(data.domain.classVar.values)):