def apply(self): it = len(self.data) if self.norm: it += len(self.data) if self.TFIDF: it += len(self.data) pb = OWGUI.ProgressBar(self, iterations=it) newdata = orngText.bagOfWords(self.data, textAttribute=self.textAttribute, callback=pb.advance) if self.norm: newdata = orngText.Preprocessor_norm()(newdata, self.norm, callback=pb.advance) if self.TFIDF: newdata = orngText.PreprocessorConstructor_tfidf()(newdata)(newdata) self.send("Bag-of-Words", newdata) self.nWords = len(newdata.domain.getmetas(orngText.TEXTMETAID)) pb.finish()
import orange import orngText import orngMDS data = orange.ExampleTable("bookexcerpts") datain = data # process text, obtain TFIDF vectors p = orngText.Preprocess(language="en") data = p.removeStopwordsFromExampleTable(data, 0) data = p.lemmatizeExampleTable(data, 0) data = orngText.bagOfWords(data) bof = data data = orngText.PreprocessorConstructor_tfidf()(data)(data) data = orngText.Preprocessor_norm()(data, 2) # compute distance as 1/cos(fi) distance = orngText.cos(data, distance = 1) # use MDS for visualization print "running MDS" mds=orngMDS.MDS(distance) mds.run(100) from pylab import * colors = ["red", "yellow", "blue"] points = [] for (i,d) in enumerate(data): points.append((mds.points[i][0], mds.points[i][1], d.getclass())) for c in range(len(data.domain.classVar.values)):