class Feature1: def __init__(self, f): self.f = f self.tfidf = f.tfidf self.matrix = f.matrix self.dfs = f.dfs self.out = Output("output1") self.features = Set() def _select(self): values = [] for did, row in self.matrix.iteritems(): values += [(v, k) for (k, v) in row.iteritems()] length = len(values) start = min(length / 20, 100) end = min(start + 1000, length) selected = sorted(values, key=operator.itemgetter(0), reverse=True)[start:end] for tfidf, word in selected: self.features.add(word) self.features = sorted(self.features) def write(self): print "Building and selecting Feature Set 1" self._select() self.out.write_data(self.f.docs, self.features, self.matrix)
class Feature2(Feature): def __init__(self, f): self.f = f self.tfidf = f.tfidf self.matrix = f.matrix self.dfs = f.dfs self.out = Output("output2") self.features = Set() def _select(self): length = len(self.dfs) start = min(length / 20, 100) end = min(start + 1000, length) selected = sorted(self.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)[start:end] for word, idf in selected: self.features.add(word) self.features = sorted(self.features) def write(self): print "Building and selecting Feature Set 2" self._select() self.out.write_data(self.f.docs, self.features, self.matrix)