Ejemplo n.º 1
0
class Feature1:
    def __init__(self, f):
        self.f = f
        self.tfidf = f.tfidf
        self.matrix = f.matrix
        self.dfs = f.dfs
        self.out = Output("output1")
        self.features = Set()

    def _select(self):
        values = []
        for did, row in self.matrix.iteritems():
            values += [(v, k) for (k, v) in row.iteritems()]
        length = len(values)
        start = min(length / 20, 100)
        end = min(start + 1000, length)
        selected = sorted(values, key=operator.itemgetter(0), reverse=True)[start:end]
        for tfidf, word in selected:
            self.features.add(word)
        self.features = sorted(self.features)

    def write(self):
        print "Building and selecting Feature Set 1"
        self._select()
        self.out.write_data(self.f.docs, self.features, self.matrix)
Ejemplo n.º 2
0
class Feature2(Feature):
    def __init__(self, f):
        self.f = f
        self.tfidf = f.tfidf
        self.matrix = f.matrix
        self.dfs = f.dfs
        self.out = Output("output2")
        self.features = Set()

    def _select(self):
        length = len(self.dfs)
        start = min(length / 20, 100)
        end = min(start + 1000, length)
        selected = sorted(self.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)[start:end]
        for word, idf in selected:
            self.features.add(word)
        self.features = sorted(self.features)

    def write(self):
        print "Building and selecting Feature Set 2"
        self._select()
        self.out.write_data(self.f.docs, self.features, self.matrix)