Example #1
0
class Clustering(object):
    def __init__(self, naive_model_path):
        _logger.info("loading naive bayes model from %s" % naive_model_path)
        model = pickle.load(open(naive_model_path))
        self.naive = NaiveDecoder(model)
        self.words = dict()
    
    def train(self):
        _logger.info("reading posterior probabilities from naive bayes model")
        self.words = list()
        self.words_seen = set()
        X = np.array([])
        for term in g_term_count:
            term = term_category(term)
            if term in self.words_seen:
                continue
            self.words_seen.add(term)
            self.words.append(term)
            x = list()
            for domain in self.naive.model.domains:
                val = self.naive.posterior_prob(term, domain)
                x.append(val)
            X = np.append(X, x)
        _logger.info("%d terms need to be clustered" % len(self.words))

        X = np.reshape(X, (len(self.words), len(self.naive.model.domains)))
        kmeans = KMeans(n_clusters = len(self.words) / 10)
        y = kmeans.fit_predict(X)

        with open(OUTFILE_PATH, "w") as outfile:
            for i in xrange(len(y)):
                outfile.write("%s\t%d\n" % (self.words[i].encode('utf-8'), y[i]))
        _logger.info("clustering result wrote to %s" % OUTFILE_PATH)
Example #2
0
 def __init__(self, naive_model_path):
     _logger.info("loading naive bayes model from %s" % naive_model_path)
     model = pickle.load(open(naive_model_path))
     self.naive = NaiveDecoder(model)
     self.words = dict()