Beispiel #1
0
    def train(self):
        _logger.info("reading posterior probabilities from naive bayes model")
        self.words = list()
        self.words_seen = set()
        X = np.array([])
        for term in g_term_count:
            term = term_category(term)
            if term in self.words_seen:
                continue
            self.words_seen.add(term)
            self.words.append(term)
            x = list()
            for domain in self.naive.model.domains:
                val = self.naive.posterior_prob(term, domain)
                x.append(val)
            X = np.append(X, x)
        _logger.info("%d terms need to be clustered" % len(self.words))

        X = np.reshape(X, (len(self.words), len(self.naive.model.domains)))
        kmeans = KMeans(n_clusters = len(self.words) / 10)
        y = kmeans.fit_predict(X)

        with open(OUTFILE_PATH, "w") as outfile:
            for i in xrange(len(y)):
                outfile.write("%s\t%d\n" % (self.words[i].encode('utf-8'), y[i]))
        _logger.info("clustering result wrote to %s" % OUTFILE_PATH)
Beispiel #2
0
def slim(sentence, clf):
    sel = clf.named_steps['select']
    vert = clf.named_steps['vert']
    terms = list(set(sentence.split()))
    terms = sorted([(term, sel.scores_[get_vert_idx(vert, term_category(term))]) for term in terms], 
                   key = lambda x: -x[1])[:7]
    return ' '.join([term[0] for term in terms])
Beispiel #3
0
 def __call__(self, sentence):
     ret = self.ngram(sentence)
     terms = self.tokens(sentence)
     for term in terms:
         cate = term_category(term)
         if term != cate:
             ret.append(cate)
     return ret
Beispiel #4
0
 def train(self):
     self.count = defaultdict(int)
     c = 0
     with open(self.train_path) as infile:
         for line in infile:
             line = line.strip()
             if not line:
                 continue
             terms, domain = line.split('\t')
             term_set = set()
             for term in terms.split(' '):
                 term = term_category(term)
                 if term not in term_set:
                     term_set.add(term)
                     self.count[(term, domain)] += 1
             c += 1
             if c % 10000 == 0:
                 _logger.debug("%d records processed" % c)
 def get_category(self, term):
     term = term_category(term)
     return word_clustering.get_cluster(term)
Beispiel #6
0
def parse(sentence):
    s1 = [term_category(term) for term in sentence.split()]
    if len(s1) == 0:
        return ["__empty__"]
    else:
        return s1
Beispiel #7
0
def extract(sent):
    return ' '.join(sorted(list(set(sent.split(' '))),
                           key = lambda term: -get_gini(term_category(term)))[:5])
Beispiel #8
0
def parse(sentence):
    for term in sentence.split():
        yield term_category(term)
Beispiel #9
0
 def get_category(self, term):
     return term_category(term)
Beispiel #10
0
 def __call__(self, sentence):
     terms = sentence.strip().split(' ')
     ret = [term_category(term) for term in terms]
     return list(ret)