def train(self): _logger.info("reading posterior probabilities from naive bayes model") self.words = list() self.words_seen = set() X = np.array([]) for term in g_term_count: term = term_category(term) if term in self.words_seen: continue self.words_seen.add(term) self.words.append(term) x = list() for domain in self.naive.model.domains: val = self.naive.posterior_prob(term, domain) x.append(val) X = np.append(X, x) _logger.info("%d terms need to be clustered" % len(self.words)) X = np.reshape(X, (len(self.words), len(self.naive.model.domains))) kmeans = KMeans(n_clusters = len(self.words) / 10) y = kmeans.fit_predict(X) with open(OUTFILE_PATH, "w") as outfile: for i in xrange(len(y)): outfile.write("%s\t%d\n" % (self.words[i].encode('utf-8'), y[i])) _logger.info("clustering result wrote to %s" % OUTFILE_PATH)
def slim(sentence, clf): sel = clf.named_steps['select'] vert = clf.named_steps['vert'] terms = list(set(sentence.split())) terms = sorted([(term, sel.scores_[get_vert_idx(vert, term_category(term))]) for term in terms], key = lambda x: -x[1])[:7] return ' '.join([term[0] for term in terms])
def __call__(self, sentence): ret = self.ngram(sentence) terms = self.tokens(sentence) for term in terms: cate = term_category(term) if term != cate: ret.append(cate) return ret
def train(self): self.count = defaultdict(int) c = 0 with open(self.train_path) as infile: for line in infile: line = line.strip() if not line: continue terms, domain = line.split('\t') term_set = set() for term in terms.split(' '): term = term_category(term) if term not in term_set: term_set.add(term) self.count[(term, domain)] += 1 c += 1 if c % 10000 == 0: _logger.debug("%d records processed" % c)
def get_category(self, term): term = term_category(term) return word_clustering.get_cluster(term)
def parse(sentence): s1 = [term_category(term) for term in sentence.split()] if len(s1) == 0: return ["__empty__"] else: return s1
def extract(sent): return ' '.join(sorted(list(set(sent.split(' '))), key = lambda term: -get_gini(term_category(term)))[:5])
def parse(sentence): for term in sentence.split(): yield term_category(term)
def get_category(self, term): return term_category(term)
def __call__(self, sentence): terms = sentence.strip().split(' ') ret = [term_category(term) for term in terms] return list(ret)