def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True): #if emoticon and 'emoticon' not in self._ngrams_config: # self._ngrams_config.append('emoticon') #elif not emoticon and 'emoticon' in self._ngrams_config: # self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config) #grams = self._retrieve_feature(txt) grams = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config, gram_icon_mixed=emoticon) if debug: linfo('begin debug case: %s' % txt) tag2score = {"P": 0, "N": 0, "O": 0} for w in grams: for tag in tag2score: if not train_t2c[tag]: continue score = self._cal_likelihood(train_w2c[tag].get(w, 0), train_t2c[tag]) tag2score[tag] += score if debug: linfo( 'DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s' % (w, tag, score, train_w2c[tag].get( w, 0), train_t2c[tag])) pred_tag = sorted(tag2score.keys(), key=lambda x: tag2score[x], reverse=True)[0] if debug: linfo('predict tag2score: %s' % tag2score) return pred_tag
def get_feature(self, txt, cache=False): if txt in self.txt2bags: bags = self.txt2bags[txt] else: bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config) if cache: self.txt2bags[txt] = bags return bags
def _feature_encoding(self, txt): bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config) #fs = {x:0 for x in gram2gid} fs = {} for gram in bags: if gram in self.gram2gid: fs[self.gram2gid[gram]] = 1 return fs
def _discretize_gram2gid(self): w2id = {} for txt in self._train_xs: bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config) for w in bags: if w not in w2id: w2id[w] = len(w2id) + 1 linfo('grams cnt: %s' % len(w2id)) return w2id
def get_feature(self, txt, cache=False): if txt in self.txt2bags: bags = self.txt2bags[txt] else: bags = ST.retrieve_feature( txt, feature_extract_config=self._feature_extract_config) if cache: self.txt2bags[txt] = bags return bags
def discret_txt(self, txt): fs = [0 for x in range(len(self.gram2gid))] bags = ST.retrieve_feature( txt, feature_extract_config=self._feature_extract_config) for w in bags: if w in self.gram2gid: wid = self.gram2gid[w] fs[wid] = 1 return fs
def _cal_shard2info(self, shard_indexs): #word2cnt = BayesClassifier.Word2Cnt() word2presence = BayesClassifier.Word2Cnt() #word_total_cnt = 0 tag2cnt = {"P":0,"N":0,"O":0} for index in shard_indexs: #word_total_cnt += len(x) txt = self._train_xs[index] tag = self._train_ys[index] tag2cnt[tag] += 1 bags = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config) for w in bags: word2presence[tag].setdefault(w, 0) word2presence[tag][w] += 1 #word2cnt[tag].setdefault(w, 0) #word2cnt[tag][w] += 1 return tag2cnt, word2presence
def _cal_shard2info(self, shard_indexs): #word2cnt = BayesClassifier.Word2Cnt() word2presence = BayesClassifier.Word2Cnt() #word_total_cnt = 0 tag2cnt = {"P": 0, "N": 0, "O": 0} for index in shard_indexs: #word_total_cnt += len(x) txt = self._train_xs[index] tag = self._train_ys[index] tag2cnt[tag] += 1 bags = ST.retrieve_feature( txt, feature_extract_config=self._ngrams_config) for w in bags: word2presence[tag].setdefault(w, 0) word2presence[tag][w] += 1 #word2cnt[tag].setdefault(w, 0) #word2cnt[tag][w] += 1 return tag2cnt, word2presence
def build_sparse_X(self, _xs): row_num = len(_xs) col_num = len(self.gram2gid) rows, cols = [], [] total_cnt = 0 for i, txt in enumerate(_xs): bags = ST.retrieve_feature( txt, feature_extract_config=self._feature_extract_config) for w in bags: if w in self.gram2gid: wid = self.gram2gid[w] rows.append(i) cols.append(wid) total_cnt += 1 linfo('build scipy sparse matrice. total_valid_cnt: %s' % (total_cnt)) row = np.array(rows) col = np.array(cols) data = np.array([1 for i in range(total_cnt)]) mtx = sparse.csr_matrix((data, (row, col)), shape=(row_num, col_num)) return mtx
def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True): #if emoticon and 'emoticon' not in self._ngrams_config: # self._ngrams_config.append('emoticon') #elif not emoticon and 'emoticon' in self._ngrams_config: # self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config) #grams = self._retrieve_feature(txt) grams = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config, gram_icon_mixed=emoticon) if debug: linfo('begin debug case: %s' % txt) tag2score = {"P":0,"N":0,"O":0} for w in grams: for tag in tag2score: if not train_t2c[tag]: continue score = self._cal_likelihood(train_w2c[tag].get(w, 0), train_t2c[tag]) tag2score[tag] += score if debug: linfo('DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s' % (w, tag, score, train_w2c[tag].get(w, 0), train_t2c[tag])) pred_tag = sorted(tag2score.keys(), key=lambda x: tag2score[x], reverse=True)[0] if debug: linfo('predict tag2score: %s' % tag2score) return pred_tag