def real_test(self): self._test_xs, self._test_ys = ST.load_data(self.test_path) ST.replace_url(self._test_xs, fill='H') ST.replace_target(self._test_xs, fill='T') #x_y = [(self.discret_txt(txt), y) for txt, y in zip(self._test_xs, self._test_ys)] test_mat = self.build_sparse_X(self._test_xs) self.accuracy(test_mat, self._test_ys)
def format_test(self, emoticon=True, parenthesis=True): test_path='../test_data/%s_test_data' % self.classifier_type self._test_xs, self._test_ys = ST.load_data(test_path) linfo('begin preprocess test data, then sparse') self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs) #ST.replace_url(self._test_xs, fill='H') #ST.replace_target(self._test_xs, fill='T') self._test_ys = map(lambda x:self.tag2index[x], self._test_ys) self.format_sparse(self._test_xs, self._test_ys, '%s/test_data/%s%s_sparse_test_data_%s' % (project_dir, self.flag_prefix, self.classifier_type, 'icon' if emoticon else 'no_icon'))
def train(self,cross_validation=False, fold_sz=10, test_path='../../test_data/tri_test_data'): self._train_xs, self._train_ys = ST.load_data(self._path) if not self._emoticon: ST.remove_emoticon(self._train_xs) self.gram2gid = self._discretize_gram2gid() if cross_validation: linfo('begin to cross train') self._cross_train(fold_sz) else: classifier = self._train(self._train_xs, self._train_ys) self._test_xs, self._test_ys = ST.load_data(test_path) ST.replace_url(self._test_xs, fill='H') ST.replace_target(self._test_xs, fill='T') test_set = [(self._feature_encoding(txt), tag) for txt, tag in zip(self._test_xs, self._test_ys)] linfo('maxent classifier precision: %.4f' % classify.accuracy(classifier, test_set))
def train(self): self._train_xs, self._train_ys = ST.load_data(self._path) if not self._emoticon: ST.remove_emoticon(self._train_xs) self.gram2gid = self._discretize_gram2gid() X = self.build_sparse_X(self._train_xs) self.clf.fit(X, self._train_ys) self.real_test()
def train(self, icon=True, cross=False): #word2cnt = BayesClassifier.Word2Cnt() #txt = '今天天气就是棒[哈哈] [太阳] [飞起来]#' #return #self._load_data() #self._replace_url(fill=True) self._train_xs, self._train_ys = ST.load_data(self._train_path) ST.replace_url(self._train_xs, fill=True) if not icon: ST.remove_emoticon(self._train_xs) self._train(cross_validation=cross)
def format_test(self, emoticon=True, parenthesis=True): test_path = '../test_data/%s_test_data' % self.classifier_type self._test_xs, self._test_ys = ST.load_data(test_path) linfo('begin preprocess test data, then sparse') self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs) #ST.replace_url(self._test_xs, fill='H') #ST.replace_target(self._test_xs, fill='T') self._test_ys = map(lambda x: self.tag2index[x], self._test_ys) self.format_sparse( self._test_xs, self._test_ys, '%s/test_data/%s%s_sparse_test_data_%s' % (project_dir, self.flag_prefix, self.classifier_type, 'icon' if emoticon else 'no_icon'))
def _all_train(self, total_word2cnt, total_tag2cnt): if os.path.exists(self._test_path): test_xs, test_ys = ST.load_data(self._test_path) #linfo('load manually tagged data count: %s' % len(test_xs)) else: return test_t2c = {"P":0,"N":0,"O":0} for y in test_ys: if y not in test_t2c: raise Exception('Key Error in tag2cnt. unknown key: %s' % y) test_t2c[y] += 1 #print test_t2c return self._batch_predict(test_xs, test_ys, total_word2cnt, total_tag2cnt, test_t2c)
def _all_train(self, total_word2cnt, total_tag2cnt): if os.path.exists(self._test_path): test_xs, test_ys = ST.load_data(self._test_path) #linfo('load manually tagged data count: %s' % len(test_xs)) else: return test_t2c = {"P": 0, "N": 0, "O": 0} for y in test_ys: if y not in test_t2c: raise Exception('Key Error in tag2cnt. unknown key: %s' % y) test_t2c[y] += 1 #print test_t2c return self._batch_predict(test_xs, test_ys, total_word2cnt, total_tag2cnt, test_t2c)
def __init__(self, ct='tri', prefix=''): if prefix and prefix != 'Dg_': raise Exception('INVALID PREFIX GIVEN!!!') self.flag_prefix = prefix self.train_data_path = '%s/train_data/%s%s_train_data' % (project_dir, prefix, ct) if ct not in ['bi', 'tri']: raise Exception('INVALID Classifier Type') self.classifier_type = ct self.tag2index = TAG2INDEX self._train_xs, self._train_ys = ST.load_data(self.train_data_path) self._train_ys = map(lambda x: self.tag2index[x], self._train_ys) #self._feature_extract_config = ['unigram', 'bigram'] self._feature_extract_config = feature_config linfo('feature extract config: %s' % self._feature_extract_config) linfo('classifier type %s' % ct) linfo('init %s success' % self)