Beispiel #1
0
 def real_test(self):
     self._test_xs, self._test_ys = ST.load_data(self.test_path)
     ST.replace_url(self._test_xs, fill='H')
     ST.replace_target(self._test_xs, fill='T')
     #x_y = [(self.discret_txt(txt), y) for txt, y in zip(self._test_xs, self._test_ys)]
     test_mat = self.build_sparse_X(self._test_xs)
     self.accuracy(test_mat, self._test_ys)
 def format_test(self, emoticon=True, parenthesis=True):
     test_path='../test_data/%s_test_data' % self.classifier_type
     self._test_xs, self._test_ys = ST.load_data(test_path)
     linfo('begin preprocess test data, then sparse')
     self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs)
     #ST.replace_url(self._test_xs, fill='H')
     #ST.replace_target(self._test_xs, fill='T')
     self._test_ys = map(lambda x:self.tag2index[x], self._test_ys)
     self.format_sparse(self._test_xs, self._test_ys, '%s/test_data/%s%s_sparse_test_data_%s' % (project_dir, self.flag_prefix, self.classifier_type, 'icon' if emoticon else 'no_icon'))
    def train(self,cross_validation=False, fold_sz=10, test_path='../../test_data/tri_test_data'):
        self._train_xs, self._train_ys = ST.load_data(self._path)
        if not self._emoticon:
            ST.remove_emoticon(self._train_xs)
        self.gram2gid = self._discretize_gram2gid()
        if cross_validation:
            linfo('begin to cross train')
            self._cross_train(fold_sz)
        else:
            classifier = self._train(self._train_xs, self._train_ys)

            self._test_xs, self._test_ys = ST.load_data(test_path)
            ST.replace_url(self._test_xs, fill='H')
            ST.replace_target(self._test_xs, fill='T')

            test_set = [(self._feature_encoding(txt), tag) for txt, tag in zip(self._test_xs, self._test_ys)]

            linfo('maxent classifier precision: %.4f' % classify.accuracy(classifier, test_set))
Beispiel #4
0
    def train(self):
        self._train_xs, self._train_ys = ST.load_data(self._path)
        if not self._emoticon:
            ST.remove_emoticon(self._train_xs)
        self.gram2gid = self._discretize_gram2gid()
        X = self.build_sparse_X(self._train_xs)

        self.clf.fit(X, self._train_ys)

        self.real_test()
    def train(self, icon=True, cross=False):
        #word2cnt = BayesClassifier.Word2Cnt()

        #txt = '今天天气就是棒[哈哈] [太阳] [飞起来]#'
        #return
        #self._load_data()
        #self._replace_url(fill=True)
        self._train_xs, self._train_ys = ST.load_data(self._train_path)
        ST.replace_url(self._train_xs, fill=True)
        if not icon:
            ST.remove_emoticon(self._train_xs)
        self._train(cross_validation=cross)
 def train(self, icon=True, cross=False):
     #word2cnt = BayesClassifier.Word2Cnt()
     
     #txt = '今天天气就是棒[哈哈] [太阳] [飞起来]#'
     #return
     #self._load_data()
     #self._replace_url(fill=True)
     self._train_xs, self._train_ys = ST.load_data(self._train_path)
     ST.replace_url(self._train_xs, fill=True)
     if not icon:
         ST.remove_emoticon(self._train_xs)
     self._train(cross_validation=cross)
 def format_test(self, emoticon=True, parenthesis=True):
     test_path = '../test_data/%s_test_data' % self.classifier_type
     self._test_xs, self._test_ys = ST.load_data(test_path)
     linfo('begin preprocess test data, then sparse')
     self._raw_test_xs, self._test_xs = ST.preprocess(self._test_xs)
     #ST.replace_url(self._test_xs, fill='H')
     #ST.replace_target(self._test_xs, fill='T')
     self._test_ys = map(lambda x: self.tag2index[x], self._test_ys)
     self.format_sparse(
         self._test_xs, self._test_ys,
         '%s/test_data/%s%s_sparse_test_data_%s' %
         (project_dir, self.flag_prefix, self.classifier_type,
          'icon' if emoticon else 'no_icon'))
 def _all_train(self, total_word2cnt, total_tag2cnt):
     if os.path.exists(self._test_path):
         test_xs, test_ys = ST.load_data(self._test_path)
         #linfo('load manually tagged data count: %s' % len(test_xs))
     else:
         return
     test_t2c = {"P":0,"N":0,"O":0}
     for y in test_ys:
         if y not in test_t2c:
             raise Exception('Key Error in tag2cnt. unknown key: %s' % y)
         test_t2c[y] += 1
     #print test_t2c
     return  self._batch_predict(test_xs, test_ys, total_word2cnt, total_tag2cnt, test_t2c)
 def _all_train(self, total_word2cnt, total_tag2cnt):
     if os.path.exists(self._test_path):
         test_xs, test_ys = ST.load_data(self._test_path)
         #linfo('load manually tagged data count: %s' % len(test_xs))
     else:
         return
     test_t2c = {"P": 0, "N": 0, "O": 0}
     for y in test_ys:
         if y not in test_t2c:
             raise Exception('Key Error in tag2cnt. unknown key: %s' % y)
         test_t2c[y] += 1
     #print test_t2c
     return self._batch_predict(test_xs, test_ys, total_word2cnt,
                                total_tag2cnt, test_t2c)
    def __init__(self, ct='tri', prefix=''):
        if prefix and prefix != 'Dg_':
            raise Exception('INVALID PREFIX GIVEN!!!')
        self.flag_prefix = prefix
        self.train_data_path = '%s/train_data/%s%s_train_data' % (project_dir, prefix, ct)
        if ct not in ['bi', 'tri']:
            raise Exception('INVALID Classifier Type')
        self.classifier_type = ct
        self.tag2index = TAG2INDEX
        self._train_xs, self._train_ys = ST.load_data(self.train_data_path)
        self._train_ys = map(lambda x: self.tag2index[x], self._train_ys)

        #self._feature_extract_config = ['unigram', 'bigram']
        self._feature_extract_config = feature_config 
        linfo('feature extract config: %s' % self._feature_extract_config)
        linfo('classifier type %s' % ct)
        linfo('init %s success' % self)
    def __init__(self, ct='tri', prefix=''):
        if prefix and prefix != 'Dg_':
            raise Exception('INVALID PREFIX GIVEN!!!')
        self.flag_prefix = prefix
        self.train_data_path = '%s/train_data/%s%s_train_data' % (project_dir,
                                                                  prefix, ct)
        if ct not in ['bi', 'tri']:
            raise Exception('INVALID Classifier Type')
        self.classifier_type = ct
        self.tag2index = TAG2INDEX
        self._train_xs, self._train_ys = ST.load_data(self.train_data_path)
        self._train_ys = map(lambda x: self.tag2index[x], self._train_ys)

        #self._feature_extract_config = ['unigram', 'bigram']
        self._feature_extract_config = feature_config
        linfo('feature extract config: %s' % self._feature_extract_config)
        linfo('classifier type %s' % ct)
        linfo('init %s success' % self)