def _cross_train(self, fold_sz):
     rid2shard = ST.random_shardlize(fold_sz, len(self._train_xs), load=True)
     precision = 0
     for fid,sd in rid2shard.items():
         tmp_train_xs = [self._train_xs[i] for i in sd]
         tmp_train_ys = [self._train_ys[i] for i in sd]
         test_set = [(self._feature_encoding(self._train_xs[i]), self._train_ys[i]) for i in sd]
         classifier = self._train(tmp_train_xs, tmp_train_ys)
         p = classify.accuracy(classifier, test_set)
         linfo('maxent classifier precision: %.4f' % p)
         precision += p
     linfo('average maxent classifier precision: %.4f' % precision/fold_sz)
    def _train(self, shard_sz=10, cross_validation=True):
        #print self._ngrams_config
        linfo('begin train classifier')
        st = time.time()
        rid2shard = ST.random_shardlize(shard_sz,
                                        len(self._train_xs),
                                        load=True,
                                        path=self.rand_path)

        #rid2word_info = {}
        #total_word2cnt = BayesClassifier.Word2Cnt()
        rid2tag_cnt, rid2word_presence = {}, {}
        total_word2presence = BayesClassifier.Word2Cnt()
        total_tag2cnt = {"P": 0, "N": 0, "O": 0}
        for rid in range(1, shard_sz + 1):
            shard = rid2shard[rid]
            #rid2word_info[rid]
            rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info(
                shard)
            #for tag, w2c in rid2word_info[rid].items():
            #    for w, c in w2c.items():
            #        total_word2cnt[tag].setdefault(w, 0)
            #        total_word2cnt[tag][w] += c
            for tag, w2p in rid2word_presence[rid].items():
                for w, c in w2p.items():
                    total_word2presence[tag].setdefault(w, 0)
                    total_word2presence[tag][w] += c
            for tag, cnt in rid2tag_cnt[rid].items():
                total_tag2cnt[tag] += cnt
        #self._debug_bigram(total_word2presence)
        self._prune(total_word2presence, rid2word_presence, total_tag2cnt)
        self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt
        linfo(self.total_t2c)
        #cross_validation
        if cross_validation:
            linfo('beign cross validation')
            p, r, f = self._cross_train(total_word2presence, rid2word_presence,
                                        total_tag2cnt, rid2tag_cnt, shard_sz,
                                        rid2shard)
            linfo(
                'Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f'
                % (p, r, f, time.time() - st))
        else:
            linfo('beign train and test with manually tagged data set')
            p, r, f = self._all_train(total_word2presence, total_tag2cnt)
            linfo(
                'Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f'
                % (p, r, f, time.time() - st))
    def _train(self, shard_sz=10, cross_validation=True):
        #print self._ngrams_config
        linfo('begin train classifier')
        st = time.time()
        rid2shard = ST.random_shardlize(shard_sz, len(self._train_xs), load=True, path=self.rand_path)

        #rid2word_info = {}
        #total_word2cnt = BayesClassifier.Word2Cnt()
        rid2tag_cnt, rid2word_presence = {}, {}
        total_word2presence = BayesClassifier.Word2Cnt()
        total_tag2cnt = {"P":0,"N":0,"O":0}
        for rid in range(1, shard_sz+1):
            shard = rid2shard[rid]
            #rid2word_info[rid]
            rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info(shard)
            #for tag, w2c in rid2word_info[rid].items():
            #    for w, c in w2c.items():
            #        total_word2cnt[tag].setdefault(w, 0)
            #        total_word2cnt[tag][w] += c
            for tag, w2p in rid2word_presence[rid].items():
                for w, c in w2p.items():
                    total_word2presence[tag].setdefault(w, 0)
                    total_word2presence[tag][w] += c
            for tag, cnt in rid2tag_cnt[rid].items():
                total_tag2cnt[tag] += cnt
        #self._debug_bigram(total_word2presence)
        self._prune(total_word2presence, rid2word_presence, total_tag2cnt)
        self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt
        linfo(self.total_t2c)
        #cross_validation
        if cross_validation:
            linfo('beign cross validation')
            p, r, f= self._cross_train(total_word2presence, rid2word_presence, total_tag2cnt, rid2tag_cnt, shard_sz, rid2shard)
            linfo('Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f, time.time()- st))
        else:
            linfo('beign train and test with manually tagged data set')
            p, r, f = self._all_train(total_word2presence, total_tag2cnt)
            linfo('Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f , time.time()- st))