def _cross_train(self, fold_sz): rid2shard = ST.random_shardlize(fold_sz, len(self._train_xs), load=True) precision = 0 for fid,sd in rid2shard.items(): tmp_train_xs = [self._train_xs[i] for i in sd] tmp_train_ys = [self._train_ys[i] for i in sd] test_set = [(self._feature_encoding(self._train_xs[i]), self._train_ys[i]) for i in sd] classifier = self._train(tmp_train_xs, tmp_train_ys) p = classify.accuracy(classifier, test_set) linfo('maxent classifier precision: %.4f' % p) precision += p linfo('average maxent classifier precision: %.4f' % precision/fold_sz)
def _train(self, shard_sz=10, cross_validation=True): #print self._ngrams_config linfo('begin train classifier') st = time.time() rid2shard = ST.random_shardlize(shard_sz, len(self._train_xs), load=True, path=self.rand_path) #rid2word_info = {} #total_word2cnt = BayesClassifier.Word2Cnt() rid2tag_cnt, rid2word_presence = {}, {} total_word2presence = BayesClassifier.Word2Cnt() total_tag2cnt = {"P": 0, "N": 0, "O": 0} for rid in range(1, shard_sz + 1): shard = rid2shard[rid] #rid2word_info[rid] rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info( shard) #for tag, w2c in rid2word_info[rid].items(): # for w, c in w2c.items(): # total_word2cnt[tag].setdefault(w, 0) # total_word2cnt[tag][w] += c for tag, w2p in rid2word_presence[rid].items(): for w, c in w2p.items(): total_word2presence[tag].setdefault(w, 0) total_word2presence[tag][w] += c for tag, cnt in rid2tag_cnt[rid].items(): total_tag2cnt[tag] += cnt #self._debug_bigram(total_word2presence) self._prune(total_word2presence, rid2word_presence, total_tag2cnt) self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt linfo(self.total_t2c) #cross_validation if cross_validation: linfo('beign cross validation') p, r, f = self._cross_train(total_word2presence, rid2word_presence, total_tag2cnt, rid2tag_cnt, shard_sz, rid2shard) linfo( 'Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p, r, f, time.time() - st)) else: linfo('beign train and test with manually tagged data set') p, r, f = self._all_train(total_word2presence, total_tag2cnt) linfo( 'Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p, r, f, time.time() - st))
def _train(self, shard_sz=10, cross_validation=True): #print self._ngrams_config linfo('begin train classifier') st = time.time() rid2shard = ST.random_shardlize(shard_sz, len(self._train_xs), load=True, path=self.rand_path) #rid2word_info = {} #total_word2cnt = BayesClassifier.Word2Cnt() rid2tag_cnt, rid2word_presence = {}, {} total_word2presence = BayesClassifier.Word2Cnt() total_tag2cnt = {"P":0,"N":0,"O":0} for rid in range(1, shard_sz+1): shard = rid2shard[rid] #rid2word_info[rid] rid2tag_cnt[rid], rid2word_presence[rid] = self._cal_shard2info(shard) #for tag, w2c in rid2word_info[rid].items(): # for w, c in w2c.items(): # total_word2cnt[tag].setdefault(w, 0) # total_word2cnt[tag][w] += c for tag, w2p in rid2word_presence[rid].items(): for w, c in w2p.items(): total_word2presence[tag].setdefault(w, 0) total_word2presence[tag][w] += c for tag, cnt in rid2tag_cnt[rid].items(): total_tag2cnt[tag] += cnt #self._debug_bigram(total_word2presence) self._prune(total_word2presence, rid2word_presence, total_tag2cnt) self.total_w2c, self.total_t2c = total_word2presence, total_tag2cnt linfo(self.total_t2c) #cross_validation if cross_validation: linfo('beign cross validation') p, r, f= self._cross_train(total_word2presence, rid2word_presence, total_tag2cnt, rid2tag_cnt, shard_sz, rid2shard) linfo('Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f, time.time()- st)) else: linfo('beign train and test with manually tagged data set') p, r, f = self._all_train(total_word2presence, total_tag2cnt) linfo('Manually Tag Data Classifier METRIC trained-precision: %.4f. recall: %.4f.f-value:%.4f. train cost used: %.2f' % (p , r , f , time.time()- st))