def build_corpus_features(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=MAX_SENT_SIZE, max_nr_sent=MAX_NR_SENTENCES) corpus.add_sequence_list(train_seq) dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll')) corpus.add_sequence_list(dev_seq) categories = [ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance'] for cat in categories: brown_seq = corpus.read_sequence_list_brown(categories=cat) corpus.add_sequence_list(brown_seq) features = exfc.ExtendedFeatures(corpus) features.build_features() corpus.save_corpus(MODEL_DIR) features.save_features(MODEL_DIR+"features.txt") return corpus, features
def load_model(): corpus = pcc.PostagCorpus() corpus.load_corpus(MODEL_DIR) features = exfc.ExtendedFeatures(corpus) features.load_features(MODEL_DIR + "features.txt", corpus) model = spc.StructuredPercetron(corpus, features) model.load_model(MODEL_DIR) return corpus, features, model
def build_corpus_features(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll("../data/train-02-21.conll", max_sent_len=MAX_SENT_SIZE, max_nr_sent=MAX_NR_SENTENCES) corpus.add_sequence_list(train_seq) features = exfc.ExtendedFeatures(corpus) features.build_features() corpus.save_corpus(MODEL_DIR) features.save_features(MODEL_DIR + "features.txt") return corpus, features
id_f.build_features() #sp = spc.StructuredPercetron(corpus,id_f) #sp.nr_rounds = 20 #sp.train_supervised(train_seq.seq_list) # #pred_train = sp.viterbi_decode_corpus(train_seq.seq_list) #pred_dev = sp.viterbi_decode_corpus(dev_seq.seq_list) #pred_test = sp.viterbi_decode_corpus(test_seq.seq_list) # #eval_train = sp.evaluate_corpus(train_seq.seq_list,pred_train) #eval_dev = sp.evaluate_corpus(dev_seq.seq_list,pred_dev) #eval_test = sp.evaluate_corpus(test_seq.seq_list,pred_test) # #print "Structured Percetron - ID Features Accuracy Train: %.3f Dev: %.3f Test: %.3f"%(eval_train,eval_dev,eval_test) ex_f = exfc.ExtendedFeatures(corpus) ex_f.build_features() #sp = spc.StructuredPercetron(corpus,ex_f) #sp.nr_rounds = 20 #sp.train_supervised(train_seq.seq_list) # #pred_train = sp.viterbi_decode_corpus(train_seq.seq_list) #pred_dev = sp.viterbi_decode_corpus(dev_seq.seq_list) #pred_test = sp.viterbi_decode_corpus(test_seq.seq_list) # #eval_train = sp.evaluate_corpus(train_seq.seq_list,pred_train) #eval_dev = sp.evaluate_corpus(dev_seq.seq_list,pred_dev) #eval_test = sp.evaluate_corpus(test_seq.seq_list,pred_test) # #print "Structured Percetron - Extended Features Accuracy Train: %.3f Dev: %.3f Test: %.3f"%(eval_train,eval_dev,eval_test)
crf_online = crfo.CRFOnline(corpus.word_dict, corpus.tag_dict, feature_mapper) crf_online.num_epochs = 20 crf_online.train_supervised(train_seq) pred_train = crf_online.viterbi_decode_corpus(train_seq) pred_dev = crf_online.viterbi_decode_corpus(dev_seq) pred_test = crf_online.viterbi_decode_corpus(test_seq) eval_train = crf_online.evaluate_corpus(train_seq, pred_train) eval_dev = crf_online.evaluate_corpus(dev_seq, pred_dev) eval_test = crf_online.evaluate_corpus(test_seq, pred_test) print "CRF - ID Features Accuracy Train: %.3f Dev: %.3f Test: %.3f"%(eval_train, eval_dev, eval_test) feature_mapper = exfc.ExtendedFeatures(train_seq) feature_mapper.build_features() crf_online = crfo.CRFOnline(corpus.word_dict, corpus.tag_dict, feature_mapper) crf_online.num_epochs = 20 crf_online.train_supervised(train_seq) pred_train = crf_online.viterbi_decode_corpus(train_seq) pred_dev = crf_online.viterbi_decode_corpus(dev_seq) pred_test = crf_online.viterbi_decode_corpus(test_seq) eval_train = crf_online.evaluate_corpus(train_seq, pred_train) eval_dev = crf_online.evaluate_corpus(dev_seq, pred_dev) eval_test = crf_online.evaluate_corpus(test_seq, pred_test) print "CRF - Extended Features Accuracy Train: %.3f Dev: %.3f Test: %.3f"%(eval_train, eval_dev,eval_test)