def build_corpus_features(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=MAX_SENT_SIZE, max_nr_sent=MAX_NR_SENTENCES) corpus.add_sequence_list(train_seq) dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll')) corpus.add_sequence_list(dev_seq) categories = [ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance'] for cat in categories: brown_seq = corpus.read_sequence_list_brown(categories=cat) corpus.add_sequence_list(brown_seq) features = exfc.ExtendedFeatures(corpus) features.build_features() corpus.save_corpus(MODEL_DIR) features.save_features(MODEL_DIR+"features.txt") return corpus, features
def load_model(): corpus = pcc.PostagCorpus() corpus.load_corpus(MODEL_DIR) features = exfc.ExtendedFeatures(corpus) features.load_features(MODEL_DIR + "features.txt", corpus) model = spc.StructuredPercetron(corpus, features) model.load_model(MODEL_DIR) return corpus, features, model
def build_corpus_features(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll("../data/train-02-21.conll", max_sent_len=MAX_SENT_SIZE, max_nr_sent=MAX_NR_SENTENCES) corpus.add_sequence_list(train_seq) features = exfc.ExtendedFeatures(corpus) features.build_features() corpus.save_corpus(MODEL_DIR) features.save_features(MODEL_DIR + "features.txt") return corpus, features
print y_pred, score print "Truth test 1" print simple.test.seq_list[1] #pdb.set_trace() # RIGHT NOW, WHOEVER COMPLETED VITERBI KNOWS ONLY THAT THEIR PREDICTED SEQUENCES MATCH # THE ONES IN THE GUIDE. BUT THESE DOES NOT MEAN THAT THERE IS NOT A BUG # SOMEWHERE. # SUGGESTION: MAKE THEM OUTPUT THE PROBABILITY OF THE BEST SEQUENCE. # FOR THIS, THEY NEED TO DIVIDE THE BEST SCORE (RETURNED BY VITERBI) # BY THE LIKELIHOOD (BY RUNNING THE FORWARD ALGO) #exercise 2.7 print "Exercise 2.7" corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll("../data/train-02-21.conll", max_sent_len=15, max_nr_sent=1000) test_seq = corpus.read_sequence_list_conll("../data/test-23.conll", max_sent_len=15, max_nr_sent=1000) dev_seq = corpus.read_sequence_list_conll("../data/dev-22.conll", max_sent_len=15, max_nr_sent=1000) #corpus.add_sequence_list(train_seq) hmm = hmmc.HMM(corpus.word_dict, corpus.tag_dict) hmm.train_supervised(train_seq) viterbi_pred_train = hmm.viterbi_decode_corpus(train_seq) posterior_pred_train = hmm.posterior_decode_corpus(train_seq)