Ejemplo n.º 1
0
def build_corpus_features():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'),
        max_sent_len=MAX_SENT_SIZE,
        max_nr_sent=MAX_NR_SENTENCES)
    corpus.add_sequence_list(train_seq)
    dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'))
    corpus.add_sequence_list(dev_seq)
    categories = [
        'adventure',
        'belles_lettres',
        'editorial',
        'fiction',
        'government',
        'hobbies',
        'humor',
        'learned',
        'lore',
        'mystery',
        'news',
        'religion',
        'reviews',
        'romance']
    for cat in categories:
        brown_seq = corpus.read_sequence_list_brown(categories=cat)
        corpus.add_sequence_list(brown_seq)
    features = exfc.ExtendedFeatures(corpus)
    features.build_features()
    corpus.save_corpus(MODEL_DIR)
    features.save_features(MODEL_DIR+"features.txt")
    return corpus, features
Ejemplo n.º 2
0
def load_model():
    corpus = pcc.PostagCorpus()
    corpus.load_corpus(MODEL_DIR)
    features = exfc.ExtendedFeatures(corpus)
    features.load_features(MODEL_DIR + "features.txt", corpus)
    model = spc.StructuredPercetron(corpus, features)
    model.load_model(MODEL_DIR)
    return corpus, features, model
Ejemplo n.º 3
0
def build_corpus_features():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll("../data/train-02-21.conll",
                                                max_sent_len=MAX_SENT_SIZE,
                                                max_nr_sent=MAX_NR_SENTENCES)
    corpus.add_sequence_list(train_seq)
    features = exfc.ExtendedFeatures(corpus)
    features.build_features()
    corpus.save_corpus(MODEL_DIR)
    features.save_features(MODEL_DIR + "features.txt")
    return corpus, features
Ejemplo n.º 4
0
print y_pred, score
print "Truth test 1"
print simple.test.seq_list[1]

#pdb.set_trace()

# RIGHT NOW, WHOEVER COMPLETED VITERBI KNOWS ONLY THAT THEIR PREDICTED SEQUENCES MATCH
# THE ONES IN THE GUIDE. BUT THESE DOES NOT MEAN THAT THERE IS NOT A BUG
# SOMEWHERE.
# SUGGESTION: MAKE THEM OUTPUT THE PROBABILITY OF THE BEST SEQUENCE.
# FOR THIS, THEY NEED TO DIVIDE THE BEST SCORE (RETURNED BY VITERBI)
# BY THE LIKELIHOOD (BY RUNNING THE FORWARD ALGO)

#exercise 2.7
print "Exercise 2.7"
corpus = pcc.PostagCorpus()
train_seq = corpus.read_sequence_list_conll("../data/train-02-21.conll",
                                            max_sent_len=15,
                                            max_nr_sent=1000)
test_seq = corpus.read_sequence_list_conll("../data/test-23.conll",
                                           max_sent_len=15,
                                           max_nr_sent=1000)
dev_seq = corpus.read_sequence_list_conll("../data/dev-22.conll",
                                          max_sent_len=15,
                                          max_nr_sent=1000)
#corpus.add_sequence_list(train_seq)
hmm = hmmc.HMM(corpus.word_dict, corpus.tag_dict)
hmm.train_supervised(train_seq)

viterbi_pred_train = hmm.viterbi_decode_corpus(train_seq)
posterior_pred_train = hmm.posterior_decode_corpus(train_seq)