def build_corpus_features(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=MAX_SENT_SIZE, max_nr_sent=MAX_NR_SENTENCES) corpus.add_sequence_list(train_seq) dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll')) corpus.add_sequence_list(dev_seq) categories = [ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance'] for cat in categories: brown_seq = corpus.read_sequence_list_brown(categories=cat) corpus.add_sequence_list(brown_seq) features = exfc.ExtendedFeatures(corpus) features.build_features() corpus.save_corpus(MODEL_DIR) features.save_features(MODEL_DIR+"features.txt") return corpus, features
def eval_model(corpus, features, model): dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll')) pred_dev = model.viterbi_decode_corpus_log(dev_seq.seq_list) eval_dev = model.evaluate_corpus(dev_seq.seq_list, pred_dev) print("Accuracy on wsj development %f" % eval_dev) test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll')) pred_test = model.viterbi_decode_corpus_log(test_seq.seq_list) eval_test = model.evaluate_corpus(test_seq.seq_list, pred_test) print("Accuracy on wsj test %f" % eval_test)
def corpus_and_sequences(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=10, max_nr_sent=1000) dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'), max_sent_len=10, max_nr_sent=1000) test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'), max_sent_len=10, max_nr_sent=1000) return corpus, train_seq, dev_seq, test_seq
def build_corpus_features(): corpus = pcc.PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=MAX_SENT_SIZE, max_nr_sent=MAX_NR_SENTENCES) corpus.add_sequence_list(train_seq) features = exfc.ExtendedFeatures(corpus) features.build_features() corpus.save_corpus(MODEL_DIR) features.save_features(MODEL_DIR + "features.txt") return corpus, features
def __init__(self, **config): corpus = PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=15, max_nr_sent=1000) dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'), max_sent_len=15, max_nr_sent=1000) test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'), max_sent_len=15, max_nr_sent=1000) # Redo indices so that they are consecutive. Also cast all data to numpy arrays # of int32 for compatibility with GPUs and theano and add reverse index train_seq, test_seq, dev_seq = compacify(train_seq, test_seq, dev_seq, theano=True) # Get number of words and tags in the corpus self.input_size = len(train_seq.x_dict) self.output_size = len(train_seq.y_dict) # Data-sets self.datasets = { 'train': { 'input': [np.array(seq.x) for seq in train_seq], 'output': [np.array(seq.y) for seq in train_seq] }, 'dev': { 'input': [np.array(seq.x) for seq in dev_seq], 'output': [np.array(seq.y) for seq in dev_seq] }, 'test': { 'input': [np.array(seq.x) for seq in test_seq], 'output': [np.array(seq.y) for seq in test_seq] } } # Config self.config = config # Number of samples self.nr_samples = { sset: len(content['output']) for sset, content in self.datasets.items() } self.maxL = max(chain(*[[len(seq) for seq in content['input']] for content in self.datasets.values()])) return
def __init__(self, **config): corpus = PostagCorpus() train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'), max_sent_len=15, max_nr_sent=1000) dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'), max_sent_len=15, max_nr_sent=1000) test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'), max_sent_len=15, max_nr_sent=1000) # Redo indices so that they are consecutive. Also cast all data to numpy arrays # of int32 for compatibility with GPUs and theano and add reverse index train_seq, test_seq, dev_seq = compacify(train_seq, test_seq, dev_seq, theano=True) # Get number of words and tags in the corpus self.input_size = len(train_seq.x_dict) self.output_size = len(train_seq.y_dict) # Data-sets self.datasets = { 'train': { 'input': [np.array(seq.x) for seq in train_seq], 'output': [np.array(seq.y) for seq in train_seq] }, 'dev': { 'input': [np.array(seq.x) for seq in dev_seq], 'output': [np.array(seq.y) for seq in dev_seq] }, 'test': { 'input': [np.array(seq.x) for seq in test_seq], 'output': [np.array(seq.y) for seq in test_seq] } } # Config self.config = config # Number of samples self.nr_samples = { sset: len(content['output']) for sset, content in self.datasets.items() }
def download_embeddings(embbeding_name, target_file): ''' Downloads file through http with progress report Obtained in stack overflow: http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http -using-python ''' # Embedding download URLs if embbeding_name == 'senna_50': # senna_50 embeddings source_url = 'http://lxmls.it.pt/2015/wp-content/uploads/2015/senna_50' else: raise ValueError("I do not have embeddings %s for download" % embbeding_name) target_file_name = os.path.basename(data.find('senna_50')) u = urllib.request.urlopen(source_url) with open(target_file, 'wb') as f: meta = u.info() file_size = int(meta.getheaders("Content-Length")[0]) file_size_dl = 0 block_sz = 8192 print("Downloading: %s Bytes: %s" % (target_file_name, file_size)) while True: text_buffer = u.read(block_sz) if not text_buffer: break file_size_dl += len(text_buffer) f.write(text_buffer) status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl*100./file_size) status = status + chr(8)*(len(status)+1) print(status) print("")
def download_embeddings(embbeding_name, target_file): ''' Downloads file through http with progress report Obtained in stack overflow: http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http -using-python ''' # Embedding download URLs if embbeding_name == 'senna_50': # senna_50 embeddings source_url = 'http://lxmls.it.pt/2015/wp-content/uploads/2015/senna_50' else: raise ValueError("I do not have embeddings %s for download" % embbeding_name) target_file_name = os.path.basename(data.find('senna_50')) u = urllib.request.urlopen(source_url) with open(target_file, 'wb') as f: meta = u.info() file_size = int(meta.getheaders("Content-Length")[0]) file_size_dl = 0 block_sz = 8192 print("Downloading: %s Bytes: %s" % (target_file_name, file_size)) while True: text_buffer = u.read(block_sz) if not text_buffer: break file_size_dl += len(text_buffer) f.write(text_buffer) status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) status = status + chr(8) * (len(status) + 1) print(status) print("")
import codecs import gzip from lxmls.sequences.label_dictionary import * from lxmls.sequences.sequence import * from lxmls.sequences.sequence_list import * from lxmls import data from os.path import dirname import numpy as np # This is also needed for theano=True # from nltk.corpus import brown # Train and test files for english WSJ part of the Penn Tree Bank data.find('train-02-21.conll') data.find('dev-22.conll') data.find('test-23.conll') # Train and test files for portuguese Floresta sintatica data.find('pt_train.txt') pt_dev = "" data.find('pt_test.txt') def compacify(train_seq, test_seq, dev_seq, theano=False): """ Create a map for indices that is be compact (do not have unused indices) """ # REDO DICTS new_x_dict = LabelDictionary() new_y_dict = LabelDictionary(['noun']) for corpus_seq in [train_seq, test_seq, dev_seq]: