def read_sequence_list_conll(self, train_file, max_sent_len=100000, max_nr_sent=100000): """ Read a conll2002 or conll2003 file into a sequence list.""" instance_list = self.read_conll_instances(train_file, max_sent_len, max_nr_sent) seq_list = SequenceList(self.word_dict) # for indices for sent_x in instance_list: seq_list.add_sequence(sent_x) return seq_list
def prepare_chains(self): # training sequences self.train = SequenceList(self.x_dict) print("Creating training from corpus.") with open(self.corpus_file) as IN: for c, l in enumerate(IN, 1): if c > self.howbig: break self.train.add_sequence([w for w in l.strip().split(" ")])
def __init__(self): #observation vocabulary self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"]) #training sequences train_seqs = SequenceList(self.x_dict) train_seqs.add_sequence(["walk", "walk", "shop", "clean"]) train_seqs.add_sequence(["walk", "walk", "shop", "clean"]) train_seqs.add_sequence(["walk", "shop", "shop", "clean"]) self.train = train_seqs
def load_seq(s): from sequences.sequence_list import SequenceList seq_list = SequenceList(word_dict, tag_dict) words = [] tags = [] line = s.rstrip() pairs = line.split(' ') for pair in pairs: fields = pair.split('_') words.append(fields[0]) tags.append(fields[1]) seq_list.add_sequence(words, tags) return seq_list[0]
class TextCorpus: def __init__(self, corpus_file, minfreq=0, howbig=10000): """ :param minfreq: minimum frequency of a word in order to be taken into account :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab".format( self.corpus_file) # file of form: w\tf\n self.minfreq = minfreq self.howbig = howbig try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") def prepare_chains(self): # training sequences self.train = SequenceList(self.x_dict) print("Creating training from corpus.") with open(self.corpus_file) as IN: for c, l in enumerate(IN, 1): if c > self.howbig: break self.train.add_sequence([w for w in l.strip().split(" ")]) def prepare_vocab_dict(self): from collections import defaultdict vocab_dict = defaultdict(int) with open(self.corpus_file) as IN, open(self.vocab_file, "w") as OUT: for c, l in enumerate(IN, 1): if c > self.howbig: break for w in l.strip().split(" "): vocab_dict[w] += 1 for w, f in vocab_dict.items(): OUT.write("{}\t{}\n".format(w, f)) print("Vocabulary file prepared.")
class TextCorpus: def __init__(self, corpus_file, minfreq=0, howbig=10000): """ :param minfreq: minimum frequency of a word in order to be taken into account :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab".format(self.corpus_file) # file of form: w\tf\n self.minfreq = minfreq self.howbig = howbig try: self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") def prepare_chains(self): # training sequences self.train = SequenceList(self.x_dict) print("Creating training from corpus.") with open(self.corpus_file) as IN: for c, l in enumerate(IN, 1): if c > self.howbig: break self.train.add_sequence([w for w in l.strip().split(" ")]) def prepare_vocab_dict(self): from collections import defaultdict vocab_dict = defaultdict(int) with open(self.corpus_file) as IN, open(self.vocab_file, "w") as OUT: for c, l in enumerate(IN, 1): if c > self.howbig: break for w in l.strip().split(" "): vocab_dict[w] += 1 for w, f in vocab_dict.items(): OUT.write("{}\t{}\n".format(w, f)) print("Vocabulary file prepared.")
def load_seq(s): from sequences.sequence_list import SequenceList seq_list = SequenceList(word_dict, tag_dict) ex_x = [] ex_y = [] contents = s.decode('string-escape').split('\n') for line in contents: toks = line.split() if len(toks) < 2: continue pos = toks[4] word = toks[1] pos = pos.lower() assert pos in mapping assert word in word_dict pos = mapping[pos] assert pos in tag_dict ex_x.append(word) ex_y.append(pos) seq_list.add_sequence(ex_x, ex_y) return seq_list[0]
def __init__(self): self.word_dict = LabelDictionary() self.sequence_list = SequenceList(self.word_dict)