def load_examples(self,data_name='Not',save_data=False, n_examples=None): ''' Set n_examples to some positive integer to only load (up to) that number of examples ''' self.log('Loading examples') if self.filename is None: raise ValueError('Filename argument to constructor can\'t be None') self.vocab_to_ints = {} self.ints_to_vocab = {} examples = [] n = 0 deps = deps_from_tsv(self.filename, limit=n_examples) for dep in deps: tokens = dep['sentence'].split() if len(tokens) > self.maxlen or not self.criterion(dep): continue tokens = self.process_single_dependency(dep) ints = [] for token in tokens: if token not in self.vocab_to_ints: #save the vocab to int dict # zero is for pad #save the int to vocab dict x = self.vocab_to_ints[token] = len(self.vocab_to_ints) + 1 self.ints_to_vocab[x] = token ints.append(self.vocab_to_ints[token]) examples.append((self.class_to_code[dep['label']], ints, dep)) n += 1 if n_examples is not None and n >= n_examples: break if (save_data) : with open('plus5_v2i.pkl', 'wb') as f: pickle.dump(self.vocab_to_ints, f) with open('plus5_i2v.pkl', 'wb') as f: pickle.dump(self.ints_to_vocab, f) return examples
def load_examples(self, n_examples=None): ''' Set n_examples to some positive integer to only load (up to) that number of examples ''' self.log('Loading examples') if self.filename is None: raise ValueError('Filename argument to constructor can\'t be None') self.vocab_to_ints = {} self.ints_to_vocab = {} examples = [] n = 0 deps = deps_from_tsv(self.filename, limit=n_examples) for dep in deps: tokens = dep['sentence'].split() if len(tokens) > self.maxlen or not self.criterion(dep): continue tokens = self.process_single_dependency(dep) ints = [] for token in tokens: if token not in self.vocab_to_ints: # zero is for pad x = self.vocab_to_ints[token] = len(self.vocab_to_ints) + 1 self.ints_to_vocab[x] = token ints.append(self.vocab_to_ints[token]) examples.append((self.class_to_code[dep['label']], ints, dep)) n += 1 if n_examples is not None and n >= n_examples: break return examples
import utils import pickle as pkl import constants infile = 'data/agr_50_mostcommon_10K.tsv' worddict = {} worddict[constants.pad] = constants.pad_idx worddict[constants.unk] = constants.unk_idx # probably we won't need this worddict[constants.bos] = constants.bos_idx worddict[constants.eos] = constants.eos_idx for dep in utils.deps_from_tsv(infile): for w in dep['sentence'].split(): if w not in worddict: worddict[w] = len(worddict) with open('data/vocab.pkl', 'wb') as f: pkl.dump(worddict, f) print('| vocabulary size %d' % len(worddict)) print('| done!')