Ejemplo n.º 1
0
    def load_examples(self,data_name='Not',save_data=False, n_examples=None):
        '''
        Set n_examples to some positive integer to only load (up to) that 
        number of examples
        '''
        self.log('Loading examples')
        if self.filename is None:
            raise ValueError('Filename argument to constructor can\'t be None')

        self.vocab_to_ints = {}
        self.ints_to_vocab = {}
        examples = []
        n = 0

        deps = deps_from_tsv(self.filename, limit=n_examples)

        for dep in deps:
            tokens = dep['sentence'].split()
            if len(tokens) > self.maxlen or not self.criterion(dep):
                continue

            tokens = self.process_single_dependency(dep)
            ints = []


            for token in tokens:
                if token not in self.vocab_to_ints:                                         #save the vocab to int dict  
                    # zero is for pad                                                       #save the int to vocab dict                                                                                                     
                    x = self.vocab_to_ints[token] = len(self.vocab_to_ints) + 1
                    self.ints_to_vocab[x] = token
                ints.append(self.vocab_to_ints[token])




            examples.append((self.class_to_code[dep['label']], ints, dep))
            n += 1
            if n_examples is not None and n >= n_examples:
                break

        if (save_data) :
            with open('plus5_v2i.pkl', 'wb') as f:
                pickle.dump(self.vocab_to_ints, f)
            with open('plus5_i2v.pkl', 'wb') as f:
                pickle.dump(self.ints_to_vocab, f)

        return examples
Ejemplo n.º 2
0
    def load_examples(self, n_examples=None):
        '''
        Set n_examples to some positive integer to only load (up to) that 
        number of examples
        '''
        self.log('Loading examples')
        if self.filename is None:
            raise ValueError('Filename argument to constructor can\'t be None')

        self.vocab_to_ints = {}
        self.ints_to_vocab = {}
        examples = []
        n = 0

        deps = deps_from_tsv(self.filename, limit=n_examples)

        for dep in deps:
            tokens = dep['sentence'].split()
            if len(tokens) > self.maxlen or not self.criterion(dep):
                continue

            tokens = self.process_single_dependency(dep)
            ints = []
            for token in tokens:
                if token not in self.vocab_to_ints:
                    # zero is for pad
                    x = self.vocab_to_ints[token] = len(self.vocab_to_ints) + 1
                    self.ints_to_vocab[x] = token
                ints.append(self.vocab_to_ints[token])

            examples.append((self.class_to_code[dep['label']], ints, dep))
            n += 1
            if n_examples is not None and n >= n_examples:
                break

        return examples
Ejemplo n.º 3
0
import utils
import pickle as pkl
import constants

infile = 'data/agr_50_mostcommon_10K.tsv'
worddict = {}
worddict[constants.pad] = constants.pad_idx
worddict[constants.unk] = constants.unk_idx  # probably we won't need this
worddict[constants.bos] = constants.bos_idx
worddict[constants.eos] = constants.eos_idx

for dep in utils.deps_from_tsv(infile):
    for w in dep['sentence'].split():
        if w not in worddict:
            worddict[w] = len(worddict)
with open('data/vocab.pkl', 'wb') as f:
    pkl.dump(worddict, f)
print('| vocabulary size %d' % len(worddict))
print('| done!')