Exemple #1
0
    def reinit(self, vocabs, parse_files):
        """ """

        self.preopen_parse_file = parse_files

        self._vocabs = vocabs
        self._multibuckets = [
            Multibucket.from_configurable(vocab,
                                          name='%s-%s' %
                                          (self.name, vocab.name))
            for vocab in self.vocabs
        ]
        self._metadata = []

        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(sent) for sent, metadata in self.iterfiles())
            for i in range(len(splits)):
                splits[i] += 1
        for multibucket, vocab in self.iteritems():
            multibucket.open(splits, depth=vocab.depth)
        for sent, metadata in self.iterfiles():
            self._metadata.append(metadata)
            for multibucket, vocab in self.iteritems():
                tokens = [line[vocab.conll_idx] for line in sent]
                idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens]
                multibucket.add(idxs, tokens)
        for multibucket in self:
            multibucket.close()
        self._multibucket = Multibucket.from_dataset(self)

        return
Exemple #2
0
    def __init__(self, vocabs, *args, **kwargs):
        """ """

        nlp_model = kwargs.pop('nlp_model', None)
        if "parse_files" in kwargs and isinstance(
                kwargs["parse_files"],
                io.StringIO):  ### SPECIAL CASE - PARSING StringIO
            self.preopen_parse_file = kwargs.pop(
                "parse_files"
            )  #This doesn't really play well with the configparser thing
        else:
            self.preopen_parse_file = None
        super(Dataset, self).__init__(*args, **kwargs)

        self._vocabs = vocabs
        self._multibuckets = [
            Multibucket.from_configurable(vocab,
                                          name='%s-%s' %
                                          (self.name, vocab.name))
            for vocab in self.vocabs
        ]
        self._metadata = []

        if nlp_model is not None:
            self._nlp_model = nlp_model.from_configurable(self, name=self.name)
        else:
            self._nlp_model = None

        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(sent) for sent, metadata in self.iterfiles())
            for i in range(len(splits)):
                splits[i] += 1
        for multibucket, vocab in self.iteritems():
            multibucket.open(splits, depth=vocab.depth)
        for sent, metadata in self.iterfiles():
            self._metadata.append(metadata)
            for multibucket, vocab in self.iteritems():
                tokens = [line[vocab.conll_idx] for line in sent]
                idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens]
                multibucket.add(idxs, tokens)
        for multibucket in self:
            multibucket.close()
        self._multibucket = Multibucket.from_dataset(self)

        return
Exemple #3
0
 def __init__(self, token_vocab, *args, **kwargs):
   """ """
   
   recount = kwargs.pop('recount', False)
   initialize_zero = kwargs.pop('initialize_zero', False)
   super(TokenVocab, self).__init__(*args, **kwargs)
   
   self._token_vocab = token_vocab
   self._token_counts = Counter()
   self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
   self._tok2idx = {}
   
   if recount:
     self.count()
   else:
     if os.path.isfile(self.filename):
       self.load()
     else:
       self.count()
       self.dump()
   self.index_vocab()
   
   embed_dims = [len(self), self.embed_size]
   if initialize_zero:
     self._embeddings_array = np.zeros(embed_dims)
   else:
     self._embeddings_array = np.random.randn(*embed_dims)
   return
 def __init__(self, token_vocab, *args, **kwargs):
   """ """
   
   super(BaseVocab, self).__init__(*args, **kwargs)
   self._cased = super(BaseVocab, self).cased
   
   SubtokenVocab.__setattr__(self, '_token_vocab', token_vocab)
   self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
   self._vocabs = [NgramVocab.from_vocab(self.token_vocab, i+1, cased=self.cased) for i in range(self.max_n)]
   self._special_tokens = super(BaseVocab, self).special_tokens
   self._special_tokens_set = set(self._special_tokens)
   SubtokenVocab._set_special_tokens(self)
   self._tok2idx = {}
   
   for vocab in self:
     assert vocab.token_vocab is self.token_vocab
   return