Ejemplo n.º 1
0
 def build_vocabs(self, dataset, logger=None, transformer=None):
     self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel,
                                              unk_token=self.config.pad_rel)
     if self.config.joint:
         self.vocabs['rel'] = rel_2nd
     super().build_vocabs(dataset, logger, transformer)
     self.config.n_rels_2nd = len(rel_2nd)
Ejemplo n.º 2
0
    def _load(path, vocab, normalize=False) -> Tuple[Vocab, Union[np.ndarray, None]]:
        if not vocab:
            vocab = Vocab()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs
Ejemplo n.º 3
0
    def build_vocabs(self, dataset, logger=None, transformer=False):
        rel_vocab = self.vocabs.get('rel', None)
        if rel_vocab is None:
            rel_vocab = Vocab(unk_token=None,
                              pad_token=self.config.get('pad_rel', None))
            self.vocabs.put(rel=rel_vocab)

        timer = CountdownTimer(len(dataset))
        if transformer:
            token_vocab = None
        else:
            self.vocabs.token = token_vocab = VocabCounter(
                unk_token=self.config.get('unk', UNK))
        for i, sample in enumerate(dataset):
            timer.log('Building vocab [blink][yellow]...[/yellow][/blink]',
                      ratio_percentage=True)
        min_freq = self.config.get('min_freq', None)
        if min_freq:
            token_vocab.trim(min_freq)
        rel_vocab.set_unk_as_safe_unk()  # Some relation in dev set is OOV
        self.vocabs.lock()
        self.vocabs.summary(logger=logger)
        if token_vocab:
            self.config.n_words = len(self.vocabs['token'])
        self.config.n_rels = len(self.vocabs['rel'])
        if token_vocab:
            self.config.pad_index = self.vocabs['token'].pad_idx
            self.config.unk_index = self.vocabs['token'].unk_idx
Ejemplo n.º 4
0
 def build_vocabs(self, trn, logger, **kwargs):
     self.vocabs.pos = Vocab(unk_token=None, pad_token=None)
     self.vocabs.rel = Vocab(unk_token=None, pad_token=None)
     self.vocabs.lemma = Vocab(unk_token=None, pad_token=None)
     self.vocabs.feat = Vocab(unk_token=None, pad_token=None)
     timer = CountdownTimer(len(trn))
     max_seq_len = 0
     for each in trn:
         max_seq_len = max(max_seq_len, len(each['token']))
         timer.log(
             f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})'
         )
     for v in self.vocabs.values():
         v.set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
Ejemplo n.º 5
0
def load_vocabs(transform, save_dir, filename='vocabs.json'):
    vocabs = SerializableDict()
    vocabs.load_json(os.path.join(save_dir, filename))
    for key, value in vocabs.items():
        vocab = Vocab()
        vocab.copy_from(value)
        setattr(transform, key, vocab)
Ejemplo n.º 6
0
 def fit(self, trn_path: str, **kwargs) -> int:
     self.tag_vocab = Vocab(unk_token=None)
     num_samples = 0
     for words, tags in self.file_to_inputs(trn_path, gold=True):
         num_samples += 1
         self.tag_vocab.update(tags)
     return num_samples
Ejemplo n.º 7
0
 def __init__(self,
              filepath: str = None,
              vocab: Vocab = None,
              expand_vocab=True,
              lowercase=False,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              **kwargs):
     if vocab is None:
         vocab = Vocab()
     self.vocab = vocab
     super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim,
                      output_dim, unk, normalize, embeddings_initializer,
                      embeddings_regularizer, activity_regularizer,
                      embeddings_constraint, mask_zero, input_length, name,
                      **kwargs)
Ejemplo n.º 8
0
 def fit(self, trn_path: str, **kwargs) -> int:
     self.word_vocab = Vocab()
     self.tag_vocab = Vocab(pad_token=None, unk_token=None)
     num_samples = 0
     for words, tags in self.file_to_inputs(trn_path, True):
         self.word_vocab.update(words)
         self.tag_vocab.update(tags)
         num_samples += 1
     if self.char_vocab:
         self.char_vocab = Vocab()
         for word in self.word_vocab.token_to_idx.keys():
             if word in (self.word_vocab.pad_token,
                         self.word_vocab.unk_token):
                 continue
             self.char_vocab.update(list(word))
     return num_samples
Ejemplo n.º 9
0
 def fit(self, trn_path: str, **kwargs) -> int:
     self.vocab = Vocab()
     num_samples = 0
     for x, y in self.file_to_inputs(trn_path):
         self.vocab.update(x)
         num_samples += 1
     return num_samples
Ejemplo n.º 10
0
 def transform(self,
               vocabs: VocabDict = None,
               **kwargs) -> Optional[Callable]:
     assert vocabs is not None
     if self.field not in vocabs:
         vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk)
     return super().transform(**kwargs)
Ejemplo n.º 11
0
 def build_vocabs(self, dataset, logger, **kwargs):
     self.vocabs.rel = Vocab(pad_token=None, unk_token=None)
     self.vocabs.pos = Vocab(pad_token=None, unk_token=None)
     self.vocabs.label = label_vocab = Vocab(pad_token='', unk_token=None)
     label_vocab.add(trees.Sub_Head)
     for each in dataset:
         tree = each['hpsg']
         nodes = [tree]
         while nodes:
             node = nodes.pop()
             if isinstance(node, trees.InternalParseNode):
                 label_vocab.add('\t'.join(node.label))
                 nodes.extend(reversed(node.children))
     self.vocabs['rel'].set_unk_as_safe_unk()
     label_vocab.set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
Ejemplo n.º 12
0
 def fit(self, trn_path: str, **kwargs):
     word_vocab, ngram_vocab, tag_vocab = Vocab(), Vocab(), Vocab(
         pad_token=None, unk_token=None)
     num_samples = 0
     for X, Y in self.file_to_samples(trn_path, gold=True):
         num_samples += 1
         word_vocab.update(X[0])
         for ngram in X[1:]:
             ngram_vocab.update(filter(lambda x: x, ngram))
         tag_vocab.update(Y)
     self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab
     if self.config.window_size:
         vocabs = word_vocab, ngram_vocab, tag_vocab
     else:
         vocabs = word_vocab, None, tag_vocab
     self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs
     return num_samples
Ejemplo n.º 13
0
 def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
     if isinstance(self.embed, Embedding):
         self.embed.transform(vocabs=vocabs)
     vocab_name = self.vocab_name
     if vocab_name not in vocabs:
         vocabs[vocab_name] = Vocab()
     return ToChar(self.field, vocab_name, min_word_length=self.min_word_length,
                   pad=vocabs[vocab_name].safe_pad_token)
Ejemplo n.º 14
0
 def fit(self, trn_path: str, **kwargs) -> int:
     self.word_vocab = Vocab()
     self.tag_vocab = Vocab(pad_token=None, unk_token=None)
     num_samples = 0
     for words, tags in generator_words_tags(trn_path,
                                             gold=True,
                                             lower=self.config.get(
                                                 'lower', False)):
         self.word_vocab.update(words)
         self.tag_vocab.update(tags)
         num_samples += 1
     if self.char_vocab:
         self.char_vocab = Vocab()
         for word in self.word_vocab.token_to_idx.keys():
             if word in (self.word_vocab.pad_token,
                         self.word_vocab.unk_token):
                 continue
             self.char_vocab.update(list(word))
     return num_samples
Ejemplo n.º 15
0
    def fit(self, trn_path: str, **kwargs) -> int:
        self.form_vocab = Vocab()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        self.cpos_vocab = Vocab(pad_token=None, unk_token=None)
        self.rel_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, (form, cpos, head, deprel) in enumerate(sent):
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                self.cpos_vocab.add(cpos)
                self.rel_vocab.update(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples
Ejemplo n.º 16
0
 def build_vocabs(self, trn, logger, **kwargs):
     self.vocabs.tag = Vocab(pad_token=None, unk_token=None)
     timer = CountdownTimer(len(trn))
     max_seq_len = 0
     token_key = self.config.token_key
     for each in trn:
         max_seq_len = max(max_seq_len, len(each[token_key]))
         timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
     self.vocabs.tag.set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
Ejemplo n.º 17
0
    def __init__(self, *args, **kwargs) -> None:
        """A dict holding :class:`hanlp.common.vocab.Vocab` instances. When used as a transform, it transforms the field
        corresponding to each :class:`hanlp.common.vocab.Vocab` into indices.

        Args:
            *args: A list of vocab names.
            **kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances.
        """
        vocabs = dict(kwargs)
        for each in args:
            vocabs[each] = Vocab()
        super().__init__(vocabs)
Ejemplo n.º 18
0
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[Vocab, Vocab, Vocab]:
    word_vocab = Vocab()
    char_vocab = Vocab()
    tag_vocab = Vocab(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
    if lock_tag_vocab:
        tag_vocab.lock()
    return word_vocab, char_vocab, tag_vocab
Ejemplo n.º 19
0
 def build_vocabs(self, dataset, logger, **kwargs):
     self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None)
     # Use null to indicate no relationship
     self.vocabs.srl_label.add('<null>')
     timer = CountdownTimer(len(dataset))
     max_seq_len = 0
     for each in dataset:
         max_seq_len = max(max_seq_len, len(each['token_input_ids']))
         timer.log(f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]')
         pass
     timer.stop()
     timer.erase()
     self.vocabs['srl_label'].set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
Ejemplo n.º 20
0
 def build_vocabs(self, dataset, logger, **kwargs):
     self.vocabs.srl = Vocab(pad_token=None, unk_token=None)
     timer = CountdownTimer(len(dataset))
     max_seq_len = 0
     for sample in dataset:
         max_seq_len = max(max_seq_len, len(sample['token_input_ids']))
         timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
     self.vocabs['srl'].set_unk_as_safe_unk()  # C-ARGM-FRQ appears only in test set
     self.vocabs.lock()
     self.vocabs.summary(logger)
     if self.config.get('delimiter') is None:
         tokens = dataset[0]['token']
         self.config.delimiter = guess_delimiter(tokens)
         logger.info(f'Guess the delimiter between tokens could be [blue]"{self.config.delimiter}"[/blue]. '
                     f'If not, specify `delimiter` in `fit()`')
Ejemplo n.º 21
0
 def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger,
                  **kwargs):
     char_min_freq = self.config.char_min_freq
     if char_min_freq:
         has_cache = dataset.cache is not None
         char_counter = Counter()
         for each in dataset:
             for c in each['char']:
                 char_counter[c] += 1
         self.vocabs.char = vocab = Vocab()
         for c, f in char_counter.items():
             if f >= char_min_freq:
                 vocab.add(c)
         if has_cache:
             dataset.purge_cache()
             for each in dataset:
                 pass
     else:
         self.vocabs.char = Vocab()
         for each in dataset:
             pass
     self.config.eos_chars = dataset.eos_chars
     self.vocabs.lock()
     self.vocabs.summary(logger)
Ejemplo n.º 22
0
 def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=True,
              input_dim=None, output_dim=None, unk=None, normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None,
              name=None, **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = Vocab()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
     with tf.device('cpu:0'):
         pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer,
                      activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase
Ejemplo n.º 23
0
 def build_vocabs(self,
                  dataset,
                  logger,
                  vocabs,
                  lock=True,
                  label_vocab_name='label',
                  **kwargs):
     vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None,
                                                    unk_token=None)
     # Use null to indicate no relationship
     label_vocab.add('<null>')
     timer = CountdownTimer(len(dataset))
     for each in dataset:
         timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]')
     label_vocab.set_unk_as_safe_unk()
     if lock:
         vocabs.lock()
         vocabs.summary(logger)
Ejemplo n.º 24
0
    def __init__(self,
                 data: str,
                 batch_size,
                 seq_len,
                 tokenizer='char',
                 eos='\n',
                 strip=True,
                 vocab=None,
                 cache=False,
                 transform: Union[Callable, List] = None) -> None:
        self.cache = cache
        self.eos = eos
        self.strip = strip
        super().__init__(transform)
        if isinstance(tokenizer, str):
            available_tokenizers = {
                'char': ToChar('text', 'token'),
                'whitespace': WhitespaceTokenizer('text', 'token')
            }
            assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} '
            self.append_transform(available_tokenizers[tokenizer])

        if vocab is None:
            vocab = Vocab()
            self.training = True
        else:
            self.training = vocab.mutable
        self.append_transform(AppendEOS('token', eos=eos))
        self.append_transform(FieldToIndex('token', vocab))
        self.batch_size = batch_size
        data = get_resource(data)
        self.data = data
        self.num_tokens = None
        self.load_file(data)
        self._fp = None
        if isinstance(seq_len, int):
            self.seq_len = lambda: seq_len
        else:
            self.seq_len = seq_len
Ejemplo n.º 25
0
 def build_vocabs(self, trn, logger, **kwargs):
     self.vocabs.label = Vocab(pad_token=None, unk_token=None)
     for each in trn:
         pass
     self.vocabs.lock()
     self.vocabs.summary(logger)
Ejemplo n.º 26
0
 def __init__(self, vocab: Vocab = None) -> None:
     super().__init__()
     if vocab is None:
         vocab = Vocab()
     self.vocab = vocab
Ejemplo n.º 27
0
 def transform(self, **kwargs) -> Callable:
     vocab = Vocab()
     vocab.load(os.path.join(get_resource(self.path), 'vocab.json'))
     return TransformList(ContextualStringEmbeddingTransform(self.field),
                          FieldToIndex(f'{self.field}_f_char', vocab),
                          FieldToIndex(f'{self.field}_b_char', vocab))
Ejemplo n.º 28
0
 def build_transform(self, embeddings, **kwargs):
     if embeddings_require_string_input(embeddings):
         self.transform.map_x = False
         if embeddings_require_char_input(embeddings):
             self.transform.char_vocab = Vocab()
     return super().build_transform(**kwargs)
Ejemplo n.º 29
0
 def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger):
     additional_tokens = set()
     self.collect_additional_tokens(additional_tokens, trn)
     additional_tokens = sorted(additional_tokens)
     self.build_tokenizer(additional_tokens)
     self.vocabs['additional_tokens'] = Vocab(idx_to_token=list(additional_tokens))
Ejemplo n.º 30
0
 def load_vocabs(self, save_dir, filename='vocabs.json'):
     self.vocabs['token'] = Vocab()