Python Vocab.update Examples

Programming Language: Python

Namespace/Package Name: hanlp.common.vocab

Class/Type: Vocab

Method/Function: update

Examples at hotexamples.com: 5

Python Vocab.update - 5 examples found. These are the top rated real world Python examples of hanlp.common.vocab.Vocab.update extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Vocab(30)

add(5)

update(5)

get_idx(3)

get_idx_without_add(2)

lock(2)

lookup(2)

set_unk_as_safe_unk(2)

unlock(2)

copy_from(1)

load(1)

unk_token(1)

Example #1

Show file

File: conll.py Project: yuhui7pm/HanLP

class CoNLL_SDP_Transform(CoNLLTransform):
    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32,
                 n_tokens_per_batch=5000, min_freq=2, **kwargs) -> None:
        super().__init__(config, map_x, map_y, lower, n_buckets, n_tokens_per_batch, min_freq, **kwargs)
        self.orphan_relation = ROOT

    def lock_vocabs(self):
        super().lock_vocabs()
        # heuristic to find the orphan relation
        for rel in self.rel_vocab.idx_to_token:
            if 'root' in rel.lower():
                self.orphan_relation = rel
                break

    def file_to_inputs(self, filepath: str, gold=True):
        assert gold, 'only support gold file for now'
        for i, sent in enumerate(read_conll(filepath)):
            prev_cells = None
            parsed_sent = []
            heads = []
            rels = []
            for j, cell in enumerate(sent):
                ID = cell[0]
                form = cell[1]
                cpos = cell[3]
                head = cell[6]
                deprel = cell[7]
                if prev_cells and ID != prev_cells[0]:  # found end of token
                    parsed_sent.append([prev_cells[1], prev_cells[2], heads, rels])
                    heads = []
                    rels = []
                heads.append(head)
                rels.append(deprel)
                prev_cells = [ID, form, cpos, head, deprel]
            parsed_sent.append([prev_cells[1], prev_cells[2], heads, rels])
            yield parsed_sent

    def fit(self, trn_path: str, **kwargs) -> int:
        self.form_vocab = Vocab()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        self.cpos_vocab = Vocab(pad_token=None, unk_token=None)
        self.rel_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, (form, cpos, head, deprel) in enumerate(sent):
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                self.cpos_vocab.add(cpos)
                self.rel_vocab.update(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples

    def inputs_to_samples(self, inputs, gold=False):
        for sent in inputs:
            sample = []
            if self.config['lower']:
                for i, cell in enumerate(sent):
                    cell = list(sent[i])
                    cell[0] = cell[0].lower()
                    if not gold:
                        cell += [[0], [self.rel_vocab.safe_pad_token]]
                    sample.append(cell)
            # insert root word with arbitrary fields, anyway it will be masked
            form, cpos, head, deprel = sample[0]
            sample.insert(0, [self.bos, self.bos, [0], deprel])
            yield sample

    def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
                           drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
        def generator():
            # custom bucketing, load corpus into memory
            corpus = list(x for x in (samples() if callable(samples) else samples))
            lengths = [1 + len(i) for i in corpus]
            if len(corpus) < 32:
                n_buckets = 1
            else:
                n_buckets = min(self.config.n_buckets, len(corpus))
            buckets = dict(zip(*kmeans(lengths, n_buckets)))
            sizes, buckets = zip(*[
                (size, bucket) for size, bucket in buckets.items()
            ])
            # the number of chunks in each bucket, which is clipped by
            # range [1, len(bucket)]
            chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
                      zip(sizes, buckets)]
            range_fn = randperm if shuffle else arange
            for i in tolist(range_fn(len(buckets))):
                split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                               for j in range(chunks[i])]
                for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
                    indices = [buckets[i][j] for j in tolist(batch_indices)]
                    raw_batch = [[], [], [], []]
                    max_len = len(max([corpus[i] for i in indices], key=len))
                    for idx in indices:
                        arc = np.zeros((max_len, max_len), dtype=np.bool)
                        rel = np.zeros((max_len, max_len), dtype=np.int64)
                        for b in raw_batch[:2]:
                            b.append([])
                        for m, cells in enumerate(corpus[idx]):
                            for b, c, v in zip(raw_batch, cells,
                                               [self.form_vocab, self.cpos_vocab]):
                                b[-1].append(v.get_idx_without_add(c))
                            for n, r in zip(cells[2], cells[3]):
                                arc[m, n] = True
                                rid = self.rel_vocab.get_idx_without_add(r)
                                if rid is None:
                                    logger.warning(f'Relation OOV: {r} not exists in train')
                                    continue
                                rel[m, n] = rid
                        raw_batch[-2].append(arc)
                        raw_batch[-1].append(rel)
                    batch = []
                    for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
                        b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                                          value=v.safe_pad_token_idx,
                                                                          dtype='int64')
                        batch.append(b)
                    batch += raw_batch[2:]
                    assert len(batch) == 4
                    yield (batch[0], batch[1]), (batch[2], batch[3])

        # for x in generator():
        #     print(len(x[-1][-1]))
        return super().samples_to_dataset(generator, False, False, 0, False, repeat, drop_remainder, prefetch,
                                          cache)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = (tf.int64, tf.int64), (tf.bool, tf.int64)
        shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None])
        values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
            False, self.rel_vocab.safe_pad_token_idx)
        return types, shapes, values

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        arc_preds, rel_preds, mask = Y
        sents = []

        for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
                                              tf.math.count_nonzero(mask, axis=-1)):
            sent = []
            for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])):
                ar = []
                for idx, (a, r) in enumerate(zip(arc, rel)):
                    if a:
                        ar.append((idx + 1, self.rel_vocab.idx_to_token[r]))
                if not ar:
                    # orphan
                    ar.append((0, self.orphan_relation))
                sent.append(ar)
            sents.append(sent)

        return sents

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                             gold=False, inputs=None, conll=True) -> Iterable:
        (words, feats, mask), (arc_preds, rel_preds) = X, Y
        xs = inputs
        ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
        sents = []
        for x, y in zip(xs, ys):
            sent = CoNLLSentence()
            for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
                head = [p[0] for p in pred]
                deprel = [p[1] for p in pred]
                if conll:
                    sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel))
                else:
                    sent.append([head, deprel])
            sents.append(sent)
        return sents

Example #2

Show file

class TransformerTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 tokenizer=None,
                 config: SerializableDict = None,
                 map_x=False,
                 map_y=False,
                 **kwargs) -> None:
        super().__init__(config, map_x, map_y, **kwargs)
        self._tokenizer = tokenizer
        self.tag_vocab: Vocab = None
        self.special_token_ids = None

    @property
    def tokenizer(self):
        return self._tokenizer

    @tokenizer.setter
    def tokenizer(self, tokenizer):
        self._tokenizer = tokenizer
        self.special_token_ids = tf.constant(
            [tokenizer.vocab[token] for token in ['[PAD]', '[CLS]', '[SEP]']],
            dtype=tf.int32)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.tag_vocab = Vocab(unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, gold=True):
            num_samples += 1
            self.tag_vocab.update(tags)
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        max_seq_length = self.config.get('max_seq_length', 128)
        types = (tf.int32, tf.int32, tf.int32), tf.int32
        # (input_ids, input_mask, segment_ids), label_ids
        shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None]
        values = (0, 0, 0), self.tag_vocab.pad_idx
        return types, shapes, values

    def lock_vocabs(self):
        super().lock_vocabs()

    def inputs_to_samples(self, inputs, gold=False):
        max_seq_length = self.config.get('max_seq_length', 128)
        tokenizer = self._tokenizer
        xlnet = False
        roberta = False
        pad_token = '[PAD]'
        cls_token = '[CLS]'
        sep_token = '[SEP]'
        unk_token = '[UNK]'

        pad_label_idx = self.tag_vocab.pad_idx
        pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0]
        for sample in inputs:
            if gold:
                words, tags = sample
            else:
                words, tags = sample, [self.tag_vocab.pad_token] * len(sample)

            input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(
                words,
                tags,
                self.tag_vocab.token_to_idx,
                max_seq_length,
                tokenizer,
                cls_token_at_end=xlnet,
                # xlnet has a cls token at the end
                cls_token=cls_token,
                cls_token_segment_id=2 if xlnet else 0,
                sep_token=sep_token,
                sep_token_extra=roberta,
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=xlnet,
                # pad on the left for xlnet
                pad_token=pad_token,
                pad_token_segment_id=4 if xlnet else 0,
                pad_token_label_id=pad_label_idx,
                unk_token=unk_token)

            if None in input_ids:
                print(input_ids)
            if None in input_mask:
                print(input_mask)
            if None in segment_ids:
                print(input_mask)
            yield (input_ids, input_mask, segment_ids), label_ids

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        raise NotImplementedError(
            'transformers has its own tagger, not need to convert idx for x')

    def y_to_idx(self, y) -> tf.Tensor:
        raise NotImplementedError(
            'transformers has its own tagger, not need to convert idx for y')

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     X=None,
                     inputs=None,
                     **kwargs) -> Iterable:
        assert X is not None, 'Need the X to know actual length of Y'
        input_ids, input_mask, segment_ids = X

        mask = tf.reduce_all(tf.not_equal(tf.expand_dims(input_ids, axis=-1),
                                          self.special_token_ids),
                             axis=-1)
        Y = tf.argmax(Y, axis=-1)
        Y = Y[mask]
        tags = [self.tag_vocab.idx_to_token[tid] for tid in Y]
        offset = 0
        for words in inputs:
            yield tags[offset:offset + len(words)]
            offset += len(words)

Example #3

Show file

 def __init__(self,
              filepath: str = None,
              vocab: Vocab = None,
              expand_vocab=True,
              lowercase=True,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = Vocab()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(
             vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(
         embeddings_initializer)
     with tf.device('cpu:0'):
         pret_embs = embeddings_initializer(
             shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim,
                      output_dim,
                      tf.keras.initializers.Constant(pret_embs),
                      embeddings_regularizer,
                      activity_regularizer,
                      embeddings_constraint,
                      mask_zero,
                      input_length,
                      name=name,
                      **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase

Example #4

Show file

class TSVTaggingTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 use_char=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.word_vocab: Optional[Vocab] = None
        self.tag_vocab: Optional[Vocab] = None
        self.char_vocab: Optional[Vocab] = None

    def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = Vocab()
        self.tag_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, True):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = Vocab()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token,
                            self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        lower = self.config.get('lower', False)
        if gold:
            if lower:
                for x, y in inputs:
                    yield x.lower(), y
            else:
                yield from inputs
        else:
            for x in inputs:
                yield x.lower() if lower else x, [self.padding_values[-1]
                                                  ] * len(x)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.word_vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(
                    str_tensor_to_str(x) if self.char_vocab else self.
                    word_vocab.idx_to_token[int(x)])
            yield words

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     inputs=None,
                     X=None,
                     **kwargs) -> Iterable:
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for ys, xs in zip(Y, inputs):
            tags = []
            for y, x in zip(ys, xs):
                tags.append(self.tag_vocab.idx_to_token[int(y)])
            yield tags

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def input_truth_output_to_str(self, input: List[str], truth: List[str],
                                  output: List[str]):
        text = ''
        for word, gold_tag, pred_tag in zip(input, truth, output):
            text += ' '.join([word, gold_tag, pred_tag]) + '\n'

        text += '\n'
        return text

Example #5

Show file

class TextTransform(Transform):

    def __init__(self,
                 forward=True,
                 seq_len=10,
                 tokenizer='char',
                 config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs)
        self.vocab: Vocab = None

    def tokenize_func(self):
        if self.config.tokenizer == 'char':
            return list
        elif self.config.tokenizer == 'whitespace':
            return lambda x: x.split()
        else:
            return lambda x: x.split(self.config.tokenizer)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = Vocab()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        defaults = self.vocab.pad_token, self.vocab.pad_token
        return types, shapes, defaults

    def file_to_inputs(self, filepath: str, gold=True):
        forward = self.config.forward
        seq_len = self.config.seq_len
        buffer = []
        tokenizer = self.tokenize_func()
        with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src:
            for line in src:
                tokens = tokenizer(line)
                buffer += tokens
                while len(buffer) > seq_len:
                    yield buffer[:seq_len], buffer[1:1 + seq_len]
                    buffer.pop(0)

    def inputs_to_samples(self, inputs, gold=False):
        forward = self.config.forward
        for t in inputs:
            if gold:
                x, y = t
            else:
                x, y = t, t
            if not forward:
                x = list(reversed(x))
                y = list(reversed(y))
            yield x, y

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.x_to_idx(y)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable:
        pred = tf.argmax(Y, axis=-1)
        for ys, ms in zip(pred, inputs):
            ret = []
            for y in ys:
                ret.append(self.vocab.idx_to_token[int(y)])
            yield ret

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input[0], str)