Ejemplo n.º 1
0
    def build(self, dataset, min_freq=1):
        counter = Counter(i
                          for chart in getattr(dataset, self.name)
                          for row in self.preprocess(chart)
                          for i in row if i is not None)

        self.vocab = Vocab(counter, min_freq, self.specials, self.unk_index)
Ejemplo n.º 2
0
    def build(self, dataset, min_freq=1, embed=None):
        r"""
        Constructs a :class:`Vocab` object for this field from the dataset.
        If the vocabulary has already existed, this function will have no effect.

        Args:
            dataset (Dataset):
                A :class:`Dataset` object. One of the attributes should be named after the name of this field.
            min_freq (int):
                The minimum frequency needed to include a token in the vocabulary. Default: 1.
            embed (Embedding):
                An Embedding object, words in which will be extended to the vocabulary. Default: ``None``.
        """

        if hasattr(self, 'vocab'):
            return
        sequences = getattr(dataset, self.name)
        counter = Counter(token
                          for seq in sequences
                          for token in self.preprocess(seq))
        self.vocab = Vocab(counter, min_freq, self.specials, self.unk_index)

        if not embed:
            self.embed = None
        else:
            tokens = self.preprocess(embed.tokens)
            # if the `unk` token has existed in the pretrained,
            # then replace it with a self-defined one
            if embed.unk:
                tokens[embed.unk_index] = self.unk

            self.vocab.extend(tokens)
            self.embed = torch.zeros(len(self.vocab), embed.dim)
            self.embed[self.vocab[tokens]] = embed.vectors
            self.embed /= torch.std(self.embed)
Ejemplo n.º 3
0
    def build(self, dataset):
        r"""
        Constructs a :class:`~supar.utils.vocab.Vocab` object for this field from the dataset.
        If the vocabulary exists already, this function will have no effect.

        Args:
            dataset (Dataset):
                A :class:`~supar.utils.data.Dataset` object.
                One of the attributes should be named after the name of this field.
        """

        sequences = getattr(dataset, self.name)
        counter = Counter(
            rel.split(':', 1)[1] for seq in sequences for rels in seq
            if rels != '_' for rel in rels.split('|'))
        self.vocab = Vocab(counter, specials=[self.pad])
Ejemplo n.º 4
0
    def build(self, dataset, min_freq=1, embed=None):
        if hasattr(self, 'vocab'):
            return
        sequences = getattr(dataset, self.name)
        counter = Counter(piece for seq in sequences for token in seq
                          for piece in self.preprocess(token))
        self.vocab = Vocab(counter, min_freq, self.specials, self.unk_index)

        if not embed:
            self.embed = None
        else:
            tokens = self.preprocess(embed.tokens)
            # if the `unk` token has existed in the pretrained,
            # then replace it with a self-defined one
            if embed.unk:
                tokens[embed.unk_index] = self.unk

            self.vocab.extend(tokens)
            self.embed = torch.zeros(len(self.vocab), embed.dim)
            self.embed[self.vocab[tokens]] = embed.vectors
Ejemplo n.º 5
0
    def build(self, dataset, min_freq=1, embed=None):
        r"""
        Constructs a :class:`Vocab` object for this field from the dataset.
        If the vocabulary has already existed, this function will have no effect.

        Args:
            dataset (Dataset):
                A :class:`Dataset` object. One of the attributes should be named after the name of this field.
            min_freq (int):
                The minimum frequency needed to include a token in the vocabulary. Default: 1.
            embed (Embedding):
                An Embedding object, words in which will be extended to the vocabulary. Default: ``None``.
        """

        if hasattr(self, 'vocab'):
            return
        sequences = getattr(dataset, self.name)
        counter = Counter(token for seq in sequences
                          for token in self.preprocess(seq))
        self.vocab = Vocab(counter, min_freq, self.specials, self.unk_index)
Ejemplo n.º 6
0
    def build(self, dataset, min_freq=1):
        counter = Counter(label for seq in getattr(dataset, self.name)
                          for i, j, label in self.preprocess(seq))

        self.vocab = Vocab(counter, min_freq, self.specials, self.unk_index)