Beispiel #1
0
    def __init__(self, dataset, config):
        super(Classifier, self).__init__()
        self.config = config
        assert len(self.config.feature.feature_names) == 1
        assert self.config.feature.feature_names[0] == "token" or \
               self.config.feature.feature_names[0] == "char"
        if config.embedding.type == EmbeddingType.EMBEDDING:
            self.token_embedding = \
                Embedding(dataset.token_map, config.embedding.dimension,
                          cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING,
                          pretrained_embedding_file=
                          config.feature.token_pretrained_file,
                          mode=EmbeddingProcessType.FLAT,
                          dropout=self.config.embedding.dropout,
                          init_type=self.config.embedding.initializer,
                          low=-self.config.embedding.uniform_bound,
                          high=self.config.embedding.uniform_bound,
                          std=self.config.embedding.random_stddev,
                          fan_mode=self.config.embedding.fan_mode,
                          activation_type=ActivationType.NONE)
            self.char_embedding = \
                Embedding(dataset.char_map, config.embedding.dimension,
                          cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
                          mode=EmbeddingProcessType.FLAT,
                          dropout=self.config.embedding.dropout,
                          init_type=self.config.embedding.initializer,
                          low=-self.config.embedding.uniform_bound,
                          high=self.config.embedding.uniform_bound,
                          std=self.config.embedding.random_stddev,
                          fan_mode=self.config.embedding.fan_mode,
                          activation_type=ActivationType.NONE)
        elif config.embedding.type == EmbeddingType.REGION_EMBEDDING:
            self.token_embedding = RegionEmbeddingLayer(
                dataset.token_map, config.embedding.dimension,
                config.embedding.region_size, cDataset.DOC_TOKEN, config,
                padding=dataset.VOCAB_PADDING,
                pretrained_embedding_file=
                config.feature.token_pretrained_file,
                dropout=self.config.embedding.dropout,
                init_type=self.config.embedding.initializer,
                low=-self.config.embedding.uniform_bound,
                high=self.config.embedding.uniform_bound,
                std=self.config.embedding.random_stddev,
                fan_mode=self.config.embedding.fan_mode,
                region_embedding_type=config.embedding.region_embedding_type)

            self.char_embedding = RegionEmbeddingLayer(
                dataset.char_map, config.embedding.dimension,
                config.embedding.region_size, cDataset.DOC_CHAR, config,
                padding=dataset.VOCAB_PADDING,
                dropout=self.config.embedding.dropout,
                init_type=self.config.embedding.initializer,
                low=-self.config.embedding.uniform_bound,
                high=self.config.embedding.uniform_bound,
                std=self.config.embedding.random_stddev,
                fan_mode=self.config.embedding.fan_mode,
                region_embedding_type=config.embedding.region_embedding_type)
        else:
            raise TypeError(
                "Unsupported embedding type: %s. " % config.embedding.type)
        self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
Beispiel #2
0
class Classifier(torch.nn.Module):
    def __init__(self, dataset, config):
        super(Classifier, self).__init__()
        self.config = config
        assert len(self.config.feature.feature_names) == 1
        assert self.config.feature.feature_names[0] == "token" or \
               self.config.feature.feature_names[0] == "char"
        if config.embedding.type == EmbeddingType.EMBEDDING:
            self.token_embedding = \
                Embedding(dataset.token_map, config.embedding.dimension,
                          cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING,
                          pretrained_embedding_file=
                          config.feature.token_pretrained_file,
                          mode=EmbeddingProcessType.FLAT,
                          dropout=self.config.embedding.dropout,
                          init_type=self.config.embedding.initializer,
                          low=-self.config.embedding.uniform_bound,
                          high=self.config.embedding.uniform_bound,
                          std=self.config.embedding.random_stddev,
                          fan_mode=self.config.embedding.fan_mode,
                          activation_type=ActivationType.NONE)
            self.char_embedding = \
                Embedding(dataset.char_map, config.embedding.dimension,
                          cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
                          mode=EmbeddingProcessType.FLAT,
                          dropout=self.config.embedding.dropout,
                          init_type=self.config.embedding.initializer,
                          low=-self.config.embedding.uniform_bound,
                          high=self.config.embedding.uniform_bound,
                          std=self.config.embedding.random_stddev,
                          fan_mode=self.config.embedding.fan_mode,
                          activation_type=ActivationType.NONE)
        elif config.embedding.type == EmbeddingType.REGION_EMBEDDING:
            self.token_embedding = RegionEmbeddingLayer(
                dataset.token_map, config.embedding.dimension,
                config.embedding.region_size, cDataset.DOC_TOKEN, config,
                padding=dataset.VOCAB_PADDING,
                pretrained_embedding_file=
                config.feature.token_pretrained_file,
                dropout=self.config.embedding.dropout,
                init_type=self.config.embedding.initializer,
                low=-self.config.embedding.uniform_bound,
                high=self.config.embedding.uniform_bound,
                std=self.config.embedding.random_stddev,
                fan_mode=self.config.embedding.fan_mode,
                region_embedding_type=config.embedding.region_embedding_type)

            self.char_embedding = RegionEmbeddingLayer(
                dataset.char_map, config.embedding.dimension,
                config.embedding.region_size, cDataset.DOC_CHAR, config,
                padding=dataset.VOCAB_PADDING,
                dropout=self.config.embedding.dropout,
                init_type=self.config.embedding.initializer,
                low=-self.config.embedding.uniform_bound,
                high=self.config.embedding.uniform_bound,
                std=self.config.embedding.random_stddev,
                fan_mode=self.config.embedding.fan_mode,
                region_embedding_type=config.embedding.region_embedding_type)
        else:
            raise TypeError(
                "Unsupported embedding type: %s. " % config.embedding.type)
        self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)

    def get_embedding(self, batch, pad_shape=None, pad_value=0):
        if self.config.feature.feature_names[0] == "token":
            token_id = batch[cDataset.DOC_TOKEN].to(self.config.device)
            if pad_shape is not None:
                token_id = torch.nn.functional.pad(
                    token_id, pad_shape, mode='constant', value=pad_value)
            embedding = self.token_embedding(token_id)
            length = batch[cDataset.DOC_TOKEN_LEN].to(self.config.device)
            mask = batch[cDataset.DOC_TOKEN_MASK].to(self.config.device)
        else:
            char_id = batch[cDataset.DOC_TOKEN].to(self.config.device)
            if pad_shape is not None:
                char_id = torch.nn.functional.pad(
                    char_id, pad_shape, mode='constant', value=pad_value)
            embedding = self.token_embedding(char_id)
            length = batch[cDataset.DOC_CHAR_LEN].to(self.config.device)
            mask = batch[cDataset.DOC_CHAR_MASK].to(self.config.device)
        return embedding, length, mask

    def get_parameter_optimizer_dict(self):
        params = list()
        params.append(
            {'params': self.token_embedding.parameters(), 'is_embedding': True})
        params.append(
            {'params': self.char_embedding.parameters(), 'is_embedding': True})
        return params

    def update_lr(self, optimizer, epoch):
        """Update lr
        """
        if epoch > self.config.train.num_epochs_static_embedding:
            for param_group in optimizer.param_groups[:2]:
                param_group["lr"] = self.config.optimizer.learning_rate
        else:
            for param_group in optimizer.param_groups[:2]:
                param_group["lr"] = 0

    def forward(self, batch):
        raise NotImplementedError