def __init__(self, dataset, config): super(Classifier, self).__init__() self.config = config assert len(self.config.feature.feature_names) == 1 assert self.config.feature.feature_names[0] == "token" or \ self.config.feature.feature_names[0] == "char" if config.embedding.type == EmbeddingType.EMBEDDING: self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE) self.char_embedding = \ Embedding(dataset.char_map, config.embedding.dimension, cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE) elif config.embedding.type == EmbeddingType.REGION_EMBEDDING: self.token_embedding = RegionEmbeddingLayer( dataset.token_map, config.embedding.dimension, config.embedding.region_size, cDataset.DOC_TOKEN, config, padding=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, region_embedding_type=config.embedding.region_embedding_type) self.char_embedding = RegionEmbeddingLayer( dataset.char_map, config.embedding.dimension, config.embedding.region_size, cDataset.DOC_CHAR, config, padding=dataset.VOCAB_PADDING, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, region_embedding_type=config.embedding.region_embedding_type) else: raise TypeError( "Unsupported embedding type: %s. " % config.embedding.type) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
class Classifier(torch.nn.Module): def __init__(self, dataset, config): super(Classifier, self).__init__() self.config = config assert len(self.config.feature.feature_names) == 1 assert self.config.feature.feature_names[0] == "token" or \ self.config.feature.feature_names[0] == "char" if config.embedding.type == EmbeddingType.EMBEDDING: self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE) self.char_embedding = \ Embedding(dataset.char_map, config.embedding.dimension, cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE) elif config.embedding.type == EmbeddingType.REGION_EMBEDDING: self.token_embedding = RegionEmbeddingLayer( dataset.token_map, config.embedding.dimension, config.embedding.region_size, cDataset.DOC_TOKEN, config, padding=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, region_embedding_type=config.embedding.region_embedding_type) self.char_embedding = RegionEmbeddingLayer( dataset.char_map, config.embedding.dimension, config.embedding.region_size, cDataset.DOC_CHAR, config, padding=dataset.VOCAB_PADDING, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, region_embedding_type=config.embedding.region_embedding_type) else: raise TypeError( "Unsupported embedding type: %s. " % config.embedding.type) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout) def get_embedding(self, batch, pad_shape=None, pad_value=0): if self.config.feature.feature_names[0] == "token": token_id = batch[cDataset.DOC_TOKEN].to(self.config.device) if pad_shape is not None: token_id = torch.nn.functional.pad( token_id, pad_shape, mode='constant', value=pad_value) embedding = self.token_embedding(token_id) length = batch[cDataset.DOC_TOKEN_LEN].to(self.config.device) mask = batch[cDataset.DOC_TOKEN_MASK].to(self.config.device) else: char_id = batch[cDataset.DOC_TOKEN].to(self.config.device) if pad_shape is not None: char_id = torch.nn.functional.pad( char_id, pad_shape, mode='constant', value=pad_value) embedding = self.token_embedding(char_id) length = batch[cDataset.DOC_CHAR_LEN].to(self.config.device) mask = batch[cDataset.DOC_CHAR_MASK].to(self.config.device) return embedding, length, mask def get_parameter_optimizer_dict(self): params = list() params.append( {'params': self.token_embedding.parameters(), 'is_embedding': True}) params.append( {'params': self.char_embedding.parameters(), 'is_embedding': True}) return params def update_lr(self, optimizer, epoch): """Update lr """ if epoch > self.config.train.num_epochs_static_embedding: for param_group in optimizer.param_groups[:2]: param_group["lr"] = self.config.optimizer.learning_rate else: for param_group in optimizer.param_groups[:2]: param_group["lr"] = 0 def forward(self, batch): raise NotImplementedError