def compose_embedding(cls, sub_embs, metadata): """Compose embedding list for ContextualIntentSlot model training. The first is the word embedding of the last utterance concatenated with the word level dictionary feature. The second is the word embedding of a sequence of utterances (includes the last utterance). Two embeddings are not concatenated and passed to the model individually. Args: sub_embs (type): sub-embeddings. Returns: type: EmbeddingList object contains embedding of the last utterance with dictionary feature and embedding of the sequence of utterances. """ return EmbeddingList( embeddings=[ EmbeddingList( embeddings=[ sub_embs.get(ModelInput.TEXT), sub_embs.get(ModelInput.DICT), sub_embs.get(ModelInput.CHAR), sub_embs.get(ModelInput.CONTEXTUAL_TOKEN_EMBEDDING), ], concat=True, ), sub_embs.get(ModelInput.SEQ), ], concat=False, )
def create_embedding(cls, config, tensorizers): word_emb = create_module( config.word_embedding, tensorizer=tensorizers["tokens"], init_from_saved_state=config.init_from_saved_state, ) seq_emb_tensorizer = tensorizers["seq_tokens"] seq_emb = create_module(config.seq_embedding, tensorizer=seq_emb_tensorizer) return EmbeddingList([EmbeddingList([word_emb], concat=True), seq_emb], concat=False)
def compose_embedding( cls, sub_emb_module_dict: Dict[str, EmbeddingBase]) -> EmbeddingList: """disable concatenation of input and output sequences""" # return EmbeddingList(sub_emb_module_dict.values(), concat=False) seq, *_ = sub_emb_module_dict.values() return EmbeddingList((seq, ), concat=False)
def create_embedding(cls, config, tensorizers: Dict[str, Tensorizer]): word_tensorizer = config.inputs.tokens byte_tensorizer = config.inputs.token_bytes assert word_tensorizer.column == byte_tensorizer.column word_embedding = create_module( config.embedding, tensorizer=tensorizers["tokens"], init_from_saved_state=config.init_from_saved_state, ) byte_embedding = create_module( config.byte_embedding, vocab_size=ByteTokenTensorizer.NUM_BYTES) return EmbeddingList([word_embedding, byte_embedding], concat=True)
def from_config( cls, model_config, feature_config=None, metadata: CommonMetadata = None, tensorizers: Dict[str, Tensorizer] = None, ): if model_config.compositional_type == RNNGParser.Config.CompositionalType.SUM: p_compositional = CompositionalSummationNN( lstm_dim=model_config.lstm.lstm_dim) elif (model_config.compositional_type == RNNGParser.Config.CompositionalType.BLSTM): p_compositional = CompositionalNN( lstm_dim=model_config.lstm.lstm_dim) else: raise ValueError("Cannot understand compositional flag {}".format( model_config.compositional_type)) if tensorizers is not None: embedding = EmbeddingList( [ create_module(model_config.embedding, tensorizer=tensorizers["tokens"]) ], concat=True, ) actions_params = tensorizers["actions"] actions_vocab = actions_params.vocab else: embedding = Model.create_embedding(feature_config, metadata=metadata) actions_params = metadata actions_vocab = metadata.actions_vocab return cls( ablation=model_config.ablation, constraints=model_config.constraints, lstm_num_layers=model_config.lstm.num_layers, lstm_dim=model_config.lstm.lstm_dim, max_open_NT=model_config.max_open_NT, dropout=model_config.dropout, actions_vocab=actions_vocab, shift_idx=actions_params.shift_idx, reduce_idx=actions_params.reduce_idx, ignore_subNTs_roots=actions_params.ignore_subNTs_roots, valid_NT_idxs=actions_params.valid_NT_idxs, valid_IN_idxs=actions_params.valid_IN_idxs, valid_SL_idxs=actions_params.valid_SL_idxs, embedding=embedding, p_compositional=p_compositional, )
def setUp(self): actions_counter = Counter() for action in [ "IN:A", "IN:B", "IN:UNSUPPORTED", "REDUCE", "SHIFT", "SL:C", "SL:D", ]: actions_counter[action] += 1 actions_vocab = Vocab(actions_counter, specials=[]) self.parser = RNNGParser( ablation=RNNGParser.Config.AblationParams(), constraints=RNNGParser.Config.RNNGConstraints(), lstm_num_layers=2, lstm_dim=20, max_open_NT=10, dropout=0.2, beam_size=3, top_k=3, actions_vocab=actions_vocab, shift_idx=4, reduce_idx=3, ignore_subNTs_roots=[2], valid_NT_idxs=[0, 1, 2, 5, 6], valid_IN_idxs=[0, 1, 2], valid_SL_idxs=[5, 6], embedding=EmbeddingList( embeddings=[ WordEmbedding( num_embeddings=5, embedding_dim=20, embeddings_weight=None, init_range=[-1, 1], unk_token_idx=4, mlp_layer_dims=[], ), DictEmbedding( num_embeddings=4, embed_dim=10, pooling_type=PoolingType.MEAN ), ], concat=True, ), p_compositional=CompositionalNN(lstm_dim=20), ) self.parser.train()
def create_embedding(cls, config, tensorizers: Dict[str, Tensorizer]): word_tensorizer = config.inputs.tokens byte_tensorizer = config.inputs.token_bytes assert word_tensorizer.column == byte_tensorizer.column word_embedding = create_module(config.embedding, tensorizer=tensorizers["tokens"]) byte_embedding = CharacterEmbedding( ByteTokenTensorizer.NUM_BYTES, config.byte_embedding.embed_dim, config.byte_embedding.cnn.kernel_num, config.byte_embedding.cnn.kernel_sizes, config.byte_embedding.highway_layers, config.byte_embedding.projection_dim, ) return EmbeddingList([word_embedding, byte_embedding], concat=True)
def _create_embeddings( cls, config: Config, tensorizers: Dict[str, Tensorizer] ) -> nn.ModuleList: embeddings = [] for inputs in cls.INPUTS_PAIR: embedding_list = [] for emb, input in zip(cls.EMBEDDINGS, inputs): if hasattr(config, emb) and input in tensorizers: embedding_list.append( cls._create_embedding(getattr(config, emb), tensorizers[input]) ) if len(embedding_list) == 1: embeddings.append(embedding_list[0]) else: embeddings.append(EmbeddingList(embeddings=embedding_list, concat=True)) return nn.ModuleList(embeddings)
def setUp(self): contextual_emb_dim = 1 emb_module = EmbeddingList( embeddings=[ WordEmbedding(num_embeddings=103, embedding_dim=100), DictEmbedding( num_embeddings=59, embed_dim=10, pooling_type=PoolingType.MEAN ), ContextualTokenEmbedding(contextual_emb_dim), ], concat=True, ) self.training_model = RNNGModel( input_for_trace=RNNGModel.get_input_for_trace(contextual_emb_dim), embedding=emb_module, ablation=RNNGParser.Config.AblationParams(), constraints=RNNGParser.Config.RNNGConstraints(), lstm_num_layers=2, lstm_dim=32, max_open_NT=10, dropout=0.4, num_actions=20, shift_idx=0, reduce_idx=1, ignore_subNTs_roots=[8, 15], valid_NT_idxs=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11] + [12, 13, 14, 15, 16, 17, 18, 19], valid_IN_idxs=[2, 4, 7, 8, 10, 12, 13, 14, 15], valid_SL_idxs=[3, 5, 6, 9, 11, 16, 17, 18, 19], embedding_dim=emb_module.embedding_dim, p_compositional=CompositionalNN(lstm_dim=32, device="cpu"), device="cpu", ) self.training_model.train() self.inference_model = RNNGInference( self.training_model.trace_embedding(), self.training_model.jit_model, MockVocab(["<unk>", "foo", "bar"]), MockVocab(["<unk>", "a", "b"]), MockVocab(["SHIFT", "REDUCE", "IN:END_CALL", "SL:METHOD_CALL"]), ) self.inference_model.eval()
def compose_embedding(cls, sub_embs, metadata): return EmbeddingList(sub_embs.values(), concat=False)