def compose_embedding(cls, sub_embs, metadata):
        """Compose embedding list for ContextualIntentSlot model training.
        The first is the word embedding of the last utterance concatenated with the
        word level dictionary feature. The second is the word embedding of a
        sequence of utterances (includes the last utterance). Two embeddings are
        not concatenated and passed to the model individually.

        Args:
            sub_embs (type): sub-embeddings.

        Returns:
            type: EmbeddingList object contains embedding of the last utterance with
                dictionary feature and embedding of the sequence of utterances.

        """
        return EmbeddingList(
            embeddings=[
                EmbeddingList(
                    embeddings=[
                        sub_embs.get(ModelInput.TEXT),
                        sub_embs.get(ModelInput.DICT),
                        sub_embs.get(ModelInput.CHAR),
                        sub_embs.get(ModelInput.CONTEXTUAL_TOKEN_EMBEDDING),
                    ],
                    concat=True,
                ),
                sub_embs.get(ModelInput.SEQ),
            ],
            concat=False,
        )
Exemple #2
0
    def create_embedding(cls, config, tensorizers):
        word_emb = create_module(
            config.word_embedding,
            tensorizer=tensorizers["tokens"],
            init_from_saved_state=config.init_from_saved_state,
        )

        seq_emb_tensorizer = tensorizers["seq_tokens"]
        seq_emb = create_module(config.seq_embedding,
                                tensorizer=seq_emb_tensorizer)
        return EmbeddingList([EmbeddingList([word_emb], concat=True), seq_emb],
                             concat=False)
Exemple #3
0
 def compose_embedding(
         cls, sub_emb_module_dict: Dict[str,
                                        EmbeddingBase]) -> EmbeddingList:
     """disable concatenation of input and output sequences"""
     # return EmbeddingList(sub_emb_module_dict.values(), concat=False)
     seq, *_ = sub_emb_module_dict.values()
     return EmbeddingList((seq, ), concat=False)
Exemple #4
0
    def create_embedding(cls, config, tensorizers: Dict[str, Tensorizer]):
        word_tensorizer = config.inputs.tokens
        byte_tensorizer = config.inputs.token_bytes
        assert word_tensorizer.column == byte_tensorizer.column

        word_embedding = create_module(
            config.embedding,
            tensorizer=tensorizers["tokens"],
            init_from_saved_state=config.init_from_saved_state,
        )
        byte_embedding = create_module(
            config.byte_embedding, vocab_size=ByteTokenTensorizer.NUM_BYTES)
        return EmbeddingList([word_embedding, byte_embedding], concat=True)
Exemple #5
0
    def from_config(
        cls,
        model_config,
        feature_config=None,
        metadata: CommonMetadata = None,
        tensorizers: Dict[str, Tensorizer] = None,
    ):
        if model_config.compositional_type == RNNGParser.Config.CompositionalType.SUM:
            p_compositional = CompositionalSummationNN(
                lstm_dim=model_config.lstm.lstm_dim)
        elif (model_config.compositional_type ==
              RNNGParser.Config.CompositionalType.BLSTM):
            p_compositional = CompositionalNN(
                lstm_dim=model_config.lstm.lstm_dim)
        else:
            raise ValueError("Cannot understand compositional flag {}".format(
                model_config.compositional_type))

        if tensorizers is not None:
            embedding = EmbeddingList(
                [
                    create_module(model_config.embedding,
                                  tensorizer=tensorizers["tokens"])
                ],
                concat=True,
            )
            actions_params = tensorizers["actions"]
            actions_vocab = actions_params.vocab
        else:
            embedding = Model.create_embedding(feature_config,
                                               metadata=metadata)
            actions_params = metadata
            actions_vocab = metadata.actions_vocab

        return cls(
            ablation=model_config.ablation,
            constraints=model_config.constraints,
            lstm_num_layers=model_config.lstm.num_layers,
            lstm_dim=model_config.lstm.lstm_dim,
            max_open_NT=model_config.max_open_NT,
            dropout=model_config.dropout,
            actions_vocab=actions_vocab,
            shift_idx=actions_params.shift_idx,
            reduce_idx=actions_params.reduce_idx,
            ignore_subNTs_roots=actions_params.ignore_subNTs_roots,
            valid_NT_idxs=actions_params.valid_NT_idxs,
            valid_IN_idxs=actions_params.valid_IN_idxs,
            valid_SL_idxs=actions_params.valid_SL_idxs,
            embedding=embedding,
            p_compositional=p_compositional,
        )
Exemple #6
0
    def setUp(self):
        actions_counter = Counter()
        for action in [
            "IN:A",
            "IN:B",
            "IN:UNSUPPORTED",
            "REDUCE",
            "SHIFT",
            "SL:C",
            "SL:D",
        ]:
            actions_counter[action] += 1
        actions_vocab = Vocab(actions_counter, specials=[])

        self.parser = RNNGParser(
            ablation=RNNGParser.Config.AblationParams(),
            constraints=RNNGParser.Config.RNNGConstraints(),
            lstm_num_layers=2,
            lstm_dim=20,
            max_open_NT=10,
            dropout=0.2,
            beam_size=3,
            top_k=3,
            actions_vocab=actions_vocab,
            shift_idx=4,
            reduce_idx=3,
            ignore_subNTs_roots=[2],
            valid_NT_idxs=[0, 1, 2, 5, 6],
            valid_IN_idxs=[0, 1, 2],
            valid_SL_idxs=[5, 6],
            embedding=EmbeddingList(
                embeddings=[
                    WordEmbedding(
                        num_embeddings=5,
                        embedding_dim=20,
                        embeddings_weight=None,
                        init_range=[-1, 1],
                        unk_token_idx=4,
                        mlp_layer_dims=[],
                    ),
                    DictEmbedding(
                        num_embeddings=4, embed_dim=10, pooling_type=PoolingType.MEAN
                    ),
                ],
                concat=True,
            ),
            p_compositional=CompositionalNN(lstm_dim=20),
        )
        self.parser.train()
Exemple #7
0
    def create_embedding(cls, config, tensorizers: Dict[str, Tensorizer]):
        word_tensorizer = config.inputs.tokens
        byte_tensorizer = config.inputs.token_bytes
        assert word_tensorizer.column == byte_tensorizer.column

        word_embedding = create_module(config.embedding,
                                       tensorizer=tensorizers["tokens"])
        byte_embedding = CharacterEmbedding(
            ByteTokenTensorizer.NUM_BYTES,
            config.byte_embedding.embed_dim,
            config.byte_embedding.cnn.kernel_num,
            config.byte_embedding.cnn.kernel_sizes,
            config.byte_embedding.highway_layers,
            config.byte_embedding.projection_dim,
        )
        return EmbeddingList([word_embedding, byte_embedding], concat=True)
    def _create_embeddings(
        cls, config: Config, tensorizers: Dict[str, Tensorizer]
    ) -> nn.ModuleList:
        embeddings = []
        for inputs in cls.INPUTS_PAIR:
            embedding_list = []
            for emb, input in zip(cls.EMBEDDINGS, inputs):
                if hasattr(config, emb) and input in tensorizers:
                    embedding_list.append(
                        cls._create_embedding(getattr(config, emb), tensorizers[input])
                    )

            if len(embedding_list) == 1:
                embeddings.append(embedding_list[0])
            else:
                embeddings.append(EmbeddingList(embeddings=embedding_list, concat=True))
        return nn.ModuleList(embeddings)
Exemple #9
0
 def setUp(self):
     contextual_emb_dim = 1
     emb_module = EmbeddingList(
         embeddings=[
             WordEmbedding(num_embeddings=103, embedding_dim=100),
             DictEmbedding(
                 num_embeddings=59, embed_dim=10, pooling_type=PoolingType.MEAN
             ),
             ContextualTokenEmbedding(contextual_emb_dim),
         ],
         concat=True,
     )
     self.training_model = RNNGModel(
         input_for_trace=RNNGModel.get_input_for_trace(contextual_emb_dim),
         embedding=emb_module,
         ablation=RNNGParser.Config.AblationParams(),
         constraints=RNNGParser.Config.RNNGConstraints(),
         lstm_num_layers=2,
         lstm_dim=32,
         max_open_NT=10,
         dropout=0.4,
         num_actions=20,
         shift_idx=0,
         reduce_idx=1,
         ignore_subNTs_roots=[8, 15],
         valid_NT_idxs=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
         + [12, 13, 14, 15, 16, 17, 18, 19],
         valid_IN_idxs=[2, 4, 7, 8, 10, 12, 13, 14, 15],
         valid_SL_idxs=[3, 5, 6, 9, 11, 16, 17, 18, 19],
         embedding_dim=emb_module.embedding_dim,
         p_compositional=CompositionalNN(lstm_dim=32, device="cpu"),
         device="cpu",
     )
     self.training_model.train()
     self.inference_model = RNNGInference(
         self.training_model.trace_embedding(),
         self.training_model.jit_model,
         MockVocab(["<unk>", "foo", "bar"]),
         MockVocab(["<unk>", "a", "b"]),
         MockVocab(["SHIFT", "REDUCE", "IN:END_CALL", "SL:METHOD_CALL"]),
     )
     self.inference_model.eval()
 def compose_embedding(cls, sub_embs, metadata):
     return EmbeddingList(sub_embs.values(), concat=False)