Beispiel #1
0
    def __init__(self, vocab, use_postags_only=True, embed_dim=100, hidden_size=200, recurrent_dropout_probability=0.3,
                 use_highway=False,
                 maxpool=True):
        super(BLSTMModel, self).__init__()

        self.embeds = Embedding.from_params(
            vocab,
            Params({'vocab_namespace': 'pos' if use_postags_only else 'tokens',
                    'embedding_dim': embed_dim,
                    'trainable': True,
                    'padding_index': 0,
                    'pretrained_file': None if use_postags_only else 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz',
                    }))
        self.binary_feature_embedding = Embedding(2, embed_dim)

        self.fwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm(
            input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=True,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False, use_highway=use_highway), stateful=False)

        self.bwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm(
            input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=False,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False, use_highway=use_highway), stateful=False)

        self.maxpool = maxpool
        self.fc = nn.Linear(hidden_size * 2, 1, bias=False)
Beispiel #2
0
    def __init__(self,
                 vocab: Vocabulary,
                 recurrent_dropout_probability: float = 0.0,
                 embedding_dropout_probability: float = 0.0,
                 input_size=512,
                 hidden_size=512) -> None:
        """
        :param options_file: for initializing elmo BiLM
        :param weight_file: for initializing elmo BiLM
        :param requires_grad: Whether or not to finetune the LSTM layers
        :param recurrent_dropout_probability: recurrent dropout to add to LSTM layers
        """
        super(SimpleBiLM, self).__init__()

        self.forward_lm = PytorchSeq2SeqWrapper(StackedLstm(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2,
            go_forward=True,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False,
            use_highway=True),
                                                stateful=True)
        self.reverse_lm = PytorchSeq2SeqWrapper(StackedLstm(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2,
            go_forward=False,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False,
            use_highway=True),
                                                stateful=True)

        # This will also be the encoder
        self.decoder = torch.nn.Linear(
            512, vocab.get_vocab_size(namespace='tokens'))

        self.vocab = vocab
        self.register_buffer(
            'eos_tokens',
            torch.LongTensor([
                vocab.get_token_index(tok) for tok in [
                    '.', '!', '?', '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@',
                    '@@eos@@'
                ]
            ]))
        self.register_buffer(
            'invalid_tokens',
            torch.LongTensor([
                vocab.get_token_index(tok) for tok in [
                    '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@', '@@eos@@',
                    '@@NEWLINE@@'
                ]
            ]))
        self.embedding_dropout_probability = embedding_dropout_probability
Beispiel #3
0
    def setUp(self):
        self.reader = ToyReader()
        self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt")
        self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt")
        self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances)

        token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2,
                                    embedding_dim=256, padding_index=0)

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(),
                                                num_layers=2,
                                                hidden_size=256,
                                                bidirectional=True,
                                                dropout=0.4,
                                                batch_first=True))

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=word_embeddings,
                                   encoder=encoder,
                                   target_embedding_dim=256,
                                   target_namespace='target_tokens',
                                   attention=DotProductAttention(),
                                   max_decoding_steps=25,
                                   beam_size=5,
                                   use_bleu=True
                                   )

        self.model.cuda(0)
Beispiel #4
0
    def __init__(self):
        # CopyNet model initialization parameters
        self.vocabulary = Vocabulary()
        self.vocabulary = self.vocabulary.from_files(
            "C:/Users/Selma/PycharmProjects/ROS2SemanticParser/"
            "CN_model_weights/no_embedds/model.tar.gz")
        self.source_embedder = BasicTextFieldEmbedder(
            token_embedders={
                'tokens':
                Embedding(num_embeddings=self.vocabulary.get_vocab_size(
                    'source_tokens'),
                          embedding_dim=310)
            })
        self.dataset_reader = CopyNetDatasetReader(
            target_namespace="target_tokens")
        self.encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_size=310,
                          hidden_size=128,
                          num_layers=1,
                          batch_first=True))
        self.attention = BilinearAttention(vector_dim=128, matrix_dim=128)
        self.beam_size = 5
        self.max_decoding_steps = 200
        self.target_embedding_dim = 150

        self.semantic_parser = CopyNetSeq2Seq(
            vocab=self.vocabulary,
            source_embedder=self.source_embedder,
            encoder=self.encoder,
            attention=self.attention,
            beam_size=self.beam_size,
            max_decoding_steps=self.max_decoding_steps,
            target_embedding_dim=self.target_embedding_dim)
Beispiel #5
0
 def from_params(self, params: Params) -> PytorchSeq2SeqWrapper:
     if not params.pop_bool('batch_first', True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     module = self._module_class(**params.as_dict())
     return PytorchSeq2SeqWrapper(module)
Beispiel #6
0
 def from_params(cls, params: Params) -> 'PytorchSeq2SeqWrapper':
     input_size = params.pop("input_size")
     hidden_size = params.pop("hidden_size")
     cell_params = params.pop("cell")
     cell = Cell.from_params(cell_params)
     return PytorchSeq2SeqWrapper(
         cls(input_size=input_size, hidden_size=hidden_size, cell=cell))
Beispiel #7
0
    def from_params(self, params: Params, **extras) -> PytorchSeq2SeqWrapper:
        if not params.pop_bool('batch_first', True):
            raise ConfigurationError(
                "Our encoder semantics assumes batch is always first!")
        if self._module_class in self.PYTORCH_MODELS:
            params['batch_first'] = True
        stateful = params.pop_bool('stateful', False)
        weight_dropout = params.pop_float('weight_dropout', 0.0)
        variational = params.pop_float('variational', True)
        num_layers = params.get('num_layers', 1)
        bidirectional = params.get('bidirectional', False)
        all_recurrent_weights = [
            f"weight_hh_l{layer}{suffix}"
            for layer, suffix in product(range(num_layers), [""] +
                                         ["_reverse"] *
                                         (1 if bidirectional else 0))
        ]

        if weight_dropout > 0.0:
            module = weight_drop_factory(self._module_class)(
                module_args=params.as_dict(infer_type_and_cast=True),
                weights=all_recurrent_weights,
                wdrop=weight_dropout,
                variational=variational,
            )
        else:
            module = self._module_class(**params.as_dict(
                infer_type_and_cast=True))

        return PytorchSeq2SeqWrapper(module, stateful=stateful)
Beispiel #8
0
def get_encoder(st_ds_conf: dict):
    emb_sz = st_ds_conf['emb_sz']
    if st_ds_conf['encoder'] == 'lstm':
        encoder = StackedEncoder(
            [
                PytorchSeq2SeqWrapper(
                    torch.nn.LSTM(emb_sz, emb_sz, batch_first=True))
                for _ in range(st_ds_conf['num_enc_layers'])
            ],
            emb_sz,
            emb_sz,
            input_dropout=st_ds_conf['intermediate_dropout'])
    elif st_ds_conf['encoder'] == 'bilstm':
        encoder = StackedEncoder(
            [
                PytorchSeq2SeqWrapper(
                    torch.nn.LSTM(
                        emb_sz, emb_sz, batch_first=True, bidirectional=True))
            ] + [
                PytorchSeq2SeqWrapper(
                    torch.nn.LSTM(emb_sz * 2,
                                  emb_sz,
                                  batch_first=True,
                                  bidirectional=True))
                for _ in range(st_ds_conf['num_enc_layers'] - 1)
            ],
            emb_sz,
            emb_sz * 2,
            input_dropout=st_ds_conf['intermediate_dropout'])
    elif st_ds_conf['encoder'] == 'transformer':
        encoder = StackedEncoder([
            TransformerEncoder(
                input_dim=emb_sz,
                num_layers=st_ds_conf['num_enc_layers'],
                num_heads=st_ds_conf['num_heads'],
                feedforward_hidden_dim=emb_sz,
                feedforward_dropout=st_ds_conf['feedforward_dropout'],
                residual_dropout=st_ds_conf['residual_dropout'],
                attention_dropout=st_ds_conf['attention_dropout'],
            ) for _ in range(st_ds_conf['num_enc_layers'])
        ],
                                 emb_sz,
                                 emb_sz,
                                 input_dropout=0.)
    else:
        assert False
    return encoder
Beispiel #9
0
 def from_params(self, params: Params) -> PytorchSeq2SeqWrapper:
     if not params.pop_bool("batch_first", True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params["batch_first"] = True
     stateful = params.pop_bool("stateful", False)
     module = self._module_class(**params.as_dict(infer_type_and_cast=True))
     return PytorchSeq2SeqWrapper(module, stateful=stateful)
Beispiel #10
0
    def __init__(self,
                 input_dim: int,
                 combination: str = "x,y",
                 num_width_embeddings: int = None,
                 span_width_embedding_dim: int = None,
                 bucket_widths: bool = False,
                 use_exclusive_start_indices: bool = False) -> None:
        super().__init__()

        self._input_dim = input_dim
        self._combination = combination

        self._encoder = PytorchSeq2SeqWrapper(
            StackedBidirectionalLstm(self._input_dim,
                                     int(floor(self._input_dim / 2)), 1))
        self._span_extractor = BidirectionalEndpointSpanExtractor(
            self._input_dim, "y", "y", num_width_embeddings,
            span_width_embedding_dim, bucket_widths)
Beispiel #11
0
    def __init__(self, input_dropout, pretrained, average_pool, semantic,
                 final_dim, backbone, head):
        super().__init__()

        self.detector = SimpleDetector(pretrained=pretrained,
                                       average_pool=average_pool,
                                       semantic=semantic,
                                       final_dim=final_dim)
        self.rnn_input_dropout = TimeDistributed(
            InputVariationalDropout(
                input_dropout)) if input_dropout > 0 else None
        self.encoder_model = TimeDistributed(
            PytorchSeq2SeqWrapper(
                torch.nn.LSTM(1280, 256, batch_first=True,
                              bidirectional=True)))
        self.backbone = build_backbone(backbone)
        # self.combine_model = build_combine_layer(combine_model)  ###combine text and image
        self.head = build_head(head)
Beispiel #12
0
    def __init__(self,
                 vocab: Vocabulary,
                 span_emb_dim: int,
                 tree_prop: int = 1,
                 tree_dropout: float = 0.0,
                 tree_children: str = 'attention',
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(Tree, self).__init__(vocab, regularizer)

        self._span_emb_dim = span_emb_dim
        assert span_emb_dim % 2 == 0

        self._f_network = FeedForward(input_dim=2 * span_emb_dim,
                                      num_layers=1,
                                      hidden_dims=span_emb_dim,
                                      activations=torch.nn.Sigmoid(),
                                      dropout=0)

        self._tree_prop = tree_prop

        self._tree_children = tree_children
        if self._tree_children == 'attention':
            self._global_attention = TimeDistributed(
                torch.nn.Linear(span_emb_dim, 1))
        elif self._tree_children == 'pooling':
            pass
        elif self._tree_children == 'conv':
            self._conv = torch.nn.Conv1d(span_emb_dim,
                                         span_emb_dim,
                                         kernel_size=3,
                                         padding=1)
        elif self._tree_children == 'rnn':
            self._encoder = PytorchSeq2SeqWrapper(
                StackedBidirectionalLstm(span_emb_dim,
                                         int(floor(span_emb_dim / 2)), 1))
        else:
            raise RuntimeError('invalid tree_children option: {}'.format(
                self._tree_children))

        self._dropout = torch.nn.Dropout(p=tree_dropout)

        initializer(self)
Beispiel #13
0
    def __init__(self,
                 pretrained=True,
                 average_pool=True,
                 semantic=True,
                 final_dim=512,
                 input_dropout=0.3,
                 reasoning_use_obj=True,
                 reasoning_use_answer=True,
                 reasoning_use_question=True,
                 pool_reasoning=True,
                 pool_answer=True,
                 pool_question=True):
        super().__init__()

        # self.detector = SimpleDetector(pretrained=pretrained,
        #   average_pool=average_pool, semantic=semantic, final_dim=final_dim)
        self.reasoning_encoder = TimeDistributed(
            PytorchSeq2SeqWrapper(
                torch.nn.LSTM(1536,
                              256,
                              num_layers=2,
                              batch_first=True,
                              bidirectional=True)))
        self.rnn_input_dropout = TimeDistributed(
            InputVariationalDropout(
                input_dropout)) if input_dropout > 0 else None
        self.span_attention = BilinearMatrixAttention(
            matrix_1_dim=final_dim,
            matrix_2_dim=final_dim,
        )

        self.obj_attention = BilinearMatrixAttention(
            matrix_1_dim=final_dim,
            matrix_2_dim=final_dim,
        )
        self.reasoning_use_obj = reasoning_use_obj
        self.reasoning_use_answer = reasoning_use_answer
        self.reasoning_use_question = reasoning_use_question
        self.pool_reasoning = pool_reasoning
        self.pool_answer = pool_answer
        self.pool_question = pool_question

        InitializerApplicator(self)
Beispiel #14
0
def allennlp_seq2seq(c, num_layers, input, hidden, cell, batch, timestep,
                     repeat, cuda, output):
    num_layers = int(num_layers)
    input = int(input)
    hidden = int(hidden)
    cell = int(cell)
    batch = int(batch)
    timestep = int(timestep)
    repeat = int(repeat)

    lstms = []
    lstm_input = input
    for _ in range(num_layers):
        lstms.append(
            PytorchSeq2SeqWrapper(AugmentedLstm(
                input_size=lstm_input,
                hidden_size=hidden,
                use_highway=False,
                use_input_projection_bias=False,
            ),
                                  stateful=True))
        lstm_input = hidden

    input_tensor = torch.rand(batch, timestep, input)
    if cuda == 'cuda':
        input_tensor = input_tensor.cuda()
        lstms = [l.cuda() for l in lstms]

    durations = []
    for idx in range(repeat):
        batch_lengths = [timestep]
        batch_lengths.extend(
            [random.randrange(timestep + 1) for _ in range(batch - 1)])
        batch_lengths = sorted(batch_lengths, reverse=True)

        mask = torch.zeros(batch, timestep, dtype=torch.long)
        for mask_idx, length in enumerate(batch_lengths):
            mask[mask_idx, :length] = 1
        if cuda == 'cuda':
            mask = mask.cuda()

        with torch.no_grad():
            time_start = time.time()
            lstm_input = input_tensor
            for lstm in lstms:
                lstm_input = lstm(
                    lstm_input,
                    mask,
                )
            durations.append((idx, time.time() - time_start), )

    with open(output, 'w') as fout:
        json.dump(
            {
                'type': 'allennlp_seq2seq',
                'cuda': cuda,
                'durations': durations
            },
            fout,
            ensure_ascii=False,
            indent=2,
        )
Beispiel #15
0
    def __init__(
            self,
            vocab: Vocabulary,
            text_field_embedder: TextFieldEmbedder,
            char_field_embedder: TextFieldEmbedder,
            # num_highway_layers: int,
            phrase_layer: Seq2SeqEncoder,
            char_rnn: Seq2SeqEncoder,
            hops: int,
            hidden_dim: int,
            dropout: float = 0.2,
            mask_lstms: bool = True,
            initializer: InitializerApplicator = InitializerApplicator(),
            regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._char_field_embedder = char_field_embedder
        self._features_embedder = nn.Embedding(2, 5)
        # self._highway_layer = TimeDistributed(Highway(text_field_embedder.get_output_dim() + 5 * 3,
        #                                               num_highway_layers))
        self._phrase_layer = phrase_layer
        self._encoding_dim = phrase_layer.get_output_dim()
        # self._stacked_brnn = PytorchSeq2SeqWrapper(
        #     StackedBidirectionalLstm(input_size=self._encoding_dim, hidden_size=hidden_dim,
        #                              num_layers=3, recurrent_dropout_probability=0.2))
        self._char_rnn = char_rnn

        self.hops = hops

        self.interactive_aligners = nn.ModuleList()
        self.interactive_SFUs = nn.ModuleList()
        self.self_aligners = nn.ModuleList()
        self.self_SFUs = nn.ModuleList()
        self.aggregate_rnns = nn.ModuleList()
        for i in range(hops):
            # interactive aligner
            self.interactive_aligners.append(
                layers.SeqAttnMatch(self._encoding_dim))
            self.interactive_SFUs.append(
                layers.SFU(self._encoding_dim, 3 * self._encoding_dim))
            # self aligner
            self.self_aligners.append(layers.SelfAttnMatch(self._encoding_dim))
            self.self_SFUs.append(
                layers.SFU(self._encoding_dim, 3 * self._encoding_dim))
            # aggregating
            self.aggregate_rnns.append(
                PytorchSeq2SeqWrapper(
                    nn.LSTM(input_size=self._encoding_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            dropout=0.2,
                            bidirectional=True,
                            batch_first=True)))

        # Memmory-based Answer Pointer
        self.mem_ans_ptr = layers.MemoryAnsPointer(x_size=self._encoding_dim,
                                                   y_size=self._encoding_dim,
                                                   hidden_size=hidden_dim,
                                                   hop=hops,
                                                   dropout_rate=0.2,
                                                   normalize=True)

        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_yesno_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._squad_metrics = SquadEmAndF1()
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._mask_lstms = mask_lstms

        initializer(self)
Beispiel #16
0
    def setUp(self):
        self.sample_only = False
        # self.setupstubexecutor()

        model_params_file_path = self.TEST_DATA_ROOT / "experiment.json"
        self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.deurified.simple.sample.json"
        self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.train.json"
        self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.test.json"
        predicates_file_path = self.TEST_DATA_ROOT / "properties.txt"
        with codecs.open(predicates_file_path) as fp:
            self.predicates = [i.strip() for i in fp]

        dbo_classes = set([
            dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper()
        ])
        binary_predicates = set(self.predicates) - dbo_classes

        if self.sample_only:
            self.sample_reader = LCQuADReaderSimple(
                predicates=binary_predicates, ontology_types=dbo_classes)
        else:
            self.train_reader = LCQuADReaderSimple(
                predicates=binary_predicates, ontology_types=dbo_classes)
            # self.test_reader = LCQuADReaderSimple(predicates=binary_predicates, ontology_types=dbo_classes)

        # sample_reader.cache_data("sample_dataset")
        # train_reader.cache_data("train_dataset")
        # test_reader.cache_data("test_dataset")

        if self.sample_only:
            self.sample_instances = list(
                self.sample_reader.read(str(self.dataset_sample_file_path)))
        else:
            self.train_instances = list(
                self.train_reader.read(str(self.dataset_train_file_path)))
            self.test_instances = list(
                self.train_reader.read(str(self.dataset_test_file_path)))

        if self.sample_only:
            self.vocab = Vocabulary.from_instances(self.sample_instances)
        else:
            self.vocab = Vocabulary.from_instances(self.train_instances +
                                                   self.test_instances,
                                                   min_count={
                                                       'tokens': 3,
                                                       'target_tokens': 3
                                                   })
            #min_count={'tokens': 3, 'target_tokens': 3})

        #self.vocab = Vocabulary()

        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size('tokens') + 2,
            embedding_dim=512,
            padding_index=0)

        #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

        # the embedder maps the input tokens to the appropriate embedding matrix
        #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
        #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(
            nn.LSTM(input_size=word_embeddings.get_output_dim(),
                    num_layers=2,
                    hidden_size=256,
                    bidirectional=True,
                    dropout=0.5,
                    batch_first=True))

        val_outputs = self.TEST_DATA_ROOT / "val_outputs.seq2seq.json"

        self.val_outputs_fp = codecs.open(val_outputs, 'w')

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = SimpleSeq2Seq(vocab=self.vocab,
                                   source_embedder=word_embeddings,
                                   encoder=encoder,
                                   target_embedding_dim=128,
                                   target_namespace='target_tokens',
                                   attention=DotProductAttention(),
                                   max_decoding_steps=25,
                                   beam_size=5,
                                   use_bleu=True,
                                   scheduled_sampling_ratio=0.3)

        self.model.cuda(0)
Beispiel #17
0
    def __init__(self, conf: Dict,
                 input_batchers: Dict[str, Union[WordBatch, CharacterBatch]],
                 n_class: int, use_cuda: bool):
        super(SeqLabelModel, self).__init__()
        self.n_class = n_class
        self.use_cuda = use_cuda
        self.input_dropout = torch.nn.Dropout2d(p=conf["dropout"])
        self.dropout = InputVariationalDropout(p=conf['dropout'])

        input_layers = {}
        for i, c in enumerate(conf['input']):
            if c['type'] == 'embeddings':
                if 'pretrained' in c:
                    embs = load_embedding_txt(c['pretrained'], c['has_header'])
                    logger.info('loaded {0} embedding entries.'.format(
                        len(embs[0])))
                else:
                    embs = None
                name = c['name']
                mapping = input_batchers[name].mapping
                layer = Embeddings(c['dim'],
                                   mapping,
                                   fix_emb=c['fixed'],
                                   embs=embs,
                                   normalize=c.get('normalize', False),
                                   input_field_name=name)
                logger.info('embedding for field {0} '
                            'created with {1} x {2}.'.format(
                                c['field'], layer.n_V, layer.n_d))
                input_layers[name] = layer

            elif c['type'] == 'cnn_encoder' or c['type'] == 'lstm_encoder':
                name = c['name']
                mapping = input_batchers[name].mapping
                embeddings = Embeddings(
                    c['dim'],
                    mapping,
                    fix_emb=False,
                    embs=None,
                    normalize=False,
                    input_field_name='{0}_ch_emb'.format(name))
                logger.info('character embedding for field {0} '
                            'created with {1} x {2}.'.format(
                                c['field'], embeddings.n_V, embeddings.n_d))
                if c['type'] == 'lstm_encoder':
                    layer = LstmTokenEmbedder(c['dim'],
                                              embeddings,
                                              conf['dropout'],
                                              use_cuda,
                                              input_field_name=name)
                elif c['type'] == 'cnn_encoder':
                    layer = ConvTokenEmbedder(c['dim'],
                                              embeddings,
                                              c['filters'],
                                              c.get('n_highway', 1),
                                              c.get('activation', 'relu'),
                                              use_cuda,
                                              input_field_name=name)
                else:
                    raise ValueError('Unknown type: {}'.format(c['type']))
                input_layers[name] = layer

            elif c['type'] == 'elmo':
                name = c['name']
                layer = ContextualizedWordEmbeddings(name, c['path'], use_cuda)
                input_layers[name] = layer

            else:
                raise ValueError('{} unknown input layer'.format(c['type']))

        self.input_layers = torch.nn.ModuleDict(input_layers)
        input_encoders = []
        input_dim = 0
        for i, c in enumerate(conf['input_encoder']):
            input_info = {
                name: self.input_layers[name].get_output_dim()
                for name in c['input']
            }

            if c['type'] == 'affine':
                input_encoder = AffineTransformInputEncoder(
                    input_info, c['dim'], use_cuda)
            elif c['type'] == 'sum':
                input_encoder = SummationInputEncoder(input_info, use_cuda)
            elif c['type'] == 'concat':
                input_encoder = ConcatenateInputEncoder(input_info, use_cuda)
            else:
                raise ValueError('{} unknown input encoder'.format(c['type']))

            input_dim += input_encoder.get_output_dim()
            input_encoders.append(input_encoder)

        self.input_encoders = torch.nn.ModuleList(input_encoders)

        encoder_name = conf['encoder']['type'].lower()
        if encoder_name == 'stacked_bidirectional_lstm':
            lstm = StackedBidirectionalLstm(
                input_size=input_dim,
                hidden_size=conf['encoder']['hidden_dim'],
                num_layers=conf['encoder']['n_layers'],
                recurrent_dropout_probability=conf['dropout'],
                layer_dropout_probability=conf['dropout'],
                use_highway=conf['encoder'].get('use_highway', True))
            self.encoder = PytorchSeq2SeqWrapper(lstm, stateful=False)
            encoded_input_dim = self.encoder.get_output_dim()
        elif encoder_name == 'project':
            self.encoder = ProjectedEncoder(input_dim,
                                            conf['encoder']['hidden_dim'],
                                            dropout=conf['dropout'])
            encoded_input_dim = self.encoder.get_output_dim()
        elif encoder_name == 'dummy':
            self.encoder = DummyEncoder()
            encoded_input_dim = input_dim
        else:
            raise ValueError('Unknown input encoder: {}'.format(encoder_name))

        if conf["classifier"]["type"].lower() == 'crf':
            self.classify_layer = CRFLayer(encoded_input_dim, n_class,
                                           use_cuda)
        else:
            self.classify_layer = ClassifyLayer(encoded_input_dim, n_class,
                                                use_cuda)

        self.encode_time = 0
        self.emb_time = 0
        self.classify_time = 0
Beispiel #18
0
    def setUp(self):
        self.sample_only = False
        self.setUpExecutor()
        # self.setupstubexecutor()

        model_params_file_path = self.TEST_DATA_ROOT / "experiment.json"
        self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.sample.json"
        self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.train.json"
        self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.test.json"
        predicates_file_path = self.TEST_DATA_ROOT / "properties.txt"
        with codecs.open(predicates_file_path) as fp:
            self.predicates = [i.strip() for i in fp]

        dbo_classes = set([
            dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper()
        ])
        binary_predicates = set(self.predicates) - dbo_classes

        token_indexer = None  #{'tokens': ELMoTokenCharactersIndexer()}

        if self.sample_only:
            sample_reader = LCQuADReader(executor=self.executor,
                                         predicates=binary_predicates,
                                         token_indexers=token_indexer,
                                         ontology_types=dbo_classes)
        else:
            train_reader = LCQuADReader(executor=self.executor,
                                        predicates=binary_predicates,
                                        token_indexers=token_indexer,
                                        ontology_types=dbo_classes)
            test_reader = LCQuADReader(executor=self.executor,
                                       predicates=binary_predicates,
                                       token_indexers=token_indexer,
                                       ontology_types=dbo_classes)

        # sample_reader.cache_data("sample_dataset")
        # train_reader.cache_data("train_dataset")
        # test_reader.cache_data("test_dataset")

        if self.sample_only:
            self.sample_instances = list(
                sample_reader.read(str(self.dataset_sample_file_path)))
        else:
            self.train_instances = list(
                train_reader.read(str(self.dataset_train_file_path)))
            self.test_instances = list(
                test_reader.read(str(self.dataset_test_file_path)))

        if self.sample_only:
            self.vocab = Vocabulary.from_instances(self.sample_instances)
        else:
            self.vocab = Vocabulary.from_instances(self.train_instances +
                                                   self.test_instances)

        #self.vocab = Vocabulary()

        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size() + 2,
            embedding_dim=256,
            padding_index=0)

        #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

        # the embedder maps the input tokens to the appropriate embedding matrix
        #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
        #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

        word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        encoder = PytorchSeq2SeqWrapper(
            nn.LSTM(
                input_size=word_embeddings.get_output_dim(),
                num_layers=1,
                hidden_size=128,
                bidirectional=True,
                # dropout=0.4,
                batch_first=True))

        val_outputs = self.TEST_DATA_ROOT / "val_outputs.json"

        self.val_outputs_fp = codecs.open(val_outputs, 'w')

        # self.set_up_model(model_params_file_path, dataset_sample_file_path)
        self.model = LCQuADMmlSemanticParser(
            vocab=self.vocab,
            sentence_embedder=word_embeddings,
            action_embedding_dim=256,
            encoder=encoder,
            attention=DotProductAttention(),
            decoder_beam_search=BeamSearch(beam_size=1),
            max_decoding_steps=50,
            dropout=0.5,
            val_outputs=self.val_outputs_fp)
        self.model.cuda(0)
Beispiel #19
0
def get_model(vocab, st_ds_conf):
    emb_sz = st_ds_conf['emb_sz']

    source_embedding = allennlp.modules.Embedding(
        num_embeddings=vocab.get_vocab_size('nltokens'), embedding_dim=emb_sz)
    target_embedding = allennlp.modules.Embedding(
        num_embeddings=vocab.get_vocab_size('lftokens'), embedding_dim=emb_sz)

    if st_ds_conf['encoder'] == 'lstm':
        encoder = StackedEncoder(
            [
                PytorchSeq2SeqWrapper(
                    torch.nn.LSTM(emb_sz, emb_sz, batch_first=True))
                for _ in range(st_ds_conf['num_enc_layers'])
            ],
            emb_sz,
            emb_sz,
            input_dropout=st_ds_conf['intermediate_dropout'])
    elif st_ds_conf['encoder'] == 'bilstm':
        encoder = StackedEncoder(
            [
                PytorchSeq2SeqWrapper(
                    torch.nn.LSTM(
                        emb_sz, emb_sz, batch_first=True, bidirectional=True))
                for _ in range(st_ds_conf['num_enc_layers'])
            ],
            emb_sz,
            emb_sz,
            input_dropout=st_ds_conf['intermediate_dropout'])
    elif st_ds_conf['encoder'] == 'transformer':
        encoder = StackedEncoder(
            [
                TransformerEncoder(
                    input_dim=emb_sz,
                    num_layers=st_ds_conf['num_enc_layers'],
                    num_heads=st_ds_conf['num_heads'],
                    feedforward_hidden_dim=emb_sz,
                    feedforward_dropout=st_ds_conf['feedforward_dropout'],
                    residual_dropout=st_ds_conf['residual_dropout'],
                    attention_dropout=st_ds_conf['attention_dropout'],
                ) for _ in range(st_ds_conf['num_enc_layers'])
            ],
            emb_sz,
            emb_sz,
            input_dropout=st_ds_conf['intermediate_dropout'])
    else:
        assert False

    enc_out_dim = encoder.get_output_dim()
    dec_out_dim = emb_sz

    dec_hist_attn = get_attention(st_ds_conf, st_ds_conf['dec_hist_attn'])
    enc_attn = get_attention(st_ds_conf, st_ds_conf['enc_attn'])
    if st_ds_conf['enc_attn'] == 'dot_product':
        assert enc_out_dim == dec_out_dim, "encoder hidden states must be able to multiply with decoder output"

    def sum_attn_dims(attns, dims):
        return sum(dim for attn, dim in zip(attns, dims) if attn is not None)

    if st_ds_conf['concat_attn_to_dec_input']:
        dec_in_dim = dec_out_dim + sum_attn_dims([enc_attn, dec_hist_attn],
                                                 [enc_out_dim, dec_out_dim])
    else:
        dec_in_dim = dec_out_dim
    rnn_cell = get_rnn_cell(st_ds_conf, dec_in_dim, dec_out_dim)

    if st_ds_conf['concat_attn_to_dec_input']:
        proj_in_dim = dec_out_dim + sum_attn_dims([enc_attn, dec_hist_attn],
                                                  [enc_out_dim, dec_out_dim])
    else:
        proj_in_dim = dec_out_dim

    word_proj = torch.nn.Linear(proj_in_dim, vocab.get_vocab_size('lftokens'))

    model = BaseSeq2Seq(
        vocab=vocab,
        encoder=encoder,
        decoder=rnn_cell,
        word_projection=word_proj,
        source_embedding=source_embedding,
        target_embedding=target_embedding,
        target_namespace='lftokens',
        start_symbol=START_SYMBOL,
        eos_symbol=END_SYMBOL,
        max_decoding_step=st_ds_conf['max_decoding_len'],
        enc_attention=enc_attn,
        dec_hist_attn=dec_hist_attn,
        intermediate_dropout=st_ds_conf['intermediate_dropout'],
        concat_attn_to_dec_input=st_ds_conf['concat_attn_to_dec_input'],
    )
    return model
Beispiel #20
0
    def __init__(self, n_relations: int, conf: Dict,
                 input_batchers: Dict[str, InputBatch], use_cuda: bool):
        super(BiaffineParser, self).__init__()
        self.n_relations = n_relations
        self.conf = conf
        self.use_cuda = use_cuda
        self.use_mst_decoding_for_validation = conf[
            'use_mst_decoding_for_validation']

        input_layers = {}
        for i, c in enumerate(conf['input']):
            if c['type'] == 'embeddings':
                if 'pretrained' in c:
                    embs = load_embedding_txt(c['pretrained'], c['has_header'])
                    logger.info('loaded {0} embedding entries.'.format(
                        len(embs[0])))
                else:
                    embs = None
                name = c['name']
                mapping = input_batchers[name].mapping
                layer = Embeddings(name,
                                   c['dim'],
                                   mapping,
                                   fix_emb=c['fixed'],
                                   embs=embs,
                                   normalize=c.get('normalize', False))
                logger.info('embedding for field {0} '
                            'created with {1} x {2}.'.format(
                                c['field'], layer.n_V, layer.n_d))
                input_layers[name] = layer

            elif c['type'] == 'cnn_encoder' or c['type'] == 'lstm_encoder':
                name = c['name']
                mapping = input_batchers[name].mapping
                embeddings = Embeddings('{0}_ch_emb',
                                        c['dim'],
                                        mapping,
                                        fix_emb=False,
                                        embs=None,
                                        normalize=False)
                logger.info('character embedding for field {0} '
                            'created with {1} x {2}.'.format(
                                c['field'], embeddings.n_V, embeddings.n_d))
                if c['type'] == 'lstm_encoder':
                    layer = LstmTokenEmbedder(name, c['dim'], embeddings,
                                              conf['dropout'], use_cuda)
                elif c['type'] == 'cnn_encoder':
                    layer = ConvTokenEmbedder(name, c['dim'], embeddings,
                                              c['filters'],
                                              c.get('n_highway', 1),
                                              c.get('activation',
                                                    'relu'), use_cuda)
                else:
                    raise ValueError('Unknown type: {}'.format(c['type']))
                input_layers[name] = layer

            elif c['type'] == 'elmo':
                name = c['name']
                layer = ContextualizedWordEmbeddings(name, c['path'], use_cuda)
                input_layers[name] = layer

            else:
                raise ValueError('{} unknown input layer'.format(c['type']))

        self.input_layers = torch.nn.ModuleDict(input_layers)

        input_encoders = []
        input_dim = 0
        for i, c in enumerate(conf['input_encoder']):
            input_info = {
                name: [
                    entry['dim'] for entry in conf['input']
                    if entry['name'] == name
                ][0]
                for name in c['input']
            }

            if c['type'] == 'affine':
                input_encoder = AffineTransformInputEncoder(
                    input_info, c['dim'], use_cuda)
            elif c['type'] == 'sum':
                input_encoder = SummationInputEncoder(input_info, use_cuda)
            elif c['type'] == 'concat':
                input_encoder = ConcatenateInputEncoder(input_info, use_cuda)
            else:
                raise ValueError('{} unknown input encoder'.format(c['type']))

            input_dim += input_encoder.get_output_dim()
            input_encoders.append(input_encoder)

        self.input_encoders = torch.nn.ModuleList(input_encoders)

        c = conf['context_encoder']
        if c['type'] == 'stacked_bidirectional_lstm_dozat':
            self.encoder = PytorchSeq2SeqWrapper(
                InputDropoutedStackedBidirectionalLstm(
                    DozatLstmCell,
                    num_layers=c['num_layers'],
                    input_size=input_dim,
                    hidden_size=c['hidden_dim'],
                    recurrent_dropout_probability=c[
                        'recurrent_dropout_probability'],
                    layer_dropout_probability=c['layer_dropout_probability'],
                    activation=Activation.by_name("leaky_relu")()),
                stateful=False)
        elif c['type'] == 'stacked_bidirectional_lstm_ma':
            self.encoder = PytorchSeq2SeqWrapper(
                InputDropoutedStackedBidirectionalLstm(
                    MaLstmCell,
                    num_layers=c['num_layers'],
                    input_size=input_dim,
                    hidden_size=c['hidden_dim'],
                    recurrent_dropout_probability=c[
                        'recurrent_dropout_probability'],
                    layer_dropout_probability=c['layer_dropout_probability'],
                    activation=Activation.by_name("tanh")()),
                stateful=False)
        elif c['type'] == 'stacked_bidirectional_lstm':
            self.encoder = PytorchSeq2SeqWrapper(StackedBidirectionalLstm(
                num_layers=c['num_layers'],
                input_size=input_dim,
                hidden_size=c['hidden_dim'],
                recurrent_dropout_probability=c[
                    'recurrent_dropout_probability'],
                layer_dropout_probability=c['layer_dropout_probability']),
                                                 stateful=False)
        else:
            self.encoder = DummyContextEncoder()

        encoder_dim = self.encoder.get_output_dim()
        c = conf['biaffine_parser']
        self.arc_representation_dim = arc_representation_dim = c[
            'arc_representation_dim']
        self.tag_representation_dim = tag_representation_dim = c[
            'tag_representation_dim']

        self.head_sentinel_ = torch.nn.Parameter(
            torch.randn([1, 1, encoder_dim]))

        self.head_arc_feedforward = FeedForward(encoder_dim, 1,
                                                arc_representation_dim,
                                                Activation.by_name("elu")())
        self.child_arc_feedforward = FeedForward(encoder_dim, 1,
                                                 arc_representation_dim,
                                                 Activation.by_name("elu")())

        self.head_tag_feedforward = FeedForward(encoder_dim, 1,
                                                tag_representation_dim,
                                                Activation.by_name("elu")())
        self.child_tag_feedforward = FeedForward(encoder_dim, 1,
                                                 tag_representation_dim,
                                                 Activation.by_name("elu")())

        arc_attention_version = c.get('arc_attention_version', 'v1')
        if arc_attention_version == 'v2':
            self.arc_attention = BilinearMatrixAttentionV2(
                arc_representation_dim,
                arc_representation_dim,
                use_input_biases=True)
        else:
            self.arc_attention = BilinearMatrixAttention(
                arc_representation_dim,
                arc_representation_dim,
                use_input_biases=True)

        self.tag_bilinear = BilinearWithBias(tag_representation_dim,
                                             tag_representation_dim,
                                             n_relations)

        self.input_dropout_ = torch.nn.Dropout2d(p=conf['dropout'])
        self.dropout_ = InputVariationalDropout(p=conf['dropout'])

        self.input_encoding_timer = TimeRecoder()
        self.context_encoding_timer = TimeRecoder()
        self.classification_timer = TimeRecoder()