def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
             "embedding": {
                     "embedding_dim": 2,
                     "vocab_namespace": "token_characters"
                     },
             "encoder": {
                     "type": "cnn",
                     "embedding_dim": 2,
                     "num_filters": 4,
                     "ngram_filter_sizes": [1, 2],
                     "output_dim": 3
                     }
             })
     self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.}))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        span_extractor = SpanExtractor.from_params(params.pop("span_extractor"))
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))

        feed_forward_params = params.pop("feedforward", None)
        if feed_forward_params is not None:
            feedforward_layer = FeedForward.from_params(feed_forward_params)
        else:
            feedforward_layer = None
        pos_tag_embedding_params = params.pop("pos_tag_embedding", None)
        if pos_tag_embedding_params is not None:
            pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params)
        else:
            pos_tag_embedding = None
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
        evalb_directory_path = params.pop("evalb_directory_path", None)
        params.assert_empty(cls.__name__)

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   span_extractor=span_extractor,
                   encoder=encoder,
                   feedforward_layer=feedforward_layer,
                   pos_tag_embedding=pos_tag_embedding,
                   initializer=initializer,
                   regularizer=regularizer,
                   evalb_directory_path=evalb_directory_path)
    def __init__(self,
                 device,
                 inp_dim,
                 hid_dim,
                 compression,
                 vocab,
                 dropout: float = 0.4,
                 dropout_emb: float = 0.2,
                 pretrain_embedding_file=None):
        super().__init__()
        self.compression = compression
        self.hid_dim = hid_dim
        self.sent_enc = EncSent(device=device,
                                inp_dim=inp_dim,
                                hid_dim=hid_dim,
                                compression=compression)
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=inp_dim)

        if dropout_emb > 0:
            self._lexical_dropout = torch.nn.Dropout(p=dropout_emb)
        else:
            self._lexical_dropout = lambda x: x

        if pretrain_embedding_file is not None:
            logger = logging.getLogger()
            logger.info(
                "Loading word embedding: {}".format(pretrain_embedding_file))
            token_embedding.from_params(vocab=vocab,
                                        params=Params({
                                            "pretrained_file":
                                            pretrain_embedding_file,
                                            "embedding_dim": inp_dim
                                        }))
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        self.sent2doc = EncWord2Sent(device=device,
                                     inp_dim=self.sent_enc.get_output_dim(),
                                     hidden_dim=hid_dim,
                                     nenc_lay=2,
                                     dropout=dropout)
Exemple #4
0
 def __init__(self, vocab: Vocabulary, dense_dim=75, l2=1e-5, l1=1e-7, drop=0.1)-> None:
     super(Text_Embedding, self).__init__()
     
     self.dense_dim = dense_dim
     self.dropout_p = drop
     self.l1_lambda = l1
     self.l2_lambda = l2
     self.final_l2_norm = True
     
     self.embed_direction = Embedding(num_embeddings = vocab.get_vocab_size('tokens'), 
                                      embedding_dim = self.dense_dim, norm_type = 2,
                                      max_norm = self.l2_lambda)
     self.embed_magnitude = Embedding(num_embeddings = vocab.get_vocab_size('tokens'),
                                      embedding_dim = 1,
                                      norm_type = 1,
                                      max_norm = self.l1_lambda)
     
     #pytorch hasn't implemented spatial dropout for 1d
     self.dropout = Dropout(p = self.dropout_p)
Exemple #5
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 context_layer: Seq2SeqEncoder,
                 mention_feedforward: FeedForward,
                 antecedent_feedforward: FeedForward,
                 feature_size: int,
                 max_span_width: int,
                 spans_per_word: float,
                 max_antecedents: int,
                 lexical_dropout: float = 0.2,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(CoreferenceResolver, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._context_layer = context_layer
        self._mention_feedforward = TimeDistributed(mention_feedforward)
        self._antecedent_feedforward = TimeDistributed(antecedent_feedforward)
        self._mention_scorer = TimeDistributed(
            torch.nn.Linear(mention_feedforward.get_output_dim(), 1))
        self._antecedent_scorer = TimeDistributed(
            torch.nn.Linear(antecedent_feedforward.get_output_dim(), 1))
        self._head_scorer = TimeDistributed(
            torch.nn.Linear(context_layer.get_output_dim(), 1))

        # 10 possible distance buckets.
        self._num_distance_buckets = 10
        self._distance_embedding = Embedding(self._num_distance_buckets,
                                             feature_size)
        self._span_width_embedding = Embedding(max_span_width, feature_size)

        self._max_span_width = max_span_width
        self._spans_per_word = spans_per_word
        self._max_antecedents = max_antecedents

        self._mention_recall = MentionRecall()
        self._conll_coref_scores = ConllCorefScores()
        if lexical_dropout > 0:
            self._lexical_dropout = torch.nn.Dropout(p=lexical_dropout)
        else:
            self._lexical_dropout = lambda x: x
        initializer(self)
 def __init__(self, name: str, event2mind: Event2Mind, num_classes: int,
              input_dim: int, output_dim: int) -> None:
     self.embedder = Embedding(num_classes, input_dim)
     event2mind.add_module(f"{name}_embedder", self.embedder)
     self.decoder_cell = GRUCell(input_dim, output_dim)
     event2mind.add_module(f"{name}_decoder_cell", self.decoder_cell)
     self.output_projection_layer = Linear(output_dim, num_classes)
     event2mind.add_module(f"{name}_output_project_layer",
                           self.output_projection_layer)
     self.recall = UnigramRecall()
Exemple #7
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 spans_per_word: float,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 spans_extractor: SpanExtractor = None,
                 spans_scorer_feedforward: FeedForward = None) -> None:
        super(SpanAe, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim() + 1
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        self._decoder_cell = LSTMCell(self._decoder_input_dim + 1,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)

        self._span_extractor = spans_extractor

        feedforward_scorer = torch.nn.Sequential(
            TimeDistributed(spans_scorer_feedforward),
            TimeDistributed(
                torch.nn.Linear(spans_scorer_feedforward.get_output_dim(), 1)))
        self._span_pruner = SpanPruner(feedforward_scorer)

        self._spans_per_word = spans_per_word
Exemple #8
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        context_layer: Seq2SeqEncoder,
        mention_feedforward: FeedForward,
        antecedent_feedforward: FeedForward,
        feature_size: int,
        max_span_width: int,
        spans_per_word: float,
        max_antecedents: int,
        lexical_dropout: float = 0.2,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super().__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._context_layer = context_layer
        self._antecedent_feedforward = TimeDistributed(antecedent_feedforward)
        feedforward_scorer = torch.nn.Sequential(
            TimeDistributed(mention_feedforward),
            TimeDistributed(torch.nn.Linear(mention_feedforward.get_output_dim(), 1)),
        )
        self._mention_pruner = Pruner(feedforward_scorer)
        self._antecedent_scorer = TimeDistributed(
            torch.nn.Linear(antecedent_feedforward.get_output_dim(), 1)
        )

        self._endpoint_span_extractor = EndpointSpanExtractor(
            context_layer.get_output_dim(),
            combination="x,y",
            num_width_embeddings=max_span_width,
            span_width_embedding_dim=feature_size,
            bucket_widths=False,
        )
        self._attentive_span_extractor = SelfAttentiveSpanExtractor(
            input_dim=text_field_embedder.get_output_dim()
        )

        # 10 possible distance buckets.
        self._num_distance_buckets = 10
        self._distance_embedding = Embedding(self._num_distance_buckets, feature_size)

        self._max_span_width = max_span_width
        self._spans_per_word = spans_per_word
        self._max_antecedents = max_antecedents

        self._mention_recall = MentionRecall()
        self._conll_coref_scores = ConllCorefScores()
        if lexical_dropout > 0:
            self._lexical_dropout = torch.nn.Dropout(p=lexical_dropout)
        else:
            self._lexical_dropout = lambda x: x
        initializer(self)
Exemple #9
0
    def __init__(self, vocab: Vocabulary, cuda_device=-1) -> None:
        super().__init__(vocab)
        self.cuda_device = cuda_device

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=EMBEDDING_SIZE)
        if cuda_device > -1:
            token_embedding = token_embedding.to(cuda_device)
        self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        self.rnn = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True))

        self.hidden2out = torch.nn.Linear(
            in_features=self.rnn.get_output_dim(),
            out_features=vocab.get_vocab_size('tokens'))
        if cuda_device > -1:
            self.hidden2out = self.hidden2out.to(cuda_device)
            self.rnn = self.rnn.to(cuda_device)
Exemple #10
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2VecEncoder,
        context_encoder: Seq2VecEncoder,
        max_decoding_steps: int = 32,
        attention: Attention = None,
        target_namespace: str = "tokens",
        scheduled_sampling_ratio: float = 0.0,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio  # Maybe we can try
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self.pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                    self._target_namespace)
        # self.outfeature = 600
        self._max_decoding_steps = max_decoding_steps
        self.kd_metric = KD_Metric()
        self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25))
        self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0))
        self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0))
        self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1))
        self.dink1 = Distinct1()
        self.dink2 = Distinct2()
        self.topic_acc = Average()
        # anything about module
        self._source_embedder = source_embedder
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        target_embedding_dim = source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        self._encoder = encoder
        self.context_encoder = context_encoder
        self._encoder_output_dim = self._encoder.get_output_dim(
        )  # 512  要不把前两个都换成outfeater得了
        self._decoder_output_dim = self._encoder_output_dim
        self._decoder_input_dim = target_embedding_dim + self._decoder_output_dim
        self._attention = None
        # if attention:
        #     self._attention = attention
        #     self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        # 在这里把那个embedding融合进入试试?
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        self._output_projection_layer = Linear(self._encoder_output_dim,
                                               num_classes)
        self.clac_num = 0
Exemple #11
0
def create_model(vocab):
    # prepare model
    EMBEDDING_DIM = 100
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    HIDDEN_DIM = 100
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    return model
Exemple #12
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
Exemple #13
0
def glove_embeddings(vocab: Vocabulary, file_path: Path, dimension: int,
                     training: bool = True, namespace: str = 'tokens'
                     ) -> BasicTextFieldEmbedder:
    "Pre-trained embeddings using GloVe"
    token_embedding = Embedding.from_params(vocab, Params({
        "embedding_dim": dimension,
        "vocab_namespace": 'tokens',
        "pretrained_file": str(file_path),
        "trainable": training,
    }))
    word_embeddings = BasicTextFieldEmbedder({namespace: token_embedding})
    return word_embeddings
Exemple #14
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 label_smoothing: float = None,
                 target_embedding_dim: int = None,
                 target_tokens_embedder: TokenEmbedder = None) -> None:
        super(PretrSeq2Seq, self).__init__(vocab)
        self._label_smoothing = label_smoothing
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()

        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        # PRETRAINED PART
        if target_tokens_embedder:
            target_embedding_dim = target_tokens_embedder.get_output_dim()
            self._target_embedder = target_tokens_embedder

        if self._attention_function:
            self._decoder_attention = LegacyAttention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 binary_feature_dim: int,
                 embedding_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 label_smoothing: float = None,
                 label_namespace: str = "labels",
                 ignore_span_metric: bool = False,
                 label_encoding: Optional[str] = 'BIO',
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = True) -> None:
        super(OieLabelerCRF, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="labels",
                                              ignore_classes=["V"])
        self.label_namespace = label_namespace
        self.encoder = encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        self.include_start_end_transitions = include_start_end_transitions
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if constrain_crf_decoding:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            print(labels)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        self.crf = ConditionalRandomField(
            self.num_classes,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(
            text_field_embedder.get_output_dim() + binary_feature_dim,
            encoder.get_input_dim(),
            "text embedding dim + verb indicator embedding dim",
            "encoder input dim")
        initializer(self)
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        attention: Attention,
        beam_size: int,
        max_decoding_steps: int,
        dropout: float = 0.0,
        target_embedding_dim: int = 30,
        copy_token: str = "@COPY@",
        source_namespace: str = "source_tokens",
        target_namespace: str = "target_tokens",
        language_id_namespace: str = "language_labels",
        tensor_based_metric: Metric = None,
        token_based_metric: Metric = None,
        initializer: InitializerApplicator = InitializerApplicator()
    ) -> None:
        if source_namespace == target_namespace:
            target_embedding_dim = source_embedder._token_embedders[
                source_namespace].get_output_dim()

        super().__init__(vocab,
                         source_embedder,
                         encoder,
                         attention,
                         beam_size,
                         max_decoding_steps,
                         target_embedding_dim=target_embedding_dim,
                         copy_token=copy_token,
                         source_namespace=source_namespace,
                         target_namespace=target_namespace,
                         tensor_based_metric=tensor_based_metric,
                         token_based_metric=token_based_metric)
        self._language_id_namespace = language_id_namespace

        self.lang_vocab_size = self.vocab.get_vocab_size(
            self._language_id_namespace)
        self._lang_embedder = Embedding(self.lang_vocab_size,
                                        self.decoder_output_dim)

        self._inp_dropout = Dropout(p=dropout)

        if source_namespace == target_namespace:
            # replace independent target embeddings by source embeddings
            self._target_embedder = self._source_embedder._token_embedders[
                source_namespace]

        # self._bt_loss = Average()
        # self._lm_loss = Average()

        initializer(self)
    def __init__(self,
                 vocab: Vocabulary,
                 attention: CaptioningAttention,
                 embedding_dim: int = 256,
                 decoder_dim: int = 256):
        super(MultiscaleDecoder, self).__init__(vocab=vocab)

        self._vocab_size = self.vocab.get_vocab_size()
        self._embedding_dim = embedding_dim
        self._decoder_dim = decoder_dim

        self._embedding = Embedding(self._vocab_size, self._embedding_dim)
        self._dropout = nn.Dropout(0.1)
        # Output size of state cell must be decoder dim since state is transformed by the state cell
        self._state_cell = nn.GRUCell(self._embedding.get_output_dim(),
                                      self._decoder_dim)

        self._attention = attention
        self._decoder_cell = nn.GRUCell(self._attention.get_output_dim(),
                                        self._decoder_dim)

        self._linear = nn.Linear(self._decoder_dim, self._vocab_size)
Exemple #18
0
    def _find_model_function(self):
        embedding_dim = self.configuration['embed_size']
        embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix'
        if os.path.exists(embedding_matrix_filepath):
            embedding_matrix = super()._load_object(embedding_matrix_filepath)
        else:
            embedding_filepath = self.configuration['embedding_filepath']
            embedding_matrix = embedding._read_embeddings_from_text_file(embedding_filepath, embedding_dim,
                                                                         self.vocab, namespace='tokens')
            super()._save_object(embedding_matrix_filepath, embedding_matrix)
        token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='tokens'),
                                    embedding_dim=embedding_dim, padding_index=0, vocab_namespace='tokens',
                                    trainable=False, weight=embedding_matrix)
        # the embedder maps the input tokens to the appropriate embedding matrix
        word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        aspect_embedding_matrix = None
        if self._init_aspect_embeddings_from_word_embeddings():
            embedding_filepath = self.configuration['embedding_filepath']
            aspect_embedding_matrix = embedding._read_embeddings_from_text_file(embedding_filepath, embedding_dim,
                                                                                self.vocab, namespace='aspect')
        aspect_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='aspect'),
                                    embedding_dim=self._get_aspect_embeddings_dim(), padding_index=0,
                                     trainable=True, weight=aspect_embedding_matrix)
        aspect_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"aspect": aspect_embedding},
                                                                    # we'll be ignoring masks so we'll need to set this to True
                                                                    allow_unmatched_keys=True)
        model_function = self._find_model_function_pure()
        model = model_function(
            word_embedder,
            aspect_embedder,
            self.distinct_categories,
            self.distinct_polarities,
            self.vocab,
            self.configuration
        )
        self._print_args(model)
        model = model.to(self.configuration['device'])
        return model
Exemple #19
0
    def _construct_embedding_matrix(self) -> Embedding:
        """
        For HotFlip, we need a word embedding matrix to search over. The below is necessary for
        models such as ELMo, character-level models, or for models that use a projection layer
        after their word embeddings.

        We run all of the tokens from the vocabulary through the TextFieldEmbedder, and save the
        final output embedding. We then group all of those output embeddings into an "embedding
        matrix".
        """
        # Gets all tokens in the vocab and their corresponding IDs
        all_tokens = self.vocab._token_to_index[self.namespace]
        all_indices = list(self.vocab._index_to_token[self.namespace].keys())
        all_inputs = {"tokens": torch.LongTensor(all_indices).unsqueeze(0)}

        # A bit of a hack; this will only work with some dataset readers, but it'll do for now.
        indexers = self.predictor._dataset_reader._token_indexers  # type: ignore
        for token_indexer in indexers.values():
            # handle when a model uses character-level inputs, e.g., a CharCNN
            if isinstance(token_indexer, TokenCharactersIndexer):
                tokens = [Token(x) for x in all_tokens]
                max_token_length = max(len(x) for x in all_tokens)
                indexed_tokens = token_indexer.tokens_to_indices(
                    tokens, self.vocab, "token_characters")
                padded_tokens = token_indexer.as_padded_tensor(
                    indexed_tokens, {"token_characters": len(tokens)},
                    {"num_token_characters": max_token_length})
                all_inputs['token_characters'] = torch.LongTensor(
                    padded_tokens['token_characters']).unsqueeze(0)
            # for ELMo models
            if isinstance(token_indexer, ELMoTokenCharactersIndexer):
                elmo_tokens = []
                for token in all_tokens:
                    elmo_indexed_token = token_indexer.tokens_to_indices(
                        [Token(text=token)], self.vocab,
                        "sentence")["sentence"]
                    elmo_tokens.append(elmo_indexed_token[0])
                all_inputs["elmo"] = torch.LongTensor(elmo_tokens).unsqueeze(0)

        embedding_layer = util.find_embedding_layer(self.predictor._model)
        if isinstance(embedding_layer, torch.nn.modules.sparse.Embedding):
            embedding_matrix = embedding_layer.weight
        else:
            # pass all tokens through the fake matrix and create an embedding out of it.
            embedding_matrix = embedding_layer(all_inputs).squeeze()

        return Embedding(num_embeddings=self.vocab.get_vocab_size(
            self.namespace),
                         embedding_dim=embedding_matrix.shape[1],
                         weight=embedding_matrix,
                         trainable=False)
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict('This is the best movie ever!')['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
Exemple #21
0
    def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim,
                     CNN_num_filters, CNN_encoder_dim):
        # The word embedding will transform every word to a "Word_embedding_dim" real valued vector
        # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim)

        indexers_dict = dict()
        if (Word_embedding_dim > 0):
            word_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size("token_ids"),
                embedding_dim=Word_embedding_dim)

            word_embedding = word_embedding.to(device=self.cf_a.device,
                                               dtype=self.cf_a.dtype)
            indexers_dict["tokens"] = word_embedding
        if (CNN_encoder_dim > 0):
            # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector
            # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim)
            char_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size("token_chars"),
                embedding_dim=char_embeddedng_dim)
            # The Encoder will apply the cNN over the max_word_length dimension
            # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes)
            character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1),
                                       embedding_dim=char_embeddedng_dim,
                                       num_filters=CNN_num_filters,
                                       output_dim=CNN_encoder_dim)

            # We concatenate the char embdding and Encoding
            token_character_encoder = TokenCharactersEncoder(
                embedding=char_embedding, encoder=character_cnn)

            token_character_encoder = token_character_encoder.to(
                device=self.cf_a.device, dtype=self.cf_a.dtype)
            indexers_dict["chars"] = token_character_encoder
        ### Now we finally create the finally embedder indicating what are the token ids it embedds
        text_field_embedder = BasicTextFieldEmbedder(indexers_dict)

        return text_field_embedder
Exemple #22
0
def chive_emb_returner(vocab: Vocabulary) -> BasicTextFieldEmbedder:
    # embed_matrix = _read_embeddings_from_text_file(
    #     file_uri="./resources/chive-1.1-mc30.txt",
    #     embedding_dim=300,
    #     vocab=vocab
    # )

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=300,
                                pretrained_file=str(CACHE_ROOT) +
                                "/resources/chive-1.1-mc30.txt",
                                vocab=vocab)

    return BasicTextFieldEmbedder({'tokens': token_embedding})
Exemple #23
0
    def __init__(self,
                 vocab: Vocabulary,
                 input_size: int,
                 hidden_size: int,
                 loss_ratio: float = 1.0,
                 recurrency: nn.LSTM = None,
                 num_layers: int = None,
                 remove_sos: bool = True,
                 remove_eos: bool = False,
                 target_embedder: Embedding = None,
                 target_embedding_dim: int = None,
                 target_namespace: str = "tokens",
                 slow_decode: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(RNNTLayer, self).__init__(vocab, regularizer)
        import warprnnt_pytorch
        self.loss_ratio = loss_ratio
        self._remove_sos = remove_sos
        self._remove_eos = remove_eos
        self._slow_decode = slow_decode
        self._target_namespace = target_namespace
        self._num_classes = self.vocab.get_vocab_size(target_namespace)
        self._pad_index = self.vocab.get_token_index(DEFAULT_PADDING_TOKEN,
                                                     self._target_namespace)
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        self._loss = warprnnt_pytorch.RNNTLoss(blank=self._pad_index,
                                               reduction='mean')
        self._recurrency = recurrency or \
            nn.LSTM(input_size=target_embedding_dim,
                    hidden_size=hidden_size,
                    num_layers=num_layers,
                    batch_first=True)

        self._target_embedder = target_embedder or Embedding(
            self._num_classes, target_embedding_dim)
        self.w_enc = nn.Linear(input_size, hidden_size, bias=True)
        self.w_dec = nn.Linear(input_size, hidden_size, bias=False)
        self._proj = nn.Linear(hidden_size, self._num_classes)

        exclude_indices = {self._pad_index, self._end_index, self._start_index}
        self._wer: Metric = WER(exclude_indices=exclude_indices)
        self._bleu: Metric = BLEU(exclude_indices=exclude_indices)
        self._dal = Average()

        initializer(self)
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 span_extractor: SpanExtractor,
                 encoder: Seq2SeqEncoder,
                 feedforward: FeedForward = None,
                 pos_tag_embedding: Embedding = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 evalb_directory_path: str = DEFAULT_EVALB_DIR) -> None:
        super().__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.span_extractor = span_extractor
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.encoder = encoder
        self.feedforward_layer = TimeDistributed(
            feedforward) if feedforward else None
        self.pos_tag_embedding = pos_tag_embedding or None
        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = span_extractor.get_output_dim()

        self.tag_projection_layer = TimeDistributed(
            Linear(output_dim, self.num_classes))

        representation_dim = text_field_embedder.get_output_dim()
        if pos_tag_embedding is not None:
            representation_dim += pos_tag_embedding.get_output_dim()
        check_dimensions_match(
            representation_dim, encoder.get_input_dim(),
            "representation dim (tokens + optional POS tags)",
            "encoder input dim")
        check_dimensions_match(encoder.get_output_dim(),
                               span_extractor.get_input_dim(),
                               "encoder input dim", "span extractor input dim")
        if feedforward is not None:
            check_dimensions_match(span_extractor.get_output_dim(),
                                   feedforward.get_input_dim(),
                                   "span extractor output dim",
                                   "feedforward input dim")

        self.tag_accuracy = CategoricalAccuracy()

        if evalb_directory_path is not None:
            self._evalb_score = EvalbBracketingScorer(evalb_directory_path)
        else:
            self._evalb_score = None
        initializer(self)
class ImageCaptioningDecoder(CaptioningDecoder):
    def __init__(self,
                 vocab: Vocabulary,
                 attention: CaptioningAttention,
                 embedding_dim: int = 256,
                 decoder_dim: int = 256):
        super(ImageCaptioningDecoder, self).__init__(vocab=vocab)

        self._vocab_size = self.vocab.get_vocab_size()
        self._embedding_dim = embedding_dim
        self._decoder_dim = decoder_dim

        self._embedding = Embedding(self._vocab_size, self._embedding_dim)
        self._attention = attention
        self._decoder_cell = nn.LSTMCell(
            self._embedding.get_output_dim() +
            self._attention.get_output_dim(), self._decoder_dim)
        self._linear = nn.Linear(self._decoder_dim, self._vocab_size)

    @overrides
    def forward(
        self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor,
        predicted_indices: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        # Shape: (batch_size, embedding_dim)
        embedding = self._embedding(predicted_indices).float().view(
            -1, self._embedding_dim)

        # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1)
        attention, attention_weights = self._attention(x, h)

        ## Change to not use teacher forcing all the time
        # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim)
        h, c = self._decoder_cell(torch.cat([attention, embedding], dim=1),
                                  (h, c))

        # Get output predictions (one per character in vocab)
        # Shape: (batch_size, vocab_size)
        preds = self._linear(h)

        return h, c, preds, attention_weights

    @overrides
    def get_output_dim(self) -> int:
        return self._vocab_size

    @overrides
    def get_input_dim(self) -> int:
        return self._decoder_dim
Exemple #26
0
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 num_start_types: int,
                 num_entity_types: int,
                 mixture_feedforward: FeedForward = None,
                 dropout: float = 0.0) -> None:
        super(WikiTablesDecoderStep, self).__init__()
        self._mixture_feedforward = mixture_feedforward
        self._entity_type_embedding = Embedding(num_entity_types,
                                                action_embedding_dim)
        self._input_attention = Attention(attention_function)

        self._num_start_types = num_start_types
        self._start_type_predictor = Linear(encoder_output_dim,
                                            num_start_types)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state.  Then we concatenate that with the decoder state and project to
        # `action_embedding_dim` to make a prediction.
        self._output_projection_layer = Linear(output_dim + encoder_output_dim,
                                               action_embedding_dim)

        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)

        if mixture_feedforward is not None:
            check_dimensions_match(output_dim,
                                   mixture_feedforward.get_input_dim(),
                                   "hidden state embedding dim",
                                   "mixture feedforward input dim")
            check_dimensions_match(mixture_feedforward.get_output_dim(), 1,
                                   "mixture feedforward output dim",
                                   "dimension for scalar value")

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 span_extractor: SpanExtractor,
                 encoder: Seq2SeqEncoder,
                 feedforward_layer: FeedForward = None,
                 pos_tag_embedding: Embedding = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 evalb_directory_path: str = None) -> None:
        super(SpanConstituencyParser, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.span_extractor = span_extractor
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.encoder = encoder
        self.feedforward_layer = TimeDistributed(feedforward_layer) if feedforward_layer else None
        self.pos_tag_embedding = pos_tag_embedding or None
        if feedforward_layer is not None:
            output_dim = feedforward_layer.get_output_dim()
        else:
            output_dim = span_extractor.get_output_dim()

        self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes))

        representation_dim = text_field_embedder.get_output_dim()
        if pos_tag_embedding is not None:
            representation_dim += pos_tag_embedding.get_output_dim()
        check_dimensions_match(representation_dim,
                               encoder.get_input_dim(),
                               "representation dim (tokens + optional POS tags)",
                               "encoder input dim")
        check_dimensions_match(encoder.get_output_dim(),
                               span_extractor.get_input_dim(),
                               "encoder input dim",
                               "span extractor input dim")
        if feedforward_layer is not None:
            check_dimensions_match(span_extractor.get_output_dim(),
                                   feedforward_layer.get_input_dim(),
                                   "span extractor output dim",
                                   "feedforward input dim")

        self.tag_accuracy = CategoricalAccuracy()

        if evalb_directory_path is not None:
            self._evalb_score = EvalbBracketingScorer(evalb_directory_path)
        else:
            self._evalb_score = None
        initializer(self)
Exemple #28
0
            def __init__(self, vocab, embedding_dim, hidden_size,
                         intermediate_size):
                super().__init__()
                self.embeddings = Embedding(
                    pretrained_file=embedding_file,
                    embedding_dim=embedding_dim,
                    projection_dim=hidden_size,
                    vocab=vocab,
                )

                self.transformer = TransformerStack(
                    num_hidden_layers=4,
                    hidden_size=hidden_size,
                    intermediate_size=intermediate_size,
                )
def build_simple_lstm_model(vocab: Vocabulary,
                            emb_size: int = 256,
                            hidden_size: int = 256,
                            num_layers: int = 2,
                            bidirectional: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"bert_tokens": Embedding(embedding_dim=emb_size, num_embeddings=vocab_size)}
    )
    encoder = LstmSeq2VecEncoder(
        input_size=emb_size, hidden_size=hidden_size, 
        num_layers=num_layers, bidirectional=bidirectional
    )
    return SimpleClassifier(vocab, embedder, encoder)
def build_simple_cnn_model(vocab: Vocabulary,
                           emb_size: int = 256,
                           output_dim: int = 256,
                           num_filters: int = 16,
                           ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5, 6)) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"bert_tokens": Embedding(embedding_dim=emb_size, num_embeddings=vocab_size)}
    )
    encoder = CnnEncoder(
        embedding_dim=emb_size, ngram_filter_sizes=ngram_filter_sizes, output_dim=output_dim, 
        num_filters=num_filters,
    )
    return SimpleClassifier(vocab, embedder, encoder)
Exemple #31
0
def construct_model(vocab, args):
    # token embedding

    word_embedding = Embedding.from_params(vocab=vocab, params=Params({
        "pretrained_file": "glove\\glove.vocab.100d.txt",
        "embedding_dim": 100,
        "trainable": True,
        "padding_index": 0
    }))

    word_embedding = BasicTextFieldEmbedder({
        "token_words": word_embedding
    })

    char_embedding = BasicTextFieldEmbedder({
        "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20,
                                                                       num_embeddings=262),
                                                   encoder=CnnEncoder(embedding_dim=20,
                                                                      ngram_filter_sizes=[5],
                                                                      num_filters=50)),
    })

    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(input_size=100,
                      num_layers=1,
                      hidden_size=100,
                      bidirectional=True,
                      batch_first=True))

    model = FollowUpSnippetModel(vocab=vocab,
                                 word_embedder=word_embedding,
                                 char_embedder=char_embedding,
                                 tokens_encoder=lstm,
                                 model_args=args)

    return model
Exemple #32
0
 def __init__(self,
              num_embeddings: int,
              embedding_dim: int,
              dropout: float = None,
              projection_dim: int = None,
              weight: torch.FloatTensor = None,
              padding_index: int = None,
              trainable: bool = True,
              max_norm: float = None,
              norm_type: float = 2.,
              scale_grad_by_freq: bool = False,
              sparse: bool = False) -> None:
     Embedding.__init__(self,
                        num_embeddings=num_embeddings,
                        embedding_dim=embedding_dim,
                        projection_dim=projection_dim,
                        weight=weight,
                        padding_index=padding_index,
                        trainable=trainable,
                        max_norm=max_norm,
                        norm_type=norm_type,
                        scale_grad_by_freq=scale_grad_by_freq,
                        sparse=sparse)
     self.dropout = dropout
def main():
    reader = PosDatasetReader()
    train_dataset = reader.read(
        cached_path(
            "https://raw.githubusercontent.com/allenai/allennlp/master/tutorials/tagger/training.txt"
        ))
    validation_dataset = reader.read(
        cached_path(
            "https://raw.githubusercontent.com/allenai/allennlp/master/tutorials/tagger/validation.txt"
        ))

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    EMBEDDING_DIM = 6
    HIDDEN_DIM = 6

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"),
                                embedding_dim=EMBEDDING_DIM)
    word_embedding = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmTagger(word_embedding, lstm, vocab)

    optimizer = optim.SGD(model.parameters(), lr=0.1)

    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[('sentence', 'num_tokens')])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        patience=10,
        num_epochs=1000,
    )

    trainer.train()

    predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

    tag_logits = predictor.predict("The dog ate the apple")["tag_logits"]
    tag_ids = np.argmax(tag_logits, axis=-1)

    print([model.vocab.get_token_from_index(i, "labels") for i in tag_ids])