def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) span_extractor = SpanExtractor.from_params(params.pop("span_extractor")) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) feed_forward_params = params.pop("feedforward", None) if feed_forward_params is not None: feedforward_layer = FeedForward.from_params(feed_forward_params) else: feedforward_layer = None pos_tag_embedding_params = params.pop("pos_tag_embedding", None) if pos_tag_embedding_params is not None: pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params) else: pos_tag_embedding = None initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) evalb_directory_path = params.pop("evalb_directory_path", None) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, span_extractor=span_extractor, encoder=encoder, feedforward_layer=feedforward_layer, pos_tag_embedding=pos_tag_embedding, initializer=initializer, regularizer=regularizer, evalb_directory_path=evalb_directory_path)
def __init__(self, device, inp_dim, hid_dim, compression, vocab, dropout: float = 0.4, dropout_emb: float = 0.2, pretrain_embedding_file=None): super().__init__() self.compression = compression self.hid_dim = hid_dim self.sent_enc = EncSent(device=device, inp_dim=inp_dim, hid_dim=hid_dim, compression=compression) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=inp_dim) if dropout_emb > 0: self._lexical_dropout = torch.nn.Dropout(p=dropout_emb) else: self._lexical_dropout = lambda x: x if pretrain_embedding_file is not None: logger = logging.getLogger() logger.info( "Loading word embedding: {}".format(pretrain_embedding_file)) token_embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": pretrain_embedding_file, "embedding_dim": inp_dim })) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) self.sent2doc = EncWord2Sent(device=device, inp_dim=self.sent_enc.get_output_dim(), hidden_dim=hid_dim, nenc_lay=2, dropout=dropout)
def __init__(self, vocab: Vocabulary, dense_dim=75, l2=1e-5, l1=1e-7, drop=0.1)-> None: super(Text_Embedding, self).__init__() self.dense_dim = dense_dim self.dropout_p = drop self.l1_lambda = l1 self.l2_lambda = l2 self.final_l2_norm = True self.embed_direction = Embedding(num_embeddings = vocab.get_vocab_size('tokens'), embedding_dim = self.dense_dim, norm_type = 2, max_norm = self.l2_lambda) self.embed_magnitude = Embedding(num_embeddings = vocab.get_vocab_size('tokens'), embedding_dim = 1, norm_type = 1, max_norm = self.l1_lambda) #pytorch hasn't implemented spatial dropout for 1d self.dropout = Dropout(p = self.dropout_p)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, context_layer: Seq2SeqEncoder, mention_feedforward: FeedForward, antecedent_feedforward: FeedForward, feature_size: int, max_span_width: int, spans_per_word: float, max_antecedents: int, lexical_dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(CoreferenceResolver, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._context_layer = context_layer self._mention_feedforward = TimeDistributed(mention_feedforward) self._antecedent_feedforward = TimeDistributed(antecedent_feedforward) self._mention_scorer = TimeDistributed( torch.nn.Linear(mention_feedforward.get_output_dim(), 1)) self._antecedent_scorer = TimeDistributed( torch.nn.Linear(antecedent_feedforward.get_output_dim(), 1)) self._head_scorer = TimeDistributed( torch.nn.Linear(context_layer.get_output_dim(), 1)) # 10 possible distance buckets. self._num_distance_buckets = 10 self._distance_embedding = Embedding(self._num_distance_buckets, feature_size) self._span_width_embedding = Embedding(max_span_width, feature_size) self._max_span_width = max_span_width self._spans_per_word = spans_per_word self._max_antecedents = max_antecedents self._mention_recall = MentionRecall() self._conll_coref_scores = ConllCorefScores() if lexical_dropout > 0: self._lexical_dropout = torch.nn.Dropout(p=lexical_dropout) else: self._lexical_dropout = lambda x: x initializer(self)
def __init__(self, name: str, event2mind: Event2Mind, num_classes: int, input_dim: int, output_dim: int) -> None: self.embedder = Embedding(num_classes, input_dim) event2mind.add_module(f"{name}_embedder", self.embedder) self.decoder_cell = GRUCell(input_dim, output_dim) event2mind.add_module(f"{name}_decoder_cell", self.decoder_cell) self.output_projection_layer = Linear(output_dim, num_classes) event2mind.add_module(f"{name}_output_project_layer", self.output_projection_layer) self.recall = UnigramRecall()
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, spans_per_word: float, target_namespace: str = "tokens", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, spans_extractor: SpanExtractor = None, spans_scorer_feedforward: FeedForward = None) -> None: super(SpanAe, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() + 1 target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim self._decoder_cell = LSTMCell(self._decoder_input_dim + 1, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) self._span_extractor = spans_extractor feedforward_scorer = torch.nn.Sequential( TimeDistributed(spans_scorer_feedforward), TimeDistributed( torch.nn.Linear(spans_scorer_feedforward.get_output_dim(), 1))) self._span_pruner = SpanPruner(feedforward_scorer) self._spans_per_word = spans_per_word
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, context_layer: Seq2SeqEncoder, mention_feedforward: FeedForward, antecedent_feedforward: FeedForward, feature_size: int, max_span_width: int, spans_per_word: float, max_antecedents: int, lexical_dropout: float = 0.2, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super().__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._context_layer = context_layer self._antecedent_feedforward = TimeDistributed(antecedent_feedforward) feedforward_scorer = torch.nn.Sequential( TimeDistributed(mention_feedforward), TimeDistributed(torch.nn.Linear(mention_feedforward.get_output_dim(), 1)), ) self._mention_pruner = Pruner(feedforward_scorer) self._antecedent_scorer = TimeDistributed( torch.nn.Linear(antecedent_feedforward.get_output_dim(), 1) ) self._endpoint_span_extractor = EndpointSpanExtractor( context_layer.get_output_dim(), combination="x,y", num_width_embeddings=max_span_width, span_width_embedding_dim=feature_size, bucket_widths=False, ) self._attentive_span_extractor = SelfAttentiveSpanExtractor( input_dim=text_field_embedder.get_output_dim() ) # 10 possible distance buckets. self._num_distance_buckets = 10 self._distance_embedding = Embedding(self._num_distance_buckets, feature_size) self._max_span_width = max_span_width self._spans_per_word = spans_per_word self._max_antecedents = max_antecedents self._mention_recall = MentionRecall() self._conll_coref_scores = ConllCorefScores() if lexical_dropout > 0: self._lexical_dropout = torch.nn.Dropout(p=lexical_dropout) else: self._lexical_dropout = lambda x: x initializer(self)
def __init__(self, vocab: Vocabulary, cuda_device=-1) -> None: super().__init__(vocab) self.cuda_device = cuda_device token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_SIZE) if cuda_device > -1: token_embedding = token_embedding.to(cuda_device) self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) self.rnn = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True)) self.hidden2out = torch.nn.Linear( in_features=self.rnn.get_output_dim(), out_features=vocab.get_vocab_size('tokens')) if cuda_device > -1: self.hidden2out = self.hidden2out.to(cuda_device) self.rnn = self.rnn.to(cuda_device)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, context_encoder: Seq2VecEncoder, max_decoding_steps: int = 32, attention: Attention = None, target_namespace: str = "tokens", scheduled_sampling_ratio: float = 0.0, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # Maybe we can try self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self.pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # self.outfeature = 600 self._max_decoding_steps = max_decoding_steps self.kd_metric = KD_Metric() self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25)) self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0)) self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0)) self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1)) self.dink1 = Distinct1() self.dink2 = Distinct2() self.topic_acc = Average() # anything about module self._source_embedder = source_embedder num_classes = self.vocab.get_vocab_size(self._target_namespace) target_embedding_dim = source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) self._encoder = encoder self.context_encoder = context_encoder self._encoder_output_dim = self._encoder.get_output_dim( ) # 512 要不把前两个都换成outfeater得了 self._decoder_output_dim = self._encoder_output_dim self._decoder_input_dim = target_embedding_dim + self._decoder_output_dim self._attention = None # if attention: # self._attention = attention # self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim # 在这里把那个embedding融合进入试试? self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._encoder_output_dim, num_classes) self.clac_num = 0
def create_model(vocab): # prepare model EMBEDDING_DIM = 100 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 100 lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmTagger(word_embeddings, lstm, vocab) return model
def main(): reader = Seq2SeqDatasetReader( source_tokenizer=WordTokenizer(), target_tokenizer=CharacterTokenizer(), source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}) train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv') validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv') vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EN_EMBEDDING_DIM) # encoder = PytorchSeq2SeqWrapper( # torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8) source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding}) # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')()) # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM) attention = DotProductAttention() max_decoding_steps = 20 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim=ZH_EMBEDDING_DIM, target_namespace='target_tokens', attention=attention, beam_size=8, use_bleu=True) optimizer = optim.Adam(model.parameters()) iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=1, cuda_device=CUDA_DEVICE) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
def glove_embeddings(vocab: Vocabulary, file_path: Path, dimension: int, training: bool = True, namespace: str = 'tokens' ) -> BasicTextFieldEmbedder: "Pre-trained embeddings using GloVe" token_embedding = Embedding.from_params(vocab, Params({ "embedding_dim": dimension, "vocab_namespace": 'tokens', "pretrained_file": str(file_path), "trainable": training, })) word_embeddings = BasicTextFieldEmbedder({namespace: token_embedding}) return word_embeddings
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, label_smoothing: float = None, target_embedding_dim: int = None, target_tokens_embedder: TokenEmbedder = None) -> None: super(PretrSeq2Seq, self).__init__(vocab) self._label_smoothing = label_smoothing self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) # PRETRAINED PART if target_tokens_embedder: target_embedding_dim = target_tokens_embedder.get_output_dim() self._target_embedder = target_tokens_embedder if self._attention_function: self._decoder_attention = LegacyAttention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, label_namespace: str = "labels", ignore_span_metric: bool = False, label_encoding: Optional[str] = 'BIO', include_start_end_transitions: bool = True, constrain_crf_decoding: bool = True) -> None: super(OieLabelerCRF, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SpanBasedF1Measure(vocab, tag_namespace="labels", ignore_classes=["V"]) self.label_namespace = label_namespace self.encoder = encoder # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_classes)) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric self.include_start_end_transitions = include_start_end_transitions if constrain_crf_decoding is None: constrain_crf_decoding = label_encoding is not None if constrain_crf_decoding: labels = self.vocab.get_index_to_token_vocabulary(label_namespace) print(labels) constraints = allowed_transitions(label_encoding, labels) else: constraints = None self.crf = ConditionalRandomField( self.num_classes, constraints, include_start_end_transitions=include_start_end_transitions) check_dimensions_match( text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim") initializer(self)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, beam_size: int, max_decoding_steps: int, dropout: float = 0.0, target_embedding_dim: int = 30, copy_token: str = "@COPY@", source_namespace: str = "source_tokens", target_namespace: str = "target_tokens", language_id_namespace: str = "language_labels", tensor_based_metric: Metric = None, token_based_metric: Metric = None, initializer: InitializerApplicator = InitializerApplicator() ) -> None: if source_namespace == target_namespace: target_embedding_dim = source_embedder._token_embedders[ source_namespace].get_output_dim() super().__init__(vocab, source_embedder, encoder, attention, beam_size, max_decoding_steps, target_embedding_dim=target_embedding_dim, copy_token=copy_token, source_namespace=source_namespace, target_namespace=target_namespace, tensor_based_metric=tensor_based_metric, token_based_metric=token_based_metric) self._language_id_namespace = language_id_namespace self.lang_vocab_size = self.vocab.get_vocab_size( self._language_id_namespace) self._lang_embedder = Embedding(self.lang_vocab_size, self.decoder_output_dim) self._inp_dropout = Dropout(p=dropout) if source_namespace == target_namespace: # replace independent target embeddings by source embeddings self._target_embedder = self._source_embedder._token_embedders[ source_namespace] # self._bt_loss = Average() # self._lm_loss = Average() initializer(self)
def __init__(self, vocab: Vocabulary, attention: CaptioningAttention, embedding_dim: int = 256, decoder_dim: int = 256): super(MultiscaleDecoder, self).__init__(vocab=vocab) self._vocab_size = self.vocab.get_vocab_size() self._embedding_dim = embedding_dim self._decoder_dim = decoder_dim self._embedding = Embedding(self._vocab_size, self._embedding_dim) self._dropout = nn.Dropout(0.1) # Output size of state cell must be decoder dim since state is transformed by the state cell self._state_cell = nn.GRUCell(self._embedding.get_output_dim(), self._decoder_dim) self._attention = attention self._decoder_cell = nn.GRUCell(self._attention.get_output_dim(), self._decoder_dim) self._linear = nn.Linear(self._decoder_dim, self._vocab_size)
def _find_model_function(self): embedding_dim = self.configuration['embed_size'] embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix' if os.path.exists(embedding_matrix_filepath): embedding_matrix = super()._load_object(embedding_matrix_filepath) else: embedding_filepath = self.configuration['embedding_filepath'] embedding_matrix = embedding._read_embeddings_from_text_file(embedding_filepath, embedding_dim, self.vocab, namespace='tokens') super()._save_object(embedding_matrix_filepath, embedding_matrix) token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='tokens'), embedding_dim=embedding_dim, padding_index=0, vocab_namespace='tokens', trainable=False, weight=embedding_matrix) # the embedder maps the input tokens to the appropriate embedding matrix word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) aspect_embedding_matrix = None if self._init_aspect_embeddings_from_word_embeddings(): embedding_filepath = self.configuration['embedding_filepath'] aspect_embedding_matrix = embedding._read_embeddings_from_text_file(embedding_filepath, embedding_dim, self.vocab, namespace='aspect') aspect_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='aspect'), embedding_dim=self._get_aspect_embeddings_dim(), padding_index=0, trainable=True, weight=aspect_embedding_matrix) aspect_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"aspect": aspect_embedding}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) model_function = self._find_model_function_pure() model = model_function( word_embedder, aspect_embedder, self.distinct_categories, self.distinct_polarities, self.vocab, self.configuration ) self._print_args(model) model = model.to(self.configuration['device']) return model
def _construct_embedding_matrix(self) -> Embedding: """ For HotFlip, we need a word embedding matrix to search over. The below is necessary for models such as ELMo, character-level models, or for models that use a projection layer after their word embeddings. We run all of the tokens from the vocabulary through the TextFieldEmbedder, and save the final output embedding. We then group all of those output embeddings into an "embedding matrix". """ # Gets all tokens in the vocab and their corresponding IDs all_tokens = self.vocab._token_to_index[self.namespace] all_indices = list(self.vocab._index_to_token[self.namespace].keys()) all_inputs = {"tokens": torch.LongTensor(all_indices).unsqueeze(0)} # A bit of a hack; this will only work with some dataset readers, but it'll do for now. indexers = self.predictor._dataset_reader._token_indexers # type: ignore for token_indexer in indexers.values(): # handle when a model uses character-level inputs, e.g., a CharCNN if isinstance(token_indexer, TokenCharactersIndexer): tokens = [Token(x) for x in all_tokens] max_token_length = max(len(x) for x in all_tokens) indexed_tokens = token_indexer.tokens_to_indices( tokens, self.vocab, "token_characters") padded_tokens = token_indexer.as_padded_tensor( indexed_tokens, {"token_characters": len(tokens)}, {"num_token_characters": max_token_length}) all_inputs['token_characters'] = torch.LongTensor( padded_tokens['token_characters']).unsqueeze(0) # for ELMo models if isinstance(token_indexer, ELMoTokenCharactersIndexer): elmo_tokens = [] for token in all_tokens: elmo_indexed_token = token_indexer.tokens_to_indices( [Token(text=token)], self.vocab, "sentence")["sentence"] elmo_tokens.append(elmo_indexed_token[0]) all_inputs["elmo"] = torch.LongTensor(elmo_tokens).unsqueeze(0) embedding_layer = util.find_embedding_layer(self.predictor._model) if isinstance(embedding_layer, torch.nn.modules.sparse.Embedding): embedding_matrix = embedding_layer.weight else: # pass all tokens through the fake matrix and create an embedding out of it. embedding_matrix = embedding_layer(all_inputs).squeeze() return Embedding(num_embeddings=self.vocab.get_vocab_size( self.namespace), embedding_dim=embedding_matrix.shape[1], weight=embedding_matrix, trainable=False)
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_dataset = reader.read( 'data/stanfordSentimentTreebank/trees/train.txt') dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt') # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3}) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, num_epochs=20) trainer.train() predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict('This is the best movie ever!')['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim, CNN_num_filters, CNN_encoder_dim): # The word embedding will transform every word to a "Word_embedding_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim) indexers_dict = dict() if (Word_embedding_dim > 0): word_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=Word_embedding_dim) word_embedding = word_embedding.to(device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["tokens"] = word_embedding if (CNN_encoder_dim > 0): # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_chars"), embedding_dim=char_embeddedng_dim) # The Encoder will apply the cNN over the max_word_length dimension # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes) character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1), embedding_dim=char_embeddedng_dim, num_filters=CNN_num_filters, output_dim=CNN_encoder_dim) # We concatenate the char embdding and Encoding token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) token_character_encoder = token_character_encoder.to( device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["chars"] = token_character_encoder ### Now we finally create the finally embedder indicating what are the token ids it embedds text_field_embedder = BasicTextFieldEmbedder(indexers_dict) return text_field_embedder
def chive_emb_returner(vocab: Vocabulary) -> BasicTextFieldEmbedder: # embed_matrix = _read_embeddings_from_text_file( # file_uri="./resources/chive-1.1-mc30.txt", # embedding_dim=300, # vocab=vocab # ) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, pretrained_file=str(CACHE_ROOT) + "/resources/chive-1.1-mc30.txt", vocab=vocab) return BasicTextFieldEmbedder({'tokens': token_embedding})
def __init__(self, vocab: Vocabulary, input_size: int, hidden_size: int, loss_ratio: float = 1.0, recurrency: nn.LSTM = None, num_layers: int = None, remove_sos: bool = True, remove_eos: bool = False, target_embedder: Embedding = None, target_embedding_dim: int = None, target_namespace: str = "tokens", slow_decode: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(RNNTLayer, self).__init__(vocab, regularizer) import warprnnt_pytorch self.loss_ratio = loss_ratio self._remove_sos = remove_sos self._remove_eos = remove_eos self._slow_decode = slow_decode self._target_namespace = target_namespace self._num_classes = self.vocab.get_vocab_size(target_namespace) self._pad_index = self.vocab.get_token_index(DEFAULT_PADDING_TOKEN, self._target_namespace) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._loss = warprnnt_pytorch.RNNTLoss(blank=self._pad_index, reduction='mean') self._recurrency = recurrency or \ nn.LSTM(input_size=target_embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True) self._target_embedder = target_embedder or Embedding( self._num_classes, target_embedding_dim) self.w_enc = nn.Linear(input_size, hidden_size, bias=True) self.w_dec = nn.Linear(input_size, hidden_size, bias=False) self._proj = nn.Linear(hidden_size, self._num_classes) exclude_indices = {self._pad_index, self._end_index, self._start_index} self._wer: Metric = WER(exclude_indices=exclude_indices) self._bleu: Metric = BLEU(exclude_indices=exclude_indices) self._dal = Average() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, span_extractor: SpanExtractor, encoder: Seq2SeqEncoder, feedforward: FeedForward = None, pos_tag_embedding: Embedding = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, evalb_directory_path: str = DEFAULT_EVALB_DIR) -> None: super().__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.span_extractor = span_extractor self.num_classes = self.vocab.get_vocab_size("labels") self.encoder = encoder self.feedforward_layer = TimeDistributed( feedforward) if feedforward else None self.pos_tag_embedding = pos_tag_embedding or None if feedforward is not None: output_dim = feedforward.get_output_dim() else: output_dim = span_extractor.get_output_dim() self.tag_projection_layer = TimeDistributed( Linear(output_dim, self.num_classes)) representation_dim = text_field_embedder.get_output_dim() if pos_tag_embedding is not None: representation_dim += pos_tag_embedding.get_output_dim() check_dimensions_match( representation_dim, encoder.get_input_dim(), "representation dim (tokens + optional POS tags)", "encoder input dim") check_dimensions_match(encoder.get_output_dim(), span_extractor.get_input_dim(), "encoder input dim", "span extractor input dim") if feedforward is not None: check_dimensions_match(span_extractor.get_output_dim(), feedforward.get_input_dim(), "span extractor output dim", "feedforward input dim") self.tag_accuracy = CategoricalAccuracy() if evalb_directory_path is not None: self._evalb_score = EvalbBracketingScorer(evalb_directory_path) else: self._evalb_score = None initializer(self)
class ImageCaptioningDecoder(CaptioningDecoder): def __init__(self, vocab: Vocabulary, attention: CaptioningAttention, embedding_dim: int = 256, decoder_dim: int = 256): super(ImageCaptioningDecoder, self).__init__(vocab=vocab) self._vocab_size = self.vocab.get_vocab_size() self._embedding_dim = embedding_dim self._decoder_dim = decoder_dim self._embedding = Embedding(self._vocab_size, self._embedding_dim) self._attention = attention self._decoder_cell = nn.LSTMCell( self._embedding.get_output_dim() + self._attention.get_output_dim(), self._decoder_dim) self._linear = nn.Linear(self._decoder_dim, self._vocab_size) @overrides def forward( self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor, predicted_indices: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # Shape: (batch_size, embedding_dim) embedding = self._embedding(predicted_indices).float().view( -1, self._embedding_dim) # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1) attention, attention_weights = self._attention(x, h) ## Change to not use teacher forcing all the time # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim) h, c = self._decoder_cell(torch.cat([attention, embedding], dim=1), (h, c)) # Get output predictions (one per character in vocab) # Shape: (batch_size, vocab_size) preds = self._linear(h) return h, c, preds, attention_weights @overrides def get_output_dim(self) -> int: return self._vocab_size @overrides def get_input_dim(self) -> int: return self._decoder_dim
def __init__(self, encoder_output_dim: int, action_embedding_dim: int, attention_function: SimilarityFunction, num_start_types: int, num_entity_types: int, mixture_feedforward: FeedForward = None, dropout: float = 0.0) -> None: super(WikiTablesDecoderStep, self).__init__() self._mixture_feedforward = mixture_feedforward self._entity_type_embedding = Embedding(num_entity_types, action_embedding_dim) self._input_attention = Attention(attention_function) self._num_start_types = num_start_types self._start_type_predictor = Linear(encoder_output_dim, num_start_types) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. output_dim = encoder_output_dim input_dim = output_dim # Our decoder input will be the concatenation of the decoder hidden state and the previous # action embedding, and we'll project that down to the decoder's `input_dim`, which we # arbitrarily set to be the same as `output_dim`. self._input_projection_layer = Linear( output_dim + action_embedding_dim, input_dim) # Before making a prediction, we'll compute an attention over the input given our updated # hidden state. Then we concatenate that with the decoder state and project to # `action_embedding_dim` to make a prediction. self._output_projection_layer = Linear(output_dim + encoder_output_dim, action_embedding_dim) # TODO(pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(input_dim, output_dim) if mixture_feedforward is not None: check_dimensions_match(output_dim, mixture_feedforward.get_input_dim(), "hidden state embedding dim", "mixture feedforward input dim") check_dimensions_match(mixture_feedforward.get_output_dim(), 1, "mixture feedforward output dim", "dimension for scalar value") if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, span_extractor: SpanExtractor, encoder: Seq2SeqEncoder, feedforward_layer: FeedForward = None, pos_tag_embedding: Embedding = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, evalb_directory_path: str = None) -> None: super(SpanConstituencyParser, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.span_extractor = span_extractor self.num_classes = self.vocab.get_vocab_size("labels") self.encoder = encoder self.feedforward_layer = TimeDistributed(feedforward_layer) if feedforward_layer else None self.pos_tag_embedding = pos_tag_embedding or None if feedforward_layer is not None: output_dim = feedforward_layer.get_output_dim() else: output_dim = span_extractor.get_output_dim() self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes)) representation_dim = text_field_embedder.get_output_dim() if pos_tag_embedding is not None: representation_dim += pos_tag_embedding.get_output_dim() check_dimensions_match(representation_dim, encoder.get_input_dim(), "representation dim (tokens + optional POS tags)", "encoder input dim") check_dimensions_match(encoder.get_output_dim(), span_extractor.get_input_dim(), "encoder input dim", "span extractor input dim") if feedforward_layer is not None: check_dimensions_match(span_extractor.get_output_dim(), feedforward_layer.get_input_dim(), "span extractor output dim", "feedforward input dim") self.tag_accuracy = CategoricalAccuracy() if evalb_directory_path is not None: self._evalb_score = EvalbBracketingScorer(evalb_directory_path) else: self._evalb_score = None initializer(self)
def __init__(self, vocab, embedding_dim, hidden_size, intermediate_size): super().__init__() self.embeddings = Embedding( pretrained_file=embedding_file, embedding_dim=embedding_dim, projection_dim=hidden_size, vocab=vocab, ) self.transformer = TransformerStack( num_hidden_layers=4, hidden_size=hidden_size, intermediate_size=intermediate_size, )
def build_simple_lstm_model(vocab: Vocabulary, emb_size: int = 256, hidden_size: int = 256, num_layers: int = 2, bidirectional: bool = True) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") embedder = BasicTextFieldEmbedder( {"bert_tokens": Embedding(embedding_dim=emb_size, num_embeddings=vocab_size)} ) encoder = LstmSeq2VecEncoder( input_size=emb_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional ) return SimpleClassifier(vocab, embedder, encoder)
def build_simple_cnn_model(vocab: Vocabulary, emb_size: int = 256, output_dim: int = 256, num_filters: int = 16, ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5, 6)) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") embedder = BasicTextFieldEmbedder( {"bert_tokens": Embedding(embedding_dim=emb_size, num_embeddings=vocab_size)} ) encoder = CnnEncoder( embedding_dim=emb_size, ngram_filter_sizes=ngram_filter_sizes, output_dim=output_dim, num_filters=num_filters, ) return SimpleClassifier(vocab, embedder, encoder)
def construct_model(vocab, args): # token embedding word_embedding = Embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": "glove\\glove.vocab.100d.txt", "embedding_dim": 100, "trainable": True, "padding_index": 0 })) word_embedding = BasicTextFieldEmbedder({ "token_words": word_embedding }) char_embedding = BasicTextFieldEmbedder({ "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20, num_embeddings=262), encoder=CnnEncoder(embedding_dim=20, ngram_filter_sizes=[5], num_filters=50)), }) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=100, num_layers=1, hidden_size=100, bidirectional=True, batch_first=True)) model = FollowUpSnippetModel(vocab=vocab, word_embedder=word_embedding, char_embedder=char_embedding, tokens_encoder=lstm, model_args=args) return model
def __init__(self, num_embeddings: int, embedding_dim: int, dropout: float = None, projection_dim: int = None, weight: torch.FloatTensor = None, padding_index: int = None, trainable: bool = True, max_norm: float = None, norm_type: float = 2., scale_grad_by_freq: bool = False, sparse: bool = False) -> None: Embedding.__init__(self, num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse) self.dropout = dropout
def main(): reader = PosDatasetReader() train_dataset = reader.read( cached_path( "https://raw.githubusercontent.com/allenai/allennlp/master/tutorials/tagger/training.txt" )) validation_dataset = reader.read( cached_path( "https://raw.githubusercontent.com/allenai/allennlp/master/tutorials/tagger/validation.txt" )) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) EMBEDDING_DIM = 6 HIDDEN_DIM = 6 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=EMBEDDING_DIM) word_embedding = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmTagger(word_embedding, lstm, vocab) optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[('sentence', 'num_tokens')]) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000, ) trainer.train() predictor = SentenceTaggerPredictor(model, dataset_reader=reader) tag_logits = predictor.predict("The dog ate the apple")["tag_logits"] tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, "labels") for i in tag_ids])