def __init__(self, vocab: Vocabulary, sentence_encoder: SentenceEncoder, clause_embedding_dim: int, slot_embedding_dim: int, span_selector: SpanSelector, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None): super(ClauseAnsweringModel, self).__init__(vocab, regularizer) self._sentence_encoder = sentence_encoder self._clause_embedding_dim = clause_embedding_dim self._slot_embedding_dim = slot_embedding_dim self._span_selector = span_selector self._question_embedding_dim = span_selector.get_extra_input_dim() self._clause_embedding = Embedding( vocab.get_vocab_size("clause-template-labels"), clause_embedding_dim) self._slot_embedding = Embedding( vocab.get_vocab_size("answer-slot-labels"), slot_embedding_dim) self._combined_embedding_dim = self._sentence_encoder.get_output_dim() + \ self._clause_embedding_dim + \ self._slot_embedding_dim self._question_projection = Linear(self._combined_embedding_dim, self._question_embedding_dim) if self._question_embedding_dim == 0: raise ConfigurationError( "Question embedding dim (span selector extra input dim) cannot be 0" )
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model: if bert_model: embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model, train_parameters=True)}) encoder = BertPooler(pretrained_model=bert_model, requires_grad=True) else: # (3) How to get vectors for each Token ID: # (3.1) embed each token token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab")) # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz' # (3.2) embed each character in each token character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab")) cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,]) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) # (3.3) embed the POS of each token pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab")) # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/ embedder = BasicTextFieldEmbedder( token_embedders={"tokens": token_embedding, "token_characters": token_encoder, "pos_tags": pos_tag_embedding} ) # emb_dim = 10 + 4 + 10 = 24 encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True) # ^ # average the embeddings across time, rather than simply summing # (ie. we will divide the summed embeddings by the length of the sentence). return SimpleClassifier(vocab, embedder, encoder)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, context_layer: Seq2SeqEncoder, mention_feedforward: FeedForward, antecedent_feedforward: FeedForward, feature_size: int, max_span_width: int, spans_per_word: float, max_antecedents: int, lexical_dropout: float = 0.2, context_layer_back: Seq2SeqEncoder = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(CoreferenceResolver, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._context_layer = context_layer self._context_layer_back = context_layer_back self._antecedent_feedforward = TimeDistributed(antecedent_feedforward) feedforward_scorer = torch.nn.Sequential( TimeDistributed(mention_feedforward), TimeDistributed( torch.nn.Linear(mention_feedforward.get_output_dim(), 1))) self._mention_pruner = SpanPruner(feedforward_scorer) self._antecedent_scorer = TimeDistributed( torch.nn.Linear(antecedent_feedforward.get_output_dim(), 1)) # TODO check the output dim when two context layers are passed through self._endpoint_span_extractor = EndpointSpanExtractor( context_layer.get_output_dim(), combination="x,y", num_width_embeddings=max_span_width, span_width_embedding_dim=feature_size, bucket_widths=False) self._attentive_span_extractor = SelfAttentiveSpanExtractor( input_dim=text_field_embedder.get_output_dim()) # 10 possible distance buckets. self._num_distance_buckets = 10 self._distance_embedding = Embedding(self._num_distance_buckets, feature_size) self._speaker_embedding = Embedding(2, feature_size) self.genres = { g: i for i, g in enumerate(['bc', 'bn', 'mz', 'nw', 'pt', 'tc', 'wb']) } self._genre_embedding = Embedding(len(self.genres), feature_size) self._max_span_width = max_span_width self._spans_per_word = spans_per_word self._max_antecedents = max_antecedents self._mention_recall = MentionRecall() self._conll_coref_scores = ConllCorefScores() if lexical_dropout > 0: self._lexical_dropout = torch.nn.Dropout(p=lexical_dropout) else: self._lexical_dropout = lambda x: x self._feature_dropout = torch.nn.Dropout(0.2) initializer(self)
def get_masked_copynet_with_attention(vocab: Vocabulary, max_decoding_steps: int = 20, beam_size: int = 1) -> MaskedCopyNet: word_embeddings = Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=EMB_DIM ) word_embeddings = BasicTextFieldEmbedder({"tokens": word_embeddings}) masker_embeddings = Embedding( num_embeddings=vocab.get_vocab_size("mask_tokens"), embedding_dim=MASK_EMB_DIM ) masker_embeddings = BasicTextFieldEmbedder({"tokens": masker_embeddings}) attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=HID_DIM * 2) mask_attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=MASK_EMB_DIM) lstm = PytorchSeq2SeqWrapper(nn.LSTM(EMB_DIM, HID_DIM, batch_first=True, bidirectional=True)) return MaskedCopyNet( vocab=vocab, embedder=word_embeddings, encoder=lstm, max_decoding_steps=max_decoding_steps, attention=attention, mask_embedder=masker_embeddings, mask_attention=mask_attention, beam_size=beam_size )
def build_model(vocab: Vocabulary) -> Model: print("Building the model") vocab_size_s = vocab.get_vocab_size("source_tokens") vocab_size_t = vocab.get_vocab_size("target_tokens") bleu = BLEU(exclude_indices = {0,2,3}) source_text_embedder = BasicTextFieldEmbedder({"source_tokens": Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_s)}) encoder = PytorchTransformer(input_dim=embedding_dim, num_layers=num_layers ,positional_encoding="sinusoidal", feedforward_hidden_dim=dff, num_attention_heads=num_head, positional_embedding_size = embedding_dim, dropout_prob = dropout) # target_text_embedder = BasicTextFieldEmbedder({"target_tokens":Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_t)}) target_text_embedder = Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_t) decoder_net = StackedSelfAttentionDecoderNet(decoding_dim=embedding_dim, target_embedding_dim=embedding_dim, feedforward_hidden_dim=dff, num_layers=num_layers, num_attention_heads=num_head, dropout_prob = dropout) decoder_net.decodes_parallel=True decoder = AutoRegressiveSeqDecoder( vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0) if args.pseudo: decoder = PseudoAutoRegressiveSeqDecoder(vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0, decoder_lin_emb = args.dec) return PseudoComposedSeq2Seq(vocab, source_text_embedder, encoder, decoder, num_virtual_models = num_virtual_models) else: decoder = AutoRegressiveSeqDecoder(vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0) return ComposedSeq2Seq(vocab, source_text_embedder, encoder, decoder)
def load_embedding(args, vocab): # Randomly initialize vectors if args.embedding_type == "None": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embedding_dim) # Load word2vec vectors elif args.embedding_type == "w2v": embedding_path = args.embedding_path save_weight_file = './{}_embedding_weight.pt'.format(args.dataset) if os.path.exists(save_weight_file): weight = torch.load(save_weight_file) else: weight = _read_pretrained_embeddings_file( embedding_path, embedding_dim=args.embedding_dim, vocab=vocab, namespace="tokens") torch.save(weight, save_weight_file) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=args.embedding_dim, weight=weight, trainable=True) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) return word_embeddings
def _find_model_function(self): embedding_dim = self.configuration['embed_size'] embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix' if os.path.exists(embedding_matrix_filepath): embedding_matrix = super()._load_object(embedding_matrix_filepath) else: embedding_filepath = self.configuration['embedding_filepath'] embedding_matrix = embedding._read_embeddings_from_text_file(embedding_filepath, embedding_dim, self.vocab, namespace='tokens') super()._save_object(embedding_matrix_filepath, embedding_matrix) embedding_matrix = embedding_matrix.to(self.configuration['device']) token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='tokens'), embedding_dim=embedding_dim, padding_index=0, vocab_namespace='tokens', trainable=self._is_train_token_embeddings(), weight=embedding_matrix) # the embedder maps the input tokens to the appropriate embedding matrix word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) position_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='position'), embedding_dim=self._get_position_embeddings_dim(), padding_index=0) position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"position": position_embedding}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) model_function = self._find_model_function_pure() model = model_function( word_embedder, position_embedder, self.distinct_polarities, self.vocab, self.configuration, ) self._print_args(model) model = model.to(self.configuration['device']) return model
def get_embedder(type_, vocab, e_dim, rq_grad=False): if type_ == 'elmo': opt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_options.json" wt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" elmo_embedder = ElmoTokenEmbedder(opt_file, wt_file, requires_grad=rq_grad) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) return word_embeddings if type_ == 'glove': wt_file = "data/glove.6B.300d.txt" glove_embedder = Embedding(400000, 300, pretrained_file=wt_file, trainable=rq_grad) word_embeddings = BasicTextFieldEmbedder({"tokens": glove_embedder}) return word_embeddings elif type_ == 'bert': bert_embedder = PretrainedBertEmbedder( pretrained_model="bert-base-uncased", top_layer_only=True, requires_grad=rq_grad) word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True) return word_embeddings else: token_embeddings = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=e_dim) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embeddings}) return word_embeddings
def init_gru(vocab, d_embedding, hidden_rnn_sz, rnn_num_layers, rnn_dropout, all_code_types, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations, feedforward_dropout, leadlag, add_time, t_max, t_scale, use_timestamps, split_paths): """Construct and train GRU""" # Init feedward params feedforward_hidden_dims = [feedforward_hidden_dims] * feedforward_num_layers feedforward_activations = [Activation.by_name(feedforward_activations)()] * feedforward_num_layers feedforward_dropout = [feedforward_dropout] * feedforward_num_layers # Needed for final layer feedforward_num_layers += 1 feedforward_hidden_dims.append(1) feedforward_activations.append(Activation.by_name('linear')()) feedforward_dropout.append(0) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=d_embedding) # Handle Augmentations augmentations = [] if add_time: augmentations.append('add_time') if leadlag: augmentations.append('leadlag') d_embedding_updated = update_dims(augmentations, d_embedding) i_augmentations = init_augmentations(augmentations, use_timestamps=use_timestamps, t_max=t_max, t_scale=t_scale) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=d_embedding) # Embedder maps the input tokens to the appropriate embedding matrix word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) # Encoder takes path of (N, L, C) and encodes into state vector # encoder = BagOfEmbeddingsEncoder(embedding_dim=d_embedding) encoder: Seq2VecEncoder = PytorchSeq2VecWrapper( nn.GRU(d_embedding_updated, hidden_rnn_sz, num_layers=rnn_num_layers, batch_first=True, dropout=rnn_dropout)) classifier_feedforward: FeedForward = FeedForward( input_dim=encoder.get_output_dim() * 3 if (all_code_types and split_paths) else encoder.get_output_dim(), num_layers=feedforward_num_layers, hidden_dims=feedforward_hidden_dims, activations=feedforward_activations, dropout=feedforward_dropout ) model = BaseModel( vocab, word_embeddings, encoder, classifier_feedforward, augmentations=i_augmentations ) return model
def main(): # "http://mattmahoney.net/dc/text8.zip" download first data_dir = 'data/word2vec/text8/text8' # 1. build vocab from file vocab = build_vocab(data_dir) # 2. build reader reader = SimpleSkipGramReader( window_size=WIN_SIZE) # or SkipGramReader(vocab=vocab) text8 = reader.read(data_dir) embedding_in = Embedding( num_embeddings=vocab.get_vocab_size('token_target'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding( num_embeddings=vocab.get_vocab_size('token_context'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) # important, transform token to index model = SkipGramNegativeSamplingModel(vocab, embedding_in, embedding_out, neg_samples=10, cuda_device=CUDA_DEVICE) # # model = SkipGramModel(vocab=vocab, # embedding_in=embedding_in, # cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=text8, num_epochs=5, cuda_device=CUDA_DEVICE) trainer.train() # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab) print(get_synonyms('one', embedding_in, vocab)) print(get_synonyms('december', embedding_in, vocab)) print(get_synonyms('flower', embedding_in, vocab)) print(get_synonyms('design', embedding_in, vocab)) print(get_synonyms('snow', embedding_in, vocab)) rho = evaluate_embeddings(embedding_in, vocab) print('simlex999 speareman correlation: {}'.format(rho))
def _find_model_function(self): embedding_dim = self.configuration['embed_size'] embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix' if os.path.exists(embedding_matrix_filepath): embedding_matrix = super()._load_object(embedding_matrix_filepath) else: embedding_filepath = self.configuration['embedding_filepath'] embedding_matrix = embedding._read_embeddings_from_text_file( embedding_filepath, embedding_dim, self.vocab, namespace='tokens') super()._save_object(embedding_matrix_filepath, embedding_matrix) token_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size(namespace='tokens'), embedding_dim=embedding_dim, padding_index=0, vocab_namespace='tokens', trainable=False, weight=embedding_matrix) # the embedder maps the input tokens to the appropriate embedding matrix word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) position_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size(namespace='position'), embedding_dim=25, padding_index=0) position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {"position": position_embedding}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) # bert_embedder = PretrainedBertEmbedder( # pretrained_model=self.bert_file_path, # top_layer_only=True, # conserve memory # requires_grad=True # ) # bert_word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"bert": bert_embedder}, # # we'll be ignoring masks so we'll need to set this to True # allow_unmatched_keys=True) bert_word_embedder = self._get_bert_word_embedder() model = pytorch_models.AsMilSimultaneouslyBert( word_embedder, position_embedder, self.distinct_categories, self.distinct_polarities, self.vocab, self.configuration, bert_word_embedder=bert_word_embedder) self._print_args(model) model = model.to(self.configuration['device']) return model
def main(): reader = SkipGramReader() text8 = reader.read('data/text8/text8') vocab = Vocabulary.from_instances(text8, min_count={ 'token_in': 5, 'token_out': 5 }) reader = SkipGramReader(vocab=vocab) text8 = reader.read('data/text8/text8') embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=text8, num_epochs=5, cuda_device=CUDA_DEVICE) trainer.train() # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab) print(get_synonyms('one', embedding_in, vocab)) print(get_synonyms('december', embedding_in, vocab)) print(get_synonyms('flower', embedding_in, vocab)) print(get_synonyms('design', embedding_in, vocab)) print(get_synonyms('snow', embedding_in, vocab)) rho = evaluate_embeddings(embedding_in, vocab) print('simlex999 speareman correlation: {}'.format(rho))
def build_model(vocab: Vocabulary) -> Model: print("Building the model") EMBEDDING_DIM = 6 HIDDEN_DIM = 6 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) criterion_embedding = Embedding( num_embeddings=7, embedding_dim=EMBEDDING_DIM) # FIXME: num embeddings word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) return MultiCriterionTokenizer(word_embeddings, criterion_embedding, lstm, vocab)
def main(): reader = SkipGramReader() dataset = reader.read("data/cv/0/train.txt") vocab = Vocabulary().from_files("data/vocabulary") params = Params(params={}) vocab.extend_from_instances(params, dataset) reader = SkipGramReader(vocab=vocab) dataset = reader.read("data/cv/0/train.txt") embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=dataset, num_epochs=20, cuda_device=CUDA_DEVICE) trainer.train() torch.save(embedding_in.state_dict(), "saved_models/word2vec.th") print(get_synonyms('C', embedding_in, vocab)) print(get_synonyms('G7', embedding_in, vocab)) print(get_synonyms('G', embedding_in, vocab)) print(get_synonyms('F', embedding_in, vocab)) print(get_synonyms('C7', embedding_in, vocab))
def prepare1(): """ First part of preparing data for training :return: biLSTM model object, biLSTM vocabulary, data for training, data for validation, cuda biLSTM object, biLSTM reader object """ reader = PosDatasetReader() train_dataset = reader.read(train_path) validation_dataset = reader.read(validation_path) vocab = Vocabulary.from_instances(train_dataset + validation_dataset) EMBEDDING_DIM = 200 HIDDEN_DIM = 200 token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)) model = LstmTagger(word_embeddings, lstm, vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 return model, vocab, train_dataset, validation_dataset, cuda_device, reader
def build_model(vocab: Vocabulary) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)}) encoder = BagOfEmbeddingsEncoder(embedding_dim=10) return SimpleClassifier(vocab, embedder, encoder)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, question_generator: QuestionGenerator, stacked_encoder: Seq2SeqEncoder = None, predicate_feature_dim: int = 100, dim_hidden: int = 100, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None): super(QuestionPredictor, self).__init__(vocab, regularizer) self.dim_hidden = dim_hidden self.text_field_embedder = text_field_embedder self.predicate_feature_embedding = Embedding(2, predicate_feature_dim) self.embedding_dropout = Dropout(p=embedding_dropout) self.stacked_encoder = stacked_encoder self.span_extractor = EndpointSpanExtractor(self.stacked_encoder.get_output_dim(), combination="x,y") self.question_generator = question_generator self.slot_labels = question_generator.get_slot_labels() self.question_metric = QuestionPredictionMetric(vocab, question_generator.get_slot_labels())
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 300 # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = BasicTextFieldEmbedder({ "tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size) }) encoder = CnnEncoder( embedding_dim=EMBED_DIMS, ngram_filter_sizes=(2, 3, 4, 5), num_filters=5 ) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if use_reg: l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg)] regularizer_applicator = RegularizerApplicator(regexes) return DecompensationClassifier(vocab, embedder, encoder, regularizer_applicator)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, stacked_encoder: Seq2SeqEncoder = None, predicate_feature_dim: int = 0, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None): super(SentenceEncoder, self).__init__() self._text_field_embedder = text_field_embedder self._stacked_encoder = stacked_encoder self._predicate_feature_dim = predicate_feature_dim self._embedding_dropout = Dropout(p=embedding_dropout) if self._predicate_feature_dim > 0: self._predicate_feature_embedding = Embedding( 2, predicate_feature_dim) if self._stacked_encoder is not None: embedding_dim_with_predicate_feature = self._text_field_embedder.get_output_dim( ) + self._predicate_feature_dim if embedding_dim_with_predicate_feature != self._stacked_encoder.get_input_dim( ): raise ConfigurationError( ("Input dimension of sentence encoder (%s) must be " % self._stacked_encoder.get_input_dim()) + \ ("the sum of predicate feature dim and text embedding dim (%s)." % (embedding_dim_with_predicate_feature))) self._metric = BinaryF1()
def __init__(self, vocab: Vocabulary, sentence_encoder: SentenceEncoder, qarg_ffnn: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None): super(ClauseAndSpanToAnswerSlotModel, self).__init__(vocab, regularizer) self._sentence_encoder = sentence_encoder self._qarg_ffnn = qarg_ffnn self._clause_embedding = Embedding( vocab.get_vocab_size("abst-clause-labels"), self._qarg_ffnn.get_input_dim()) self._span_extractor = EndpointSpanExtractor( input_dim=self._sentence_encoder.get_output_dim(), combination="x,y") self._span_hidden = TimeDistributed( Linear(2 * self._sentence_encoder.get_output_dim(), self._qarg_ffnn.get_input_dim())) self._predicate_hidden = Linear( self._sentence_encoder.get_output_dim(), self._qarg_ffnn.get_input_dim()) self._qarg_predictor = Linear(self._qarg_ffnn.get_output_dim(), self.vocab.get_vocab_size("qarg-labels")) self._metric = BinaryF1()
def build_model( vocab: Vocabulary, embedding_dim: int, pretrained_file: str = None, initializer: InitializerApplicator = None, regularizer: RegularizerApplicator = None ) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") word_vec = Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size, pretrained_file=pretrained_file, vocab=vocab) embedding = BasicTextFieldEmbedder({"tokens": word_vec}) # Use ELMo # options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' # weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # Use BERT # bert_embedder = PretrainedTransformerEmbedder( # model_name='bert-base-uncased', # max_length=512, # train_parameters=False # ) # embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}) encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim) return SimpleClassifier(vocab, embedding, encoder, initializer, regularizer=regularizer)
def running_whole_model(): token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("sentence", "num_tokens"), ("structures1", "num_tokens"), ("structures2", "num_tokens"), ("structures3", "num_tokens")]) iterator.index_with(vocab) model = All_generating(embed_size=EMBEDDING_DIM, word_embeddings=word_embeddings, vocab=vocab, num_of_candidates=7, ) # optimizer = adabound.AdaBound(model.parameters(), lr=lr, final_lr=0.1) optimizer = optim.Adam(model.parameters(), lr=lr) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=whole_train_dataset, validation_dataset=whole_validation_dataset, patience=5, num_epochs=30) trainer.train()
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0) -> None: super(dTPRxNet, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, stacked_encoder: Seq2SeqEncoder, binary_feature_dim: int, initializer: InitializerApplicator, embedding_dropout: float = 0.0) -> None: super(SemanticRoleLabeler, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SpanBasedF1Measure(vocab, tag_namespace="labels", ignore_classes=["V"]) self.stacked_encoder = stacked_encoder # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.tag_projection_layer = TimeDistributed( Linear(self.stacked_encoder.get_output_dim(), self.num_classes)) self.embedding_dropout = Dropout(p=embedding_dropout) initializer(self) if text_field_embedder.get_output_dim( ) + binary_feature_dim != stacked_encoder.get_input_dim(): raise ConfigurationError( "The SRL Model uses a binary verb indicator feature, meaning " "the input dimension of the stacked_encoder must be equal to " "the output dimension of the text_field_embedder + 1.")
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False) -> None: super(SemanticRoleLabeler, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SpanBasedF1Measure(vocab, tag_namespace="labels", ignore_classes=["V"]) self.encoder = encoder # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_classes)) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim") initializer(self)
def running_NER(): reader = PosDatasetReader() train_dataset = reader.read('../data/700_multi_data/600_ner_train.txt') validation_dataset = reader.read('../data/700_multi_data/66_ner_test.txt') vocab = Vocabulary.from_files("../model_store/vocabulary") # '''vocab part''' # train_1 = reader.read('../data/train/train.json') # train_2 = reader.read('../data/train/dev.json') token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmTagger(word_embeddings, lstm, vocab) optimizer = optim.SGD(model.parameters(), lr=0.1) iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000) trainer.train()
def build_model(vocab: Vocabulary) -> Model: vocab_size = vocab.get_vocab_size( "tokens") # "tokens" from data_reader.token_indexers ?? embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)}) encoder = BagOfEmbeddingsEncoder(embedding_dim=10) return SimpleClassifier(vocab, embedder, encoder)
def generate_res_file(): reader = PosDatasetReader() vocab = Vocabulary.from_files("../model_store/vocabulary") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model2 = LstmTagger(word_embeddings, lstm, vocab) with open("../model_store/model.th", 'rb') as f: model2.load_state_dict(torch.load(f)) predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader) train_read_file = open('../data/only_sentence/raw_test.json', 'r') train_write_file = open('../data/only_sentence/ner_test.json', 'w') for line in train_read_file: tag_logits2 = predictor2.predict( line.replace('.', '').replace(',', '').replace('\n', ''))['tag_logits'] tag_ids = np.argmax(tag_logits2, axis=-1) res = [model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids] for i in range(len(res)): train_write_file.write(res[i] + ' ') # train_write_file.write(str(tag_logits2)) train_write_file.write('\n') train_write_file.flush() train_read_file.close() train_write_file.close() print('finish') # generate_res_file()
def __init__(self, num_classes: int, input_dim: int, output_dim: int) -> None: super().__init__() self.embedder = Embedding(num_classes, input_dim) self.decoder_cell = GRUCell(input_dim, output_dim) self.output_projection_layer = Linear(output_dim, num_classes) self.recall = UnigramRecall()
def __init__( self, vocab: Vocabulary, number_of_branch_map: Dict[str, int], child_state_generator: ChildStateGenerator, target_namespace: str = "equation_vocab", embedding_size: int = 128, hidden_size: int = 512, beam_size: int = 5, initializer: InitializerApplicator = InitializerApplicator(), ) -> None: super().__init__(vocab, target_namespace) # Since nodes could have arbitrary number of children. We need have a map to look it up. self.number_of_branch_map = number_of_branch_map # GTS modules self._source_vocab_size = self.vocab.get_vocab_size("tokens") self.encoder = EncoderSeq(input_size=self._source_vocab_size, embedding_size=embedding_size, hidden_size=hidden_size, n_layers=2) self.predict = Prediction(hidden_size=hidden_size, op_nums=self.num_operations, input_size=self.num_constants) self._target_embedder = Embedding( num_embeddings=self.num_operations, embedding_dim=embedding_size) self.merge = Merge(hidden_size=hidden_size, embedding_size=embedding_size) # The generator to generate arbitrary number of child states self.generator = child_state_generator # At prediction time, we'll use a beam search to find the best target sequence. self._beam_size = beam_size initializer(self)