def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, posclass_weight: Optional[float] = 1, use_power: Optional[bool] = False, dropout: Optional[float] = 0) -> None: super().__init__(vocab) self.embedder = embedder self.encoder = encoder if use_power: self.classifier = torch.nn.Linear( in_features=encoder.get_output_dim() + 1, out_features=vocab.get_vocab_size('labels') ) else: self.classifier = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels') ) self.use_power = use_power self.f1_lie = F1Measure(vocab.get_token_index('False', 'labels')) self.f1_truth = F1Measure(vocab.get_token_index('True', 'labels')) self.micro_f1 = FBetaMeasure(average='micro') self.macro_f1 = FBetaMeasure(average='macro') weights = [1,1] weights[vocab.get_token_index('False', 'labels')] = posclass_weight self.loss = torch.nn.CrossEntropyLoss(weight = torch.Tensor(weights)) self.dropout = torch.nn.Dropout(dropout)
def get_masked_copynet_with_attention(vocab: Vocabulary, max_decoding_steps: int = 20, beam_size: int = 1) -> MaskedCopyNet: word_embeddings = Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=EMB_DIM ) word_embeddings = BasicTextFieldEmbedder({"tokens": word_embeddings}) masker_embeddings = Embedding( num_embeddings=vocab.get_vocab_size("mask_tokens"), embedding_dim=MASK_EMB_DIM ) masker_embeddings = BasicTextFieldEmbedder({"tokens": masker_embeddings}) attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=HID_DIM * 2) mask_attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=MASK_EMB_DIM) lstm = PytorchSeq2SeqWrapper(nn.LSTM(EMB_DIM, HID_DIM, batch_first=True, bidirectional=True)) return MaskedCopyNet( vocab=vocab, embedder=word_embeddings, encoder=lstm, max_decoding_steps=max_decoding_steps, attention=attention, mask_embedder=masker_embeddings, mask_attention=mask_attention, beam_size=beam_size )
def __init__(self, vocab: Vocabulary, embedder: TokenEmbedder, sim_file_path: str, window_size: int = 4, num_neg_samples: int = 5, neg_exponent: float = 0.75, cuda_device: int = -1) -> None: super().__init__(vocab) self._device = f'cuda:{cuda_device}' if torch.cuda.is_available( ) and cuda_device >= 0 else 'cpu' self._ws353 = WS353(sim_file_path) self.embedder = embedder self._window_size = window_size self._num_neg_samples = num_neg_samples self.output_layer = nn.Linear(embedder.get_output_dim(), vocab.get_vocab_size('words')) # negative sampling with word frequency distribution self._word_dist = torch.zeros(vocab.get_vocab_size('words')) if vocab._retained_counter: for word, count in vocab._retained_counter['words'].items(): word_idx = vocab.get_token_index(token=word, namespace='words') self._word_dist[word_idx] = count # prevent sampling process from choosing pad and unk tokens self._word_dist[vocab.get_token_index(token=vocab._padding_token, namespace='words')] = 0 self._word_dist[vocab.get_token_index(token=vocab._oov_token, namespace='words')] = 0 # prevent frequent words from sampling too frequently self._word_dist = torch.pow(self._word_dist, neg_exponent)
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, dropout: float = 0.1, ff_dim: int = 100): super().__init__(vocab) self.embedder = embedder self.encoder = encoder assert self.embedder.get_output_dim() == self.encoder.get_input_dim() self.feedforward = FeedForward( encoder.get_output_dim(), 1, hidden_dims=ff_dim, activations=Activation.by_name('relu')(), dropout=dropout) self.out = torch.nn.Linear( in_features=self.feedforward.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self.crf = ConditionalRandomField(vocab.get_vocab_size('labels')) self.f1 = FBetaMeasure(average='micro') self.accuracy = CategoricalAccuracy() self.idx_to_label = vocab.get_index_to_token_vocabulary('labels')
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace(u"word", namespace=u'1') assert u"word" in list( vocab.get_index_to_token_vocabulary(namespace=u'1').values()) assert vocab.get_token_index(u"word", namespace=u'1') == word_index assert vocab.get_token_from_index(word_index, namespace=u'1') == u"word" assert vocab.get_vocab_size(namespace=u'1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace(u"word2", namespace=u'2') word_index = vocab.add_token_to_namespace(u"word", namespace=u'2') assert u"word" in list( vocab.get_index_to_token_vocabulary(namespace=u'2').values()) assert u"word2" in list( vocab.get_index_to_token_vocabulary(namespace=u'2').values()) assert vocab.get_token_index(u"word", namespace=u'2') == word_index assert vocab.get_token_index(u"word2", namespace=u'2') == word2_index assert vocab.get_token_from_index(word_index, namespace=u'2') == u"word" assert vocab.get_token_from_index(word2_index, namespace=u'2') == u"word2" assert vocab.get_vocab_size(namespace=u'2') == initial_vocab_size + 2
def __init__(self, vocab: Vocabulary, bert_embedder: Optional[PretrainedBertEmbedder] = None, encoder: Optional[Seq2SeqEncoder] = None, dropout: Optional[float] = None, use_crf: bool = True) -> None: super().__init__(vocab) if bert_embedder: self.use_bert = True self.bert_embedder = bert_embedder else: self.use_bert = False self.basic_embedder = BasicTextFieldEmbedder({ "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024) }) self.rnn = Seq2SeqEncoder.from_params(Params({ "type": "lstm", "input_size": 1024, "hidden_size": 512, "bidirectional": True, "batch_first": True })) self.encoder = encoder if encoder: hidden2tag_in_dim = encoder.get_output_dim() else: hidden2tag_in_dim = bert_embedder.get_output_dim() self.hidden2tag = TimeDistributed(torch.nn.Linear( in_features=hidden2tag_in_dim, out_features=vocab.get_vocab_size("labels"))) if dropout: self.dropout = torch.nn.Dropout(dropout) else: self.dropout = None self.use_crf = use_crf if use_crf: crf_constraints = allowed_transitions( constraint_type="BIO", labels=vocab.get_index_to_token_vocabulary("labels") ) self.crf = ConditionalRandomField( num_tags=vocab.get_vocab_size("labels"), constraints=crf_constraints, include_start_end_transitions=True ) self.f1 = SpanBasedF1Measure(vocab, tag_namespace="labels", ignore_classes=["news/type","negation", "demonstrative_reference", "timer/noun","timer/attributes"], label_encoding="BIO")
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, dropout: float = None, num_samples: int = None, sparse_embeddings: bool = False, bidirectional: bool = False, initializer: InitializerApplicator = None, regularizer: Optional[RegularizerApplicator] = None, ) -> None: super().__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder if contextualizer.is_bidirectional() is not bidirectional: raise ConfigurationError( "Bidirectionality of contextualizer must match bidirectionality of " "language model. " f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, " f"language model bidirectional: {bidirectional}") self._contextualizer = contextualizer self._bidirectional = bidirectional # The dimension for making predictions just in the forward # (or backward) direction. if self._bidirectional: self._forward_dim = contextualizer.get_output_dim() // 2 else: self._forward_dim = contextualizer.get_output_dim() # TODO(joelgrus): more sampled softmax configuration options, as needed. if num_samples is not None: self._softmax_loss = SampledSoftmaxLoss( num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim, num_samples=num_samples, sparse=sparse_embeddings, ) else: self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim) # This buffer is now unused and exists only for backwards compatibility reasons. self.register_buffer("_last_average_loss", torch.zeros(1)) self._perplexity = Perplexity() if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x if initializer is not None: initializer(self)
def __init__(self, vocab: Vocabulary) -> None: super().__init__(vocab) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_SIZE) self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) self.rnn = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True)) self.hidden2out = torch.nn.Linear(in_features=self.rnn.get_output_dim(), out_features=vocab.get_vocab_size('tokens'))
def __init__( self, cf_a, # Configuration file vocab: Vocabulary) -> None: ## We send the vocabulary to the upper model. # Apparently AllenNLP needs the vocabulary super().__init__(vocab) self.cf_a = cf_a self.loss_func = cf_a.loss_func self.prior = cf_a.LSTM_prior """ Token Embedding Biatch !! """ self.word_embeddings = self.get_embedder(vocab, cf_a.Word_embedding_dim, cf_a.char_embeddedng_dim, cf_a.CNN_num_filters, cf_a.CNN_encoder_dim) self.encoder = self.get_sec2vec_encoder( cf_a.CNN_encoder_dim + cf_a.Word_embedding_dim, cf_a.LSTM_H) if (cf_a.Bayesian_Linear): self.hidden2tag = LinearVB( in_features=self.cf_a.LSTM_H, out_features=vocab.get_vocab_size('tags_country'), bias=True, prior=cf_a.Linear_output_prior) else: self.hidden2tag = torch.nn.Linear( in_features=self.cf_a.LSTM_H, out_features=vocab.get_vocab_size('tags_country')) self.accuracy = CategoricalAccuracy() """ List of Bayesian Linear Models. Using this list we can easily set the special requirements of VB models. And also analize easily the weights in the network """ self.VBmodels = [] self.LinearModels = [] if (cf_a.Bayesian_Linear): self.VBmodels.append(self.hidden2tag) else: self.LinearModels.append(self.hidden2tag) if (cf_a.Bayesian_LSTM): self.VBmodels.extend(self.encoder.get_LSTMCells()) else: self.LinearModels.extend(self.encoder.get_LSTMCells())
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, dropout: float = None, loss_scale: Union[float, str] = 1.0, num_samples: int = None, sparse_embeddings: bool = False, bidirectional: bool = False, initializer: InitializerApplicator = None) -> None: super().__init__(vocab) self._text_field_embedder = text_field_embedder if contextualizer.is_bidirectional() is not bidirectional: raise ConfigurationError( "Bidirectionality of contextualizer must match bidirectionality of " "language model. " f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, " f"language model bidirectional: {bidirectional}") self._contextualizer = contextualizer self._bidirectional = bidirectional # The dimension for making predictions just in the forward # (or backward) direction. if self._bidirectional: self._forward_dim = contextualizer.get_output_dim() // 2 else: self._forward_dim = contextualizer.get_output_dim() # TODO(joelgrus): more sampled softmax configuration options, as needed. if num_samples is not None: self._softmax_loss = SampledSoftmaxLoss( num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim, num_samples=num_samples, sparse=sparse_embeddings) else: self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim) # TODO(brendanr): Output perplexity here. e^loss self.register_buffer('_last_average_loss', torch.zeros(1)) if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x self._loss_scale = loss_scale if initializer is not None: initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, dropout: float = None, num_samples: int = None, sparse_embeddings: bool = False, bidirectional: bool = False, initializer: InitializerApplicator = None) -> None: super().__init__(vocab) self._text_field_embedder = text_field_embedder if contextualizer.is_bidirectional() is not bidirectional: raise ConfigurationError( "Bidirectionality of contextualizer must match bidirectionality of " "language model. " f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, " f"language model bidirectional: {bidirectional}") self._contextualizer = contextualizer self._bidirectional = bidirectional # The dimension for making predictions just in the forward # (or backward) direction. if self._bidirectional: self._forward_dim = contextualizer.get_output_dim() // 2 else: self._forward_dim = contextualizer.get_output_dim() # TODO(joelgrus): more sampled softmax configuration options, as needed. if num_samples is not None: self._softmax_loss = SampledSoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim, num_samples=num_samples, sparse=sparse_embeddings) else: self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim) # TODO(brendanr): Output perplexity here. e^loss self.register_buffer('_last_average_loss', torch.zeros(1)) if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x if initializer is not None: initializer(self)
def main(): reader = SkipGramReader() dataset = reader.read("data/cv/0/train.txt") vocab = Vocabulary().from_files("data/vocabulary") params = Params(params={}) vocab.extend_from_instances(params, dataset) reader = SkipGramReader(vocab=vocab) dataset = reader.read("data/cv/0/train.txt") embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'), embedding_dim=EMBEDDING_DIM) embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'), embedding_dim=EMBEDDING_DIM) if CUDA_DEVICE > -1: embedding_in = embedding_in.to(CUDA_DEVICE) embedding_out = embedding_out.to(CUDA_DEVICE) iterator = BasicIterator(batch_size=BATCH_SIZE) iterator.index_with(vocab) model = SkipGramModel(vocab=vocab, embedding_in=embedding_in, cuda_device=CUDA_DEVICE) # model = SkipGramNegativeSamplingModel( # vocab=vocab, # embedding_in=embedding_in, # embedding_out=embedding_out, # neg_samples=10, # cuda_device=CUDA_DEVICE) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=dataset, num_epochs=20, cuda_device=CUDA_DEVICE) trainer.train() torch.save(embedding_in.state_dict(), "saved_models/word2vec.th") print(get_synonyms('C', embedding_in, vocab)) print(get_synonyms('G7', embedding_in, vocab)) print(get_synonyms('G', embedding_in, vocab)) print(get_synonyms('F', embedding_in, vocab)) print(get_synonyms('C7', embedding_in, vocab))
def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder self.hidden2decision = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size("grammaticality_labels")) self.loss_function = nn.CrossEntropyLoss() self.accuracy = CategoricalAccuracy() self.vocab = vocab self.specificAccuracies = {} for ind in range(vocab.get_vocab_size(namespace="ugtype_labels")): self.specificAccuracies[ind] = CategoricalAccuracy()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, forward_segmental_contextualizer: Seq2SeqEncoder, backward_segmental_contextualizer: Seq2SeqEncoder, label_feature_dim: int, softmax_projection_dim: int, label_namespace: str = "labels", dropout: float = None, num_samples: int = None, sparse_embeddings: bool = False, bidirectional: bool = True, initializer: InitializerApplicator = None) -> None: super().__init__(vocab=vocab, text_field_embedder=text_field_embedder, contextualizer=contextualizer, dropout=dropout, num_samples=num_samples, sparse_embeddings=sparse_embeddings, bidirectional=bidirectional, initializer=initializer) self._forward_segmental_contextualizer = forward_segmental_contextualizer self._backward_segmental_contextualizer = backward_segmental_contextualizer if num_samples is not None: self._softmax_loss = SampledSoftmaxLoss( num_words=vocab.get_vocab_size(), embedding_dim=softmax_projection_dim, num_samples=num_samples, sparse=sparse_embeddings) else: self._softmax_loss = _SoftmaxLoss( num_words=vocab.get_vocab_size(), embedding_dim=softmax_projection_dim) self.num_classes = self.vocab.get_vocab_size(label_namespace) self.label_feature_embedding = Embedding(self.num_classes, label_feature_dim) base_dim = contextualizer.get_output_dim() // 2 seg_dim = base_dim + label_feature_dim self._forward_dim = softmax_projection_dim self.pre_segmental_layer = TimeDistributed( Linear(seg_dim, softmax_projection_dim)) self.projection_layer = TimeDistributed( Linear(base_dim * 2, softmax_projection_dim))
def generate_distance_target(index, eps=1): vocab = Vocabulary().from_files("data/vocabulary") vocab_size = vocab.get_vocab_size() weight = np.zeros((vocab_size, vocab_size)) if index == 0: dist_func = distance_0 if index == 1: dist_func = distance_1 if index == 2: dist_func = distance_2 for i in range(vocab_size): chord_i = vocab.get_token_from_index(i) for j in range(vocab_size): chord_j = vocab.get_token_from_index(j) if "@" in chord_i or "@" in chord_j: M = 1 - distance_0(chord_i, chord_j) else: dist = dist_func(chord_i, chord_j) M = 1 / (dist + eps) weight[i][j] = M max_value = np.max(weight) weight /= max_value weight = torch.from_numpy(weight).float() if not os.path.isdir("data/targets/"): os.makedirs("data/targets/") torch.save(weight, "data/targets/target_distance_{}.th".format(index))
def __init__(self, vocab: Vocabulary, source_text_embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, tied_source_embedder_key: Optional[str] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, positive_label: str = "algebra", target_namespace: str = "tokens")-> None: super(TextClassifier, self).__init__(vocab, regularizer) self._source_text_embedder = source_text_embedder self._target_namespace = target_namespace self._encoder = encoder self._linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self.accuracy = CategoricalAccuracy() positive_label = vocab.get_token_index(positive_label, namespace='labels') # for comnputing precision, recall and f1 self.f1_measure = F1Measure(positive_label) # the loss function combines logsoftmax and NLLloss, the input to this function is logits self.loss_function = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, layer_norm: Optional[MaskedLayerNorm] = None, dropout: float = None, loss_scale: Union[float, str] = 1.0, remove_bos_eos: bool = True) -> None: super().__init__(vocab) self._text_field_embedder = text_field_embedder self._layer_norm = layer_norm or (lambda x: x) if not contextualizer.is_bidirectional(): raise ConfigurationError("contextualizer must be bidirectional") self._contextualizer = contextualizer # The dimension for making predictions just in the forward # (or backward) direction. self._forward_dim = contextualizer.get_output_dim() // 2 # TODO(joelgrus): Allow SampledSoftmaxLoss here by configuration self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim) self.register_buffer('_last_average_loss', torch.zeros(1)) if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x self._loss_scale = loss_scale self._remove_bos_eos = remove_bos_eos
def __init__(self, vocab: Vocabulary, embedding_target: TokenEmbedder, embedding_context: TokenEmbedder, neg_samples=10, cuda_device=-1): super().__init__(vocab) self.embedding_target = embedding_target self.embedding_context = embedding_context self.neg_samples = neg_samples self.cuda_device = cuda_device # Pre-compute probability for negative sampling if vocab is not None and 'token_target' in vocab._retained_counter: token_to_probs = {} token_counts = vocab._retained_counter['token_target'] # HACK total_counts = sum(token_counts.values()) total_probs = 0. for token, counts in token_counts.items(): unigram_freq = counts / total_counts unigram_freq = math.pow(unigram_freq, 0.75) token_to_probs[token] = unigram_freq total_probs += unigram_freq self.neg_sample_probs = np.ndarray((vocab.get_vocab_size('token_target'),)) for token_id, token in vocab.get_index_to_token_vocabulary('token_target').items(): self.neg_sample_probs[token_id] = token_to_probs.get(token, 0) / total_probs else: print('You need to construct vocab from instances to record the token count statistics')
def build_model( vocab: Vocabulary, embedding_dim: int, pretrained_file: str = None, initializer: InitializerApplicator = None, regularizer: RegularizerApplicator = None ) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") word_vec = Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size, pretrained_file=pretrained_file, vocab=vocab) embedding = BasicTextFieldEmbedder({"tokens": word_vec}) # Use ELMo # options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' # weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) # embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder}) # Use BERT # bert_embedder = PretrainedTransformerEmbedder( # model_name='bert-base-uncased', # max_length=512, # train_parameters=False # ) # embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}) encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim) return SimpleClassifier(vocab, embedding, encoder, initializer, regularizer=regularizer)
def __init__(self, text_field_embedder: TextFieldEmbedder, type_field_embedder: TextFieldEmbedder, vocab: Vocabulary) -> None: super().__init__(vocab) self.text_field_embedder = text_field_embedder # self.hidden2tag = torch.nn.Linear(in_features=self.text_field_embedder.get_output_dim(), # out_features=vocab.get_vocab_size('labels')) self.type_field_embedder = type_field_embedder self.hidden2medium = torch.nn.Linear( in_features=self.text_field_embedder.get_output_dim() + 20, out_features=int( (self.text_field_embedder.get_output_dim() + 20) / 2)) self.dropout = torch.nn.Dropout(0.2) self.medium2tag = torch.nn.Linear( in_features=int( (self.text_field_embedder.get_output_dim() + 20) / 2), out_features=vocab.get_vocab_size("labels")) self.loss = FocalLoss() self.metrics = { # "accuracy": CategoricalAccuracy(), "f1_measure": SpanBasedF1Measure(vocabulary=vocab, tag_namespace="labels", ignore_classes=[""]) }
def add_task(self, task_tag: str, vocab: Vocabulary): self.classification_layers.append( torch.nn.Linear(in_features=self.hidden_dim, out_features=vocab.get_vocab_size('labels'))) self.num_task = self.num_task + 1 self.task2id[task_tag] = self.num_task self.tasks_vocabulary[task_tag] = vocab
def __init__( self, vocab: Vocabulary, bert_model: Union[str, BertModel], dropout: float = 0.0, num_labels: int = None, index: str = "bert", label_namespace: str = "labels", trainable: bool = True, initializer: InitializerApplicator = InitializerApplicator() ) -> None: super().__init__(vocab) if isinstance(bert_model, str): self.bert_model = PretrainedBertModel.load(bert_model) else: self.bert_model = bert_model self.bert_model.requires_grad = trainable in_features = self.bert_model.config.hidden_size if num_labels: out_features = num_labels else: out_features = vocab.get_vocab_size(label_namespace) self._dropout = torch.nn.Dropout(p=dropout) self._tagger_layer = torch.nn.Linear(in_features, out_features) self._span_f1 = SpanBasedF1Measure(vocab, label_namespace, label_encoding='BIO') self._loss = torch.nn.CrossEntropyLoss() self._index = index initializer(self._tagger_layer)
def __init__( self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder = None, dropout: float = 0.5, label_namespace: str = "entity_tags", ) -> None: super().__init__(vocab) self.vocab = vocab self.embedder = embedder self.encoder = encoder self.dropout = Dropout(dropout) self.label_namespace = label_namespace self.labels = vocab.get_index_to_token_vocabulary(label_namespace) num_labels = vocab.get_vocab_size(label_namespace) self.label_projection_layer = TimeDistributed( torch.nn.Linear( embedder.get_output_dim() if encoder is None else encoder.get_output_dim(), num_labels)) self.crf = ConditionalRandomField(num_labels, include_start_end_transitions=True) self.metrics = { "span_f1": SpanBasedF1Measure(vocab, tag_namespace=label_namespace, label_encoding="BIO"), "accuracy": CategoricalAccuracy(), }
def __init__(self, vocab: Vocabulary, action_embedding_dim: int, text_field_embedder: TextFieldEmbedder = None, dropout: float = 0.0, rule_namespace: str = 'rule_labels', debug: bool=False, regularizer: Optional[RegularizerApplicator] = None) -> None: super(DROPParserBase, self).__init__(vocab=vocab, regularizer=regularizer) self._denotation_accuracy = Average() self._consistency = Average() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace # This flag turns on the debugging mode which prints a bunch of stuff in self.decode (inside functions as well) self._debug = debug self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace), embedding_dim=action_embedding_dim, vocab_namespace=self._rule_namespace) self._action_embedding_dim = action_embedding_dim # This is what we pass as input in the first step of decoding, when we don't have a # previous action. self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding, mean=0.0, std=0.001)
def build_seq2seq_model(flags, data_reader, vocab: Vocabulary, source_namespace: str = 'source_tokens', target_namespace: str = 'target_tokens') -> Model: source_embedding = Embedding( vocab.get_vocab_size(namespace=source_namespace), embedding_dim=flags.source_embedding_dim) source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding}) lstm_encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(flags.source_embedding_dim, flags.encoder_hidden_dim, batch_first=True, bidirectional=flags.encoder_bidirectional)) attention = DotProductAttention() model = SimpleSeq2Seq(vocab, source_embedder, lstm_encoder, flags.max_decode_length, target_embedding_dim=flags.decoder_hidden_dim, target_namespace=target_namespace, attention=attention, beam_size=flags.beam_size, use_bleu=True) return model
def __init__( self, #### The embedding layer is specified as an AllenNLP <code>TextFieldEmbedder</code> #### which represents a general way of turning tokens into tensors. #### (Here we know that we want to represent each unique word with a learned tensor, #### but using the general class allows us to easily experiment with different types #### of embeddings, for example <a href = "https://allennlp.org/elmo">ELMo</a>.) word_embeddings: TextFieldEmbedder, #### Similarly, the encoder is specified as a general <code>Seq2SeqEncoder</code> #### even though we know we want to use an LSTM. Again, this makes it easy to #### experiment with other sequence encoders, for example a Transformer. encoder: Seq2SeqEncoder, #### Every AllenNLP model also expects a <code>Vocabulary</code>, #### which contains the namespaced mappings of tokens to indices and labels to indices. vocab: Vocabulary ) -> None: #### Notice that we have to pass the vocab to the base class constructor. super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder #### The feed forward layer is not passed in as a parameter, but is constructed by us. #### Notice that it looks at the encoder to find the correct input dimension and looks #### at the vocabulary (and, in particular, at the label -> index mapping) to find the correct output dimension. self.hidden2tag = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) #### The last thing to notice is that we also instantiate a #### <code>CategoricalAccuracy</code> metric, which we'll use to track accuracy #### during each training and validation epoch. self.accuracy = CategoricalAccuracy()
def __init__(self, vocab: Vocabulary, embedding_dim: int, use_crf: bool = False, label_namespace: str = "xpos_tags"): super().__init__(vocab) self.label_namespace = label_namespace self.labels = vocab.get_index_to_token_vocabulary(label_namespace) num_labels = vocab.get_vocab_size(label_namespace) if use_crf: self.crf = ConditionalRandomField( num_labels, include_start_end_transitions=True) self.label_projection_layer = TimeDistributed( torch.nn.Linear(embedding_dim, num_labels)) self.decoder = None else: self.crf = None self.decoder = GruSeq2SeqEncoder(input_size=embedding_dim, hidden_size=embedding_dim, num_layers=1, bidirectional=True) self.label_projection_layer = TimeDistributed( torch.nn.Linear(self.decoder.get_output_dim(), num_labels)) from allennlp.training.metrics import CategoricalAccuracy self.metrics = {"accuracy": CategoricalAccuracy()}
def __init__(self, vocab: Vocabulary, sentence_embedder: TextFieldEmbedder, action_embedding_dim: int, encoder: Seq2SeqEncoder, dropout: float = 0.0, rule_namespace: str = 'rule_labels') -> None: super(NlvrSemanticParser, self).__init__(vocab=vocab) self._sentence_embedder = sentence_embedder self._denotation_accuracy = Average() self._consistency = Average() self._encoder = encoder if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size( self._rule_namespace), embedding_dim=action_embedding_dim) # This is what we pass as input in the first step of decoding, when we don't have a # previous action. self._first_action_embedding = torch.nn.Parameter( torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding)
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary, positive_label: int = 4) -> None: super().__init__(vocab) # We need the embeddings to convert word IDs to their vector representations self.word_embeddings = word_embeddings self.encoder = encoder # After converting a sequence of vectors to a single vector, we feed it into # a fully-connected linear layer to reduce the dimension to the total number of labels. self.linear = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive) self.accuracy = CategoricalAccuracy() self.f1_measure = F1Measure(positive_label) # We use the cross entropy loss because this is a classification task. # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss, # which makes it unnecessary to add a separate softmax layer. self.loss_function = torch.nn.CrossEntropyLoss()
def __init__(self, vocab: Vocabulary, sentence_embedder: TextFieldEmbedder, action_embedding_dim: int, encoder: Seq2SeqEncoder, dropout: float = 0.0, rule_namespace: str = 'rule_labels') -> None: super(NlvrSemanticParser, self).__init__(vocab=vocab) self._sentence_embedder = sentence_embedder self._denotation_accuracy = Average() self._consistency = Average() self._encoder = encoder if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace), embedding_dim=action_embedding_dim) # This is what we pass as input in the first step of decoding, when we don't have a # previous action. self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding)
def __init__(self, vocab: Vocabulary, embedding_in: Embedding, embedding_out: Embedding, neg_samples=10, cuda_device=-1): super().__init__(vocab) self.embedding_in = embedding_in self.embedding_out = embedding_out self.neg_samples = neg_samples self.cuda_device = cuda_device check_if_counter = getattr(vocab, '_retained_counter', None) if not check_if_counter: return # pre-compute probability for negative sampling token_to_probs = {} token_counts = vocab._retained_counter['tags_in'] assert len(token_counts) > 2 total_counts = sum(token_counts.values()) total_probs = 0. for token, counts in token_counts.items(): unigram_freq = counts / total_counts unigram_freq = math.pow(unigram_freq, 3 / 4) token_to_probs[token] = unigram_freq total_probs += unigram_freq self.neg_sample_probs = np.ndarray((vocab.get_vocab_size('tags_in'),)) for token_id , token in vocab.get_index_to_token_vocabulary('tags_in').items(): self.neg_sample_probs[token_id] = token_to_probs.get(token, 0) / total_probs
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self.accuracy = CategoricalAccuracy()
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def __init__(self, #### The embedding layer is specified as an AllenNLP <code>TextFieldEmbedder</code> which represents a general way of turning tokens into tensors. (Here we know that we want to represent each unique word with a learned tensor, but using the general class allows us to easily experiment with different types of embeddings, for example <a href = "https://allennlp.org/elmo">ELMo</a>.) word_embeddings: TextFieldEmbedder, #### Similarly, the encoder is specified as a general <code>Seq2SeqEncoder</code> even though we know we want to use an LSTM. Again, this makes it easy to experiment with other sequence encoders, for example a Transformer. encoder: Seq2SeqEncoder, #### Every AllenNLP model also expects a <code>Vocabulary</code>, which contains the namespaced mappings of tokens to indices and labels to indices. vocab: Vocabulary) -> None: #### Notice that we have to pass the vocab to the base class constructor. super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder #### The feed forward layer is not passed in as a parameter, but is constructed by us. Notice that it looks at the encoder to find the correct input dimension and looks at the vocabulary (and, in particular, at the label -> index mapping) to find the correct output dimension. self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) #### The last thing to notice is that we also instantiate a <code>CategoricalAccuracy</code> metric, which we'll use to track accuracy during each training and validation epoch. self.accuracy = CategoricalAccuracy()
def index(self, vocab: Vocabulary): if self._label_ids is None: self._label_ids = [vocab.get_token_index(label, self._label_namespace) # type: ignore for label in self.labels] if not self._num_labels: self._num_labels = vocab.get_vocab_size(self._label_namespace)
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token