def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
def test_from_dataset_respects_max_vocab_size_single_int(self): max_vocab_size = 1 vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == max_vocab_size + 2 vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert len(words) == 5
def test_multilabel_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("rel0", namespace="rel_labels") vocab.add_token_to_namespace("rel1", namespace="rel_labels") vocab.add_token_to_namespace("rel2", namespace="rel_labels") f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
def test_unknown_token(self): # pylint: disable=protected-access # We're putting this behavior in a test so that the behavior is documented. There is # solver code that depends in a small way on how we treat the unknown token, so any # breaking change to this behavior should break a test, so you know you've done something # that needs more consideration. vocab = Vocabulary() oov_token = vocab._oov_token oov_index = vocab.get_token_index(oov_token) assert oov_index == 1 assert vocab.get_token_index("unseen word") == oov_index
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, contextualizer: Seq2SeqEncoder, dropout: float = None, num_samples: int = None, sparse_embeddings: bool = False, bidirectional: bool = False, initializer: InitializerApplicator = None) -> None: super().__init__(vocab) self._text_field_embedder = text_field_embedder if contextualizer.is_bidirectional() is not bidirectional: raise ConfigurationError( "Bidirectionality of contextualizer must match bidirectionality of " "language model. " f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, " f"language model bidirectional: {bidirectional}") self._contextualizer = contextualizer self._bidirectional = bidirectional # The dimension for making predictions just in the forward # (or backward) direction. if self._bidirectional: self._forward_dim = contextualizer.get_output_dim() // 2 else: self._forward_dim = contextualizer.get_output_dim() # TODO(joelgrus): more sampled softmax configuration options, as needed. if num_samples is not None: self._softmax_loss = SampledSoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim, num_samples=num_samples, sparse=sparse_embeddings) else: self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(), embedding_dim=self._forward_dim) # TODO(brendanr): Output perplexity here. e^loss self.register_buffer('_last_average_loss', torch.zeros(1)) if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x if initializer is not None: initializer(self)
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. """ self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: dep_labels = [token.dep_ or 'NONE' for token in tokens] return {index_name: [vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels]}
def __init__(self, vocab: Vocabulary, sentence_embedder: TextFieldEmbedder, action_embedding_dim: int, encoder: Seq2SeqEncoder, dropout: float = 0.0, rule_namespace: str = 'rule_labels') -> None: super(NlvrSemanticParser, self).__init__(vocab=vocab) self._sentence_embedder = sentence_embedder self._denotation_accuracy = Average() self._consistency = Average() self._encoder = encoder if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace), embedding_dim=action_embedding_dim) # This is what we pass as input in the first step of decoding, when we don't have a # previous action. self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding)
def setUp(self): super(TestCopyNetReader, self).setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens] return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if tag is None: tag = 'NONE' return vocabulary.get_token_index(tag, self._namespace)
def test_vocab_can_print(self): vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") vocab.add_token_to_namespace("b3", namespace="b") print(vocab)
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None, label_encoding: Optional[str] = "BIO", tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. label_encoding : ``str``, optional (default = "BIO") The encoding used to specify label span endpoints in the sequence. Valid options are "BIO", "IOB1", "BIOUL" or "BMES". tags_to_spans_function: ``Callable``, optional (default = ``None``) If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be used to generate spans. """ if label_encoding and tags_to_spans_function: raise ConfigurationError( 'Both label_encoding and tags_to_spans_function are provided. ' 'Set "label_encoding=None" explicitly to enable tags_to_spans_function.' ) if label_encoding: if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES"]: raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'.") elif tags_to_spans_function is None: raise ConfigurationError( 'At least one of the (label_encoding, tags_to_spans_function) should be provided.' ) self._label_encoding = label_encoding self._tags_to_spans_function = tags_to_spans_function self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self.accuracy = CategoricalAccuracy()
def test_vocab_from_instances_namespaces(self): reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg']) instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt')) # check that we didn't clobber the labels namespace vocab = Vocabulary.from_instances(instances) self.assertSetEqual( set(vocab._token_to_index.keys()), # pylint: disable=protected-access {'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags', 'predicate_arg_tags'} )
def test_min_pretrained_embeddings(self): params = Params({ "pretrained_files": { "tokens": str(self.FIXTURES_ROOT / "embeddings/glove.6B.100d.sample.txt.gz") }, "min_pretrained_embeddings": {"tokens": 50}, }) vocab = Vocabulary.from_params(params=params, instances=self.dataset) assert vocab.get_vocab_size() >= 50 assert vocab.get_token_index("his") > 1 # not @@UNKNOWN@@
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. index = token.text_id else: text = token.text if self.lowercase_tokens: text = text.lower() index = vocabulary.get_token_index(text, self.namespace) return index
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
def test_max_vocab_size_dict(self): params = Params({ "max_vocab_size": { "tokens": 1, "characters": 20 } }) vocab = Vocabulary.from_params(params=params, instances=self.dataset) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == 3
def test_from_dataset_respects_inclusive_embedding_file(self): embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]: indices = [] if token.text is None: raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) indices.append(index) return indices
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def __init__(self, #### The embedding layer is specified as an AllenNLP <code>TextFieldEmbedder</code> which represents a general way of turning tokens into tensors. (Here we know that we want to represent each unique word with a learned tensor, but using the general class allows us to easily experiment with different types of embeddings, for example <a href = "https://allennlp.org/elmo">ELMo</a>.) word_embeddings: TextFieldEmbedder, #### Similarly, the encoder is specified as a general <code>Seq2SeqEncoder</code> even though we know we want to use an LSTM. Again, this makes it easy to experiment with other sequence encoders, for example a Transformer. encoder: Seq2SeqEncoder, #### Every AllenNLP model also expects a <code>Vocabulary</code>, which contains the namespaced mappings of tokens to indices and labels to indices. vocab: Vocabulary) -> None: #### Notice that we have to pass the vocab to the base class constructor. super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder #### The feed forward layer is not passed in as a parameter, but is constructed by us. Notice that it looks at the encoder to find the correct input dimension and looks at the vocabulary (and, in particular, at the label -> index mapping) to find the correct output dimension. self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) #### The last thing to notice is that we also instantiate a <code>CategoricalAccuracy</code> metric, which we'll use to track accuracy during each training and validation epoch. self.accuracy = CategoricalAccuracy()
def test_set_from_file_reads_non_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('B-PERS\n') vocab_file.write('I-PERS\n') vocab_file.write('O\n') vocab_file.write('B-ORG\n') vocab_file.write('I-ORG\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags') assert vocab.get_token_index("B-PERS", namespace='tags') == 0 assert vocab.get_token_index("I-PERS", namespace='tags') == 1 assert vocab.get_token_index("O", namespace='tags') == 2 assert vocab.get_token_index("B-ORG", namespace='tags') == 3 assert vocab.get_token_index("I-ORG", namespace='tags') == 4 assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS" assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS" assert vocab.get_token_from_index(2, namespace='tags') == "O" assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG" assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
def test_registrability(self): @Vocabulary.register('my-vocabulary') class MyVocabulary: @classmethod def from_params(cls, params, instances=None): # pylint: disable=unused-argument return MyVocabulary() params = Params({'type': 'my-vocabulary'}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
def setUp(self): super().setUp() self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary): with open(file_path, mode='w') as f: for index, token in vocab.get_index_to_token_vocabulary('token_in').items(): values = ['{:.5f}'.format(val) for val in embedding.weight[index]] f.write(' '.join([token] + values)) f.write('\n')
def train(train_dataset, val_dataset, cfg): # Vocabularyを生成 VOCAB_SIZE = cfg.w2v.vocab_size vocab = Vocabulary.from_instances(train_dataset + val_dataset, max_vocab_size=VOCAB_SIZE) BATCH_SIZE = cfg.training.batch_size # パディング済みミニバッチを生成してくれるIterator iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # 東北大が提供している学習済み日本語 Wikipedia エンティティベクトルを使用する # http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/ model_name = cfg.w2v.model_name norm = cfg.w2v.norm cwd = hydra.utils.get_original_cwd() params = Params({ 'embedding_dim': 200, 'padding_index': 0, 'pretrained_file': os.path.join(cwd, f'embs/jawiki.{model_name}_vectors.200d.txt'), 'norm_type': norm }) token_embedding = Embedding.from_params(vocab=vocab, params=params) HIDDEN_SIZE = cfg.model.hidden_size dropout = cfg.model.dropout word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper( nn.LSTM(word_embeddings.get_output_dim(), HIDDEN_SIZE, bidirectional=True, batch_first=True)) model = ClassifierWithAttn(word_embeddings, encoder, vocab, dropout) model.train() USE_GPU = True if USE_GPU and torch.cuda.is_available(): model = model.cuda(0) LR = cfg.training.learning_rate EPOCHS = cfg.training.epoch patience = cfg.training.patience if cfg.training.patience > 0 else None optimizer = optim.Adam(model.parameters(), lr=LR) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=val_dataset, patience=patience, cuda_device=0 if USE_GPU else -1, num_epochs=EPOCHS) metrics = trainer.train() logger.info(metrics) return model, metrics
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def test_set_from_file_reads_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('<S>\n') vocab_file.write('</S>\n') vocab_file.write('<UNK>\n') vocab_file.write('a\n') vocab_file.write('tricky\x0bchar\n') vocab_file.write('word\n') vocab_file.write('another\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>") assert vocab._oov_token == DEFAULT_OOV_TOKEN assert vocab.get_token_index("random string") == 3 assert vocab.get_token_index("<S>") == 1 assert vocab.get_token_index("</S>") == 2 assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3 assert vocab.get_token_index("a") == 4 assert vocab.get_token_index("tricky\x0bchar") == 5 assert vocab.get_token_index("word") == 6 assert vocab.get_token_index("another") == 7 assert vocab.get_token_from_index(0) == vocab._padding_token assert vocab.get_token_from_index(1) == "<S>" assert vocab.get_token_from_index(2) == "</S>" assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN assert vocab.get_token_from_index(4) == "a" assert vocab.get_token_from_index(5) == "tricky\x0bchar" assert vocab.get_token_from_index(6) == "word" assert vocab.get_token_from_index(7) == "another"
def main(): parser = argparse.ArgumentParser(description='Evidence Inference experiments') parser.add_argument('--cuda_device', type=int, default=0, help='GPU number (default: 0)') parser.add_argument('--epochs', type=int, default=2, help='upper epoch limit (default: 2)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=32, help='batch size (default: 32)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout for the model (default: 0.2)') parser.add_argument('--emb_size', type=int, default=256, help='elmo embeddings size (default: 256)') parser.add_argument('--model_name', type=str, default='baseline', help='model name (default: baseline)') parser.add_argument('--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() annotations = pd.read_csv('data/data/annotations_merged.csv') prompts = pd.read_csv('data/data/prompts_merged.csv') feature_dictionary = {} prompts_dictionary = {} for index, row in prompts.iterrows(): prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']] for index, row in annotations.iterrows(): if row['PMCID'] not in feature_dictionary: feature_dictionary[row['PMCID']] = [] feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']] + prompts_dictionary[row['PromptID']]) train = [] valid = [] test = [] with open('data/splits/train_article_ids.txt') as train_file: for line in train_file: train.append(int(line.strip())) with open('data/splits/validation_article_ids.txt') as valid_file: for line in valid_file: valid.append(int(line.strip())) with open('data/splits/test_article_ids.txt') as test_file: for line in test_file: test.append(int(line.strip())) elmo_token_indexer = {'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer()} reader = EIDatasetReader(elmo_token_indexer, feature_dictionary) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) vocab = Vocabulary.from_instances(train_data + valid_data + test_data) urls = [ 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_options.json', 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_weights.hdf5' ] elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable, projection_dim=args.emb_size) word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True) model = Baseline(word_embeddings, vocab) cuda_device = args.cuda_device if torch.cuda.is_available(): model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = torch.optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('article', 'num_tokens')], padding_noise=0.1) iterator.index_with(vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=test_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token
import lineflow.datasets as lfds SOURCE_FIELD_NAME = 'source_tokens' TARGET_FIELD_NAME = 'target_tokens' if __name__ == '__main__': print('Reading...') train = lfds.SmallParallelEnJa('train') \ .to_allennlp(source_field_name=SOURCE_FIELD_NAME, target_field_name=TARGET_FIELD_NAME).all() validation = lfds.SmallParallelEnJa('dev') \ .to_allennlp(source_field_name=SOURCE_FIELD_NAME, target_field_name=TARGET_FIELD_NAME).all() if not osp.exists('./enja_vocab'): print('Building vocabulary...') vocab = Vocabulary.from_instances(train + validation, max_vocab_size=50000) print(f'Vocab Size: {vocab.get_vocab_size()}') print('Saving...') vocab.save_to_files('./enja_vocab') else: print('Loading vocabulary...') vocab = Vocabulary.from_files('./enja_vocab') iterator = BucketIterator(sorting_keys=[(SOURCE_FIELD_NAME, 'num_tokens')], batch_size=32) iterator.index_with(vocab) num_batches = iterator.get_num_batches(train) for batch in Tqdm.tqdm(iterator(train, num_epochs=1), total=num_batches):
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: tag = token.ent_type_ if tag is None: tag = 'NONE' return vocabulary.get_token_index(tag, self._namespace)
def from_params( cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "TrainerPieces": all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) if recover and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab_params = params.pop("vocabulary", {}) vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary"), vocab_params.get("padding_token", None), vocab_params.get("oov_token", None), ) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), # Using a generator comprehension here is important # because, being lazy, it allows us to not iterate over the # dataset when directory_path is specified. (instance for key, dataset in all_datasets.items() if key in datasets_for_vocab_creation for instance in dataset), ) model = Model.from_params(vocab=vocab, params=params.pop("model")) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets["train"] validation_data = all_datasets.get("validation") test_data = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names( model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return cls( model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params, )
do_lowercase=True, ) # %% reader = ClaimAnnotationReaderJSON( token_indexers={"tokens": token_indexer} ) train_dataset = reader.read(TRAIN_PATH) validation_dataset = reader.read(VALIDATION_PATH) test_dataset = reader.read(TEST_PATH) # %% vocab = Vocabulary() vocab._token_to_index['labels'] = {'0': 0, '1': 1} # %% """Prepare iterator""" from allennlp.data.iterators import BasicIterator iterator = BasicIterator(batch_size=64) iterator.index_with(vocab) # %% def multiple_target_CrossEntropyLoss(logits, labels):
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # intiialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
def add_task(self, task_tag: str, vocab: Vocabulary): self.classification_layers.append(torch.nn.Linear(in_features=self.hidden_dim, out_features=vocab.get_vocab_size('labels'))) self.num_task = self.num_task + 1 self.task2id[task_tag] = self.num_task self.tasks_vocabulary[task_tag] = vocab
def index(self, vocab: Vocabulary): if self._indexed_labels is None and self.labels is not None: self._indexed_labels = [vocab.get_token_index(label, self._label_namespace) for label in self.labels]
def main(): trainFile = "../srcData/trainData.csv" validFile = "../srcData/devData.csv" testFile = "../srcData/testData.csv" trainSeq2SeqFile = data.dataPreparation(trainFile) validSeq2SeqFile = data.dataPreparation(validFile) testSeq2SeqFile = data.dataPreparation(testFile) print (testSeq2SeqFile) #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model #SingleIdTokenIndexer = Tokens are single integers #TokenCharactersIndexer = Tokens as a list of integers # Read a tsvfile with paired instances (source, target) reader = Seq2SeqDatasetReader( source_tokenizer = WordTokenizer(), target_tokenizer = WordTokenizer(), # Defaults to source_tokenizer source_token_indexers={'tokens': SingleIdTokenIndexer()}, target_token_indexers={'tokens': SingleIdTokenIndexer()} # Defaults to source_token_indexers ) # Each of the dataset is a list of each tokens (source_tokens, target_tokens) train_dataset = reader.read(trainSeq2SeqFile) validation_dataset = reader.read(validSeq2SeqFile) test_dataset = reader.read(testSeq2SeqFile) # Finding extra fact2 vocab trainExtraVocab = findExtraVocab(train_dataset) validExtraVocab = findExtraVocab(validation_dataset) testExtraVocab = findExtraVocab(test_dataset) finalExtraVocab = list(set(trainExtraVocab+validExtraVocab+testExtraVocab)) print("length:",len(finalExtraVocab)) #input() #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3}) vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset) # Train + Valid = 9703 # Train + Valid + Test = 10099 print ("Vocab SIze :",vocab.get_vocab_size('tokens')) encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=ENC_EMBEDDING_DIM) # Embedding for tokens since in the dataset creation time it is mentioned tokens source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding}) encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(ENC_EMBEDDING_DIM,HIDDEN_DIM,batch_first=True,dropout=0.2)) attention = DotProductAttention() max_decoding_steps = 4 # TODO: make this variable model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps, target_embedding_dim = TGT_EMBEDDING_DIM, #target_namespace = 'target_tokens', attention = attention, beam_size = beamSize, use_bleu = True, extra_vocab = finalExtraVocab) #Can also specify lr=0.001 optimizer = optim.Adam(model.parameters()) # Data Iterator that specify how to batch our dataset # Takes data shuffles it and creates fixed sized batches #iterator = BasicIterator(batch_size=2) #iterator.index_with(vocab) # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model = model, optimizer = optimizer, iterator = iterator, train_dataset = train_dataset, validation_dataset = validation_dataset, #patience = 3, num_epochs = numEpochs, cuda_device = CUDA_DEVICE) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) '''for i in range(2): print ("Epoch: {}".format(i)) trainer.train() predictor = SimpleSeq2SeqPredictor(model, reader) for instance in itertools.islice(validation_dataset, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens']) """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 'loss': 5.9835076332092285, 'class_log_probabilities': [-20.10894012451172], 'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']} """ print (predictor.predict_instance(instance)) ''' outFile = open("output_"+str(HIDDEN_DIM)+"_"+str(numEpochs)+"_"+str(beamSize)+".csv","w") writer = csv.writer(outFile,delimiter="\t") for instance in itertools.islice(test_dataset,500): src = instance.fields['source_tokens'].tokens gold = instance.fields['target_tokens'].tokens pred = predictor.predict_instance(instance)['predicted_tokens'] writer.writerow([src,gold,pred]) outFile.close()
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, vecoder: Seq2VecEncoder, sen_encoder: Seq2VecEncoder, max_decoding_steps: int = 32, attention: Attention = None, beam_size: int = None, target_namespace: str = "tokens", scheduled_sampling_ratio: float = 0.5, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # Maybe we can try self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self.pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._max_decoding_steps = max_decoding_steps self.vocab = vocab # anything about dims self.sen_num = 10 # with open('../data/0510/cy/kg_and_train.pk', 'rb') as f: with open('cy/openkg.pk', 'rb') as f: self.kg_mat = torch.tensor(pickle.load(f)).float() self.symp_mat = torch.nn.Parameter(self.kg_mat).cuda() self.evovl_mat = torch.zeros(len(self.kg_mat), len(self.kg_mat)).cuda() # with open('../data/0510/cy/comp_topic2num.pk', 'rb') as f: with open('cy/comp_topic2num.pk', 'rb') as f: self.word_idx = pickle.load(f) self.idx_word = {v: k for k, v in self.word_idx.items()} self.vocab_to_idx = {} self.idx_to_vocab_list = [] self.vocab_list = [] for word, k in self.word_idx.items(): self.vocab_to_idx[vocab.get_token_index(word.strip())] = k self.idx_to_vocab_list.append(vocab.get_token_index(word.strip())) self.symp_size = len(self.symp_mat) + self.sen_num self.topic = len(self.symp_mat) self._encoder = encoder self._vecoder = vecoder self._sen_encoder = sen_encoder self.outfeature = self._sen_encoder.get_output_dim() # anything about graph self.symp_state = torch.nn.Parameter( torch.Tensor(self.symp_size, self.outfeature)) torch.nn.init.xavier_uniform_(self.symp_state, gain=1.414) self.predict_layer = torch.nn.Parameter( torch.Tensor(self.symp_size, self.outfeature)) self.predict_bias = torch.nn.Parameter(torch.Tensor(self.symp_size)) torch.nn.init.kaiming_uniform_(self.predict_layer) torch.nn.init.uniform_(self.predict_bias, -1 / self.symp_size**0.5, 1 / self.symp_size**0.5) self.attn_one = GATAttention(self.outfeature, self.outfeature, 1) self.attn_two = GATAttention(self.outfeature, self.outfeature, 1) self.attn_three = GATAttention(self.outfeature, self.outfeature, 1) # Metric self.kd_metric = KD_Metric() self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25)) self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0)) self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0)) self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1)) self.topic_acc = Average() # anything about module self._source_embedder = source_embedder num_classes = self.vocab.get_vocab_size(self._target_namespace) target_embedding_dim = source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) self._encoder_output_dim = self._encoder.get_output_dim( ) # 600 要不把前两个都换成outfeater得了 self._decoder_output_dim = self._encoder_output_dim * 2 self._decoder_input_dim = target_embedding_dim self._attention = None if attention: self._attention = attention self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim # 在这里把那个embedding融合进入试试? self.before_linear = Linear(2 * self.outfeature, self.outfeature) self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self.outfeature * 2, num_classes) self.linear_all = Linear(self.outfeature * 3 + self._decoder_input_dim, 1) self.attention_linear = Linear(self.outfeature, self.outfeature) self.decoder_linear = Linear(self.outfeature * 2, self.outfeature) self.get_attn = Linear(self.outfeature, 1, bias=False) self.topic_acc = MyAverage() self.topic_rec = MyAverage() self.topic_f1 = F1() self.dink1 = Distinct1() self.dink2 = Distinct2() self.last_sen = 2 self.clac_num = 0
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("d", namespace="tokens") original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.add_token_to_namespace("b", namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index("c", "tokens") # should be present assert extended_vocab.get_token_index("e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2 text_field = TextField([Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
WORD2VEC = load_w2v(args.word2vec, VOCAB) print( f'Loaded {len(WORD2VEC)} words. Coverage: { len(WORD2VEC) / len(VOCAB)*100:.2f}%' ) elif args.rank_func == 'bert': import torch from torch.nn import functional as F from pytorch_pretrained_bert.modeling import BertForNextSentencePrediction from allennlp.data import Instance from allennlp.data.dataset import Batch from allennlp.data.fields import TextField from allennlp.data.tokenizers import WordTokenizer, Token from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer from allennlp.data.vocabulary import Vocabulary print('Initialize BERT model...') TOKENIZER = WordTokenizer(word_splitter=BertBasicWordSplitter()) WORD_INDEXER = PretrainedBertIndexer(pretrained_model=args.bert_vocab) VOCAB = Vocabulary() GPU_ID = args.gpu_id BERT_NEXT_SENTENCE = BertForNextSentencePrediction.from_pretrained( args.bert_model).to(torch.device(f"cuda:{GPU_ID}")) BERT_NEXT_SENTENCE.eval() main() if args.rank_func == 'sentenc': SESSION.close()
vocab = Vocabulary() for ns in ["tokens", "token_in", "token_out"]: for chord in itertools.product(note_list, accidental_list, chord_type_list): vocab.add_token_to_namespace("".join(chord), namespace=ns) vocab.add_token_to_namespace(START_SYMBOL, namespace=ns) vocab.add_token_to_namespace(END_SYMBOL, namespace=ns) key_list = [ "".join(x) for x in itertools.product(note_list, accidental_list) ] form_list = ["m", "+", "o", "M", "%", "It", "Ger", "Fr"] figbass_list = ["7", "6"] for char in (key_list + form_list + figbass_list): vocab.add_token_to_namespace(char, namespace="token_characters") note_number_list = [str(x) for x in range(12)] for note_number in note_number_list: vocab.add_token_to_namespace(note_number, namespace="notes") vocab.save_to_files("data/vocabulary") generate_vocab() vocab = Vocabulary.from_files("data/vocabulary") print(vocab.get_token_to_index_vocabulary())
def test_from_params_adds_tokens_to_vocab(self): vocab = Vocabulary.from_params(Params({'tokens_to_add': {'tokens': ['q', 'x', 'z']}}), self.dataset) assert vocab.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b', 5: 'q', 6: 'x', 7: 'z'}
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embedding = word_embeddings self.encoder = encoder self.hidden2out = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size("labels")) self.accuracy = MicroMetrics(vocab) self.lstm = nn.LSTM(input_size=word_embeddings.get_output_dim(), hidden_size=128, num_layers=1, batch_first=True) self.label_index_to_label = self.vocab.get_index_to_token_vocabulary('labels')
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def main(): #Initlizing the embeddings (BERT) bert_token_indexer = PretrainedBertIndexer( pretrained_model="./biobert_pubmed/vocab.txt", max_pieces=config.max_seq_len, do_lowercase=True, ) reader = BertAnalogyDatasetReader( tokenizer=bert_tokenizer, token_indexers={'tokens': bert_token_indexer}) train_dataset, test_dataset, dev_dataset = ( reader.read(DATA_ROOT + "/" + fname) for fname in ["train_all.txt", "test_all.txt", "val_all.txt"]) vocab = Vocabulary.from_instances(train_dataset + test_dataset + dev_dataset) bert_embedder = PretrainedBertEmbedder( pretrained_model='biobert_pubmed', top_layer_only=True, # conserve memory ) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": bert_embedder}, # we'll be ignoring masks so we'll need to set this to True allow_unmatched_keys=True) BERT_DIM = word_embeddings.get_output_dim() class BertSentencePooler(Seq2VecEncoder): def forward(self, embs: torch.tensor, mask: torch.tensor = None) -> torch.tensor: # extract first token tensor return embs[:, 0] @overrides def get_output_dim(self) -> int: return BERT_DIM #Initializing the model #takes the hidden state at the last time step of the LSTM for every layer as one single output bert_encoder = BertSentencePooler(vocab) model = LstmModel(word_embeddings, bert_encoder, vocab) if USE_GPU: model.cuda() else: model # Training the model optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=10, cuda_device=0 if USE_GPU else -1, num_epochs=20) trainer.train() #Saving the model with open("biobert/model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("biobert/vocabulary") return vocab
def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def index(self, vocab: Vocabulary): self._mapping_array = [ vocab.get_token_index(x.text, self._target_namespace) for x in self._source_tokens ]
def main(): logging.basicConfig( level=logging.INFO, format='[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s') parser = argparse.ArgumentParser() parser.add_argument( '--model-name', help='Model\'s name (the name of directory with the trained model)') parser.add_argument( '--pretrained-models-dir', default=None, help='Path to directory with pretrained models (e.g., RuBERT)') parser.add_argument('--models-dir', default='../models', help='Path to directory where the models are stored') parser.add_argument( '--data-dir', default='../data/test_private_data', help='Path to directory with files to apply the model to') parser.add_argument('--predictions-dir', default='../predictions/private', help='Path to directory to store the predictions') parser.add_argument('--batch-size', default=128, type=int) parser.add_argument('--checkpoint-name', default='best.th', help='Name of the checkpoint to use') parser.add_argument('--cuda-unit', default='0', help='CUDA device number') args = parser.parse_args() model_dir = os.path.join(args.models_dir, args.model_name) result_data_dir = args.predictions_dir #result_data_dir = os.path.join(args.predictions_dir, args.model_name) if not os.path.isdir(result_data_dir): os.makedirs(result_data_dir) config = Config.load(os.path.join(model_dir, 'config.json')) if args.models_dir: config.data.models_dir = args.models_dir if args.pretrained_models_dir: config.data.pretrained_models_dir = args.pretrained_models_dir logger.info('Config: %s', config) cuda_dev_name = 'cuda:' + args.cuda_unit device = torch.device( cuda_dev_name if torch.cuda.is_available() else 'cpu:0') #device = torch.device('cpu') vocab = Vocabulary.from_files(os.path.join(model_dir, 'vocab')) lemmatize_helper = LemmatizeHelper.load(model_dir) morpho_vectorizer = MorphoVectorizer( ) if config.embedder.use_pymorphy else None model = _build_model(config, vocab, lemmatize_helper, morpho_vectorizer, bert_max_length=BERT_MAX_LENGTH) model.HeuristicMode = True model.to(device) model.load_state_dict( torch.load(os.path.join(model_dir, args.checkpoint_name), map_location=device)) model.eval() reader = _get_reader(config, skip_labels=True, bert_max_length=BERT_MAX_LENGTH, reader_max_length=None) for root, dirs, files in os.walk(args.data_dir): reroot = root[len(args.data_dir) + 1:] for name in dirs: os.makedirs(os.path.join(result_data_dir, reroot, name)) for name in files: path = os.path.join(root, name) result_path = os.path.join(result_data_dir, reroot, name) if not path.endswith('.conllu'): continue print("PROCESSING: " + path) data = reader.read(path) if morpho_vectorizer is not None: morpho_vectorizer.apply_to_instances(data) with open(result_path, 'w') as f_out: for begin_index in tqdm(range(0, len(data), args.batch_size)): end_index = min(len(data), begin_index + args.batch_size) predictions_list = model.forward_on_instances( data[begin_index:end_index]) for predictions in predictions_list: for token_index in range(len(predictions['words'])): #word = predictions['words'][token_index] word = predictions['original_words'][token_index] lemma = predictions['predicted_lemmas'][ token_index] upos, feats = predictions['predicted_gram_vals'][ token_index].split('|', 1) feats = reorder_grammemes(feats) head_tag = predictions['predicted_dependencies'][ token_index] head_index = predictions['predicted_heads'][ token_index] #print(token_index + 1, word, lemma, upos, '_', feats, head_index, head_tag, '_', '_', sep='\t', file=f_out) tn = predictions['token_nos'][token_index] hn = predictions['token_nos'][ head_index - 1] if head_index > 0 else 0 print(tn, word, lemma, upos, '_', feats, hn, head_tag, '_', '_', sep='\t', file=f_out) print(file=f_out)
from bella_allen_nlp.allen_models.target_lstm import TargetLSTMClassifier #token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens_id'), # 'chars': TokenCharactersIndexer(namespace='char_id')} token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens_id', lowercase_tokens=True)} reader = TargetDatasetReader(token_indexers=token_indexers) train_dataset = reader.read(cached_path( '/home/andrew/.Bella/Datasets/restaurants train')) validation_dataset = reader.read(cached_path( '/home/andrew/.Bella/Datasets/restaurants dev')) target = train_dataset[0].fields['target'] text = train_dataset[0].fields['text'] label = train_dataset[0].fields['label'] vocab = Vocabulary.from_instances(train_dataset + validation_dataset) WORD_EMBEDDING_DIM = 50 CHAR_EMBEDDING_DIM = 5 CHAR_WORD_DIM = 30 HIDDEN_DIM = 50 #char_embedding = Embedding(num_embeddings=vocab.get_vocab_size("char_id"), # embedding_dim=CHAR_EMBEDDING_DIM) #character_cnn = CnnEncoder(embedding_dim=CHAR_EMBEDDING_DIM, num_filters=2, # output_dim=CHAR_WORD_DIM) #token_character_encoder = TokenCharactersEncoder(embedding=char_embedding, # encoder=character_cnn) #word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding, # "chars": token_character_encoder})
def generate_vocab(): note_list = ["A", "B", "C", "D", "E", "F", "G"] accidental_list = ["", "b", "#"] chord_type_list = [ "", "m", "+", "o", "7", "m7", "M7", "o7", "%7", "+7", "It6", "Ger6", "Fr6" ] vocab = Vocabulary() for ns in ["tokens", "token_in", "token_out"]: for chord in itertools.product(note_list, accidental_list, chord_type_list): vocab.add_token_to_namespace("".join(chord), namespace=ns) vocab.add_token_to_namespace(START_SYMBOL, namespace=ns) vocab.add_token_to_namespace(END_SYMBOL, namespace=ns) key_list = [ "".join(x) for x in itertools.product(note_list, accidental_list) ] form_list = ["m", "+", "o", "M", "%", "It", "Ger", "Fr"] figbass_list = ["7", "6"] for char in (key_list + form_list + figbass_list): vocab.add_token_to_namespace(char, namespace="token_characters") note_number_list = [str(x) for x in range(12)] for note_number in note_number_list: vocab.add_token_to_namespace(note_number, namespace="notes") vocab.save_to_files("data/vocabulary")
def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: JSONDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) if not TEST_MODE: train_dataset = read_datasets(dataset_paths, readers, data_split="train") validation_dataset = read_datasets(dataset_paths, readers, data_split="dev") vocab = create_classification_vocab( [train_dataset, validation_dataset]) else: vocab = Vocabulary.from_files( os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskClassifier(word_embeddings, encoders, vocab, TASKS) model = model.cuda(device=CUDA_DEVICE) if not TEST_MODE: iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True) iterator.index_with(vocab) if CLEAN_MODEL_DIR: if os.path.exists(SERIALIZATION_DIR): logger.info(f"Deleting {SERIALIZATION_DIR}") shutil.rmtree(SERIALIZATION_DIR) logger.info(f"Creating {SERIALIZATION_DIR}") os.makedirs(SERIALIZATION_DIR) logger.info( f"Writing arguments to arguments.json in {SERIALIZATION_DIR}") with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp: json.dump(vars(args), fp, indent=2) logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}") vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # Use list to ensure each epoch is a full pass through the data combined_training_dataset = list( roundrobin_iterator(*train_dataset.values())) combined_validation_dataset = list( roundrobin_iterator(*validation_dataset.values())) # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) training_stats = [] trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=combined_training_dataset, validation_dataset=combined_validation_dataset, patience=PATIENTCE, num_epochs=NUM_EPOCHS, cuda_device=CUDA_DEVICE, serialization_dir=SERIALIZATION_DIR, # model_save_interval=600 ) stats = trainer.train() training_stats.append(stats) with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp: json.dump(training_stats, fp, indent=2) else: model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() test_filepaths = { task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS } logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() test_stats = evaluate_multiple_data(model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE) with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp: json.dump(test_stats, fp, indent=2)