Esempio n. 1
0
    def test_from_instances_exclusive_embeddings_file_inside_archive(self):
        """ Just for ensuring there are no problems when reading pretrained tokens from an archive """
        # Read embeddings file from archive
        archive_path = str(self.TEST_DIR / "embeddings-archive.zip")

        with zipfile.ZipFile(archive_path, 'w') as archive:
            file_path = 'embedding.3d.vec'
            with archive.open(file_path, 'w') as embeddings_file:
                embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
                embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

            with archive.open('dummy.vec', 'w') as dummy_file:
                dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))

        embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)

        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)
        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words
Esempio n. 2
0
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
Esempio n. 3
0
    def test_from_dataset_respects_max_vocab_size_single_int(self):
        max_vocab_size = 1
        vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
        words = vocab.get_index_to_token_vocabulary().values()
        # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default
        assert len(words) == max_vocab_size + 2

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert len(words) == 5
Esempio n. 4
0
    def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("rel0", namespace="rel_labels")
        vocab.add_token_to_namespace("rel1", namespace="rel_labels")
        vocab.add_token_to_namespace("rel2", namespace="rel_labels")

        f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
Esempio n. 5
0
 def test_unknown_token(self):
     # pylint: disable=protected-access
     # We're putting this behavior in a test so that the behavior is documented.  There is
     # solver code that depends in a small way on how we treat the unknown token, so any
     # breaking change to this behavior should break a test, so you know you've done something
     # that needs more consideration.
     vocab = Vocabulary()
     oov_token = vocab._oov_token
     oov_index = vocab.get_token_index(oov_token)
     assert oov_index == 1
     assert vocab.get_token_index("unseen word") == oov_index
Esempio n. 6
0
    def test_from_dataset_respects_min_count(self):
        vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
Esempio n. 8
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 dropout: float = None,
                 num_samples: int = None,
                 sparse_embeddings: bool = False,
                 bidirectional: bool = False,
                 initializer: InitializerApplicator = None) -> None:
        super().__init__(vocab)
        self._text_field_embedder = text_field_embedder

        if contextualizer.is_bidirectional() is not bidirectional:
            raise ConfigurationError(
                    "Bidirectionality of contextualizer must match bidirectionality of "
                    "language model. "
                    f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
                    f"language model bidirectional: {bidirectional}")

        self._contextualizer = contextualizer
        self._bidirectional = bidirectional

        # The dimension for making predictions just in the forward
        # (or backward) direction.
        if self._bidirectional:
            self._forward_dim = contextualizer.get_output_dim() // 2
        else:
            self._forward_dim = contextualizer.get_output_dim()

        # TODO(joelgrus): more sampled softmax configuration options, as needed.
        if num_samples is not None:
            self._softmax_loss = SampledSoftmaxLoss(num_words=vocab.get_vocab_size(),
                                                    embedding_dim=self._forward_dim,
                                                    num_samples=num_samples,
                                                    sparse=sparse_embeddings)
        else:
            self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
                                              embedding_dim=self._forward_dim)

        # TODO(brendanr): Output perplexity here. e^loss
        self.register_buffer('_last_average_loss', torch.zeros(1))

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        if initializer is not None:
            initializer(self)
    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
Esempio n. 10
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        dep_labels = [token.dep_ or 'NONE' for token in tokens]

        return {index_name: [vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels]}
Esempio n. 11
0
    def __init__(self,
                 vocab: Vocabulary,
                 sentence_embedder: TextFieldEmbedder,
                 action_embedding_dim: int,
                 encoder: Seq2SeqEncoder,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels') -> None:
        super(NlvrSemanticParser, self).__init__(vocab=vocab)

        self._sentence_embedder = sentence_embedder
        self._denotation_accuracy = Average()
        self._consistency = Average()
        self._encoder = encoder
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace

        self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace),
                                          embedding_dim=action_embedding_dim)

        # This is what we pass as input in the first step of decoding, when we don't have a
        # previous action.
        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding)
Esempio n. 12
0
 def setUp(self):
     super(TestCopyNetReader, self).setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
Esempio n. 13
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens]

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
Esempio n. 14
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if self._coarse_tags:
         tag = token.pos_
     else:
         tag = token.tag_
     if tag is None:
         tag = 'NONE'
     return vocabulary.get_token_index(tag, self._namespace)
Esempio n. 15
0
 def test_vocab_can_print(self):
     vocab = Vocabulary(non_padded_namespaces=["a", "c"])
     vocab.add_token_to_namespace("a0", namespace="a")
     vocab.add_token_to_namespace("a1", namespace="a")
     vocab.add_token_to_namespace("a2", namespace="a")
     vocab.add_token_to_namespace("b2", namespace="b")
     vocab.add_token_to_namespace("b3", namespace="b")
     print(vocab)
Esempio n. 16
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None,
                 label_encoding: Optional[str] = "BIO",
                 tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", "BIOUL" or "BMES".
        tags_to_spans_function: ``Callable``, optional (default = ``None``)
            If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be
            used to generate spans.
        """
        if label_encoding and tags_to_spans_function:
            raise ConfigurationError(
                    'Both label_encoding and tags_to_spans_function are provided. '
                    'Set "label_encoding=None" explicitly to enable tags_to_spans_function.'
                    )
        if label_encoding:
            if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES"]:
                raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'.")
        elif tags_to_spans_function is None:
            raise ConfigurationError(
                    'At least one of the (label_encoding, tags_to_spans_function) should be provided.'
                    )

        self._label_encoding = label_encoding
        self._tags_to_spans_function = tags_to_spans_function
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
Esempio n. 17
0
 def __init__(self,
              word_embeddings: TextFieldEmbedder,
              encoder: Seq2SeqEncoder,
              vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.word_embeddings = word_embeddings
     self.encoder = encoder
     self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                       out_features=vocab.get_vocab_size('labels'))
     self.accuracy = CategoricalAccuracy()
Esempio n. 18
0
 def test_vocab_from_instances_namespaces(self):
     reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg'])
     instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt'))
     # check that we didn't clobber the labels namespace
     vocab = Vocabulary.from_instances(instances)
     self.assertSetEqual(
             set(vocab._token_to_index.keys()), # pylint: disable=protected-access
             {'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags',
              'predicate_arg_tags'}
     )
Esempio n. 19
0
    def test_min_pretrained_embeddings(self):
        params = Params({
                "pretrained_files": {
                        "tokens": str(self.FIXTURES_ROOT / "embeddings/glove.6B.100d.sample.txt.gz")
                },
                "min_pretrained_embeddings": {"tokens": 50},
        })

        vocab = Vocabulary.from_params(params=params, instances=self.dataset)
        assert vocab.get_vocab_size() >= 50
        assert vocab.get_token_index("his") > 1  # not @@UNKNOWN@@
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if getattr(token, 'text_id', None) is not None:
         # `text_id` being set on the token means that we aren't using the vocab, we just use
         # this id instead.
         index = token.text_id
     else:
         text = token.text
         if self.lowercase_tokens:
             text = text.lower()
         index = vocabulary.get_token_index(text, self.namespace)
     return index
Esempio n. 21
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
Esempio n. 22
0
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
Esempio n. 23
0
    def test_max_vocab_size_dict(self):
        params = Params({
                "max_vocab_size": {
                        "tokens": 1,
                        "characters": 20
                }
        })

        vocab = Vocabulary.from_params(params=params, instances=self.dataset)
        words = vocab.get_index_to_token_vocabulary().values()
        # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default
        assert len(words) == 3
Esempio n. 24
0
    def test_from_dataset_respects_inclusive_embedding_file(self):
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_filename},
                                          only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_filename},
                                          only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]:
     indices = []
     if token.text is None:
         raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         if getattr(character, 'text_id', None) is not None:
             # `text_id` being set on the token means that we aren't using the vocab, we just
             # use this id instead.
             index = character.text_id
         else:
             index = vocabulary.get_token_index(character.text, self._namespace)
         indices.append(index)
     return indices
Esempio n. 26
0
    def test_max_vocab_size_partial_dict(self):
        indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()}
        instance = Instance({
                'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers)
        })
        dataset = Batch([instance])
        params = Params({
                "max_vocab_size": {
                        "tokens": 1
                }
        })

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
Esempio n. 27
0
 def __init__(self,
              #### The embedding layer is specified as an AllenNLP <code>TextFieldEmbedder</code> which represents a general way of turning tokens into tensors. (Here we know that we want to represent each unique word with a learned tensor, but using the general class allows us to easily experiment with different types of embeddings, for example <a href = "https://allennlp.org/elmo">ELMo</a>.)
              word_embeddings: TextFieldEmbedder,
              #### Similarly, the encoder is specified as a general <code>Seq2SeqEncoder</code> even though we know we want to use an LSTM. Again, this makes it easy to experiment with other sequence encoders, for example a Transformer.
              encoder: Seq2SeqEncoder,
              #### Every AllenNLP model also expects a <code>Vocabulary</code>, which contains the namespaced mappings of tokens to indices and labels to indices.
              vocab: Vocabulary) -> None:
     #### Notice that we have to pass the vocab to the base class constructor.
     super().__init__(vocab)
     self.word_embeddings = word_embeddings
     self.encoder = encoder
     #### The feed forward layer is not passed in as a parameter, but is constructed by us. Notice that it looks at the encoder to find the correct input dimension and looks at the vocabulary (and, in particular, at the label -> index mapping) to find the correct output dimension.
     self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                       out_features=vocab.get_vocab_size('labels'))
     #### The last thing to notice is that we also instantiate a <code>CategoricalAccuracy</code> metric, which we'll use to track accuracy during each training and validation epoch.
     self.accuracy = CategoricalAccuracy()
Esempio n. 28
0
    def test_set_from_file_reads_non_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR / 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('B-PERS\n')
            vocab_file.write('I-PERS\n')
            vocab_file.write('O\n')
            vocab_file.write('B-ORG\n')
            vocab_file.write('I-ORG\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags')
        assert vocab.get_token_index("B-PERS", namespace='tags') == 0
        assert vocab.get_token_index("I-PERS", namespace='tags') == 1
        assert vocab.get_token_index("O", namespace='tags') == 2
        assert vocab.get_token_index("B-ORG", namespace='tags') == 3
        assert vocab.get_token_index("I-ORG", namespace='tags') == 4
        assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS"
        assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS"
        assert vocab.get_token_from_index(2, namespace='tags') == "O"
        assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG"
        assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
Esempio n. 29
0
    def test_registrability(self):

        @Vocabulary.register('my-vocabulary')
        class MyVocabulary:
            @classmethod
            def from_params(cls, params, instances=None):
                # pylint: disable=unused-argument
                return MyVocabulary()


        params = Params({'type': 'my-vocabulary'})

        instance = Instance(fields={})

        vocab = Vocabulary.from_params(params=params, instances=[instance])

        assert isinstance(vocab, MyVocabulary)
    def setUp(self):
        super().setUp()

        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary):
    with open(file_path, mode='w') as f:
        for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
            values = ['{:.5f}'.format(val) for val in embedding.weight[index]]
            f.write(' '.join([token] + values))
            f.write('\n')
def train(train_dataset, val_dataset, cfg):
    # Vocabularyを生成
    VOCAB_SIZE = cfg.w2v.vocab_size
    vocab = Vocabulary.from_instances(train_dataset + val_dataset,
                                      max_vocab_size=VOCAB_SIZE)

    BATCH_SIZE = cfg.training.batch_size

    # パディング済みミニバッチを生成してくれるIterator
    iterator = BucketIterator(batch_size=BATCH_SIZE,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    # 東北大が提供している学習済み日本語 Wikipedia エンティティベクトルを使用する
    # http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/
    model_name = cfg.w2v.model_name
    norm = cfg.w2v.norm
    cwd = hydra.utils.get_original_cwd()
    params = Params({
        'embedding_dim':
        200,
        'padding_index':
        0,
        'pretrained_file':
        os.path.join(cwd, f'embs/jawiki.{model_name}_vectors.200d.txt'),
        'norm_type':
        norm
    })

    token_embedding = Embedding.from_params(vocab=vocab, params=params)
    HIDDEN_SIZE = cfg.model.hidden_size
    dropout = cfg.model.dropout

    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": token_embedding})
    encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
        nn.LSTM(word_embeddings.get_output_dim(),
                HIDDEN_SIZE,
                bidirectional=True,
                batch_first=True))
    model = ClassifierWithAttn(word_embeddings, encoder, vocab, dropout)
    model.train()

    USE_GPU = True

    if USE_GPU and torch.cuda.is_available():
        model = model.cuda(0)

    LR = cfg.training.learning_rate
    EPOCHS = cfg.training.epoch
    patience = cfg.training.patience if cfg.training.patience > 0 else None

    optimizer = optim.Adam(model.parameters(), lr=LR)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=val_dataset,
                      patience=patience,
                      cuda_device=0 if USE_GPU else -1,
                      num_epochs=EPOCHS)
    metrics = trainer.train()
    logger.info(metrics)

    return model, metrics
Esempio n. 33
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
Esempio n. 34
0
    def test_set_from_file_reads_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR / 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('<S>\n')
            vocab_file.write('</S>\n')
            vocab_file.write('<UNK>\n')
            vocab_file.write('a\n')
            vocab_file.write('tricky\x0bchar\n')
            vocab_file.write('word\n')
            vocab_file.write('another\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>")

        assert vocab._oov_token == DEFAULT_OOV_TOKEN
        assert vocab.get_token_index("random string") == 3
        assert vocab.get_token_index("<S>") == 1
        assert vocab.get_token_index("</S>") == 2
        assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3
        assert vocab.get_token_index("a") == 4
        assert vocab.get_token_index("tricky\x0bchar") == 5
        assert vocab.get_token_index("word") == 6
        assert vocab.get_token_index("another") == 7
        assert vocab.get_token_from_index(0) == vocab._padding_token
        assert vocab.get_token_from_index(1) == "<S>"
        assert vocab.get_token_from_index(2) == "</S>"
        assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN
        assert vocab.get_token_from_index(4) == "a"
        assert vocab.get_token_from_index(5) == "tricky\x0bchar"
        assert vocab.get_token_from_index(6) == "word"
        assert vocab.get_token_from_index(7) == "another"
def main():
    parser = argparse.ArgumentParser(description='Evidence Inference experiments')
    parser.add_argument('--cuda_device', type=int, default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs', type=int, default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--emb_size', type=int, default=256,
                        help='elmo embeddings size (default: 256)')
    parser.add_argument('--model_name', type=str, default='baseline',
                        help='model name (default: baseline)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    annotations = pd.read_csv('data/data/annotations_merged.csv')
    prompts = pd.read_csv('data/data/prompts_merged.csv')

    feature_dictionary = {}
    prompts_dictionary = {}

    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']]

    for index, row in annotations.iterrows():
        if row['PMCID'] not in feature_dictionary:
            feature_dictionary[row['PMCID']] = []
        feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']]
                                                + prompts_dictionary[row['PromptID']])

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    elmo_token_indexer = {'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer()}

    reader = EIDatasetReader(elmo_token_indexer, feature_dictionary)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    urls = [
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_options.json',
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_weights.hdf5'
    ]

    elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable,
                                             projection_dim=args.emb_size)

    word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True)

    model = Baseline(word_embeddings, vocab)

    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('article', 'num_tokens')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Esempio n. 36
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_token_to_namespace("a", namespace="tokens1")
        original_vocab.add_token_to_namespace("b", namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a" "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": []})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=[],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1"]})
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)
        extended_vocab = copy.copy(original_vocab)
        extended_vocab._extend(non_padded_namespaces=["tokens1"],
                               tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens2"]})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
Esempio n. 37
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token
import lineflow.datasets as lfds

SOURCE_FIELD_NAME = 'source_tokens'
TARGET_FIELD_NAME = 'target_tokens'

if __name__ == '__main__':
    print('Reading...')
    train = lfds.SmallParallelEnJa('train') \
        .to_allennlp(source_field_name=SOURCE_FIELD_NAME, target_field_name=TARGET_FIELD_NAME).all()
    validation = lfds.SmallParallelEnJa('dev') \
        .to_allennlp(source_field_name=SOURCE_FIELD_NAME, target_field_name=TARGET_FIELD_NAME).all()

    if not osp.exists('./enja_vocab'):
        print('Building vocabulary...')
        vocab = Vocabulary.from_instances(train + validation,
                                          max_vocab_size=50000)
        print(f'Vocab Size: {vocab.get_vocab_size()}')

        print('Saving...')
        vocab.save_to_files('./enja_vocab')
    else:
        print('Loading vocabulary...')
        vocab = Vocabulary.from_files('./enja_vocab')

    iterator = BucketIterator(sorting_keys=[(SOURCE_FIELD_NAME, 'num_tokens')],
                              batch_size=32)
    iterator.index_with(vocab)

    num_batches = iterator.get_num_batches(train)

    for batch in Tqdm.tqdm(iterator(train, num_epochs=1), total=num_batches):
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     tag = token.ent_type_
     if tag is None:
         tag = 'NONE'
     return vocabulary.get_token_index(tag, self._namespace)
Esempio n. 40
0
    def from_params(
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        cache_directory: str = None,
        cache_prefix: str = None,
    ) -> "TrainerPieces":
        all_datasets = training_util.datasets_from_params(
            params, cache_directory, cache_prefix)
        datasets_for_vocab_creation = set(
            params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(
                    f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info(
            "From dataset instances, %s will be considered for vocabulary creation.",
            ", ".join(datasets_for_vocab_creation),
        )

        if recover and os.path.exists(
                os.path.join(serialization_dir, "vocabulary")):
            vocab_params = params.pop("vocabulary", {})
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"),
                vocab_params.get("padding_token", None),
                vocab_params.get("oov_token", None),
            )
        else:
            vocab = Vocabulary.from_params(
                params.pop("vocabulary", {}),
                # Using a generator comprehension here is important
                # because, being lazy, it allows us to not iterate over the
                # dataset when directory_path is specified.
                (instance for key, dataset in all_datasets.items()
                 if key in datasets_for_vocab_creation
                 for instance in dataset),
            )

        model = Model.from_params(vocab=vocab, params=params.pop("model"))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets["train"]
        validation_data = all_datasets.get("validation")
        test_data = all_datasets.get("test")

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(
            model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        return cls(
            model,
            iterator,
            train_data,
            validation_data,
            test_data,
            validation_iterator,
            trainer_params,
        )
Esempio n. 41
0
    do_lowercase=True,
 )


# %%
reader = ClaimAnnotationReaderJSON(
    token_indexers={"tokens": token_indexer}
)

train_dataset = reader.read(TRAIN_PATH)
validation_dataset = reader.read(VALIDATION_PATH)
test_dataset = reader.read(TEST_PATH)


# %%
vocab = Vocabulary()

vocab._token_to_index['labels'] = {'0': 0, '1': 1}


# %%
"""Prepare iterator"""
from allennlp.data.iterators import BasicIterator

iterator = BasicIterator(batch_size=64)

iterator.index_with(vocab)


# %%
def multiple_target_CrossEntropyLoss(logits, labels):
Esempio n. 42
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda() # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train() # rnn cannot do backwards in train mode

    # intiialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train() # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model,
                                                      batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
Esempio n. 43
0
 def add_task(self, task_tag: str, vocab: Vocabulary):
   self.classification_layers.append(torch.nn.Linear(in_features=self.hidden_dim, out_features=vocab.get_vocab_size('labels')))
   self.num_task = self.num_task + 1
   self.task2id[task_tag] = self.num_task
   self.tasks_vocabulary[task_tag] = vocab
 def index(self, vocab: Vocabulary):
     if self._indexed_labels is None and self.labels is not None:
         self._indexed_labels = [vocab.get_token_index(label, self._label_namespace)
                                 for label in self.labels]
Esempio n. 45
0
def main():

    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print (testSeq2SeqFile)
    #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    #SingleIdTokenIndexer = Tokens are single integers
    #TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = Seq2SeqDatasetReader(
        source_tokenizer = WordTokenizer(),
        target_tokenizer = WordTokenizer(), # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer()} # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)

    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(set(trainExtraVocab+validExtraVocab+testExtraVocab))
    print("length:",len(finalExtraVocab))
    #input()

    #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099


    print ("Vocab SIze :",vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(ENC_EMBEDDING_DIM,HIDDEN_DIM,batch_first=True,dropout=0.2))


    attention = DotProductAttention()

    max_decoding_steps = 4  # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim = TGT_EMBEDDING_DIM,
                          #target_namespace = 'target_tokens',
                          attention = attention,
                          beam_size = beamSize,
                          use_bleu = True,
                          extra_vocab = finalExtraVocab)
    #Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    #iterator = BasicIterator(batch_size=2)
    #iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model = model,
                      optimizer = optimizer,
                      iterator = iterator,
                      train_dataset = train_dataset,
                      validation_dataset = validation_dataset,
                      #patience = 3,
                      num_epochs = numEpochs,
                      cuda_device = CUDA_DEVICE)

    trainer.train()
    predictor = SimpleSeq2SeqPredictor(model, reader)

    '''for i in range(2):
        print ("Epoch: {}".format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)


        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
            """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 
             'loss': 5.9835076332092285,
             'class_log_probabilities': [-20.10894012451172],
             'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']}
             """
            print (predictor.predict_instance(instance))
    '''

    outFile = open("output_"+str(HIDDEN_DIM)+"_"+str(numEpochs)+"_"+str(beamSize)+".csv","w")
    writer = csv.writer(outFile,delimiter="\t")
    for instance in itertools.islice(test_dataset,500):
        src = instance.fields['source_tokens'].tokens
        gold = instance.fields['target_tokens'].tokens
        pred = predictor.predict_instance(instance)['predicted_tokens']
        writer.writerow([src,gold,pred])


    outFile.close()
Esempio n. 46
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        vecoder: Seq2VecEncoder,
        sen_encoder: Seq2VecEncoder,
        max_decoding_steps: int = 32,
        attention: Attention = None,
        beam_size: int = None,
        target_namespace: str = "tokens",
        scheduled_sampling_ratio: float = 0.5,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio  # Maybe we can try
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self.pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                    self._target_namespace)
        self._max_decoding_steps = max_decoding_steps
        self.vocab = vocab
        # anything about dims
        self.sen_num = 10
        # with open('../data/0510/cy/kg_and_train.pk', 'rb') as f:
        with open('cy/openkg.pk', 'rb') as f:
            self.kg_mat = torch.tensor(pickle.load(f)).float()
        self.symp_mat = torch.nn.Parameter(self.kg_mat).cuda()
        self.evovl_mat = torch.zeros(len(self.kg_mat), len(self.kg_mat)).cuda()

        # with open('../data/0510/cy/comp_topic2num.pk', 'rb') as f:
        with open('cy/comp_topic2num.pk', 'rb') as f:
            self.word_idx = pickle.load(f)
        self.idx_word = {v: k for k, v in self.word_idx.items()}
        self.vocab_to_idx = {}
        self.idx_to_vocab_list = []
        self.vocab_list = []
        for word, k in self.word_idx.items():
            self.vocab_to_idx[vocab.get_token_index(word.strip())] = k
            self.idx_to_vocab_list.append(vocab.get_token_index(word.strip()))

        self.symp_size = len(self.symp_mat) + self.sen_num
        self.topic = len(self.symp_mat)
        self._encoder = encoder
        self._vecoder = vecoder
        self._sen_encoder = sen_encoder

        self.outfeature = self._sen_encoder.get_output_dim()
        # anything about graph
        self.symp_state = torch.nn.Parameter(
            torch.Tensor(self.symp_size, self.outfeature))
        torch.nn.init.xavier_uniform_(self.symp_state, gain=1.414)
        self.predict_layer = torch.nn.Parameter(
            torch.Tensor(self.symp_size, self.outfeature))
        self.predict_bias = torch.nn.Parameter(torch.Tensor(self.symp_size))
        torch.nn.init.kaiming_uniform_(self.predict_layer)
        torch.nn.init.uniform_(self.predict_bias, -1 / self.symp_size**0.5,
                               1 / self.symp_size**0.5)

        self.attn_one = GATAttention(self.outfeature, self.outfeature, 1)
        self.attn_two = GATAttention(self.outfeature, self.outfeature, 1)
        self.attn_three = GATAttention(self.outfeature, self.outfeature, 1)

        # Metric
        self.kd_metric = KD_Metric()
        self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25))
        self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0))
        self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0))
        self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1))
        self.topic_acc = Average()
        # anything about module
        self._source_embedder = source_embedder
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        target_embedding_dim = source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        self._encoder_output_dim = self._encoder.get_output_dim(
        )  # 600  要不把前两个都换成outfeater得了
        self._decoder_output_dim = self._encoder_output_dim * 2
        self._decoder_input_dim = target_embedding_dim
        self._attention = None
        if attention:
            self._attention = attention
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        # 在这里把那个embedding融合进入试试?
        self.before_linear = Linear(2 * self.outfeature, self.outfeature)
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)

        self._output_projection_layer = Linear(self.outfeature * 2,
                                               num_classes)

        self.linear_all = Linear(self.outfeature * 3 + self._decoder_input_dim,
                                 1)
        self.attention_linear = Linear(self.outfeature, self.outfeature)
        self.decoder_linear = Linear(self.outfeature * 2, self.outfeature)

        self.get_attn = Linear(self.outfeature, 1, bias=False)
        self.topic_acc = MyAverage()
        self.topic_rec = MyAverage()
        self.topic_f1 = F1()
        self.dink1 = Distinct1()
        self.dink2 = Distinct2()
        self.last_sen = 2
        self.clac_num = 0
Esempio n. 47
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        extension_ways = ["from_params", "extend_from_instances"]
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("d", namespace="tokens")
            original_vocab.add_token_to_namespace("a", namespace="tokens")
            original_vocab.add_token_to_namespace("b", namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            instances = Batch([Instance({"text": text_field})])
            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                extra_count = 2 if extended_vocab.is_padded("tokens") else 0
                assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count
                assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count
                assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count

                assert extended_vocab.get_token_index("c", "tokens") # should be present
                assert extended_vocab.get_token_index("e", "tokens") # should be present

                assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[],
                                      ["tokens1"],
                                      ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2
            text_field = TextField([Token(t) for t in ["b"]],
                                   {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])

            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                # Should have two namespaces
                assert len(extended_vocab._token_to_index) == 2

                extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
                assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

                extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
                assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
Esempio n. 48
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                                  1: '@@UNKNOWN@@',
                                                                  2: 'a', 3: 'c', 4: 'b'}
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory_path' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
Esempio n. 49
0
        WORD2VEC = load_w2v(args.word2vec, VOCAB)
        print(
            f'Loaded {len(WORD2VEC)} words. Coverage: { len(WORD2VEC) / len(VOCAB)*100:.2f}%'
        )
    elif args.rank_func == 'bert':
        import torch
        from torch.nn import functional as F

        from pytorch_pretrained_bert.modeling import BertForNextSentencePrediction
        from allennlp.data import Instance
        from allennlp.data.dataset import Batch
        from allennlp.data.fields import TextField
        from allennlp.data.tokenizers import WordTokenizer, Token
        from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter
        from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer
        from allennlp.data.vocabulary import Vocabulary

        print('Initialize BERT model...')
        TOKENIZER = WordTokenizer(word_splitter=BertBasicWordSplitter())
        WORD_INDEXER = PretrainedBertIndexer(pretrained_model=args.bert_vocab)
        VOCAB = Vocabulary()
        GPU_ID = args.gpu_id
        BERT_NEXT_SENTENCE = BertForNextSentencePrediction.from_pretrained(
            args.bert_model).to(torch.device(f"cuda:{GPU_ID}"))
        BERT_NEXT_SENTENCE.eval()

    main()

    if args.rank_func == 'sentenc':
        SESSION.close()
    vocab = Vocabulary()
    for ns in ["tokens", "token_in", "token_out"]:
        for chord in itertools.product(note_list, accidental_list,
                                       chord_type_list):
            vocab.add_token_to_namespace("".join(chord), namespace=ns)

        vocab.add_token_to_namespace(START_SYMBOL, namespace=ns)
        vocab.add_token_to_namespace(END_SYMBOL, namespace=ns)

    key_list = [
        "".join(x) for x in itertools.product(note_list, accidental_list)
    ]
    form_list = ["m", "+", "o", "M", "%", "It", "Ger", "Fr"]
    figbass_list = ["7", "6"]
    for char in (key_list + form_list + figbass_list):
        vocab.add_token_to_namespace(char, namespace="token_characters")

    note_number_list = [str(x) for x in range(12)]
    for note_number in note_number_list:
        vocab.add_token_to_namespace(note_number, namespace="notes")

    vocab.save_to_files("data/vocabulary")


generate_vocab()

vocab = Vocabulary.from_files("data/vocabulary")

print(vocab.get_token_to_index_vocabulary())
Esempio n. 51
0
 def test_from_params_adds_tokens_to_vocab(self):
     vocab = Vocabulary.from_params(Params({'tokens_to_add': {'tokens': ['q', 'x', 'z']}}), self.dataset)
     assert vocab.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                              1: '@@UNKNOWN@@',
                                                              2: 'a', 3: 'c', 4: 'b',
                                                              5: 'q', 6: 'x', 7: 'z'}
Esempio n. 52
0
 def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.word_embedding = word_embeddings
     self.encoder = encoder
     self.hidden2out = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size("labels"))
     self.accuracy = MicroMetrics(vocab)
     self.lstm = nn.LSTM(input_size=word_embeddings.get_output_dim(), hidden_size=128, num_layers=1, batch_first=True)
     self.label_index_to_label = self.vocab.get_index_to_token_vocabulary('labels')
Esempio n. 53
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = self.TEST_DIR / 'vocab_save'

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def main():
    #Initlizing the embeddings (BERT)
    bert_token_indexer = PretrainedBertIndexer(
        pretrained_model="./biobert_pubmed/vocab.txt",
        max_pieces=config.max_seq_len,
        do_lowercase=True,
    )
    reader = BertAnalogyDatasetReader(
        tokenizer=bert_tokenizer,
        token_indexers={'tokens': bert_token_indexer})

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname)
        for fname in ["train_all.txt", "test_all.txt", "val_all.txt"])

    vocab = Vocabulary.from_instances(train_dataset + test_dataset +
                                      dev_dataset)

    bert_embedder = PretrainedBertEmbedder(
        pretrained_model='biobert_pubmed',
        top_layer_only=True,  # conserve memory
    )
    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": bert_embedder},
        # we'll be ignoring masks so we'll need to set this to True
        allow_unmatched_keys=True)

    BERT_DIM = word_embeddings.get_output_dim()

    class BertSentencePooler(Seq2VecEncoder):
        def forward(self,
                    embs: torch.tensor,
                    mask: torch.tensor = None) -> torch.tensor:
            # extract first token tensor
            return embs[:, 0]

        @overrides
        def get_output_dim(self) -> int:
            return BERT_DIM

    #Initializing the model
    #takes the hidden state at the last time step of the LSTM for every layer as one single output
    bert_encoder = BertSentencePooler(vocab)

    model = LstmModel(word_embeddings, bert_encoder, vocab)
    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      cuda_device=0 if USE_GPU else -1,
                      num_epochs=20)

    trainer.train()

    #Saving the model
    with open("biobert/model.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("biobert/vocabulary")
    return vocab
Esempio n. 55
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Esempio n. 56
0
 def index(self, vocab: Vocabulary):
     self._mapping_array = [
         vocab.get_token_index(x.text, self._target_namespace)
         for x in self._source_tokens
     ]
Esempio n. 57
0
def main():
    logging.basicConfig(
        level=logging.INFO,
        format='[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model-name',
        help='Model\'s name (the name of directory with the trained model)')
    parser.add_argument(
        '--pretrained-models-dir',
        default=None,
        help='Path to directory with pretrained models (e.g., RuBERT)')
    parser.add_argument('--models-dir',
                        default='../models',
                        help='Path to directory where the models are stored')
    parser.add_argument(
        '--data-dir',
        default='../data/test_private_data',
        help='Path to directory with files to apply the model to')
    parser.add_argument('--predictions-dir',
                        default='../predictions/private',
                        help='Path to directory to store the predictions')
    parser.add_argument('--batch-size', default=128, type=int)
    parser.add_argument('--checkpoint-name',
                        default='best.th',
                        help='Name of the checkpoint to use')
    parser.add_argument('--cuda-unit', default='0', help='CUDA device number')
    args = parser.parse_args()

    model_dir = os.path.join(args.models_dir, args.model_name)
    result_data_dir = args.predictions_dir
    #result_data_dir = os.path.join(args.predictions_dir, args.model_name)

    if not os.path.isdir(result_data_dir):
        os.makedirs(result_data_dir)

    config = Config.load(os.path.join(model_dir, 'config.json'))

    if args.models_dir:
        config.data.models_dir = args.models_dir
    if args.pretrained_models_dir:
        config.data.pretrained_models_dir = args.pretrained_models_dir

    logger.info('Config: %s', config)

    cuda_dev_name = 'cuda:' + args.cuda_unit
    device = torch.device(
        cuda_dev_name if torch.cuda.is_available() else 'cpu:0')
    #device = torch.device('cpu')

    vocab = Vocabulary.from_files(os.path.join(model_dir, 'vocab'))
    lemmatize_helper = LemmatizeHelper.load(model_dir)
    morpho_vectorizer = MorphoVectorizer(
    ) if config.embedder.use_pymorphy else None

    model = _build_model(config,
                         vocab,
                         lemmatize_helper,
                         morpho_vectorizer,
                         bert_max_length=BERT_MAX_LENGTH)
    model.HeuristicMode = True
    model.to(device)

    model.load_state_dict(
        torch.load(os.path.join(model_dir, args.checkpoint_name),
                   map_location=device))
    model.eval()

    reader = _get_reader(config,
                         skip_labels=True,
                         bert_max_length=BERT_MAX_LENGTH,
                         reader_max_length=None)

    for root, dirs, files in os.walk(args.data_dir):
        reroot = root[len(args.data_dir) + 1:]

        for name in dirs:
            os.makedirs(os.path.join(result_data_dir, reroot, name))

        for name in files:
            path = os.path.join(root, name)
            result_path = os.path.join(result_data_dir, reroot, name)

            if not path.endswith('.conllu'):
                continue

            print("PROCESSING: " + path)
            data = reader.read(path)

            if morpho_vectorizer is not None:
                morpho_vectorizer.apply_to_instances(data)

            with open(result_path, 'w') as f_out:
                for begin_index in tqdm(range(0, len(data), args.batch_size)):
                    end_index = min(len(data), begin_index + args.batch_size)
                    predictions_list = model.forward_on_instances(
                        data[begin_index:end_index])
                    for predictions in predictions_list:
                        for token_index in range(len(predictions['words'])):
                            #word = predictions['words'][token_index]
                            word = predictions['original_words'][token_index]
                            lemma = predictions['predicted_lemmas'][
                                token_index]
                            upos, feats = predictions['predicted_gram_vals'][
                                token_index].split('|', 1)
                            feats = reorder_grammemes(feats)
                            head_tag = predictions['predicted_dependencies'][
                                token_index]
                            head_index = predictions['predicted_heads'][
                                token_index]

                            #print(token_index + 1, word, lemma, upos, '_', feats, head_index, head_tag, '_', '_', sep='\t', file=f_out)
                            tn = predictions['token_nos'][token_index]
                            hn = predictions['token_nos'][
                                head_index - 1] if head_index > 0 else 0
                            print(tn,
                                  word,
                                  lemma,
                                  upos,
                                  '_',
                                  feats,
                                  hn,
                                  head_tag,
                                  '_',
                                  '_',
                                  sep='\t',
                                  file=f_out)
                        print(file=f_out)
from bella_allen_nlp.allen_models.target_lstm import TargetLSTMClassifier

#token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens_id'),
#                  'chars': TokenCharactersIndexer(namespace='char_id')}
token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens_id', 
                                                 lowercase_tokens=True)}
reader = TargetDatasetReader(token_indexers=token_indexers)
train_dataset = reader.read(cached_path(
    '/home/andrew/.Bella/Datasets/restaurants train'))
validation_dataset = reader.read(cached_path(
    '/home/andrew/.Bella/Datasets/restaurants dev'))
target = train_dataset[0].fields['target']
text = train_dataset[0].fields['text']
label = train_dataset[0].fields['label']

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
WORD_EMBEDDING_DIM = 50
CHAR_EMBEDDING_DIM = 5
CHAR_WORD_DIM = 30
HIDDEN_DIM = 50


#char_embedding = Embedding(num_embeddings=vocab.get_vocab_size("char_id"), 
#                           embedding_dim=CHAR_EMBEDDING_DIM)
#character_cnn = CnnEncoder(embedding_dim=CHAR_EMBEDDING_DIM, num_filters=2, 
#                           output_dim=CHAR_WORD_DIM)
#token_character_encoder = TokenCharactersEncoder(embedding=char_embedding, 
#                                                 encoder=character_cnn)

#word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding,
#                                          "chars": token_character_encoder})
def generate_vocab():
    note_list = ["A", "B", "C", "D", "E", "F", "G"]
    accidental_list = ["", "b", "#"]
    chord_type_list = [
        "", "m", "+", "o", "7", "m7", "M7", "o7", "%7", "+7", "It6", "Ger6",
        "Fr6"
    ]

    vocab = Vocabulary()
    for ns in ["tokens", "token_in", "token_out"]:
        for chord in itertools.product(note_list, accidental_list,
                                       chord_type_list):
            vocab.add_token_to_namespace("".join(chord), namespace=ns)

        vocab.add_token_to_namespace(START_SYMBOL, namespace=ns)
        vocab.add_token_to_namespace(END_SYMBOL, namespace=ns)

    key_list = [
        "".join(x) for x in itertools.product(note_list, accidental_list)
    ]
    form_list = ["m", "+", "o", "M", "%", "It", "Ger", "Fr"]
    figbass_list = ["7", "6"]
    for char in (key_list + form_list + figbass_list):
        vocab.add_token_to_namespace(char, namespace="token_characters")

    note_number_list = [str(x) for x in range(12)]
    for note_number in note_number_list:
        vocab.add_token_to_namespace(note_number, namespace="notes")

    vocab.save_to_files("data/vocabulary")
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {
        task.tag_namespace: JSONDatasetReader(
            task.tag_namespace,
            token_indexers=token_indexers,
            tag_namespace_hashing_fn=tag_namespace_hashing_fn,
        )
        for task in TASKS
    }

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    if not TEST_MODE:
        train_dataset = read_datasets(dataset_paths,
                                      readers,
                                      data_split="train")
        validation_dataset = read_datasets(dataset_paths,
                                           readers,
                                           data_split="dev")
        vocab = create_classification_vocab(
            [train_dataset, validation_dataset])
    else:
        vocab = Vocabulary.from_files(
            os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskClassifier(word_embeddings, encoders, vocab, TASKS)
    model = model.cuda(device=CUDA_DEVICE)

    if not TEST_MODE:
        iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                  batch_size=BATCH_SIZE,
                                                  cache_instances=True)
        iterator.index_with(vocab)

        if CLEAN_MODEL_DIR:
            if os.path.exists(SERIALIZATION_DIR):
                logger.info(f"Deleting {SERIALIZATION_DIR}")
                shutil.rmtree(SERIALIZATION_DIR)
            logger.info(f"Creating {SERIALIZATION_DIR}")
            os.makedirs(SERIALIZATION_DIR)

        logger.info(
            f"Writing arguments to arguments.json in {SERIALIZATION_DIR}")
        with open(os.path.join(SERIALIZATION_DIR, "arguments.json"),
                  "w+") as fp:
            json.dump(vars(args), fp, indent=2)

        logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}")
        vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))
        # Use list to ensure each epoch is a full pass through the data
        combined_training_dataset = list(
            roundrobin_iterator(*train_dataset.values()))
        combined_validation_dataset = list(
            roundrobin_iterator(*validation_dataset.values()))

        # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1)
        optimizer = optim.Adam(model.parameters(),
                               lr=LR,
                               weight_decay=WEIGHT_DECAY)

        training_stats = []
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=combined_training_dataset,
            validation_dataset=combined_validation_dataset,
            patience=PATIENTCE,
            num_epochs=NUM_EPOCHS,
            cuda_device=CUDA_DEVICE,
            serialization_dir=SERIALIZATION_DIR,
            # model_save_interval=600
        )
        stats = trainer.train()
        training_stats.append(stats)

        with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"),
                  "w+") as fp:
            json.dump(training_stats, fp, indent=2)
    else:
        model.load_state_dict(
            torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
        model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()

    test_filepaths = {
        task.tag_namespace: dataset_paths[task.tag_namespace]["test"]
        for task in TASKS
    }

    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    test_stats = evaluate_multiple_data(model,
                                        readers,
                                        test_iterator,
                                        test_filepaths,
                                        cuda_device=CUDA_DEVICE)
    with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp:
        json.dump(test_stats, fp, indent=2)