コード例 #1
0
ファイル: vocabulary_test.py プロジェクト: apmoore1/allennlp
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
コード例 #2
0
ファイル: vocabulary_test.py プロジェクト: apmoore1/allennlp
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                                  1: '@@UNKNOWN@@',
                                                                  2: 'a', 3: 'c', 4: 'b'}
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory_path' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
コード例 #3
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
コード例 #4
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None,
                 label_encoding: Optional[str] = "BIO",
                 tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", "BIOUL" or "BMES".
        tags_to_spans_function: ``Callable``, optional (default = ``None``)
            If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be
            used to generate spans.
        """
        if label_encoding and tags_to_spans_function:
            raise ConfigurationError(
                    'Both label_encoding and tags_to_spans_function are provided. '
                    'Set "label_encoding=None" explicitly to enable tags_to_spans_function.'
                    )
        if label_encoding:
            if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES"]:
                raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'.")
        elif tags_to_spans_function is None:
            raise ConfigurationError(
                    'At least one of the (label_encoding, tags_to_spans_function) should be provided.'
                    )

        self._label_encoding = label_encoding
        self._tags_to_spans_function = tags_to_spans_function
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
コード例 #5
0
ファイル: vocabulary_test.py プロジェクト: apmoore1/allennlp
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
コード例 #6
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_tokens_to_namespace(
            ["a0", "a1", "a2"], namespace="a")  # non-padded, should start at 0
        vocab.add_tokens_to_namespace(
            ["b2", "b3"], namespace="b")  # padded, should start at 2
        vocab.save_to_files(vocab_dir)

        params = Params({"type": "from_files", "directory": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary(
            "a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary(
            "b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), instances=self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {
            0: "@@PADDING@@",
            1: "@@UNKNOWN@@",
            2: "a",
            3: "c",
            4: "b",
        }
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(
                Params({
                    "type": "from_files",
                    "directory": vocab_dir,
                    "min_count": {
                        "tokens": 2
                    }
                }))
コード例 #7
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace(
            "a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace(
            "b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary(
            "a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary(
            "b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {
            0: '@@PADDING@@',
            1: '@@UNKNOWN@@',
            2: 'a',
            3: 'c',
            4: 'b'
        }
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'vocabulary_directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(
                Params({
                    "directory_path": vocab_dir,
                    "min_count": 2
                }))
コード例 #8
0
def get_synonyms(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10):
    """Given a token, return a list of top N most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[token_id]
    cosine = CosineSimilarity(dim=0)
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim

    return sims.most_common(num_synonyms)
コード例 #9
0
def inflate_stress_vocabulary(vocabulary: Vocabulary,
                              stress_predictor: StressPredictor):
    vocab = StressVocabulary()
    for index, word in vocabulary.get_index_to_token_vocabulary(
            "tokens").items():
        stresses = [
            Stress(pos, Stress.Type.PRIMARY)
            for pos in stress_predictor.predict(word)
        ]
        word = StressedWord(word, set(stresses))
        vocab.add_word(word, index)
    return vocab
コード例 #10
0
def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary):
    with open(file_path, mode='w') as f:
        words = vocab.get_index_to_token_vocabulary('token_in').items()
        print(len(words))
        f.write('{} {}\n'.format(
            len(words),
            EMBEDDING_DIM))  #we write number of words and embedding dimension
        for index, token in words:  #loop through both keys and values, by using the items()
            values = [
                '{:.10f}'.format(val) for val in embedding.weight[index]
            ]  #write each value as a number with 10 decimals
            f.write(' '.join([token] + values))
            f.write('\n')
コード例 #11
0
 def __init__(self,
              vocabulary: Vocabulary,
              average: str = "macro",
              label_namespace: str = "labels",
              ignore_label: str = None) -> None:
     self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(
         label_namespace)
     self._average = average
     self._ignore_label = ignore_label
     self._true_positives: Dict[str, int] = defaultdict(int)
     self._true_negatives: Dict[str, int] = defaultdict(int)
     self._false_positives: Dict[str, int] = defaultdict(int)
     self._false_negatives: Dict[str, int] = defaultdict(int)
コード例 #12
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 slot_labels: List[str],
                 count_span: bool = False,
                 fine_grained: bool = False):
        self._vocabulary = vocabulary
        self._bio_vocabulary = vocabulary.get_index_to_token_vocabulary(
            "bio_labels")
        self._slot_labels = slot_labels
        self._count_span = count_span
        self._fine_grained = fine_grained

        self.reset()
コード例 #13
0
 def __init__(self, model_path, vocab: Vocabulary):
     super().__init__(vocab)
     self.pretrained_tokenizer = BertForPreTraining.from_pretrained(
         model_path)
     config = BertConfig.from_pretrained(model_path)
     bert_model = BertForPreTraining(config)
     self.bert = bert_model.bert
     tags = vocab.get_index_to_token_vocabulary("tags")
     num_tags = len(tags)
     constraints = allowed_transitions(constraint_type="BMES", labels=tags)
     self.projection = torch.nn.Linear(768, num_tags)
     self.crf = ConditionalRandomField(num_tags=num_tags,
                                       constraints=constraints,
                                       include_start_end_transitions=False)
コード例 #14
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace="1")
        assert "word" in vocab.get_index_to_token_vocabulary(
            namespace="1").values()
        assert vocab.get_token_index("word", namespace="1") == word_index
        assert vocab.get_token_from_index(word_index, namespace="1") == "word"
        assert vocab.get_vocab_size(namespace="1") == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace="2")
        word_index = vocab.add_token_to_namespace("word", namespace="2")
        assert "word" in vocab.get_index_to_token_vocabulary(
            namespace="2").values()
        assert "word2" in vocab.get_index_to_token_vocabulary(
            namespace="2").values()
        assert vocab.get_token_index("word", namespace="2") == word_index
        assert vocab.get_token_index("word2", namespace="2") == word2_index
        assert vocab.get_token_from_index(word_index, namespace="2") == "word"
        assert vocab.get_token_from_index(word2_index,
                                          namespace="2") == "word2"
        assert vocab.get_vocab_size(namespace="2") == initial_vocab_size + 2
コード例 #15
0
ファイル: model.py プロジェクト: hawkeoni/Semeval2020_task11
 def __init__(
     self,
     vocab: Vocabulary,
     embedder: TextFieldEmbedder,
     feature_encoder: SpanClassifier,
     num_classes: int = 14,
 ):
     super().__init__(vocab)
     self.embedder = embedder
     self.feature_encoder = feature_encoder
     self.hidden2tag = torch.nn.Linear(feature_encoder.get_output_dim(),
                                       num_classes)
     self.criterion = torch.nn.BCEWithLogitsLoss()
     self.acc = Accuracy()
     self.f1 = MultilabelMicroF1()
     self.idx2label = vocab.get_index_to_token_vocabulary("labels")
コード例 #16
0
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None:

        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.vocab = vocab
        self.label_vocab = vocab.get_index_to_token_vocabulary(
            namespace='labels')

        inf_vec = torch.Tensor([float('-inf')] * encoder.get_input_dim())
        self.class_avgs = [
            inf_vec.clone() for i in range(len(self.label_vocab))
        ]

        self.accuracy = CategoricalAccuracy()
        self.f_beta = FBetaMeasure(1.0, None, [0, 1, 2])
コード例 #17
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None,
                 label_encoding: str = "BIO") -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", or BIOUL".
        """
        if label_encoding not in ["BIO", "IOB1", "BIOUL"]:
            raise ConfigurationError(
                "Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL'.")

        self._label_encoding = label_encoding
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(
            tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
コード例 #18
0
def get_related(token: str,
                embedding: Model,
                vocab: Vocabulary,
                num_related: int = 20):
    """Given a token, return a list of top 20 most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[
        token_id]  #A pre-initialization weight matrix for the embedding lookup, allowing the use of pretrained vectors.
    cosine = CosineSimilarity(
        dim=0
    )  #we do this to be able calculate simple cosine similarity between 2 vectors
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary(
            'token_in').items():
        # Cosine similarity of our token vector with every other word vector in the vocabulary
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim  #save the value of cosine similarity

    return sims.most_common(num_related)
コード例 #19
0
def create_target_weight():
    vocab = Vocabulary().from_files("data/vocabulary")

    token_weight_list = []
    for index, token in vocab.get_index_to_token_vocabulary().items():
        token_weight = get_target_distribution(token, vocab)
        token_weight_list.append(token_weight)

    weight = torch.stack(token_weight_list)
    s = Score.score
    torch.save(
        weight,
        "data/targets/target_{}{}{}{}{}{}.th".format(
            s["token_name"],
            s["key_name"],
            s["key_number"],
            s["triad_form"],
            s["figbass"],
            s["note_pair"],
        ),
    )
コード例 #20
0
ファイル: vocabulary_test.py プロジェクト: apmoore1/allennlp
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = self.TEST_DIR / 'vocab_save'

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
コード例 #21
0
ファイル: vocabulary_test.py プロジェクト: Taekyoon/allennlp
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == ["a", "c"]

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
コード例 #22
0
    def test_saving_and_loading(self):

        vocab_dir = self.TEST_DIR / "vocab_save"

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_tokens_to_namespace(
            ["a0", "a1", "a2"], namespace="a"
        )  # non-padded, should start at 0
        vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b")  # padded, should start at 2

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace="a") == 3
        assert vocab2.get_token_from_index(0, namespace="a") == "a0"
        assert vocab2.get_token_from_index(1, namespace="a") == "a1"
        assert vocab2.get_token_from_index(2, namespace="a") == "a2"
        assert vocab2.get_token_index("a0", namespace="a") == 0
        assert vocab2.get_token_index("a1", namespace="a") == 1
        assert vocab2.get_token_index("a2", namespace="a") == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace="b") == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace="b") == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace="b") == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace="b") == "b2"
        assert vocab2.get_token_from_index(3, namespace="b") == "b3"
        assert vocab2.get_token_index(vocab._padding_token, namespace="b") == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace="b") == 1
        assert vocab2.get_token_index("b2", namespace="b") == 2
        assert vocab2.get_token_index("b3", namespace="b") == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
コード例 #23
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 message_encoder: Seq2VecEncoder,
                 conversation_encoder: Seq2SeqEncoder,
                 dropout: float = 0.5,
                 pos_weight: float = None,
                 use_game_scores: bool = False) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._message_encoder = message_encoder
        self._conversation_encoder = conversation_encoder
        self._use_game_scores = use_game_scores

        output_dim = conversation_encoder.get_output_dim() + int(self._use_game_scores)

        self._classifier = nn.Linear(in_features=output_dim,
                                     out_features=vocab.get_vocab_size('labels'))
        self._dropout = nn.Dropout(dropout)

        self._label_index_to_token = vocab.get_index_to_token_vocabulary(namespace="labels")
        self._num_labels = len(self._label_index_to_token)
        print(self._label_index_to_token)
        index_list = list(range(self._num_labels))
        print(index_list)
        self._f1 = FBetaMeasure(average=None, labels=index_list)
        self._f1_micro = FBetaMeasure(average='micro')
        self._f1_macro = FBetaMeasure(average='macro')

        if pos_weight is None or pos_weight <= 0:
            labels_counter = self.vocab._retained_counter['labels']
            self._pos_weight = 1. * labels_counter['True'] / labels_counter['False']
            # self._pos_weight = 15.886736214605067
            print('Computing Pos weight from labels:', self._pos_weight)
        else:
            self._pos_weight = float(pos_weight)
コード例 #24
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 namespace: str = "intent_labels",
                 ignore_classes: List[str] = None,
                 coarse: bool = True) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the label namespace.
        namespace : str, required.
            The vocabulary namespace for labels.
        ignore_classes : List[str], optional.
            Labels which will be ignored when computing metrics.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(
            namespace)
        self._ignore_classes: List[str] = ignore_classes or []
        self._coarse = coarse

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
コード例 #25
0
 def __init__(self, window_size=5, lazy=False, vocab: Vocabulary = None):
     """A DatasetReader for reading a plain text corpus and producing instances
     for the SkipGram model.
     When vocab is not None, this runs sub-sampling of frequent words as described
     in (Mikolov et al. 2013).
     """
     super().__init__(lazy=lazy)
     self.window_size = window_size
     self.reject_probs = None
     if vocab:
         self.reject_probs = {}
         threshold = 1.e-3
         token_counts = vocab._retained_counter['token_in']  # HACK
         total_counts = sum(token_counts.values())
         for _, token in vocab.get_index_to_token_vocabulary(
                 'token_in').items():
             counts = token_counts[token]
             if counts > 0:
                 normalized_counts = counts / total_counts
                 reject_prob = 1. - math.sqrt(threshold / normalized_counts)
                 reject_prob = max(0., reject_prob)
             else:
                 reject_prob = 0.
             self.reject_probs[token] = reject_prob
コード例 #26
0
def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary):
    with open(file_path, mode='w') as f:
        for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
            values = ['{:.5f}'.format(val) for val in embedding.weight[index]]
            f.write(' '.join([token] + values))
            f.write('\n')
コード例 #27
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True,
                 add_random_noise: bool = False,
                 add_attack_noise: bool = False,
                 do_noise_normalization: bool = True,
                 noise_norm: Optional[float] = None,
                 noise_loss_prob: Optional[float] = None,
                 add_noise_for: str = "ov",
                 rnn_after_embeddings: bool = False,
                 open_vocabulary_slots: Optional[List[str]] = None,
                 metrics_for_each_slot_type: bool = False) -> None:
        """
        Params
        ------
        vocab: the allennlp Vocabulary object, will be automatically passed
        bert_embedder: the pretrained BERT embedder. If it is not None, the pretrained BERT
                embedding (parameter fixed) will be used as the embedding layer. Otherwise, a look-up
                embedding matrix will be initialized with the embedding size 1024. The default is None.
        encoder: the contextual encoder used after the embedding layer. If set to None, no contextual
                encoder will be used.
        dropout: the dropout rate, won't be set in all our experiments.
        use_crf: if set to True, CRF will be used at the end of the model (as output layer). Otherwise,
                a softmax layer (with cross-entropy loss) will be used.
        add_random_noise: whether to add random noise to slots. Can not be set simultaneously 
                with add_attack_noise. This setting is used as baseline in our experiments.
        add_attack_noise: whether to add adversarial attack noise to slots. Can not be set simultaneously
                with add_random_noise.
        do_noise_normalization: if set to True, the normalization will be applied to gradients w.r.t. 
                token embeddings. Otherwise, the gradients won't be normalized.
        noise_norm: the normalization norm (L2) applied to gradients.
        noise_loss_prob: the alpha hyperparameter to balance the loss from normal forward and adversarial
                forward. See the paper for more details. Should be set from 0 to 1.
        add_noise_for: if set to ov, the noise will only be applied to open-vocabulary slots. Otherwise,
                the noise will be applied to all slots (both open-vocabulary and normal slots).
        rnn_after_embeddings: if set to True, an additional BiLSTM layer will be applied after the embedding
                layer. Default is False.
        open_vocabulary_slots: the list of open-vocabulary slots. If not set, will be set to open-vocabulary
                slots of Snips dataset by default.
        metrics_for_each_slot_type: whether to log metrics for each slot type. Default is False.
        """
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens":
                Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn_after_embeddings = rnn_after_embeddings
            if rnn_after_embeddings:
                self.rnn = Seq2SeqEncoder.from_params(
                    Params({
                        "type": "lstm",
                        "input_size": 1024,
                        "hidden_size": 512,
                        "bidirectional": True,
                        "batch_first": True
                    }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(
            torch.nn.Linear(in_features=hidden2tag_in_dim,
                            out_features=vocab.get_vocab_size("labels")))

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels"))
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True)

        # default open_vocabulary slots: for SNIPS dataset
        open_vocabulary_slots = open_vocabulary_slots or [
            "playlist", "entity_name", "poi", "restaurant_name",
            "geographic_poi", "album", "track", "object_name", "movie_name"
        ]
        self.f1 = OVSpecSpanBasedF1Measure(
            vocab,
            tag_namespace="labels",
            ignore_classes=[],
            label_encoding="BIO",
            open_vocabulary_slots=open_vocabulary_slots)

        self.add_random_noise = add_random_noise
        self.add_attack_noise = add_attack_noise
        assert not (add_random_noise and
                    add_attack_noise), "both random and attack noise applied"
        if add_random_noise or add_attack_noise:
            self.do_noise_normalization = do_noise_normalization
            assert noise_norm is not None
            assert noise_loss_prob is not None and 0. <= noise_loss_prob <= 1.
            self.noise_norm = noise_norm
            self.noise_loss_prob = noise_loss_prob
            assert add_noise_for in ["ov", "all"]
            self.ov_noise_only = (add_noise_for == "ov")

        self.metrics_for_each_slot_type = metrics_for_each_slot_type
コード例 #28
0
ファイル: embedding.py プロジェクト: ydwisroad/competitions
def _read_embeddings_from_text_file(
        file_uri: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped.

    The remainder of the docstring is identical to `_read_pretrained_embeddings_file`.
    """
    tokens_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(" ")
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1,
                        line,
                    )
                    continue

                vector = numpy.asarray(fields[1:], dtype="float32")
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
コード例 #29
0
    def __init__(
        self,
        vocabulary: Vocabulary,
        tag_namespace: str = "tags",
        ignore_classes: List[str] = None,
        label_encoding: Optional[str] = "BMESO",
        tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None
    ) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", "BIOUL" or "BMES".
        tags_to_spans_function: ``Callable``, optional (default = ``None``)
            If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be
            used to generate spans.
        """
        if label_encoding and tags_to_spans_function:
            raise ConfigurationError(
                'Both label_encoding and tags_to_spans_function are provided. '
                'Set "label_encoding=None" explicitly to enable tags_to_spans_function.'
            )
        if label_encoding:
            if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES", "BMESO"]:
                raise ConfigurationError(
                    "Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'."
                )
        elif tags_to_spans_function is None:
            raise ConfigurationError(
                'At least one of the (label_encoding, tags_to_spans_function) should be provided.'
            )

        self._label_encoding = label_encoding
        self._tags_to_spans_function = tags_to_spans_function
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(
            tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
コード例 #30
0
    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None,
                 ontology_path: str = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(
            tag_namespace)
        self._ignore_classes = ignore_classes or []
        self.num_classes = vocabulary.get_vocab_size(tag_namespace)

        if ontology_path is not None:
            self._ontology = FrameOntology(ontology_path)

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)

        # These will hold unlabeled span counts.
        self._unlabeled_true_positives: int = 0
        self._unlabeled_false_positives: int = 0
        self._unlabeled_false_negatives: int = 0

        # These will hold partial match counts.
        self._partial_true_positives: int = 0
        self._partial_false_positives: int = 0
        self._partial_false_negatives: int = 0

        # These will hold width-wise span counts.
        self._width_tp: Dict[int, int] = defaultdict(int)
        self._width_fp: Dict[int, int] = defaultdict(int)
        self._width_fn: Dict[int, int] = defaultdict(int)

        # These will hold width-wise span counts.
        self._dist_tp: Dict[int, int] = defaultdict(int)
        self._dist_fp: Dict[int, int] = defaultdict(int)
        self._dist_fn: Dict[int, int] = defaultdict(int)

        self._gold_spans: List[Set[Tuple[int, int, str]]] = []
        self._predicted_spans: List[Set[Tuple[int, int, str]]] = []