def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. """ self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None, label_encoding: Optional[str] = "BIO", tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. label_encoding : ``str``, optional (default = "BIO") The encoding used to specify label span endpoints in the sequence. Valid options are "BIO", "IOB1", "BIOUL" or "BMES". tags_to_spans_function: ``Callable``, optional (default = ``None``) If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be used to generate spans. """ if label_encoding and tags_to_spans_function: raise ConfigurationError( 'Both label_encoding and tags_to_spans_function are provided. ' 'Set "label_encoding=None" explicitly to enable tags_to_spans_function.' ) if label_encoding: if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES"]: raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'.") elif tags_to_spans_function is None: raise ConfigurationError( 'At least one of the (label_encoding, tags_to_spans_function) should be provided.' ) self._label_encoding = label_encoding self._tags_to_spans_function = tags_to_spans_function self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace( ["a0", "a1", "a2"], namespace="a") # non-padded, should start at 0 vocab.add_tokens_to_namespace( ["b2", "b3"], namespace="b") # padded, should start at 2 vocab.save_to_files(vocab_dir) params = Params({"type": "from_files", "directory": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary( "a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary( "b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), instances=self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == { 0: "@@PADDING@@", 1: "@@UNKNOWN@@", 2: "a", 3: "c", 4: "b", } # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params( Params({ "type": "from_files", "directory": vocab_dir, "min_count": { "tokens": 2 } }))
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace( "a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace( "b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary( "a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary( "b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == { 0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b' } # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'vocabulary_directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params( Params({ "directory_path": vocab_dir, "min_count": 2 }))
def get_synonyms(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10): """Given a token, return a list of top N most similar words to the token.""" token_id = vocab.get_token_index(token, 'token_in') token_vec = embedding.weight[token_id] cosine = CosineSimilarity(dim=0) sims = Counter() for index, token in vocab.get_index_to_token_vocabulary('token_in').items(): sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_synonyms)
def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor): vocab = StressVocabulary() for index, word in vocabulary.get_index_to_token_vocabulary( "tokens").items(): stresses = [ Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word) ] word = StressedWord(word, set(stresses)) vocab.add_word(word, index) return vocab
def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary): with open(file_path, mode='w') as f: words = vocab.get_index_to_token_vocabulary('token_in').items() print(len(words)) f.write('{} {}\n'.format( len(words), EMBEDDING_DIM)) #we write number of words and embedding dimension for index, token in words: #loop through both keys and values, by using the items() values = [ '{:.10f}'.format(val) for val in embedding.weight[index] ] #write each value as a number with 10 decimals f.write(' '.join([token] + values)) f.write('\n')
def __init__(self, vocabulary: Vocabulary, average: str = "macro", label_namespace: str = "labels", ignore_label: str = None) -> None: self._label_vocabulary = vocabulary.get_index_to_token_vocabulary( label_namespace) self._average = average self._ignore_label = ignore_label self._true_positives: Dict[str, int] = defaultdict(int) self._true_negatives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def __init__(self, vocabulary: Vocabulary, slot_labels: List[str], count_span: bool = False, fine_grained: bool = False): self._vocabulary = vocabulary self._bio_vocabulary = vocabulary.get_index_to_token_vocabulary( "bio_labels") self._slot_labels = slot_labels self._count_span = count_span self._fine_grained = fine_grained self.reset()
def __init__(self, model_path, vocab: Vocabulary): super().__init__(vocab) self.pretrained_tokenizer = BertForPreTraining.from_pretrained( model_path) config = BertConfig.from_pretrained(model_path) bert_model = BertForPreTraining(config) self.bert = bert_model.bert tags = vocab.get_index_to_token_vocabulary("tags") num_tags = len(tags) constraints = allowed_transitions(constraint_type="BMES", labels=tags) self.projection = torch.nn.Linear(768, num_tags) self.crf = ConditionalRandomField(num_tags=num_tags, constraints=constraints, include_start_end_transitions=False)
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace="1") assert "word" in vocab.get_index_to_token_vocabulary( namespace="1").values() assert vocab.get_token_index("word", namespace="1") == word_index assert vocab.get_token_from_index(word_index, namespace="1") == "word" assert vocab.get_vocab_size(namespace="1") == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace="2") word_index = vocab.add_token_to_namespace("word", namespace="2") assert "word" in vocab.get_index_to_token_vocabulary( namespace="2").values() assert "word2" in vocab.get_index_to_token_vocabulary( namespace="2").values() assert vocab.get_token_index("word", namespace="2") == word_index assert vocab.get_token_index("word2", namespace="2") == word2_index assert vocab.get_token_from_index(word_index, namespace="2") == "word" assert vocab.get_token_from_index(word2_index, namespace="2") == "word2" assert vocab.get_vocab_size(namespace="2") == initial_vocab_size + 2
def __init__( self, vocab: Vocabulary, embedder: TextFieldEmbedder, feature_encoder: SpanClassifier, num_classes: int = 14, ): super().__init__(vocab) self.embedder = embedder self.feature_encoder = feature_encoder self.hidden2tag = torch.nn.Linear(feature_encoder.get_output_dim(), num_classes) self.criterion = torch.nn.BCEWithLogitsLoss() self.acc = Accuracy() self.f1 = MultilabelMicroF1() self.idx2label = vocab.get_index_to_token_vocabulary("labels")
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.encoder = encoder self.vocab = vocab self.label_vocab = vocab.get_index_to_token_vocabulary( namespace='labels') inf_vec = torch.Tensor([float('-inf')] * encoder.get_input_dim()) self.class_avgs = [ inf_vec.clone() for i in range(len(self.label_vocab)) ] self.accuracy = CategoricalAccuracy() self.f_beta = FBetaMeasure(1.0, None, [0, 1, 2])
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None, label_encoding: str = "BIO") -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. label_encoding : ``str``, optional (default = "BIO") The encoding used to specify label span endpoints in the sequence. Valid options are "BIO", "IOB1", or BIOUL". """ if label_encoding not in ["BIO", "IOB1", "BIOUL"]: raise ConfigurationError( "Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL'.") self._label_encoding = label_encoding self._label_vocabulary = vocabulary.get_index_to_token_vocabulary( tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def get_related(token: str, embedding: Model, vocab: Vocabulary, num_related: int = 20): """Given a token, return a list of top 20 most similar words to the token.""" token_id = vocab.get_token_index(token, 'token_in') token_vec = embedding.weight[ token_id] #A pre-initialization weight matrix for the embedding lookup, allowing the use of pretrained vectors. cosine = CosineSimilarity( dim=0 ) #we do this to be able calculate simple cosine similarity between 2 vectors sims = Counter() for index, token in vocab.get_index_to_token_vocabulary( 'token_in').items(): # Cosine similarity of our token vector with every other word vector in the vocabulary sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim #save the value of cosine similarity return sims.most_common(num_related)
def create_target_weight(): vocab = Vocabulary().from_files("data/vocabulary") token_weight_list = [] for index, token in vocab.get_index_to_token_vocabulary().items(): token_weight = get_target_distribution(token, vocab) token_weight_list.append(token_weight) weight = torch.stack(token_weight_list) s = Score.score torch.save( weight, "data/targets/target_{}{}{}{}{}{}.th".format( s["token_name"], s["key_name"], s["key_number"], s["triad_form"], s["figbass"], s["note_pair"], ), )
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == ["a", "c"] # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def test_saving_and_loading(self): vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_tokens_to_namespace( ["a0", "a1", "a2"], namespace="a" ) # non-padded, should start at 0 vocab.add_tokens_to_namespace(["b2", "b3"], namespace="b") # padded, should start at 2 vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace="a") == 3 assert vocab2.get_token_from_index(0, namespace="a") == "a0" assert vocab2.get_token_from_index(1, namespace="a") == "a1" assert vocab2.get_token_from_index(2, namespace="a") == "a2" assert vocab2.get_token_index("a0", namespace="a") == 0 assert vocab2.get_token_index("a1", namespace="a") == 1 assert vocab2.get_token_index("a2", namespace="a") == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace="b") == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace="b") == vocab._padding_token assert vocab2.get_token_from_index(1, namespace="b") == vocab._oov_token assert vocab2.get_token_from_index(2, namespace="b") == "b2" assert vocab2.get_token_from_index(3, namespace="b") == "b3" assert vocab2.get_token_index(vocab._padding_token, namespace="b") == 0 assert vocab2.get_token_index(vocab._oov_token, namespace="b") == 1 assert vocab2.get_token_index("b2", namespace="b") == 2 assert vocab2.get_token_index("b3", namespace="b") == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, message_encoder: Seq2VecEncoder, conversation_encoder: Seq2SeqEncoder, dropout: float = 0.5, pos_weight: float = None, use_game_scores: bool = False) -> None: super().__init__(vocab) self._embedder = embedder self._message_encoder = message_encoder self._conversation_encoder = conversation_encoder self._use_game_scores = use_game_scores output_dim = conversation_encoder.get_output_dim() + int(self._use_game_scores) self._classifier = nn.Linear(in_features=output_dim, out_features=vocab.get_vocab_size('labels')) self._dropout = nn.Dropout(dropout) self._label_index_to_token = vocab.get_index_to_token_vocabulary(namespace="labels") self._num_labels = len(self._label_index_to_token) print(self._label_index_to_token) index_list = list(range(self._num_labels)) print(index_list) self._f1 = FBetaMeasure(average=None, labels=index_list) self._f1_micro = FBetaMeasure(average='micro') self._f1_macro = FBetaMeasure(average='macro') if pos_weight is None or pos_weight <= 0: labels_counter = self.vocab._retained_counter['labels'] self._pos_weight = 1. * labels_counter['True'] / labels_counter['False'] # self._pos_weight = 15.886736214605067 print('Computing Pos weight from labels:', self._pos_weight) else: self._pos_weight = float(pos_weight)
def __init__(self, vocabulary: Vocabulary, namespace: str = "intent_labels", ignore_classes: List[str] = None, coarse: bool = True) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the label namespace. namespace : str, required. The vocabulary namespace for labels. ignore_classes : List[str], optional. Labels which will be ignored when computing metrics. """ self._label_vocabulary = vocabulary.get_index_to_token_vocabulary( namespace) self._ignore_classes: List[str] = ignore_classes or [] self._coarse = coarse # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def __init__(self, window_size=5, lazy=False, vocab: Vocabulary = None): """A DatasetReader for reading a plain text corpus and producing instances for the SkipGram model. When vocab is not None, this runs sub-sampling of frequent words as described in (Mikolov et al. 2013). """ super().__init__(lazy=lazy) self.window_size = window_size self.reject_probs = None if vocab: self.reject_probs = {} threshold = 1.e-3 token_counts = vocab._retained_counter['token_in'] # HACK total_counts = sum(token_counts.values()) for _, token in vocab.get_index_to_token_vocabulary( 'token_in').items(): counts = token_counts[token] if counts > 0: normalized_counts = counts / total_counts reject_prob = 1. - math.sqrt(threshold / normalized_counts) reject_prob = max(0., reject_prob) else: reject_prob = 0. self.reject_probs[token] = reject_prob
def write_embeddings(embedding: Embedding, file_path, vocab: Vocabulary): with open(file_path, mode='w') as f: for index, token in vocab.get_index_to_token_vocabulary('token_in').items(): values = ['{:.5f}'.format(val) for val in embedding.weight[index]] f.write(' '.join([token] + values)) f.write('\n')
def __init__(self, vocab: Vocabulary, bert_embedder: Optional[PretrainedBertEmbedder] = None, encoder: Optional[Seq2SeqEncoder] = None, dropout: Optional[float] = None, use_crf: bool = True, add_random_noise: bool = False, add_attack_noise: bool = False, do_noise_normalization: bool = True, noise_norm: Optional[float] = None, noise_loss_prob: Optional[float] = None, add_noise_for: str = "ov", rnn_after_embeddings: bool = False, open_vocabulary_slots: Optional[List[str]] = None, metrics_for_each_slot_type: bool = False) -> None: """ Params ------ vocab: the allennlp Vocabulary object, will be automatically passed bert_embedder: the pretrained BERT embedder. If it is not None, the pretrained BERT embedding (parameter fixed) will be used as the embedding layer. Otherwise, a look-up embedding matrix will be initialized with the embedding size 1024. The default is None. encoder: the contextual encoder used after the embedding layer. If set to None, no contextual encoder will be used. dropout: the dropout rate, won't be set in all our experiments. use_crf: if set to True, CRF will be used at the end of the model (as output layer). Otherwise, a softmax layer (with cross-entropy loss) will be used. add_random_noise: whether to add random noise to slots. Can not be set simultaneously with add_attack_noise. This setting is used as baseline in our experiments. add_attack_noise: whether to add adversarial attack noise to slots. Can not be set simultaneously with add_random_noise. do_noise_normalization: if set to True, the normalization will be applied to gradients w.r.t. token embeddings. Otherwise, the gradients won't be normalized. noise_norm: the normalization norm (L2) applied to gradients. noise_loss_prob: the alpha hyperparameter to balance the loss from normal forward and adversarial forward. See the paper for more details. Should be set from 0 to 1. add_noise_for: if set to ov, the noise will only be applied to open-vocabulary slots. Otherwise, the noise will be applied to all slots (both open-vocabulary and normal slots). rnn_after_embeddings: if set to True, an additional BiLSTM layer will be applied after the embedding layer. Default is False. open_vocabulary_slots: the list of open-vocabulary slots. If not set, will be set to open-vocabulary slots of Snips dataset by default. metrics_for_each_slot_type: whether to log metrics for each slot type. Default is False. """ super().__init__(vocab) if bert_embedder: self.use_bert = True self.bert_embedder = bert_embedder else: self.use_bert = False self.basic_embedder = BasicTextFieldEmbedder({ "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024) }) self.rnn_after_embeddings = rnn_after_embeddings if rnn_after_embeddings: self.rnn = Seq2SeqEncoder.from_params( Params({ "type": "lstm", "input_size": 1024, "hidden_size": 512, "bidirectional": True, "batch_first": True })) self.encoder = encoder if encoder: hidden2tag_in_dim = encoder.get_output_dim() else: hidden2tag_in_dim = bert_embedder.get_output_dim() self.hidden2tag = TimeDistributed( torch.nn.Linear(in_features=hidden2tag_in_dim, out_features=vocab.get_vocab_size("labels"))) if dropout: self.dropout = torch.nn.Dropout(dropout) else: self.dropout = None self.use_crf = use_crf if use_crf: crf_constraints = allowed_transitions( constraint_type="BIO", labels=vocab.get_index_to_token_vocabulary("labels")) self.crf = ConditionalRandomField( num_tags=vocab.get_vocab_size("labels"), constraints=crf_constraints, include_start_end_transitions=True) # default open_vocabulary slots: for SNIPS dataset open_vocabulary_slots = open_vocabulary_slots or [ "playlist", "entity_name", "poi", "restaurant_name", "geographic_poi", "album", "track", "object_name", "movie_name" ] self.f1 = OVSpecSpanBasedF1Measure( vocab, tag_namespace="labels", ignore_classes=[], label_encoding="BIO", open_vocabulary_slots=open_vocabulary_slots) self.add_random_noise = add_random_noise self.add_attack_noise = add_attack_noise assert not (add_random_noise and add_attack_noise), "both random and attack noise applied" if add_random_noise or add_attack_noise: self.do_noise_normalization = do_noise_normalization assert noise_norm is not None assert noise_loss_prob is not None and 0. <= noise_loss_prob <= 1. self.noise_norm = noise_norm self.noise_loss_prob = noise_loss_prob assert add_noise_for in ["ov", "all"] self.ov_noise_only = (add_noise_for == "ov") self.metrics_for_each_slot_type = metrics_for_each_slot_type
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped. The remainder of the docstring is identical to `_read_pretrained_embeddings_file`. """ tokens_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line, ) continue vector = numpy.asarray(fields[1:], dtype="float32") embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def __init__( self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None, label_encoding: Optional[str] = "BMESO", tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None ) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. label_encoding : ``str``, optional (default = "BIO") The encoding used to specify label span endpoints in the sequence. Valid options are "BIO", "IOB1", "BIOUL" or "BMES". tags_to_spans_function: ``Callable``, optional (default = ``None``) If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be used to generate spans. """ if label_encoding and tags_to_spans_function: raise ConfigurationError( 'Both label_encoding and tags_to_spans_function are provided. ' 'Set "label_encoding=None" explicitly to enable tags_to_spans_function.' ) if label_encoding: if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES", "BMESO"]: raise ConfigurationError( "Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'." ) elif tags_to_spans_function is None: raise ConfigurationError( 'At least one of the (label_encoding, tags_to_spans_function) should be provided.' ) self._label_encoding = label_encoding self._tags_to_spans_function = tags_to_spans_function self._label_vocabulary = vocabulary.get_index_to_token_vocabulary( tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None, ontology_path: str = None) -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. """ self._label_vocabulary = vocabulary.get_index_to_token_vocabulary( tag_namespace) self._ignore_classes = ignore_classes or [] self.num_classes = vocabulary.get_vocab_size(tag_namespace) if ontology_path is not None: self._ontology = FrameOntology(ontology_path) # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int) # These will hold unlabeled span counts. self._unlabeled_true_positives: int = 0 self._unlabeled_false_positives: int = 0 self._unlabeled_false_negatives: int = 0 # These will hold partial match counts. self._partial_true_positives: int = 0 self._partial_false_positives: int = 0 self._partial_false_negatives: int = 0 # These will hold width-wise span counts. self._width_tp: Dict[int, int] = defaultdict(int) self._width_fp: Dict[int, int] = defaultdict(int) self._width_fn: Dict[int, int] = defaultdict(int) # These will hold width-wise span counts. self._dist_tp: Dict[int, int] = defaultdict(int) self._dist_fp: Dict[int, int] = defaultdict(int) self._dist_fn: Dict[int, int] = defaultdict(int) self._gold_spans: List[Set[Tuple[int, int, str]]] = [] self._predicted_spans: List[Set[Tuple[int, int, str]]] = []