Python Vocabulary.empty Exemples

Langage de programmation: Python

Espace de nommage/Pack: allennlp.data

Class/Type: Vocabulary

Méthode/Fonction: empty

Exemples au hotexamples.com: 2

Python Vocabulary.empty - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de allennlp.data.Vocabulary.empty extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Vocabulary(30)

from_files(30)

get_vocab_size(30)

get_token_index(30)

from_params(30)

from_instances(30)

add_token_to_namespace(30)

get_token_from_index(29)

get_token_to_index_vocabulary(26)

get_index_to_token_vocabulary(25)

add_tokens_to_namespace(18)

from_dataset(11)

get_namespaces(10)

save_to_files(9)

set_from_file(8)

list_available(5)

_extend(4)

by_name(3)

resolve_class_name(3)

add_transformer_vocab(3)

from_pretrained_transformer(2)

empty(2)

print_statistics(1)

extend_from_instances(1)

slices(1)

Méthodes fréquemment utilisées

Vocabulary (30)

from_files (30)

get_vocab_size (30)

get_token_index (30)

from_params (30)

from_instances (30)

add_token_to_namespace (30)

get_token_from_index (29)

get_token_to_index_vocabulary (26)

get_index_to_token_vocabulary (25)

Méthodes fréquemment utilisées

add_tokens_to_namespace (18)

from_dataset (11)

get_namespaces (10)

save_to_files (9)

set_from_file (8)

list_available (5)

_extend (4)

by_name (3)

resolve_class_name (3)

add_transformer_vocab (3)

from_pretrained_transformer (2)

empty (2)

print_statistics (1)

extend_from_instances (1)

slices (1)

Méthodes fréquemment utilisées

from_pretrained_transformer (2)

empty (2)

print_statistics (1)

extend_from_instances (1)

slices (1)

Exemple #1

0

Afficher le fichier

def test_metrics(): metrics = Metrics( accuracy={"type": "categorical_accuracy"}, f1={ "type": "span_f1", "vocabulary": Vocabulary.empty(), }, ) # Check that training and validation metrics are different instances assert (metrics.get_dict()["accuracy"] is not metrics.get_dict(is_train=False)["accuracy"]) # Check if we share the same vocab assert (metrics.get_dict()["f1"]._label_vocabulary is metrics.get_dict(is_train=False)["f1"]._label_vocabulary)

Exemple #2

0

Afficher le fichier

Fichier : hf_tokenize.py Projet : himkt/allennlp

def run( # type: ignore self, tokenizer_name: str, input: DatasetDict, fields_to_tokenize: Optional[List[str]] = None, add_special_tokens: bool = True, max_length: Optional[int] = 512, special_tokens_mask: bool = False, offset_mapping: bool = False, ) -> DatasetDict: """ Reads a dataset and converts all strings in it into `TransformerTextField`. * `tokenizer_name` is the name of the tokenizer to use. For example, `"roberta-large"`. * `input` is the dataset to transform in this way. * By default, this step tokenizes all strings it finds, but if you specify `fields_to_tokenize`, it will only tokenize the named fields. * `add_special_tokens` specifies whether or not to add special tokens to the tokenized strings. * `max_length` is the maximum length the resulting `TransformerTextField` will have. If there is too much text, it will be truncated. * `special_tokens_mask` specifies whether to add the special token mask as one of the tensors in `TransformerTextField`. * `offset_mapping` specifies whether to add a mapping from tokens to original string offsets to the tensors in `TransformerTextField`. This function returns a new dataset with new `TransformerTextField`s. """ tokenizer = cached_transformers.get_tokenizer(tokenizer_name) assert tokenizer.pad_token_type_id == 0 field_names_used = set() # find all the strings if fields_to_tokenize is None: def should_tokenize_field(fname: str) -> bool: return True else: regexes_to_tokenize = [re.compile(r) for r in fields_to_tokenize] def should_tokenize_field(fname: str) -> bool: for r in regexes_to_tokenize: if r.fullmatch(fname): return True return False def find_string_objects(o: Any, prefix: str = "") -> Iterable[Tuple[str, str]]: prefix = prefix.lstrip(".") if isinstance(o, str): if should_tokenize_field(prefix): yield prefix, o elif isinstance(o, List): for i, item in enumerate(o): yield from find_string_objects(item, f"{prefix}.{i}") elif isinstance(o, Dict): for name, item in o.items(): yield from find_string_objects(item, f"{prefix}.{name}") strings = [] for split_name, instances in input.splits.items(): for instance in instances: for name, string in find_string_objects(instance): field_names_used.add(name) strings.append(string) for field_name in sorted(field_names_used): self.logger.info("Tokenizing field %s", field_name) # This thing is so complicated because we want to call `batch_encode_plus` with all # the strings at once. encoded = tokenizer.batch_encode_plus( strings, add_special_tokens=add_special_tokens, truncation=max_length is not None, max_length=max_length, return_token_type_ids=True, return_attention_mask=False, return_special_tokens_mask=special_tokens_mask, return_offsets_mapping=offset_mapping, ) # make fields string_to_field = { s: TransformerTextField( torch.tensor(encoded["input_ids"][i], dtype=torch.int32), torch.tensor(encoded["token_type_ids"][i], dtype=torch.int32), torch.tensor(encoded["attention_mask"][i], dtype=torch.bool) if "attention_mask" in encoded else None, torch.tensor(encoded["special_tokens_mask"][i], dtype=torch.bool) if "special_tokens_mask" in encoded else None, torch.tensor(encoded["offset_mapping"][i], dtype=torch.int32) if "offset_mapping" in encoded else None, tokenizer.pad_token_id, ) for i, s in enumerate(strings) } def replace_string_objects(o: Any) -> Any: if isinstance(o, str): try: return string_to_field[o] except KeyError: return o elif isinstance(o, List) or isinstance(o, Dataset): return [replace_string_objects(i) for i in o] elif isinstance(o, Dict): return { key: replace_string_objects(value) for key, value in o.items() } else: return o new_splits = { split_name: replace_string_objects(split_data) for split_name, split_data in input.splits.items() } # make vocab if input.vocab is not None: vocab = copy.deepcopy(input.vocab) else: vocab = Vocabulary.empty() for name in field_names_used: vocab.add_transformer_vocab(tokenizer, name) return DatasetDict(new_splits, vocab)