Python Vocab.build_vocab Exemples

Langage de programmation: Python

Espace de nommage/Pack: utils.vocab

Class/Type: Vocab

Méthode/Fonction: build_vocab

Exemples au hotexamples.com: 2

Python Vocab.build_vocab - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de utils.vocab.Vocab.build_vocab extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Vocab(30)

load(22)

add(10)

from_file(8)

save(7)

add_word(5)

add_tokenized_sentence(3)

tokens2ids(3)

size(3)

tokens2indices(3)

finish(3)

set_freeze(2)

extend(2)

token2id(2)

process(2)

load_pretrained_emb(2)

__dict__(2)

get_freeze(2)

__get_index__(2)

__get_word__(2)

__len__(2)

build_vocab(2)

encode_sequence_batch(1)

build(1)

unmap(1)

top_words(1)

add_documents(1)

add_pad_token(1)

sequence_2_id(1)

add_unk_token(1)

randomly_init_embeddings(1)

add_words(1)

load_vocab(1)

load_from_pickle(1)

embeddings(1)

load_from_file(1)

load_embeddings(1)

inst2idx(1)

init_embed(1)

get_token(1)

get_size(1)

get_index(1)

contains(1)

from_iterable(1)

convert_tokens_to_ids(1)

create(1)

filter_tokens_by_cnt(1)

word_2_id(1)

Méthodes fréquemment utilisées

Vocab (30)

load (22)

add (10)

from_file (8)

save (7)

add_word (5)

add_tokenized_sentence (3)

tokens2ids (3)

size (3)

tokens2indices (3)

Méthodes fréquemment utilisées

finish (3)

set_freeze (2)

extend (2)

token2id (2)

process (2)

load_pretrained_emb (2)

__dict__ (2)

get_freeze (2)

__get_index__ (2)

__get_word__ (2)

__len__ (2)

build_vocab (2)

encode_sequence_batch (1)

build (1)

unmap (1)

top_words (1)

add_documents (1)

add_pad_token (1)

sequence_2_id (1)

add_unk_token (1)

Méthodes fréquemment utilisées

__len__ (2)

build_vocab (2)

encode_sequence_batch (1)

build (1)

unmap (1)

top_words (1)

add_documents (1)

add_pad_token (1)

sequence_2_id (1)

add_unk_token (1)

randomly_init_embeddings (1)

add_words (1)

load_vocab (1)

load_from_pickle (1)

embeddings (1)

load_from_file (1)

load_embeddings (1)

inst2idx (1)

init_embed (1)

get_token (1)

get_size (1)

get_index (1)

contains (1)

from_iterable (1)

convert_tokens_to_ids (1)

create (1)

filter_tokens_by_cnt (1)

word_2_id (1)

Méthodes fréquemment utilisées

randomly_init_embeddings (1)

add_words (1)

load_vocab (1)

load_from_pickle (1)

embeddings (1)

load_from_file (1)

load_embeddings (1)

inst2idx (1)

init_embed (1)

get_token (1)

get_size (1)

get_index (1)

contains (1)

from_iterable (1)

convert_tokens_to_ids (1)

create (1)

filter_tokens_by_cnt (1)

word_2_id (1)

Exemple #1

0

Afficher le fichier

Fichier : Trainer.py Projet : kant/Multilingual-RDF-Verbalizer

def build_vocab(files, vocabulary=None, mtl=False, name="src", save_dir="/"): vocabs = [] if vocabulary is not None: for v in vocabulary: print(f'Loading from {v}') vocab = Vocab() vocab.load_from_file(v) vocabs.append(vocab) else: if mtl is True: for index, f in enumerate(files): vocab = Vocab() vocab.build_vocab([f]) vocab.save(save_dir + name + ".vocab." + str(index) + ".json") vocabs.append(vocab) else: vocab = Vocab() vocab.build_vocab(files) vocab.save(save_dir + name + ".vocab.json") vocabs.append(vocab) for index, vocab in enumerate(vocabs): print(f'vocabulary size {index+1:d}: {vocab.len():d}') return vocabs

Exemple #2

0

Afficher le fichier

Fichier : Dataloader.py Projet : kant/Multilingual-RDF-Verbalizer

class ParallelDataset(Dataset): ''' This class builds a dataset from source/target files according to a max_length ''' def __init__(self, source_name, target_name, max_length=300, source_vocab=None, target_vocab=None): self.data_source = self.read_file(source_name) self.data_target = self.read_file(target_name) self.max_length = max_length self.source_vocab = source_vocab if source_vocab == None: self.source_vocab = Vocab() self.source_vocab.build_vocab([source_name]) self.target_vocab = target_vocab if target_vocab == None: self.target_vocab = Vocab() self.target_vocab.build_vocab([target_name]) def __len__(self): ''' Return the length of the dataset ''' return len(self.data_source) def __getitem__(self, index): src_tokens = self.padding_sentence(self.data_source[index]) tgt_tokens = self.padding_sentence(self.data_target[index]) src_tokens_ids = self.source_vocab.convert_tokens_to_ids(src_tokens) src_tokens_ids_tensor = torch.tensor(src_tokens_ids) tgt_tokens_ids = self.target_vocab.convert_tokens_to_ids(tgt_tokens) tgt_tokens_ids_tensor = torch.tensor(tgt_tokens_ids) return src_tokens_ids_tensor, tgt_tokens_ids_tensor def read_file(self, filename): ''' Read the file to filename: filename or path of the source/target files ''' data = [] with open(filename, "r") as f: for line in f: data.append(line.strip().split()) return data def padding_sentence(self, tokens): ''' Padding the sentence (adding sos and eos tokens and fix the length to a max_length tokens: list of tokens of a sentence ''' tokens = ['<sos>'] + tokens + ['<eos>'] if len(tokens) < self.max_length: tokens = tokens + [ '<pad>' for _ in range(self.max_length - len(tokens)) ] else: tokens = tokens[:self.max_length - 1] + ['<eos>'] return tokens def vocabs(self): ''' Return the source and target vocabulary ''' return self.source_vocab, self.target_vocab