def get_vocabulary_table(workdir, words): vocab_table = VocabularyTable() for word in words: vocab_table.add_label(word['word']) return vocab_table
parser.add_argument('--lr', type=float, default=0.01, help='Learning rate') parser.add_argument('--epochs', type=int, default=5, help='Number of epochs for training') parser.add_argument('--batch-size', type=int, default=32, help='Batch size') parser.add_argument('--device', type=str, default='cpu', help='Device string') parser.add_argument('--resume', action='store_true', help="Training resumes from a model file " "specified by '--model-file' option") args = parser.parse_args() print('Loading vocabulary table ...') vocab_path = os.path.join(args.workdir, args.vocabulary_table_file) vocab_table = VocabularyTable.load(vocab_path, min_freq=args.min_word_frequency) print('Loading training data ...') training_data_dirpath = os.path.join(args.workdir, args.training_data_dirname) repository_tr = TrainingDatasetRepository(training_data_dirpath) dataset_tr = IterableAudioDataset(repository_tr, vocab_table) dataloader_tr = DataLoader(dataset_tr, batch_size=args.batch_size, collate_fn=ListenAttendSpell.collate) print('Loading development data ...') development_data_dirpath = os.path.join(args.workdir, args.development_data_dirname) repository_dev = DevelopmentDatasetRepository(development_data_dirpath) dataloaders_dev = [] for dataset_dev in AudioDataset.load_all(repository_dev, vocab_table): dataloader_dev = DataLoader(dataset_dev, batch_size=args.batch_size,
def test_vocabulary_table_get_unk_id(): vocab_table = VocabularyTable() assert vocab_table.get_unk_id() == 1 assert vocab_table.get_label_id('<unk>') == 1 assert vocab_table.get_label(1) == '<unk>'
def test_vocabulary_table_get_bos_id(): vocab_table = VocabularyTable() assert vocab_table.get_bos_id() == 2 assert vocab_table.get_label_id('<bos>') == 2 assert vocab_table.get_label(2) == '<bos>'
def test_vocabulary_table_get_pad_id(): vocab_table = VocabularyTable() assert vocab_table.get_pad_id() == 0 assert vocab_table.get_label_id('<pad>') == 0 assert vocab_table.get_label(0) == '<pad>'
def test_vocabulary_table_save_and_load(workdir): vocab_path = os.path.join(workdir, 'vocab.txt') vocab_table = VocabularyTable() vocab_table.add_label('私') vocab_table.add_label('あなた') vocab_table.add_label('あなた') vocab_table.save(vocab_path) vocab_loaded = VocabularyTable.load(vocab_path) assert vocab_loaded.get_label_id('<pad>') == 0 assert vocab_loaded.get_label(0) == '<pad>' assert vocab_loaded.get_label_id('<unk>') == 1 assert vocab_loaded.get_label(1) == '<unk>' assert vocab_loaded.get_label_id('<bos>') == 2 assert vocab_loaded.get_label(2) == '<bos>' assert vocab_loaded.get_label_id('<eos>') == 3 assert vocab_loaded.get_label(3) == '<eos>' assert vocab_loaded.get_label_id('私') == 4 assert vocab_loaded.get_label(4) == '私' assert vocab_loaded.get_label_id('あなた') == 5 assert vocab_loaded.get_label(5) == 'あなた' vocab_loaded = VocabularyTable.load(vocab_path, min_freq=2) assert vocab_loaded.get_label_id('あなた') == 4 assert vocab_loaded.get_label(4) == 'あなた' assert vocab_loaded.get_label_id('私') == vocab_loaded.get_unk_id() vocab_loaded = VocabularyTable.load(vocab_path, min_freq=3) assert vocab_loaded.get_label_id('あなた') == vocab_loaded.get_unk_id() assert vocab_loaded.get_label_id('私') == vocab_loaded.get_unk_id()
def test_vocabulary_table_add_label(): # when min_freq is 1 vocab_table = VocabularyTable() vocab_table.add_label('私') assert vocab_table.num_labels() == 5 assert vocab_table.get_label_id('私') == 4 assert vocab_table.get_label(4) == '私' # when min_freq is 2 or more vocab_table = VocabularyTable(min_freq=2) vocab_table.add_label('私') assert vocab_table.num_labels() == 4 assert vocab_table.get_label_id('私') == vocab_table.get_unk_id() vocab_table.add_label('私') assert vocab_table.num_labels() == 5 assert vocab_table.get_label_id('私') == 4
def test_vocabulary_table_get_eos_id(): vocab_table = VocabularyTable() assert vocab_table.get_eos_id() == 3 assert vocab_table.get_label_id('<eos>') == 3 assert vocab_table.get_label(3) == '<eos>'
type=str, default='vocab.txt', help='Vocabulary table file name') parser.add_argument('--dataset-type', type=str, default='csj', choices=['csj'], help='Dataset type to use') parser.add_argument('--use-subset', action='store_true', help='A flag whether to use a subset of the dataset') args = parser.parse_args() print('Creating vocabulary table and corpus ...') corpus = Corpus() vocabulary_table = VocabularyTable() if args.dataset_type == 'csj': csj_parser = CSJParser(args.dataset_path) tr_talk_sets, _ = csj_parser.get_talks(1) for tr_talks in tr_talk_sets: csj_parser.add_vocabulary(tr_talks, vocabulary_table, corpus, only_core=args.use_subset) else: raise ValueError('dataset_type: {} is not supported'.format( args.dataset_type)) print('Saving corpus ...') corpus_path = os.path.join(args.workdir, args.corpus_file) corpus.save(corpus_path) print('Saving vocabulary table ...')