Esempio n. 1
0
def get_vocabulary_table(workdir, words):
    vocab_table = VocabularyTable()
    for word in words:
        vocab_table.add_label(word['word'])
    return vocab_table
Esempio n. 2
0
parser.add_argument('--lr', type=float, default=0.01, help='Learning rate')
parser.add_argument('--epochs',
                    type=int,
                    default=5,
                    help='Number of epochs for training')
parser.add_argument('--batch-size', type=int, default=32, help='Batch size')
parser.add_argument('--device', type=str, default='cpu', help='Device string')
parser.add_argument('--resume',
                    action='store_true',
                    help="Training resumes from a model file "
                    "specified by '--model-file' option")
args = parser.parse_args()

print('Loading vocabulary table ...')
vocab_path = os.path.join(args.workdir, args.vocabulary_table_file)
vocab_table = VocabularyTable.load(vocab_path,
                                   min_freq=args.min_word_frequency)
print('Loading training data ...')
training_data_dirpath = os.path.join(args.workdir, args.training_data_dirname)
repository_tr = TrainingDatasetRepository(training_data_dirpath)
dataset_tr = IterableAudioDataset(repository_tr, vocab_table)
dataloader_tr = DataLoader(dataset_tr,
                           batch_size=args.batch_size,
                           collate_fn=ListenAttendSpell.collate)
print('Loading development data ...')
development_data_dirpath = os.path.join(args.workdir,
                                        args.development_data_dirname)
repository_dev = DevelopmentDatasetRepository(development_data_dirpath)
dataloaders_dev = []
for dataset_dev in AudioDataset.load_all(repository_dev, vocab_table):
    dataloader_dev = DataLoader(dataset_dev,
                                batch_size=args.batch_size,
def test_vocabulary_table_get_unk_id():
    vocab_table = VocabularyTable()
    assert vocab_table.get_unk_id() == 1
    assert vocab_table.get_label_id('<unk>') == 1
    assert vocab_table.get_label(1) == '<unk>'
def test_vocabulary_table_get_bos_id():
    vocab_table = VocabularyTable()
    assert vocab_table.get_bos_id() == 2
    assert vocab_table.get_label_id('<bos>') == 2
    assert vocab_table.get_label(2) == '<bos>'
def test_vocabulary_table_get_pad_id():
    vocab_table = VocabularyTable()
    assert vocab_table.get_pad_id() == 0
    assert vocab_table.get_label_id('<pad>') == 0
    assert vocab_table.get_label(0) == '<pad>'
def test_vocabulary_table_save_and_load(workdir):
    vocab_path = os.path.join(workdir, 'vocab.txt')
    vocab_table = VocabularyTable()
    vocab_table.add_label('私')
    vocab_table.add_label('あなた')
    vocab_table.add_label('あなた')
    vocab_table.save(vocab_path)

    vocab_loaded = VocabularyTable.load(vocab_path)
    assert vocab_loaded.get_label_id('<pad>') == 0
    assert vocab_loaded.get_label(0) == '<pad>'
    assert vocab_loaded.get_label_id('<unk>') == 1
    assert vocab_loaded.get_label(1) == '<unk>'
    assert vocab_loaded.get_label_id('<bos>') == 2
    assert vocab_loaded.get_label(2) == '<bos>'
    assert vocab_loaded.get_label_id('<eos>') == 3
    assert vocab_loaded.get_label(3) == '<eos>'
    assert vocab_loaded.get_label_id('私') == 4
    assert vocab_loaded.get_label(4) == '私'
    assert vocab_loaded.get_label_id('あなた') == 5
    assert vocab_loaded.get_label(5) == 'あなた'

    vocab_loaded = VocabularyTable.load(vocab_path, min_freq=2)
    assert vocab_loaded.get_label_id('あなた') == 4
    assert vocab_loaded.get_label(4) == 'あなた'
    assert vocab_loaded.get_label_id('私') == vocab_loaded.get_unk_id()

    vocab_loaded = VocabularyTable.load(vocab_path, min_freq=3)
    assert vocab_loaded.get_label_id('あなた') == vocab_loaded.get_unk_id()
    assert vocab_loaded.get_label_id('私') == vocab_loaded.get_unk_id()
def test_vocabulary_table_add_label():
    # when min_freq is 1
    vocab_table = VocabularyTable()
    vocab_table.add_label('私')
    assert vocab_table.num_labels() == 5
    assert vocab_table.get_label_id('私') == 4
    assert vocab_table.get_label(4) == '私'
    # when min_freq is 2 or more
    vocab_table = VocabularyTable(min_freq=2)
    vocab_table.add_label('私')
    assert vocab_table.num_labels() == 4
    assert vocab_table.get_label_id('私') == vocab_table.get_unk_id()
    vocab_table.add_label('私')
    assert vocab_table.num_labels() == 5
    assert vocab_table.get_label_id('私') == 4
def test_vocabulary_table_get_eos_id():
    vocab_table = VocabularyTable()
    assert vocab_table.get_eos_id() == 3
    assert vocab_table.get_label_id('<eos>') == 3
    assert vocab_table.get_label(3) == '<eos>'
                    type=str,
                    default='vocab.txt',
                    help='Vocabulary table file name')
parser.add_argument('--dataset-type',
                    type=str,
                    default='csj',
                    choices=['csj'],
                    help='Dataset type to use')
parser.add_argument('--use-subset',
                    action='store_true',
                    help='A flag whether to use a subset of the dataset')
args = parser.parse_args()

print('Creating vocabulary table and corpus ...')
corpus = Corpus()
vocabulary_table = VocabularyTable()
if args.dataset_type == 'csj':
    csj_parser = CSJParser(args.dataset_path)
    tr_talk_sets, _ = csj_parser.get_talks(1)
    for tr_talks in tr_talk_sets:
        csj_parser.add_vocabulary(tr_talks,
                                  vocabulary_table,
                                  corpus,
                                  only_core=args.use_subset)
else:
    raise ValueError('dataset_type: {} is not supported'.format(
        args.dataset_type))
print('Saving corpus ...')
corpus_path = os.path.join(args.workdir, args.corpus_file)
corpus.save(corpus_path)
print('Saving vocabulary table ...')