Esempio n. 1
0
def main():
    dataset_train = ASR_align_DataSet(file=[args.dirs.train.data],
                                      args=args,
                                      _shuffle=False,
                                      transform=True)
    dataset_dev = ASR_align_DataSet(file=[args.dirs.dev.data],
                                    args=args,
                                    _shuffle=False,
                                    transform=True)
    tfdata_train = TFData(dataset=dataset_train,
                          dataAttr=['feature', 'label', 'align'],
                          dir_save=args.dirs.train.tfdata,
                          args=args)
    tfdata_dev = TFData(dataset=dataset_dev,
                        dataAttr=['feature', 'label', 'align'],
                        dir_save=args.dirs.dev.tfdata,
                        args=args)
    tfdata_train.save('0')
    tfdata_dev.save('0')
    # tfdata_train.get_bucket_size(100, True)
    # split_save()
    # for sample in tfdata_dev.read():
    # # for sample in dataset_train:
    #     # print(sample['feature'].shape)
    #     # print(sample['label'])
    #     print(sample[0].shape)
    #     import pdb; pdb.set_trace()
    dataset_train.get_dataset_ngram(n=args.data.ngram,
                                    k=10000,
                                    savefile=args.dirs.ngram)
Esempio n. 2
0
def split_save(capacity=10000):
    fw = open(args.dirs.train.tfdata / '0.csv', 'w')
    with open(args.dirs.train.data) as f:
        for num, line in enumerate(f):
            if num % capacity == 0:
                idx_file = num // capacity
                print('package file ', idx_file)
                try:
                    fw.close()
                    fw = open(
                        args.dirs.train.tfdata / (str(idx_file) + '.csv'), 'w')
                except:
                    pass
            fw.write(line)
    print('processed {} utts.'.format(num + 1))
    fw.close()

    for i in Path(args.dirs.train.tfdata).glob('*.csv'):
        print('converting {}.csv to record'.format(i.name))
        dataset_train = ASR_align_DataSet(file=[i],
                                          args=args,
                                          _shuffle=False,
                                          transform=True)
        tfdata_train = TFData(dataset=dataset_train,
                              dataAttr=['feature', 'label', 'align'],
                              dir_save=args.dirs.train.tfdata,
                              args=args)

        tfdata_train.save(i.name.split('.')[0])