def main(): print_arguments(args) print('开始生成数据列表...') create_manifest(annotation_path=args.annotation_path, manifest_path_prefix=args.manifest_prefix) print('开始生成数据字典...') counter = Counter() count_manifest(counter, args.manifest_path) count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) with open(args.vocab_path, 'w', encoding='utf-8') as fout: labels = ['?'] for char, count in count_sorted: if count < args.count_threshold: break labels.append(char) fout.write(str(labels).replace("'", '"')) print('数据字典生成完成!') print('开始抽取%s条数据计算均值和标准值...' % args.num_samples) compute_mean_std(args.manifest_path, args.num_samples, args.output_path)
def main(): print_arguments(args) # 创建保存模型的文件夹 if not os.path.exists(args.save_model_path): os.makedirs(args.save_model_path) # 加载数据字典 with open(args.vocab_path, 'r', encoding='utf-8') as f: vocabulary = eval(f.read()) vocabulary = "".join(vocabulary) # 获取模型 model = GatedConv(vocabulary) # 加载预训练模型 if args.restore_model: model = torch.load(args.restore_model) model = model.cuda() train(model=model, train_manifest_path=args.train_manifest_path, dev_manifest_path=args.dev_manifest_path, vocab_path=args.vocab_path, epochs=args.epochs, batch_size=args.batch_size, learning_rate=args.learning_rate)
type=str, help="language model path. (default: %(default)s)") parser.add_argument("--dev_manifest_path", default="dataset/manifest.dev", type=str, help="train manifest file path. (default: %(default)s)") parser.add_argument("--vocab_path", default="dataset/zh_vocab.json", type=str, help="vocab file path. (default: %(default)s)") parser.add_argument("--batch_size", default=64, type=int, help="number for batch size. (default: %(default)s)") args = parser.parse_args() print_arguments(args) alpha = 0.8 beta = 0.3 cutoff_top_n = 40 cutoff_prob = 1.0 beam_width = 32 num_processes = 4 blank_index = 0 model = torch.load(args.model_path) model = model.cuda() model.eval() # 创建解码器 decoder = CTCBeamDecoder(model.vocabulary, args.lm_path, alpha, beta,