'--export-dir', dest='export_dir', help= 'Export dir for nonsilence_phones.txt, silence_phones.txt and extra_questions.txt', type=str, default='data/local/dict/') args = parser.parse_args() if (args.singlefile != ''): ids = [args.singlefile] else: if args.filelist == '': print('No files specified for processing!') sys.exit() ids = common_utils.loadIdFile(args.filelist) combinedDict = {} for myid in ids: print("I'm now opening ", myid) importer = guessImportFunc(myid) d = importer(myid) combinedDict = merge_dicts(combinedDict, d) variants = 0 for key in sorted(combinedDict.keys()): #print 'Word:',key,combinedDict[key] variants += len(combinedDict[key]) print('Dictionary size is ', len(combinedDict),
parser.add_argument('-p', '--utterance-postfix-name', dest='postfix', help='--utterance-postfix-name', type=str, default='') args = parser.parse_args() if args.filelist == '': print 'Corpus filelist is empty. Use -f to supply a filelist!' else: print 'Load ', args.filelist, ', ommit ', args.remove_extension ids_raw = common_utils.loadIdFile( args.filelist, remove_extension=args.remove_extension) print 'I have', len( ids_raw ), 'files. Some may have their audio missing, I\'ll check that for you...' ids = [] #check for missing wav files: omitted = 0 for myid in ids_raw: check = myid + args.postfix + args.wav_extension if os.path.isfile(check): ids.append(myid) elif os.path.isfile(myid + '_Kinect-Beam' + args.wav_extension): ids.append(myid) else: print 'Warning, omitting', myid, 'because I can\'t find', check
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Prepares the files from the TUDA corpus (XML) into text transcriptions for KALDI') parser.add_argument('-f', '--filelist', dest='filelist', help='process this file list', type=str, default = '') parser.add_argument('-r', '--remove_extension', dest='remove_extension', help='remove this extension, to get plain file id', type=str, default='.xml') parser.add_argument('-w', '--audio-file-extension', dest='wav_extension', help='extension for audio files', type=str, default='.wav') parser.add_argument('-p', '--utterance-postfix-name', dest='postfix', help='--utterance-postfix-name', type=str, default='_Kinect-Beam') args = parser.parse_args() if args.filelist == '': print 'Corpus filelist is empty. Use -f to supply a filelist!' else: print 'Load ', args.filelist, ', ommit ', args.remove_extension ids_raw = common_utils.loadIdFile(args.filelist,remove_extension=args.remove_extension) print 'I have',len(ids_raw),'files. Some may have their audio missing, I\'ll check that for you...' ids = [] #check for missing wav files: omitted = 0 for myid in ids_raw: check = myid+args.postfix+args.wav_extension if os.path.isfile(check): ids.append(myid) else: print 'Warning, omitting',myid,'because I can\'t find',check omitted += 1 print 'Found',len(ids),' wav files.' print 'Omitted ',omitted,' xml transcription files (Some missing files is normal for the TUDA Kaldi corpus).'
parser = argparse.ArgumentParser(description='Prepares various sources of pronounciations and builds a lexicon that can be exported to KALDI') parser.add_argument('-f', '--filelist', dest='filelist', help='Process this file list of lexicons', type=str, default = '') parser.add_argument('-s', '--single-file', dest='singlefile', help='Process this single lexicon file', type=str, default = '') parser.add_argument('-e', '--export-pickle', dest='export', help='Export pickle file of combined phoneme dictionary', type=str, default = '') parser.add_argument('-d', '--export-dir', dest='export_dir', help='Export dir for nonsilence_phones.txt, silence_phones.txt and extra_questions.txt' , type=str, default='data/local/dict/') args = parser.parse_args() if(args.singlefile != ''): ids = [args.singlefile] else: if args.filelist == '': print 'No files specified for processing!' sys.exit() ids = common_utils.loadIdFile(args.filelist) combinedDict = {} for myid in ids: print "I'm now opening ", myid importer = guessImportFunc(myid) d = importer(myid) combinedDict = merge_dicts(combinedDict, d) variants = 0 for key in sorted(combinedDict.iterkeys()): #print 'Word:',key,combinedDict[key] variants += len(combinedDict[key]) print 'Dictionary size is ', len(combinedDict), ' pronounciation variants ', variants