",": "", ":": "", ";": "", "?": "", "\\": " ", "\t": " " } from_chars = ''.join(normdict.keys()) to_chars = ''.join(normdict.values()) #t_table = maketrans(from_chars, to_chars) ## Main numtable = writenumbers.loadNumTable(sys.argv[1]) transcript = codecs.open(sys.argv[2], "r", "utf8") outtext = codecs.open(sys.argv[3], "w", "utf8") for line in transcript: normtext1 = re.sub(r'[\.,:;\?]', '', line) normtext2 = re.sub(r'[\t\\]', ' ', normtext1) normtext3 = re.sub(r' +', ' ', normtext2.strip()) normtext4 = writenumbers.normNumber(normtext3, numtable) outtext.write(normtext4) transcript.close() outtext.close()
} t_table = str.maketrans(normdict) ## Utility function def getuttid_text(line): return line.split(" ", 1) ## Main numtable = writenumbers.loadNumTable(sys.argv[1]) textin = codecs.open(sys.argv[2], "r", "utf8") fid = codecs.open(sys.argv[3], "w", "utf8") outtext = codecs.open(sys.argv[4], "w", "utf8") for line in textin: utt_id, text = getuttid_text(line) normtext1 = text.translate(t_table) normtext2 = re.sub(r' +', ' ', normtext1.strip()) normtext3 = writenumbers.normNumber(normtext2, numtable) fid.write(utt_id + "\n") outtext.write(normtext3) textin.close() outtext.close() fid.close()
import codecs import sys import re import writenumbers from string import maketrans ## Global vars normdict = {".": "", ",": "", ":": "", ";": "", "?": "", "\\": " ", "\t": " "} from_chars = ''.join(list(normdict.keys())) to_chars = ''.join(list(normdict.values())) #t_table = maketrans(from_chars, to_chars) ## Main numtable = writenumbers.loadNumTable(sys.argv[1]) transcript = codecs.open(sys.argv[2], "r", "utf8") outtext = codecs.open(sys.argv[3], "w", "utf8") for line in transcript: normtext1 = re.sub(r'[\.,:;\?]', '', line) normtext2 = re.sub(r'[\t\\]', ' ', normtext1) normtext3 = re.sub(r' +', ' ', normtext2.strip()) normtext4 = writenumbers.normNumber(normtext3, numtable) outtext.write(normtext4) transcript.close() outtext.close()