",": "",
            ":": "",
            ";": "",
            "?": "",
            "\\": " ",
            "\t": " "
            }

from_chars = ''.join(normdict.keys())
to_chars = ''.join(normdict.values())

#t_table = maketrans(from_chars, to_chars)


## Main

numtable = writenumbers.loadNumTable(sys.argv[1])
transcript = codecs.open(sys.argv[2], "r", "utf8")
outtext = codecs.open(sys.argv[3], "w", "utf8")


for line in transcript:
    normtext1 = re.sub(r'[\.,:;\?]', '', line)
    normtext2 = re.sub(r'[\t\\]', ' ', normtext1)
    normtext3 = re.sub(r'  +', ' ', normtext2.strip())
    normtext4 = writenumbers.normNumber(normtext3, numtable)
    outtext.write(normtext4)

transcript.close()
outtext.close()
            }

t_table = str.maketrans(normdict)


## Utility function

def getuttid_text(line):
    return line.split(" ", 1)


## Main

numtable = writenumbers.loadNumTable(sys.argv[1])
textin = codecs.open(sys.argv[2], "r", "utf8")
fid = codecs.open(sys.argv[3], "w", "utf8")
outtext = codecs.open(sys.argv[4], "w", "utf8")

for line in textin:
        utt_id, text = getuttid_text(line)
        normtext1 = text.translate(t_table)
        normtext2 = re.sub(r'  +', ' ', normtext1.strip())
        normtext3 = writenumbers.normNumber(normtext2, numtable)

        fid.write(utt_id + "\n")
        outtext.write(normtext3)

textin.close()
outtext.close()
fid.close()
Exemple #3
0
}

t_table = str.maketrans(normdict)

## Utility function


def getuttid_text(line):
    return line.split(" ", 1)


## Main

numtable = writenumbers.loadNumTable(sys.argv[1])
textin = codecs.open(sys.argv[2], "r", "utf8")
fid = codecs.open(sys.argv[3], "w", "utf8")
outtext = codecs.open(sys.argv[4], "w", "utf8")

for line in textin:
    utt_id, text = getuttid_text(line)
    normtext1 = text.translate(t_table)
    normtext2 = re.sub(r'  +', ' ', normtext1.strip())
    normtext3 = writenumbers.normNumber(normtext2, numtable)

    fid.write(utt_id + "\n")
    outtext.write(normtext3)

textin.close()
outtext.close()
fid.close()
Exemple #4
0
import codecs
import sys
import re
import writenumbers
from string import maketrans

## Global vars

normdict = {".": "", ",": "", ":": "", ";": "", "?": "", "\\": " ", "\t": " "}

from_chars = ''.join(list(normdict.keys()))
to_chars = ''.join(list(normdict.values()))

#t_table = maketrans(from_chars, to_chars)

## Main

numtable = writenumbers.loadNumTable(sys.argv[1])
transcript = codecs.open(sys.argv[2], "r", "utf8")
outtext = codecs.open(sys.argv[3], "w", "utf8")

for line in transcript:
    normtext1 = re.sub(r'[\.,:;\?]', '', line)
    normtext2 = re.sub(r'[\t\\]', ' ', normtext1)
    normtext3 = re.sub(r'  +', ' ', normtext2.strip())
    normtext4 = writenumbers.normNumber(normtext3, numtable)
    outtext.write(normtext4)

transcript.close()
outtext.close()