Beispiel #1
0
filename_prep = re.search(r"(?<=system-output\/)(.*?)(?=\.txt)", infile).group(0)
outfile = "./results/google/universal/google-" + filename_prep + ".csv"
trans_file = "./results/google/system-trans-text/google-" + filename_prep + "-trans.txt"

# setting initial utterance as jiwer can't handle empty strings.
# tsoft = the start of the file.
prev = "tsotf"
utt = ""

# Google specific processing.
# This function extracts each new hypothesis with its time and processes it.
# Simultaneously, finalised hypotheses are stored for final WER calculations.
with open(infile, 'r') as f:
    for line in f:
        if line.startswith("Finished"):
            fin = re.search(r"(?<=Finished: )(.*)(?=\n)", line).group(0)
        if line.startswith("Time"):
            time = re.search(r"(?<=Time: )(.*)(?=\n)", line).group(0)
        if line.startswith("Transcript"):
            utt = re.search(r"(?<=Transcript: )(.*)(?=\n)", line).group(0)
            utt = utt.replace(".", "")
            if fin == "False":
                process(outfile, time, prev, utt)
                prev = utt
            else:
                process(outfile, time, prev, utt)
                add_trans_chunk(trans_file, utt.lower())
                prev = "tsotf"

# Universal output finalised.
clean_csv(outfile)
from universal import add_trans_chunk
import sys
import re

# Given a system output and which channel it is from, the final transcript is stored.
# This is to calculate the WER.
infile = sys.argv[1]
side = sys.argv[2]
if side == "left":
    filename_prep = re.search(r"(?<=left\/)(.*?)(?=\.txt)", infile).group(0)
elif side == "right":
    filename_prep = re.search(r"(?<=right\/)(.*?)(?=\.txt)", infile).group(0)
else:
    print("Which side?")
trans_file = "./results/msoft/split-system-trans-text/" + side + "-" + filename_prep + "-trans.txt"

with open(infile, 'r') as f:
    for line in f:
        if line.startswith("JSON"):
            transcript = re.search(r"(?<=DisplayText\":\")(.*?)(?=\")", line)
            if transcript:
                transcript = transcript.group(0)
                add_trans_chunk(trans_file, transcript.lower())

# IBM specific processing.
# This function extracts each new hypothesis with its time and processes it.
# Simultaneously, finalised hypotheses are stored for final WER calculations.
with open(infile, 'r') as f:
    for line in f:
        check = line.replace(" ", "").replace("%HESITATION", "")
        try:
            number = re.search(r"[+-]?([0-9]*[.])?[0-9]+", line).group(0)
        except AttributeError:
            pass
        if check.startswith("\"transcript"):
            utt = re.search(r"(?<=transcript\"\: \")(.*?)(?=\")",
                            line).group(0)
            utt = utt.lower().replace("%hesitation", "")
        if check.startswith(number) and not "," in line:
            time = check.replace("\n", "")
        if check.startswith("\"final"):
            if not utt.isspace():
                # print(utt)
                if "false" in line:
                    process(outfile, time, prev, utt)
                    prev = utt
                else:
                    process(outfile, time, prev, utt)
                    add_trans_chunk(trans_file, utt)
                    prev = "tsotf"

# Universal output finalised.
clean_csv(outfile)