def main(): with open(hyp_file, 'r', encoding='utf-8') as hyp_fh, open(ref_file, 'r', encoding='utf-8') as ref_fh: ref_ids = set() for utt, uttid in SnorIter(ref_fh): ref_ids.add(uttid) for utt, uttid in SnorIter(hyp_fh): if uttid not in ref_ids: print(uttid)
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): for char in utt: if char == "\u25cf" or char == "\u2022" or char == "\u2219": # Convert "dots"/"filled-circles" to periods fh_out.write("\u002e") elif char == "\u2010" or char == "\u2011" or char == "\u2012" or char == "\u2013" or char == "\u2014" or char == "\u2015": # Change variuos Unicode dashes to Reular hyphen fh_out.write("\u002d") elif char == "\uff09": # Change Full width right-paren to regular paren fh_out.write("\u0029") elif char == "\uff08": # Change Full width left-paren to regular paren fh_out.write("\u0028") else: # Otherwise just apapend char w/o modification fh_out.write(char) # Finally, print out uttid and newline fh_out.write(" (%s)\n" % uttid)
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): # Only output one space at a time space_chars = set([" ", "\t", "\u00a0"]) last_char_was_space = False # Strip spaces at beginning and end of utterance utt = utt.strip(' ') for char in utt: if char in space_chars: if not last_char_was_space: fh_out.write(" ") last_char_was_space = True else: fh_out.write(char) last_char_was_space = False # Finally, print out uttid and newline fh_out.write(" (%s)\n" % uttid)
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): for word in utt.split(): fh_out.write(utf8_char_to_uxxxx(word[0])) for char in word[1:]: fh_out.write("_") fh_out.write(utf8_char_to_uxxxx(char)) fh_out.write(" ") # Finally write out uttid and newline fh_out.write("(%s)\n" % uttid)
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): for char in utt: if char == " ": fh_out.write("<sp> ") else: fh_out.write(char) fh_out.write(" ") # Finally write out uttid and newline fh_out.write("(%s)\n" % uttid)
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): for char in utt.split(): if char == "<sp>": fh_out.write("u0020 ") else: fh_out.write(utf8_char_to_uxxxx(char)) fh_out.write(" ") # Finally write out uttid and newline fh_out.write("(%s)\n" % uttid)
def main(): # First load ids to filter out of transcript ids_to_filter = set() with open(input_ids_file, 'r') as fh: for line in fh: ids_to_filter.add(line.strip()) # Now load input transcript and filter out the ids with open(input_trans, 'r', encoding='utf-8') as fh, open(output_trans, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): if uttid in ids_to_filter: continue fh_out.write("%s (%s)\n" % (utt, uttid))
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): for char in utt: if (split_punc and char in punc) or ( split_punc and char in currency_symbols) or (split_digits and char in digits): fh_out.write(" ") fh_out.write(char) fh_out.write(" ") else: fh_out.write(char) # Finally write out uttid and newline fh_out.write(" (%s)\n" % uttid)
def main(): with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out: for utt, uttid in SnorIter(fh): for char in utt: # First, convert from presentation form to base form if char in PRESENTATION_TO_BASE: char = PRESENTATION_TO_BASE[char] # Next, handle character-level transformations if char == "\u0640": # remove tatweel continue elif char == "\u064b" or char == "\u064c" or char == "\u064d" or char == "\u064e" or char == "\u064f" or char == "\u0650" or char == "\u0651" or char == "\u0652" or char == "\u0653" or char == "\u0654" or char == "\u0655": # remove vowels and hamza continue elif char == "\u200f" or char == "\u200e": # remove RTL and LTR marks continue elif char == "\u06a9": # u06a9 (arabic keheh) -> u0643 (arabic kaf) fh_out.write("\u0643") elif char == "\u06cc": # u06cc (Farsi Yeh) -> u064a (Arabic Yeh) fh_out.write("\u064a") elif char == "\ufdfc": # Transform ligature for RIAL sign -> seq of chars for rial sign fh_out.write("\u0631\u06cc\u0627\u0644") elif char == "\u06f0": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0660") elif char == "\u06f1": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0661") elif char == "\u06f2": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0662") elif char == "\u06f3": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0663") elif char == "\u06f4": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0664") elif char == "\u06f5": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0665") elif char == "\u06f6": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0666") elif char == "\u06f7": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0667") elif char == "\u06f8": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0668") elif char == "\u06f9": # Extended (farsi) arabic-indic digit -> regular arabic-indic digit fh_out.write("\u0669") elif char == "\u060c": # Change Arabic comma to Reular Comma fh_out.write("\u002c") else: # Otherwise just apapend char w/o modification fh_out.write(char) # Finally, print out uttid and newline fh_out.write(" (%s)\n" % uttid)