Ejemplo n.º 1
0
def main():

    with open(hyp_file, 'r', encoding='utf-8') as hyp_fh, open(ref_file, 'r', encoding='utf-8') as ref_fh:
        ref_ids = set()
        for utt, uttid in SnorIter(ref_fh):
            ref_ids.add(uttid)

        for utt, uttid in SnorIter(hyp_fh):
            if uttid not in ref_ids:
                print(uttid)
Ejemplo n.º 2
0
def main():

    with open(input_file,
              'r', encoding='utf-8') as fh, open(output_file,
                                                 'w',
                                                 encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            for char in utt:
                if char == "\u25cf" or char == "\u2022" or char == "\u2219":
                    # Convert "dots"/"filled-circles" to periods
                    fh_out.write("\u002e")
                elif char == "\u2010" or char == "\u2011" or char == "\u2012" or char == "\u2013" or char == "\u2014" or char == "\u2015":
                    # Change variuos Unicode dashes to Reular hyphen
                    fh_out.write("\u002d")
                elif char == "\uff09":
                    # Change Full width right-paren to regular paren
                    fh_out.write("\u0029")
                elif char == "\uff08":
                    # Change Full width left-paren to regular paren
                    fh_out.write("\u0028")
                else:
                    # Otherwise just apapend char w/o modification
                    fh_out.write(char)

            # Finally, print out uttid and newline
            fh_out.write(" (%s)\n" % uttid)
def main():

    with open(input_file,
              'r', encoding='utf-8') as fh, open(output_file,
                                                 'w',
                                                 encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            # Only output one space at a time
            space_chars = set([" ", "\t", "\u00a0"])

            last_char_was_space = False

            # Strip spaces at beginning and end of utterance
            utt = utt.strip(' ')
            for char in utt:
                if char in space_chars:
                    if not last_char_was_space:
                        fh_out.write(" ")
                    last_char_was_space = True
                else:
                    fh_out.write(char)
                    last_char_was_space = False

            # Finally, print out uttid and newline
            fh_out.write(" (%s)\n" % uttid)
Ejemplo n.º 4
0
def main():
    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            for word in utt.split():
                fh_out.write(utf8_char_to_uxxxx(word[0]))
                for char in word[1:]:
                    fh_out.write("_")
                    fh_out.write(utf8_char_to_uxxxx(char))
                fh_out.write(" ")
            # Finally write out uttid and newline
            fh_out.write("(%s)\n" % uttid)
Ejemplo n.º 5
0
def main():
    with open(input_file,
              'r', encoding='utf-8') as fh, open(output_file,
                                                 'w',
                                                 encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            for char in utt:
                if char == " ":
                    fh_out.write("<sp> ")
                else:
                    fh_out.write(char)
                    fh_out.write(" ")
            # Finally write out uttid and newline
            fh_out.write("(%s)\n" % uttid)
Ejemplo n.º 6
0
def main():

    with open(input_file,
              'r', encoding='utf-8') as fh, open(output_file,
                                                 'w',
                                                 encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            for char in utt.split():
                if char == "<sp>":
                    fh_out.write("u0020 ")
                else:
                    fh_out.write(utf8_char_to_uxxxx(char))
                    fh_out.write(" ")

            # Finally write out uttid and newline
            fh_out.write("(%s)\n" % uttid)
Ejemplo n.º 7
0
def main():

    # First load ids to filter out of transcript
    ids_to_filter = set()
    with open(input_ids_file, 'r') as fh:
        for line in fh:
            ids_to_filter.add(line.strip())

    # Now load input transcript and filter out the ids
    with open(input_trans, 'r',
              encoding='utf-8') as fh, open(output_trans,
                                            'w',
                                            encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            if uttid in ids_to_filter:
                continue

            fh_out.write("%s (%s)\n" % (utt, uttid))
Ejemplo n.º 8
0
def main():

    with open(input_file,
              'r', encoding='utf-8') as fh, open(output_file,
                                                 'w',
                                                 encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            for char in utt:
                if (split_punc and char in punc) or (
                        split_punc
                        and char in currency_symbols) or (split_digits
                                                          and char in digits):
                    fh_out.write(" ")
                    fh_out.write(char)
                    fh_out.write(" ")
                else:
                    fh_out.write(char)

            # Finally write out uttid and newline
            fh_out.write(" (%s)\n" % uttid)
Ejemplo n.º 9
0
def main():

    with open(input_file,
              'r', encoding='utf-8') as fh, open(output_file,
                                                 'w',
                                                 encoding='utf-8') as fh_out:
        for utt, uttid in SnorIter(fh):
            for char in utt:
                # First, convert from presentation form to base form
                if char in PRESENTATION_TO_BASE:
                    char = PRESENTATION_TO_BASE[char]

                # Next, handle character-level transformations
                if char == "\u0640":
                    # remove tatweel
                    continue
                elif char == "\u064b" or char == "\u064c" or char == "\u064d" or char == "\u064e" or char == "\u064f" or char == "\u0650" or char == "\u0651" or char == "\u0652" or char == "\u0653" or char == "\u0654" or char == "\u0655":
                    # remove vowels and hamza
                    continue
                elif char == "\u200f" or char == "\u200e":
                    # remove RTL and LTR marks
                    continue
                elif char == "\u06a9":
                    # u06a9 (arabic keheh) -> u0643 (arabic kaf)
                    fh_out.write("\u0643")
                elif char == "\u06cc":
                    # u06cc (Farsi Yeh) -> u064a (Arabic Yeh)
                    fh_out.write("\u064a")
                elif char == "\ufdfc":
                    # Transform ligature for RIAL sign -> seq of chars for rial sign
                    fh_out.write("\u0631\u06cc\u0627\u0644")
                elif char == "\u06f0":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0660")
                elif char == "\u06f1":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0661")
                elif char == "\u06f2":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0662")
                elif char == "\u06f3":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0663")
                elif char == "\u06f4":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0664")
                elif char == "\u06f5":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0665")
                elif char == "\u06f6":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0666")
                elif char == "\u06f7":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0667")
                elif char == "\u06f8":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0668")
                elif char == "\u06f9":
                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
                    fh_out.write("\u0669")
                elif char == "\u060c":
                    # Change Arabic comma to Reular Comma
                    fh_out.write("\u002c")
                else:
                    # Otherwise just apapend char w/o modification
                    fh_out.write(char)

            # Finally, print out uttid and newline
            fh_out.write(" (%s)\n" % uttid)