Ejemplo n.º 1
0
def wer(ref, hyp, remove_nsns=False):
    """
    Calculate word error rate between two string or time_aligned_text objects
    >>> wer("this is a cat", "this is a dog")
    25.0
    """

    # accept time_aligned_text objects too
    if type(ref) == time_aligned_text:
        ref = ref.text()

    if type(hyp) == time_aligned_text:
        hyp = hyp.text()

    # remove tagged noises and other nonspeech events
    ref = re.sub(re_tagged_nonspeech, " ", ref)
    hyp = re.sub(re_tagged_nonspeech, " ", hyp)

    # optionally, remove non silence noises
    if remove_nsns:
        ref = remove_nonsilence_noises(ref)
        hyp = remove_nonsilence_noises(hyp)

    # clean punctuation, etc.
    ref = clean_up(ref)
    hyp = clean_up(hyp)

    # calculate WER
    return (100 * editdistance.eval(ref.split(" "), hyp.split(" ")) /
            max(1, len(ref.split(" "))))
Ejemplo n.º 2
0
def standardize_transcript(input_transcript, remove_nsns=False):
    """
    Given an input transcript or time_aligned_text object,
    remove non-speech events
    [optionally] remove non-silence noises

    >>> standardize_transcript("this is a test")
    'this is a test'
    >>> standardize_transcript("this is um a test")
    'this is um a test'
    >>> standardize_transcript("this is um a test", remove_nsns=True)
    'this is a test'
    """

    # accept time_aligned_text objects but use their output text
    input_transcript = (input_transcript.text() if isinstance(
        input_transcript, time_aligned_text) else input_transcript)

    # remove tagged noises and other non-speech events
    input_transcript = re.sub(re_tagged_nonspeech, " ", input_transcript)

    if remove_nsns:
        input_transcript = remove_nonsilence_noises(input_transcript)

    # clean punctuation, etc.
    input_transcript = clean_up(input_transcript)

    return input_transcript
Ejemplo n.º 3
0
def format_segment(seg):
  """
  :param seg: segment object
  :return str: text for a particular STM line (see segment __str__ method)
    Formats a segment assuming it's an instance of class segment with elements
    filename, channel, speaker, start and stop times, label, and text
  """
  return " ".join(
    [str(getattr(seg,_)) for _ in ('filename', 'channel', 'speaker', 'start', 'stop', 'label')] +
    [clean_up(seg.text)]  # clean_up used to unformat stm file text
  )
Ejemplo n.º 4
0
def cer(ref, hyp, remove_nsns=False):
    """
    Calculate character error rate between two strings or time_aligned_text objects
    >>> cer("this cat", "this bad")
    25.0
    """

    # accept time_aligned_text objects too
    if type(ref) == time_aligned_text:
        ref = ref.text()

    if type(hyp) == time_aligned_text:
        hyp = hyp.text()

    if remove_nsns:
        ref = remove_nonsilence_noises(ref)
        hyp = remove_nonsilence_noises(hyp)

    ref = clean_up(ref)
    hyp = clean_up(hyp)

    # calculate per line CER
    return 100 * editdistance.eval(ref, hyp) / max(1, len(ref))
Ejemplo n.º 5
0
def preprocess_transcript(input_file):
    """
    Given a str file path to gk json output
    Return a list of tokenized word objects
    """
    data = json.load(open(input_file, "r+", encoding="utf-8"))
    segment_times = [(segment["startTimeSec"], segment["endTimeSec"])
                     for segment in data["segments"]]
    lattice_segments = [segment["words"] for segment in data["segments"]]
    transcript_segments = [
        segment["transcript"] for segment in data["segments"]
    ]

    # start token count (increment for included tokens)
    token_idx = 0
    clean_lattice = []
    changed = []
    for seg_id, seg_text in enumerate(transcript_segments):
        seg_lattice = lattice_segments[seg_id]
        start_time, end_time = segment_times[seg_id]
        for i, token in enumerate(seg_text.split()):
            clean_token = clean_up(token).strip()
            if clean_token:
                word_dict = seg_lattice[i]
                assert word_dict["word"] == token

                if word_dict["word"] != clean_token:
                    changed.append([seg_id, i, token, clean_token])

                # times (audio-aligned)
                start, duration = start_time + word_dict["start"], word_dict[
                    "length"]
                end = start + duration

                # generate token-level dict for each cleaned token retained
                word_dict = dict(
                    gk_token=clean_token,
                    gk_token_idx=token_idx,
                    seg_id=seg_id,
                    start=start,
                    end=end,
                    duration=duration,
                )
                # adding cleaned word + metadata to clean_lattice
                clean_lattice.append(word_dict)
                token_idx += 1
    return clean_lattice
def parse_transcript(transcript, speaker="unknown", gender="male", token_idx_offset=0):
    """
    Given reference transcript (and ideally speaker and gender),
    Return a list of tokenized word objects
    """

    if os.path.exists(transcript):
        transcript = open(transcript).read()
    else:
        LOGGER.info(
            "Transcript is not a file location; assuming it is raw text instead."
        )

    clean_lattice = []
    for i, token in enumerate(clean_up(transcript).split()):
        word_dict = dict(
            token=token, token_idx=token_idx_offset + i, speaker=speaker, gender=gender
        )
        clean_lattice.append(word_dict)
    return clean_lattice
def submit_transcript(transcript):
    """
    Submits a transcript to Discovery
    """
    data = {"word_confusions": [{w: "1.0"} for w in transcript.split(" ")]}

    if any(map(lambda c: c not in UNFORMATTED_CHARS, transcript)):
        data['punctuated_transcript'] = transcript
        data['transcript'] = clean_up(transcript)
    else:
        data['transcript'] = transcript

    response = requests.post("http://{}:{}/process".format(
        DISCOVERY_HOST, DISCOVERY_PORT),
                             json=data)
    if response.status_code == 200:
        return response.json()
    else:
        logger.error(
            "Request was not successful. Response Status Code: {}".format(
                response.status_code))
        return {}
Ejemplo n.º 8
0
def test_clean_up():
    " execute suite of tests "

    tests = [
        ("1.05", "one point zero five"),
        ("105.", "one hundred and five"),
        ("105.", "one hundred and five"),
        ("They dollars and three cents.", "they dollars and three cents"),
        (
            "The machine is in the garden of Mr. MacGregor.",
            "the machine is in the garden of mr macgregor",
        ),
        ("This may be a problem.", "this may be a problem"),
        (
            "Yeah th those are my finest percentages.",
            "yeah th those are my finest percentages",
        ),
        ("Please press five after the tone.",
         "please press five after the tone"),
        ("Six distinct.", "six distinct"),
        ("ABC trades at -3.2%.",
         "a b c trades at negative three point two percent"),
        (
            "My 2017 report shows the 5th best earnings.",
            "my two thousand and seventeen report shows the fifth best earnings",
        ),
        ("This is the 9th of November.", "this is the ninth of november"),
        (
            "No work has been completed since Tuesday the 16th of September.",
            "no work has been completed since tuesday the sixteenth of september",
        ),
        (
            "That is what Leah Bradley said last week about the Indians on TV.",
            "that is what leah bradley said last week about the indians on t v",
        ),
        (
            "I've never done anything that I'd regret.",
            "i 've never done anything that i 'd regret",
        ),
        (
            "He needs 1.375%.",
            "he needs one point three hundred and seventy five percent",
        ),
        (
            "I heard Mr. McDonald has $6.23",
            "i heard mr mcdonald has six dollars and twenty three cents",
        ),
        ("Yes this is Mr. MacAllen.", "yes this is mr macallen"),
        ("Don't break dollars.", "don 't break dollars"),
        ("This is the best one.", "this is the best one"),
        ("London has five theatres.", "london has five theatres"),
        ("Hundreds of cats.", "hundreds of cats"),
        ("Thousands of cats.", "thousands of cats"),
        ("Millions and billions of cats.", "millions and billions of cats"),
        ("Good evening Larry.", "good evening larry"),
        ("You have two choices Neo.", "you have two choices neo"),
        ("This one or that one.", "this one or that one"),
        ("7 8 9 ...", "seven eight nine"),
        ("Two partridges in one pear tree.",
         "two partridges in one pear tree"),
        ("2s 3s 4s.", "twos threes fours"),
        ("I am -5 on the bonds.", "i am negative five on the bonds"),
        ("This is your bus terminus seven.",
         "this is your bus terminus seven"),
        ("5th of March.", "fifth of march"),
        ("Hundreds.", "hundreds"),
        ("ABC.", "a b c"),
        ("A.B.C.", "a b c"),
        ("ABC", "a b c"),
        ("A one sauce.", "a one sauce"),
        ("This is Prof. Charles Xavier.", "this is prof charles xavier"),
        ("Welcome to the island of Dr. Moreau.",
         "welcome to the island of dr moreau"),
        ("7th Sept against the third of May.",
         "seventh sept against the third of may"),
        (
            "Who is on first and what is on second.",
            "who is on first and what is on second",
        ),
        ("2001 a space Odyssey.", "two thousand and one a space odyssey"),
        (
            "I'm selling my car for one trillion.",
            "i 'm selling my car for one trillion",
        ),
        (
            "I'm selling my car for one trillion.",
            "i 'm selling my car for one trillion",
        ),
        (
            "I would recommend selling 2s 10s here.",
            "i would recommend selling twos tens here",
        ),
        ("129.6%.", "one hundred and twenty nine point six percent"),
        (
            "5.3% and then 129.6%.",
            "five point three percent and then one hundred and twenty nine point six percent",
        ),
        (
            "70.3% and coming around 129.6%.",
            "seventy point three percent and coming around one hundred and twenty nine point six percent",
        ),
        (
            "Dr. Joseph owes $12.5 billion.",
            "dr joseph owes twelve point five billion dollars",
        ),
        (
            "Replacements for things like Dr. for drive should only happen where necessary.",
            "replacements for things like dr for drive should only happen where necessary",
        ),
        ("100.", "one hundred"),
        ("115.", "one hundred and fifteen"),
        ("125.", "one hundred and twenty five"),
        ("140.", "one hundred and forty"),
        ("1000.", "one thousand"),
        (u"1 2 3 4 5 6 7 8 9 10.",
         u"one two three four five six seven eight nine ten"),
        (
            "his license plate is a. c, f seven...five ! zero",
            "his license plate is a c f seven five zero",
        ),
        ("Q2", "q two"),
        (
            "from our website at www.take2games.com.",
            "from our website at www take two games dot com",
        ),
        ("NBA 2K18", "n b a two k eighteen"),
        ("launched WWE 2K 18", "launched w w e two k eighteen"),
        (
            "released L.A. Noire, the The VR Case Files for the HTC VIVE system",
            "released l a noire the the v r case files for the h t c v i v e system",
        ),
        (
            "Total net bookings were $654 million,",
            "total net bookings were six hundred and fifty four million dollars",
        ),
        (
            "net booking which grew 6% to $380 million.",
            "net booking which grew six percent to three hundred and eighty million dollars",
        ),
        (
            "to $25 dollars or $0.21 per share price.",
            "to twenty five dollars dollars or zero dollars and twenty one cents per share price",
        ),
        ("year-over-year", "year over year"),
        ("HTC VIVE", "h t c v i v e"),
    ]

    for test in tests:
        input_string = test[0]
        result = clean_up(input_string)
        assert result == test[1]
def clean_line(line):
    "clean up a line and test for empty values"
    return clean_up(" ".join(
        map(lambda val: str(val) if not pd.isnull(val) else '', line)))
Ejemplo n.º 10
0
def make_name():
    name = FAKER.name()
    return (clean_up(name), name)
Ejemplo n.º 11
0
def test_clean_up():
  " execute suite of tests "

  tests = [
    ('1.05', 'one point zero five'),
    ('105.', 'one hundred and five'),
    ('105.', 'one hundred and five'),
    ('They dollars and three cents.', 'they dollars and three cents'),
    ('The machine is in the garden of Mr. MacGregor.', 'the machine is in the garden of mr macgregor'),
    ('This may be a problem.', 'this may be a problem'),
    ('Yeah th those are my finest percentages.', 'yeah th those are my finest percentages'),
    ('Please press five after the tone.', 'please press five after the tone'),
    ('Six distinct.', 'six distinct'),
    ('ABC trades at -3.2%.', 'a b c trades at negative three point two percent'),
    (
      'My 2017 report shows the 5th best earnings.',
      'my two thousand and seventeen report shows the fifth best earnings'
    ),
    ('This is the 9th of November.', 'this is the ninth of november'),
    (
      'No work has been completed since Tuesday the 16th of September.',
      'no work has been completed since tuesday the sixteenth of september'
    ),
    (
      'That is what Leah Bradley said last week about the Indians on TV.',
      'that is what leah bradley said last week about the indians on t v'
    ),
    ("I've never done anything that I'd regret.", "i 've never done anything that i 'd regret"),
    ('He needs 1.375%.', 'he needs one point three hundred and seventy five percent'),
    ('I heard Mr. McDonald has $6.23', 'i heard mr mcdonald has six dollars and twenty three cents'),
    ('Yes this is Mr. MacAllen.', 'yes this is mr macallen'),
    ("Don't break dollars.", "don 't break dollars"),
    ('This is the best one.', 'this is the best one'),
    ('London has five theatres.', 'london has five theatres'),
    ('Hundreds of cats.', 'hundreds of cats'),
    ('Thousands of cats.', 'thousands of cats'),
    ('Millions and billions of cats.', 'millions and billions of cats'),
    ('Good evening Larry.', 'good evening larry'),
    ('You have two choices Neo.', 'you have two choices neo'),
    ('This one or that one.', 'this one or that one'),
    ('7 8 9 ...', 'seven eight nine'),
    ('Two partridges in one pear tree.', 'two partridges in one pear tree'),
    ('2s 3s 4s.', 'twos threes fours'),
    ('I am -5 on the bonds.', 'i am negative five on the bonds'),
    ('This is your bus terminus seven.', 'this is your bus terminus seven'),
    ('5th of March.', 'fifth of march'),
    ('Hundreds.', 'hundreds'),
    ('ABC.', 'a b c'),
    ('A.B.C.', 'a b c'),
    ('ABC', 'a b c'),
    ('A one sauce.', 'a one sauce'),
    ('This is Prof. Charles Xavier.', 'this is prof charles xavier'),
    ('Welcome to the island of Dr. Moreau.', 'welcome to the island of dr moreau'),
    ('7th Sept against the third of May.', 'seventh sept against the third of may'),
    ('Who is on first and what is on second.', 'who is on first and what is on second'),
    ('2001 a space Odyssey.', 'two thousand and one a space odyssey'),
    ("I'm selling my car for one trillion.", "i 'm selling my car for one trillion"),
    ("I'm selling my car for one trillion.", "i 'm selling my car for one trillion"),
    ('I would recommend selling 2s 10s here.', 'i would recommend selling twos tens here'),
    ('129.6%.', 'one hundred and twenty nine point six percent'),
    ('5.3% and then 129.6%.', 'five point three percent and then one hundred and twenty nine point six percent'),
    (
      '70.3% and coming around 129.6%.',
      'seventy point three percent and coming around one hundred and twenty nine point six percent'
    ),
    ('Dr. Joseph owes $12.5 billion.', 'dr joseph owes twelve point five billion dollars'),
    (
      'Replacements for things like Dr. for drive should only happen where necessary.',
      'replacements for things like dr for drive should only happen where necessary'
    ),
    ('100.', 'one hundred'),
    ('115.', 'one hundred and fifteen'),
    ('125.', 'one hundred and twenty five'),
    ('140.', 'one hundred and forty'),
    ('1000.', 'one thousand'),
    (u'1 2 3 4 5 6 7 8 9 10.', u'one two three four five six seven eight nine ten'),
    ("his license plate is a. c, f seven...five ! zero", 'his license plate is a c f seven five zero'),
    ("Q2", 'q two'),
    ("from our website at www.take2games.com.", 'from our website at www take two games dot com'),
    ("NBA 2K18", 'n b a two k eighteen'),
    ("launched WWE 2K 18", 'launched w w e two k eighteen'),
    (
      "released L.A. Noire, the The VR Case Files for the HTC VIVE system",
      'released l a noire the the v r case files for the h t c v i v e system'
    ),
    ("Total net bookings were $654 million,", 'total net bookings were six hundred and fifty four million dollars'),
    (
      "net booking which grew 6% to $380 million.",
      'net booking which grew six percent to three hundred and eighty million dollars'
    ),
    (
      "to $25 dollars or $0.21 per share price.",
      'to twenty five dollars dollars or zero dollars and twenty one cents per share price'
    ),
    ("year-over-year", 'year over year'),
    ("HTC VIVE", 'h t c v i v e'),
  ]

  for test in tests:
    input_string = test[0]
    result = clean_up(input_string)
    assert (result == test[1])