def decode_helper(probs, uttid, uxxxx): res = self.lattice_decoder.Decode(probs, uttid) res_utf8 = '' if uxxxx == False: for uxxxx_word in res.split(' '): res_utf8 += ''.join( [uxxxx_to_utf8(r) for r in uxxxx_word.split('_')]) res = res_utf8 else: res_flatten = '' for uxxxx_word in res.split(' '): for uxxxx_char in uxxxx_word.split('_'): res_flatten += uxxxx_char res_flatten += ' ' res = res_flatten.strip() return res
def decode_without_lm(self, model_output, batch_actual_timesteps, uxxxx=False): start_decode = datetime.now() min_prob_thresh = 3 * 1 / len(self.alphabet) T = model_output.size()[0] B = model_output.size()[1] prev_char = ['' for _ in range(B)] result = ['' for _ in range(B)] for t in range(T): # #gpu argmax (bug!!!!!) # gpu_argmax = True # argmaxs, argmax_idxs = model_output.data[t].max(dim=1) # argmaxs.squeeze_() # argmax_idxs.squeeze_() # cpu argmax gpu_argmax = False model_output_at_t_cpu = model_output.data[t].cpu().numpy() argmaxs = model_output_at_t_cpu.max(1).flatten() argmax_idxs = model_output_at_t_cpu.argmax(1).flatten() for b in range(B): # Only look at valid model output for this batch entry if t >= batch_actual_timesteps[b]: continue if argmax_idxs[b] == 0: # CTC Blank prev_char[b] = '' continue # Heuristic # If model is predicting very low probability for all letters in alphabet, treat that the # samed as a CTC blank if argmaxs[b] < min_prob_thresh: prev_char[b] = '' continue char = self.alphabet.idx_to_char[argmax_idxs[b]] if prev_char[b] == char: continue result[b] += char prev_char[b] = char # Add a space to all but last iteration if t != T - 1: result[b] += ' ' # Strip off final token-stream space if needed for b in range(B): if len(result[b]) > 0 and result[b][-1] == ' ': result[b] = result[b][:-1] # Check if we should return utf8 output if uxxxx == False: result = [uxxxx_to_utf8(r) for r in result] return result
def decode_with_lm(self, model_output, batch_actual_timesteps, uxxxx=False, pmod=False): if self.lattice_decoder is None: raise Exception( "Must initialize lattice decoder prior to LM decoding") T = model_output.size()[0] B = model_output.size()[1] # Actual model output is not set to probability vector yet, need to run softmax probs = torch.nn.functional.log_softmax( model_output.view(-1, model_output.size(2)), dim=1).view(model_output.size(0), model_output.size(1), -1) # Make sure we're on CPU probs = probs.data.cpu() hyp_results = [] for b in range(B): for t in range(T): if pmod: if torch.max(probs[t, b]) < 0.9: probs[t, b] = probs[t, b] * 0.6 # low conf else: probs[t, b] = probs[t, b] * 1.1 # 'normal, high conf' else: probs[t, b] = probs[t, b] * self.acoustic_weight activations = probs[:, b, :].numpy() activations_remapped = np.zeros( (batch_actual_timesteps[b], len(self.lmidx_to_char))) for c in range(len(self.lmidx_to_char)): char = self.lmidx_to_char[c] if char in self.alphabet.char_to_idx: mapped_c = self.alphabet.char_to_idx[char] activations_remapped[:, c] = activations[: batch_actual_timesteps[ b], mapped_c] else: activations_remapped[:, c] = np.log(1e-10) # Now check that anything turned to NULL gets mapped to ctc-blank for t in range(batch_actual_timesteps[b]): psum = np.log(1e-10) for c in range(len(self.lmidx_to_char)): psum = np.logaddexp(psum, activations_remapped[t, c]) if psum < np.log(1e-2): activations_remapped[t, 0] = 0 res = self.lattice_decoder.Decode(activations_remapped) res_utf8 = '' if uxxxx == False: for uxxxx_word in res.split(' '): res_utf8 += ''.join( [uxxxx_to_utf8(r) for r in uxxxx_word.split('_')]) res = res_utf8 else: res_flatten = '' for uxxxx_word in res.split(' '): for uxxxx_char in uxxxx_word.split('_'): res_flatten += uxxxx_char res_flatten += ' ' res = res_flatten.strip() hyp_results.append(res) return hyp_results
def decode_with_lm_mt(self, model_output, batch_actual_timesteps, uxxxx=False, n_workers=10): # Setup multi-threaded decoding #print("About to create threadpool") with ThreadPoolExecutor(max_workers=n_workers) as executor: if self.lattice_decoder is None: raise Exception( "Must initialize lattice decoder prior to LM decoding") T = model_output.size()[0] B = model_output.size()[1] # Actual model output is not set to probability vector yet, need to run softmax probs = torch.nn.functional.log_softmax( model_output.view(-1, model_output.size(2)), dim=1).view(model_output.size(0), model_output.size(1), -1) # Need to take care of issue where prob goes to a char in model-alphabet but not in lm-alphabet # Just assign high prob to ctc-blank? #print("Sum of missing chars' prob = %s" % str(model_output[:,:,self.add_to_blank_idx].sum(dim=2))) #probs[:,:,0] += probs[:,:,self.add_to_blank_idx].sum(dim=2) #probs[:,:,self.add_to_blank_idx] = 0 # Make sure we're on CPU probs = probs.data.cpu() # We process decoder parallely in worker threads; store those async futures here decoder_futures = [None] * B # probs = probs * self.acoustic_weight start_submitting = datetime.now() for b in range(B): probs_remapped = np.full( (batch_actual_timesteps[b], len(self.lmidx_to_char)), np.log(1e-10)) probs_remapped[:, self. lm_swap_idxs_lmidx] = probs[:batch_actual_timesteps[ b], b, self.lm_swap_idxs_modelidx] decoder_futures[b] = executor.submit( self.lattice_decoder.Decode, probs_remapped) end_submitting = datetime.now() #print("Waiting for threadpool jobs to finish. Took %f s to get here" % (end_submitting - start_submitting).total_seconds()) # At this point all decoder tasks are done (we are outside scope of with ThreadPoolExecutor, so it has finished) end_waiting = datetime.now() #print("Took %f s to wait for batch decodes to finish" % (end_waiting - end_submitting).total_seconds()) hyp_results = [] for b in range(B): res = decoder_futures[b].result() res_utf8 = '' if uxxxx == False: for uxxxx_word in res.split(' '): res_utf8 += ''.join( [uxxxx_to_utf8(r) for r in uxxxx_word.split('_')]) res = res_utf8 else: res_flatten = '' for uxxxx_word in res.split(' '): for uxxxx_char in uxxxx_word.split('_'): res_flatten += uxxxx_char res_flatten += ' ' res = res_flatten.strip() hyp_results.append(res) return hyp_results
import sys import textutils input_file = sys.argv[1] with open(input_file, 'r') as fh: for line in fh: lparen_location = line.rfind('(') rparen_location = line.rfind(')') utt = line[:lparen_location] utt_utf8 = '' for word in utt.split(" "): if word == "u0020": utt_utf8 += " " elif word == "u0009": utt_utf8 += " " else: word_utf8 = '' for char in word.split("_"): char_utf8 = textutils.uxxxx_to_utf8(char) word_utf8 += char_utf8 utt_utf8 += word_utf8 uttid = line[lparen_location + 1:rparen_location] uttid = uttid[:uttid.rfind('_')] print("%s (%s)" % (utt_utf8, uttid))
def undo_bidi(uxxxx_str, base_level=1): # Step 0: attach the unicode bidi type to each char augmented_char_array = [] for char in uxxxx_str.split(): bidi_type = bidirectional(uxxxx_to_utf8(char)) # For now, hard-coding base level to always be RTL, because this is for Arabic corpus. Revisit this later! augmented_char_array.append({ 'char': char, 'bidi-type': bidi_type, 'bidi-orig-type': bidi_type, 'level': base_level }) # Step 1: Resolve Explicit embed and overrides # See: http://unicode.org/reports/tr9/#Explicit_Levels_and_Directions overflow_counter = almost_overflow_counter = 0 directional_override = 'N' levels = deque() # X1 embedding_level = base_level for _ch in augmented_char_array: bidi_type = _ch['bidi-type'] level_func, override = X2_X5_MAPPINGS.get(bidi_type, (None, None)) if level_func: # So this is X2 to X5 # if we've past EXPLICIT_LEVEL_LIMIT, note it and do nothing if overflow_counter != 0: overflow_counter += 1 continue new_level = level_func(embedding_level) if new_level < EXPLICIT_LEVEL_LIMIT: levels.append((embedding_level, directional_override)) embedding_level, directional_override = new_level, override elif embedding_level == EXPLICIT_LEVEL_LIMIT - 2: # The new level is invalid, but a valid level can still be # achieved if this level is 60 and we encounter an RLE or # RLO further on. So record that we 'almost' overflowed. almost_overflow_counter += 1 else: overflow_counter += 1 else: # X6 if bidi_type not in X6_IGNORED: _ch['level'] = embedding_level if directional_override != 'N': _ch['bidi-type'] = directional_override # X7 elif bidi_type == 'PDF': if overflow_counter: overflow_counter -= 1 elif almost_overflow_counter and \ embedding_level != EXPLICIT_LEVEL_LIMIT - 1: almost_overflow_counter -= 1 elif levels: embedding_level, directional_override = levels.pop() # X8 elif bidi_type == 'B': levels.clear() overflow_counter = almost_overflow_counter = 0 embedding_level = _ch['level'] = base_level directional_override = 'N' # Removes the explicit embeds and overrides of types # RLE, LRE, RLO, LRO, PDF, and BN. Adjusts extended chars # next and prev as well # Applies X9. See http://unicode.org/reports/tr9/#X9 augmented_char_array = [ _ch for _ch in augmented_char_array if _ch['bidi-type'] not in X9_REMOVED ] # Step 2: determine LTR / RTL runs # See: See http://unicode.org/reports/tr9/#X10 # First define utility function: Basically, RTL takes preference over LTR ... if either left/right boundary is RTL then all is RTL def calc_level_run(b_l, b_r): return ['L', 'R'][max(b_l, b_r) % 2] runs = [] # After remoing RLO/LRO/etc, check length again if len(augmented_char_array) == 0: return '' first_char = augmented_char_array[0] run_start_level = calc_level_run(first_char['level'], base_level) run_end_level = None run_start = run_length = 0 prev_level, prev_type = first_char['level'], first_char['bidi-type'] for char in augmented_char_array: curr_level, curr_type = char['level'], char['bidi-type'] if curr_level == prev_level: run_length += 1 else: run_end_level = calc_level_run(prev_level, curr_level) runs.append({ 'sor': run_start_level, 'eor': run_end_level, 'start': run_start, 'type': prev_type, 'length': run_length }) run_start_level = run_end_level run_start += run_length run_length = 1 prev_level, prev_type = curr_level, curr_type # for the last char/runlevel run_end_level = calc_level_run(curr_level, base_level) runs.append({ 'sor': run_start_level, 'eor': run_end_level, 'start': run_start, 'type': curr_type, 'length': run_length }) # Step 3: Resolve weak LTR/RTL types # See: http://unicode.org/reports/tr9/#Resolving_Weak_Types for run in runs: prev_strong = prev_type = run['sor'] start, length = run['start'], run['length'] chars = augmented_char_array[start:start + length] for char in chars: # W1. Examine each nonspacing mark (NSM) in the level run, and # change the type of the NSM to the type of the previous character. # If the NSM is at the start of the level run, it will get the type # of sor. bidi_type = char['bidi-type'] if bidi_type == 'NSM': char['bidi-type'] = bidi_type = prev_type # W2. Search backward from each instance of a European number until # the first strong type (R, L, AL, or sor) is found. If an AL is # found, change the type of the European number to Arabic number. if bidi_type == 'EN' and prev_strong == 'AL': char['bidi-type'] = 'AN' # update prev_strong if needed if bidi_type in ('R', 'L', 'AL'): prev_strong = bidi_type prev_type = char['bidi-type'] # W3. Change all ALs to R for char in chars: if char['bidi-type'] == 'AL': char['bidi-type'] = 'R' # W4. A single European separator between two European numbers changes # to a European number. A single common separator between two numbers of # the same type changes to that type. for idx in range(1, len(chars) - 1): bidi_type = chars[idx]['bidi-type'] prev_type = chars[idx - 1]['bidi-type'] next_type = chars[idx + 1]['bidi-type'] if bidi_type == 'ES' and (prev_type == next_type == 'EN'): chars[idx]['bidi-type'] = 'EN' if bidi_type == 'CS' and prev_type == next_type and \ prev_type in ('AN', 'EN'): chars[idx]['bidi-type'] = prev_type # W5. A sequence of European terminators adjacent to European numbers # changes to all European numbers. for idx in range(len(chars)): if chars[idx]['bidi-type'] == 'EN': for et_idx in range(idx - 1, -1, -1): if chars[et_idx]['bidi-type'] == 'ET': chars[et_idx]['bidi-type'] = 'EN' else: break for et_idx in range(idx + 1, len(chars)): if chars[et_idx]['bidi-type'] == 'ET': chars[et_idx]['bidi-type'] = 'EN' else: break # W6. Otherwise, separators and terminators change to Other Neutral. for char in chars: if char['bidi-type'] in ('ET', 'ES', 'CS'): char['bidi-type'] = 'ON' # W7. Search backward from each instance of a European number until the # first strong type (R, L, or sor) is found. If an L is found, then # change the type of the European number to L. prev_strong = run['sor'] for char in chars: if char['bidi-type'] == 'EN' and prev_strong == 'L': char['bidi-type'] = 'L' if char['bidi-type'] in ('L', 'R'): prev_strong = char['bidi-type'] # Step 4: Resolve Neutral Types # See: http://unicode.org/reports/tr9/#Resolving_Neutral_Types for run in runs: start, length = run['start'], run['length'] # use sor and eor chars = [{ 'bidi-type': run['sor'] }] + augmented_char_array[start:start + length] + [{ 'bidi-type': run['eor'] }] total_chars = len(chars) seq_start = None for idx in range(total_chars): _ch = chars[idx] if _ch['bidi-type'] in ('B', 'S', 'WS', 'ON'): # N1. A sequence of neutrals takes the direction of the # surrounding strong text if the text on both sides has the same # direction. European and Arabic numbers act as if they were R # in terms of their influence on neutrals. Start-of-level-run # (sor) and end-of-level-run (eor) are used at level run # boundaries. if seq_start is None: seq_start = idx prev_bidi_type = chars[idx - 1]['bidi-type'] else: if seq_start is not None: next_bidi_type = chars[idx]['bidi-type'] if prev_bidi_type in ('AN', 'EN'): prev_bidi_type = 'R' if next_bidi_type in ('AN', 'EN'): next_bidi_type = 'R' for seq_idx in range(seq_start, idx): if prev_bidi_type == next_bidi_type: chars[seq_idx]['bidi-type'] = prev_bidi_type else: # N2. Any remaining neutrals take the embedding # direction. The embedding direction for the given # neutral character is derived from its embedding # level: L if the character is set to an even level, # and R if the level is odd. if chars[seq_idx]['level'] % 2 == 0: chars[seq_idx]['bidi-type'] = 'L' else: chars[seq_idx]['bidi-type'] = 'R' seq_start = None # Step 5: Resolve Implicit Levels # See: http://unicode.org/reports/tr9/#Resolving_Implicit_Levels def _embedding_direction(x): return ('L', 'R')[x % 2] for run in runs: start, length = run['start'], run['length'] chars = augmented_char_array[start:start + length] for _ch in chars: # only those types are allowed at this stage assert _ch['bidi-type'] in ('L', 'R', 'EN', 'AN'), \ '[%s] not allowed here. Original string was: [%s]; Cur run = [%s] and cur char = [%s].' % (_ch['bidi-type'], uxxxx_str, str(chars), _ch) if _embedding_direction(_ch['level']) == 'L': # I1. For all characters with an even (left-to-right) embedding # direction, those of type R go up one level and those of type # AN or EN go up two levels. if _ch['bidi-type'] == 'R': _ch['level'] += 1 elif _ch['bidi-type'] != 'L': _ch['level'] += 2 else: # I2. For all characters with an odd (right-to-left) embedding # direction, those of type L, EN or AN go up one level. if _ch['bidi-type'] != 'R': _ch['level'] += 1 # Step 6: Reorder Resolved Levels # See: http://unicode.org/reports/tr9/#I2 # Applies L1. should_reset = True chars = augmented_char_array for _ch in chars[::-1]: # L1. On each line, reset the embedding level of the following # characters to the paragraph embedding level: if _ch['bidi-orig-type'] in ('B', 'S'): # 1. Segment separators, # 2. Paragraph separators, _ch['level'] = base_level should_reset = True elif should_reset and _ch['bidi-orig-type'] in ('BN', 'WS'): # 3. Any sequence of whitespace characters preceding a segment # separator or paragraph separator # 4. Any sequence of white space characters at the end of the # line. _ch['level'] = base_level else: should_reset = False max_len = len(chars) # L2 should be per line # Calculates highest level and loweset odd level on the fly. line_start = line_end = 0 highest_level = 0 lowest_odd_level = EXPLICIT_LEVEL_LIMIT for idx in range(max_len): _ch = chars[idx] # calc the levels char_level = _ch['level'] if char_level > highest_level: highest_level = char_level if char_level % 2 and char_level < lowest_odd_level: lowest_odd_level = char_level if _ch['bidi-orig-type'] == 'B' or idx == max_len - 1: line_end = idx # omit line breaks if _ch['bidi-orig-type'] == 'B': line_end -= 1 _reverse_contiguous_sequence(chars, line_start, line_end, highest_level, lowest_odd_level) # reset for next line run line_start = idx + 1 highest_level = 0 lowest_odd_level = EXPLICIT_LEVEL_LIMIT # Finally, reverse entire string return ' '.join([char['char'] for char in reversed(chars)])