def make_conversation_ngram_neihbors(word_comb: list, CLIENT_WORDS, n_top=5, BADLY_RECOGNIZED_WORDS=()): n = len(word_comb) combs = [] # print('make ngram nbrs') for comb_ln in range(max(0, n - 1), n + 2): if comb_ln not in CLIENT_WORDS['conversation_ngrams']: continue for cmb in CLIENT_WORDS['conversation_ngrams'][comb_ln]: if ' '.join(word_comb) in cmb or max(BADLY_RECOGNIZED_WORDS[w] for w in cmb.split()) >= 1: continue combs.append(cmb) combs = list(set(combs)) if len(combs) > 100000: return [] if not combs: return [] # print('CMBS', len(combs)) nbrs = get_top_phoneme_neighbors(' '.join(word_comb), combs) # print('finish ngram') return nbrs[:n_top]
def make_pure_client_vocab_neighbors(word_comb: list, CLIENT_WORDS, n_top=5, BADLY_RECOGNIZED_WORDS=()): n = len(word_comb) combs = [] # print(word_comb) if len(CLIENT_WORDS['special_vocab'])**n > 50000: return [] for comb_ln in range(max(1, n - 1), n + 1): vocabs = [CLIENT_WORDS['special_vocab'] for _ in range(comb_ln)] for cmb in itertools.product(*vocabs): if max(BADLY_RECOGNIZED_WORDS[w] for w in cmb) >= 0.75: continue combs.append(cmb) combs = list(set(combs)) # print('get_phoneme_nbrs', len(combs)) nbrs = get_top_phoneme_neighbors(' '.join(word_comb), [' '.join(cmb) for cmb in combs], n_top=20) nbrs = [n for n in nbrs if _is_good_neighbor(word_comb, n)] return nbrs[:n_top]
def make_highlighted_string(tokens, indicators, CLIENT_WORDS=None, apply_replacements=False, HELP_VOCAB=None): THRESCHOLD = 0.5 from stop_words import get_stop_words STOP_WORDS = get_stop_words('en') l, r = 0, 0 N = len(tokens) new_tokens = [] original_phrases = [] while l < N: if indicators[l] <= THRESCHOLD: new_tokens.append(tokens[l]) l += 1 else: r = l while r < N and indicators[r] > THRESCHOLD or tokens[max( 0, r - 1)] == '\'': r += 1 is_all_stopwords, pref, med, suf = _clip_stopwords( tokens[l:r], STOP_WORDS) brightness = max(indicators[l:r] + [0]) brightness = int(brightness * 10) * 10 # print(brightness) if apply_replacements: if brightness >= THRESCHOLD and CLIENT_WORDS is not None: candidates = CLIENT_WORDS['total_vocabulary'] + [ med ] # get_top_phoneme_neighbors(med, CLIENT_WORDS, return_scores=True) badly_recognized_text = med.split() sample = make_test_sample_2(med, candidates, HELP_VOCAB) # print(sample) # print('-----------------') result = evaluate_sample(sample, med) tooltip = 'data-tooltip="{}"'.format('; '.join( [x[0] for x in result[1:20]])) else: tooltip = '' else: if brightness >= THRESCHOLD and CLIENT_WORDS is not None: candidates = get_top_phoneme_neighbors( med, CLIENT_WORDS['total_vocabulary']) tooltip = 'data-tooltip="{}"'.format('; '.join(candidates)) else: tooltip = '' if not is_all_stopwords: new_tokens.append( '{} <strong class="brightness-{}" {}>{}</strong> {}'. format(pref, 100, tooltip, med, suf)) else: new_tokens.append('{} {} {}'.format(pref, med, suf)) original_phrases.append(tokens[l:r]) l = r text = ' '.join(new_tokens) return text
def prepare_samples_for_cb(samples, hard_coded_candidates=None): cb_samples = [] for index, s in enumerate(tqdm(samples)): source = deepcopy(s['tokens']) target = deepcopy(s['gt_tokens']) asr_ranges = get_ranges_from_array(s['verbose_labels'], lambda x: x != 0, lambda x: any(y != 2 for y in x)) gt_ranges = get_ranges_from_array(s['gt_tokens_labels'], lambda x: x != 0, lambda x: True) assert len(asr_ranges) == len(gt_ranges) for rg_s, rg_t in zip(asr_ranges, gt_ranges): l_s, r_s = rg_s l_t, r_t = rg_t badly_recognized_text = ' '.join(source[l_s:r_s]) target_text = ' '.join(target[l_t:r_t]) # print(badly_recognized_text, '->', target_text) if hard_coded_candidates is None: if random.random() < 0.5: podskazki = get_top_phoneme_neighbors( badly_recognized_text, set(list(s['help_vocab'].keys())) - {target_text}, n_top=10) else: podskazki = set( random.sample(list(s['help_vocab'].keys()), min(10, len( s['help_vocab'])))) - {target_text} podskazki = list(podskazki) else: podskazki = hard_coded_candidates for p in podskazki: sample_features = make_features_for_candidate( orig_text=source[l_s:r_s], candidate_text=p.split(), l_context_index=l_s, r_context_index=r_s, client_vocab=s['help_vocab']) cb_samples.append({ 'features': sample_features, 'target': 0, 'badly_recognized_text': badly_recognized_text, 'candidate_text': p, 'context': source, 'context_with_substituted_candidate': source[:l_s] + p.split() + source[r_s:], 'group_index': index, # 'target_text': target_text }) true_target_text_features = make_features_for_candidate( orig_text=source[l_s:r_s], candidate_text=target[l_t:r_t], l_context_index=l_s, r_context_index=r_s, client_vocab=s['help_vocab']) cb_samples.append({ 'features': true_target_text_features, 'target': 1, 'badly_recognized_text': badly_recognized_text, 'candidate_text': target_text, 'context': source, 'context_with_substituted_candidate': source[:l_s] + target_text.split() + source[r_s:], 'group_index': index, # 'target_text': target_text }) # group_index_2_orig_sentence = {} # for sample in cb_samples: # group_index_2_orig_sentence[sample['group_index']] = sample['context'] # group_indexes, orig_sentences = list(zip(*list(sorted(group_index_2_orig_sentence.items())))) # orig_sentence_predictions = gather_predictions_for_sentences([' '.join(s) for s in orig_sentences]) # group_index_2_orig_sentence_score = dict(zip(group_indexes, orig_sentence_predictions)) # candidate_substituted_sentences = [] # for sample in cb_samples: # candidate_substituted_sentences.append(' '.join(sample['context_with_substituted_candidate'])) # candidate_substituted_sentences_predictions = gather_predictions_for_sentences( # candidate_substituted_sentences) # for i, sample in enumerate(cb_samples): # sample['orig_sentence_score'] = group_index_2_orig_sentence_score[sample['group_index']] # sample['candidate_substituted_sentence_score'] = \ # candidate_substituted_sentences_predictions[i] # samples = make_context_features_cb_samples(cb_samples) return cb_samples
def make_sample(sent, labels, conversation_specific_tokens=None): def normal_word_to_replace(w): if any(x not in string.ascii_letters + ' ' for x in w): return False if w in STOPWORDS: return False return True indexes = [i for i in range(len(sent)) if labels[i] != 0] ranges = [] idx = 0 k = len(indexes) while idx < k: l = idx r = idx + 1 while r < k and indexes[r] == indexes[l] + (r - l): r += 1 idx = r ranges.append((indexes[l], indexes[r - 1] + 1)) sample = {'target': ' '.join(sent), 'source': deepcopy(sent)} specific_words = config['client_specific_words'] + list( conversation_specific_tokens or []) client_specific_candidates = [] for rg in reversed(ranges): l, r = rg spelling = [] if any(not normal_word_to_replace(w) for w in sample['source'][l:r]): continue for w in sample['source'][l:r]: print(w) w_joined = ''.join([x.lower() for x in w.split() if x]) w_candidates = get_top_phoneme_neighbors(w_joined, specific_words, n_top=10) local_candidates = w_candidates + [w.lower()] local_candidates = list( sorted(set(local_candidates), key=lambda x: get_pronounce_dist(w, x))) client_specific_candidates.extend(local_candidates) spell = wordbreak(w_joined)[0] spelling.extend(spell) sample['source'][l:r] = ['#'] + spelling + ['#'] # client_specific_candidates = list(set(client_specific_candidates)) # random.shuffle(client_specific_candidates) suffix = ' $% ' + ' ; '.join(client_specific_candidates) bad_punct = ''.join(x for x in string.punctuation if x not in ['\'', '#', '$', '%']) sample['source'] = [t for t in sample['source'] if t not in bad_punct] sample['source'] = ' '.join(sample['source']) + suffix return sample
def make_highlighted_string_2(tokens, indicators, BADLY_RECOGNIZED_WORDS, ALL_TOKENS, CLIENT_WORDS=None, pure_phonemes=False, HELP_VOCAB=None, prev_texts=None, post_texts=None, THRESCHOLD=0.5): THRESCHOLD = THRESCHOLD from stop_words import get_stop_words STOP_WORDS = get_stop_words('en') l, r = 0, 0 N = len(tokens) new_tokens = [] original_phrases = [] bad_ranges = [] while l < N: if indicators[l] <= THRESCHOLD: new_tokens.append(tokens[l]) l += 1 else: r = l while r < N and indicators[r] > THRESCHOLD or tokens[max( 0, r - 1)] == '\'': r += 1 is_all_stopwords, pref, med, suf = _clip_stopwords( tokens[l:r], STOP_WORDS) pref_toks = pref.split() bad_toks = med.split() suf_toks = suf.split() if is_all_stopwords: for t in pref_toks + bad_toks + suf_toks: if not t: continue new_tokens.append(t) else: brightness = max(indicators[l:r] + [0]) brightness = int(brightness * 10) * 10 for t in pref_toks: if not t: continue new_tokens.append(t) bad_ranges.append( (len(new_tokens), len(new_tokens) + len(bad_toks), brightness)) for t in bad_toks: if not t: continue new_tokens.append(t) for t in suf_toks: if not t: continue new_tokens.append(t) l = r print('BADLY_RECOGNIZED_WORDS[anson]', BADLY_RECOGNIZED_WORDS['anson']) print('BADLY_RECOGNIZED_WORDS[temple]', BADLY_RECOGNIZED_WORDS['temple']) print('BADLY_RECOGNIZED_WORDS[template]', BADLY_RECOGNIZED_WORDS['template']) print('BADLY_RECOGNIZED_WORDS[bot]', BADLY_RECOGNIZED_WORDS['bot']) new_tokens_copy = deepcopy(new_tokens) bad_ranges.sort(reverse=True) for l, r, brightness in bad_ranges: # print("lr", l, r) med = ' '.join(new_tokens[l:r]) if brightness >= THRESCHOLD and CLIENT_WORDS is not None: if pure_phonemes: candidates = get_top_phoneme_neighbors( med, CLIENT_WORDS['total_vocabulary']) else: candidates = make_neighbors_flow( med, CLIENT_WORDS, n_top=10, BADLY_RECOGNIZED_WORDS=BADLY_RECOGNIZED_WORDS) # template central_tokens = new_tokens_copy[:l] + ['{}'] + new_tokens_copy[r:] temaplate = prev_texts + ' ' + ' '.join( central_tokens) + ' ' + post_texts candidates = rerank_by_model(temaplate, candidates) if candidates: pure_cand_words, nll_cands_scores = zip(*candidates) candidates = rank_candidates(new_tokens[l:r], pure_cand_words, nll_cands_scores, BADLY_RECOGNIZED_WORDS) # tooltip = 'data-tooltip="{}"'.format('; '.join(candidates)) tooltip = 'data-tooltip="{}"'.format('; '.join( [f'{x}' for (x, y) in candidates])) else: tooltip = '' brightness = 100 new_tokens[l:r] = [ '<strong class="brightness-{}" {}>{}</strong>'.format( brightness, tooltip, med) ] # print(new_tokens) text = ' '.join(new_tokens) return text