def make_conversation_ngram_neihbors(word_comb: list,
                                     CLIENT_WORDS,
                                     n_top=5,
                                     BADLY_RECOGNIZED_WORDS=()):
    n = len(word_comb)
    combs = []
    # print('make ngram nbrs')
    for comb_ln in range(max(0, n - 1), n + 2):
        if comb_ln not in CLIENT_WORDS['conversation_ngrams']:
            continue
        for cmb in CLIENT_WORDS['conversation_ngrams'][comb_ln]:
            if ' '.join(word_comb) in cmb or max(BADLY_RECOGNIZED_WORDS[w]
                                                 for w in cmb.split()) >= 1:
                continue
            combs.append(cmb)
    combs = list(set(combs))

    if len(combs) > 100000:
        return []

    if not combs:
        return []
    # print('CMBS', len(combs))

    nbrs = get_top_phoneme_neighbors(' '.join(word_comb), combs)
    # print('finish ngram')
    return nbrs[:n_top]
def make_pure_client_vocab_neighbors(word_comb: list,
                                     CLIENT_WORDS,
                                     n_top=5,
                                     BADLY_RECOGNIZED_WORDS=()):
    n = len(word_comb)
    combs = []
    # print(word_comb)

    if len(CLIENT_WORDS['special_vocab'])**n > 50000:
        return []

    for comb_ln in range(max(1, n - 1), n + 1):
        vocabs = [CLIENT_WORDS['special_vocab'] for _ in range(comb_ln)]
        for cmb in itertools.product(*vocabs):
            if max(BADLY_RECOGNIZED_WORDS[w] for w in cmb) >= 0.75:
                continue
            combs.append(cmb)
    combs = list(set(combs))
    # print('get_phoneme_nbrs', len(combs))

    nbrs = get_top_phoneme_neighbors(' '.join(word_comb),
                                     [' '.join(cmb) for cmb in combs],
                                     n_top=20)

    nbrs = [n for n in nbrs if _is_good_neighbor(word_comb, n)]

    return nbrs[:n_top]
Esempio n. 3
0
def make_highlighted_string(tokens,
                            indicators,
                            CLIENT_WORDS=None,
                            apply_replacements=False,
                            HELP_VOCAB=None):
    THRESCHOLD = 0.5

    from stop_words import get_stop_words
    STOP_WORDS = get_stop_words('en')
    l, r = 0, 0
    N = len(tokens)
    new_tokens = []
    original_phrases = []
    while l < N:
        if indicators[l] <= THRESCHOLD:
            new_tokens.append(tokens[l])
            l += 1
        else:
            r = l
            while r < N and indicators[r] > THRESCHOLD or tokens[max(
                    0, r - 1)] == '\'':
                r += 1
            is_all_stopwords, pref, med, suf = _clip_stopwords(
                tokens[l:r], STOP_WORDS)
            brightness = max(indicators[l:r] + [0])
            brightness = int(brightness * 10) * 10
            # print(brightness)

            if apply_replacements:
                if brightness >= THRESCHOLD and CLIENT_WORDS is not None:
                    candidates = CLIENT_WORDS['total_vocabulary'] + [
                        med
                    ]  # get_top_phoneme_neighbors(med, CLIENT_WORDS, return_scores=True)
                    badly_recognized_text = med.split()
                    sample = make_test_sample_2(med, candidates, HELP_VOCAB)
                    # print(sample)
                    # print('-----------------')
                    result = evaluate_sample(sample, med)
                    tooltip = 'data-tooltip="{}"'.format('; '.join(
                        [x[0] for x in result[1:20]]))
                else:
                    tooltip = ''
            else:
                if brightness >= THRESCHOLD and CLIENT_WORDS is not None:
                    candidates = get_top_phoneme_neighbors(
                        med, CLIENT_WORDS['total_vocabulary'])
                    tooltip = 'data-tooltip="{}"'.format('; '.join(candidates))
                else:
                    tooltip = ''

            if not is_all_stopwords:
                new_tokens.append(
                    '{} <strong class="brightness-{}" {}>{}</strong> {}'.
                    format(pref, 100, tooltip, med, suf))
            else:
                new_tokens.append('{} {} {}'.format(pref, med, suf))
            original_phrases.append(tokens[l:r])
            l = r
    text = ' '.join(new_tokens)
    return text
def prepare_samples_for_cb(samples, hard_coded_candidates=None):
    cb_samples = []

    for index, s in enumerate(tqdm(samples)):
        source = deepcopy(s['tokens'])
        target = deepcopy(s['gt_tokens'])
        asr_ranges = get_ranges_from_array(s['verbose_labels'],
                                           lambda x: x != 0,
                                           lambda x: any(y != 2 for y in x))
        gt_ranges = get_ranges_from_array(s['gt_tokens_labels'],
                                          lambda x: x != 0, lambda x: True)
        assert len(asr_ranges) == len(gt_ranges)
        for rg_s, rg_t in zip(asr_ranges, gt_ranges):
            l_s, r_s = rg_s
            l_t, r_t = rg_t
            badly_recognized_text = ' '.join(source[l_s:r_s])
            target_text = ' '.join(target[l_t:r_t])
            #             print(badly_recognized_text, '->', target_text)
            if hard_coded_candidates is None:
                if random.random() < 0.5:
                    podskazki = get_top_phoneme_neighbors(
                        badly_recognized_text,
                        set(list(s['help_vocab'].keys())) - {target_text},
                        n_top=10)
                else:
                    podskazki = set(
                        random.sample(list(s['help_vocab'].keys()),
                                      min(10, len(
                                          s['help_vocab'])))) - {target_text}
                    podskazki = list(podskazki)
            else:
                podskazki = hard_coded_candidates
            for p in podskazki:
                sample_features = make_features_for_candidate(
                    orig_text=source[l_s:r_s],
                    candidate_text=p.split(),
                    l_context_index=l_s,
                    r_context_index=r_s,
                    client_vocab=s['help_vocab'])
                cb_samples.append({
                    'features':
                    sample_features,
                    'target':
                    0,
                    'badly_recognized_text':
                    badly_recognized_text,
                    'candidate_text':
                    p,
                    'context':
                    source,
                    'context_with_substituted_candidate':
                    source[:l_s] + p.split() + source[r_s:],
                    'group_index':
                    index,
                    # 'target_text': target_text
                })
            true_target_text_features = make_features_for_candidate(
                orig_text=source[l_s:r_s],
                candidate_text=target[l_t:r_t],
                l_context_index=l_s,
                r_context_index=r_s,
                client_vocab=s['help_vocab'])
            cb_samples.append({
                'features':
                true_target_text_features,
                'target':
                1,
                'badly_recognized_text':
                badly_recognized_text,
                'candidate_text':
                target_text,
                'context':
                source,
                'context_with_substituted_candidate':
                source[:l_s] + target_text.split() + source[r_s:],
                'group_index':
                index,
                # 'target_text': target_text
            })

    # group_index_2_orig_sentence = {}
    # for sample in cb_samples:
    #     group_index_2_orig_sentence[sample['group_index']] = sample['context']
    # group_indexes, orig_sentences = list(zip(*list(sorted(group_index_2_orig_sentence.items()))))
    # orig_sentence_predictions = gather_predictions_for_sentences([' '.join(s) for s in orig_sentences])
    # group_index_2_orig_sentence_score = dict(zip(group_indexes, orig_sentence_predictions))

    # candidate_substituted_sentences = []
    # for sample in cb_samples:
    #     candidate_substituted_sentences.append(' '.join(sample['context_with_substituted_candidate']))

    # candidate_substituted_sentences_predictions = gather_predictions_for_sentences(
    #                 candidate_substituted_sentences)

    # for i, sample in enumerate(cb_samples):
    #     sample['orig_sentence_score'] = group_index_2_orig_sentence_score[sample['group_index']]
    #     sample['candidate_substituted_sentence_score'] = \
    #             candidate_substituted_sentences_predictions[i]

    # samples = make_context_features_cb_samples(cb_samples)

    return cb_samples
def make_sample(sent, labels, conversation_specific_tokens=None):
    def normal_word_to_replace(w):
        if any(x not in string.ascii_letters + ' ' for x in w):
            return False
        if w in STOPWORDS:
            return False
        return True

    indexes = [i for i in range(len(sent)) if labels[i] != 0]
    ranges = []
    idx = 0
    k = len(indexes)
    while idx < k:
        l = idx
        r = idx + 1
        while r < k and indexes[r] == indexes[l] + (r - l):
            r += 1
        idx = r
        ranges.append((indexes[l], indexes[r - 1] + 1))

    sample = {'target': ' '.join(sent), 'source': deepcopy(sent)}

    specific_words = config['client_specific_words'] + list(
        conversation_specific_tokens or [])

    client_specific_candidates = []

    for rg in reversed(ranges):
        l, r = rg
        spelling = []
        if any(not normal_word_to_replace(w) for w in sample['source'][l:r]):
            continue
        for w in sample['source'][l:r]:
            print(w)
            w_joined = ''.join([x.lower() for x in w.split() if x])
            w_candidates = get_top_phoneme_neighbors(w_joined,
                                                     specific_words,
                                                     n_top=10)

            local_candidates = w_candidates + [w.lower()]

            local_candidates = list(
                sorted(set(local_candidates),
                       key=lambda x: get_pronounce_dist(w, x)))

            client_specific_candidates.extend(local_candidates)
            spell = wordbreak(w_joined)[0]
            spelling.extend(spell)
        sample['source'][l:r] = ['#'] + spelling + ['#']

    # client_specific_candidates = list(set(client_specific_candidates))
    # random.shuffle(client_specific_candidates)

    suffix = ' $% ' + ' ; '.join(client_specific_candidates)

    bad_punct = ''.join(x for x in string.punctuation
                        if x not in ['\'', '#', '$', '%'])
    sample['source'] = [t for t in sample['source'] if t not in bad_punct]

    sample['source'] = ' '.join(sample['source']) + suffix
    return sample
Esempio n. 6
0
def make_highlighted_string_2(tokens,
                              indicators,
                              BADLY_RECOGNIZED_WORDS,
                              ALL_TOKENS,
                              CLIENT_WORDS=None,
                              pure_phonemes=False,
                              HELP_VOCAB=None,
                              prev_texts=None,
                              post_texts=None,
                              THRESCHOLD=0.5):
    THRESCHOLD = THRESCHOLD

    from stop_words import get_stop_words
    STOP_WORDS = get_stop_words('en')
    l, r = 0, 0
    N = len(tokens)
    new_tokens = []
    original_phrases = []

    bad_ranges = []

    while l < N:
        if indicators[l] <= THRESCHOLD:
            new_tokens.append(tokens[l])
            l += 1
        else:
            r = l
            while r < N and indicators[r] > THRESCHOLD or tokens[max(
                    0, r - 1)] == '\'':
                r += 1
            is_all_stopwords, pref, med, suf = _clip_stopwords(
                tokens[l:r], STOP_WORDS)
            pref_toks = pref.split()
            bad_toks = med.split()
            suf_toks = suf.split()
            if is_all_stopwords:
                for t in pref_toks + bad_toks + suf_toks:
                    if not t:
                        continue
                    new_tokens.append(t)
            else:
                brightness = max(indicators[l:r] + [0])
                brightness = int(brightness * 10) * 10
                for t in pref_toks:
                    if not t:
                        continue
                    new_tokens.append(t)

                bad_ranges.append(
                    (len(new_tokens), len(new_tokens) + len(bad_toks),
                     brightness))

                for t in bad_toks:
                    if not t:
                        continue
                    new_tokens.append(t)

                for t in suf_toks:
                    if not t:
                        continue
                    new_tokens.append(t)
            l = r

    print('BADLY_RECOGNIZED_WORDS[anson]', BADLY_RECOGNIZED_WORDS['anson'])
    print('BADLY_RECOGNIZED_WORDS[temple]', BADLY_RECOGNIZED_WORDS['temple'])
    print('BADLY_RECOGNIZED_WORDS[template]',
          BADLY_RECOGNIZED_WORDS['template'])
    print('BADLY_RECOGNIZED_WORDS[bot]', BADLY_RECOGNIZED_WORDS['bot'])

    new_tokens_copy = deepcopy(new_tokens)
    bad_ranges.sort(reverse=True)
    for l, r, brightness in bad_ranges:
        # print("lr", l, r)
        med = ' '.join(new_tokens[l:r])
        if brightness >= THRESCHOLD and CLIENT_WORDS is not None:
            if pure_phonemes:
                candidates = get_top_phoneme_neighbors(
                    med, CLIENT_WORDS['total_vocabulary'])
            else:
                candidates = make_neighbors_flow(
                    med,
                    CLIENT_WORDS,
                    n_top=10,
                    BADLY_RECOGNIZED_WORDS=BADLY_RECOGNIZED_WORDS)

            # template
            central_tokens = new_tokens_copy[:l] + ['{}'] + new_tokens_copy[r:]
            temaplate = prev_texts + ' ' + ' '.join(
                central_tokens) + ' ' + post_texts
            candidates = rerank_by_model(temaplate, candidates)

            if candidates:
                pure_cand_words, nll_cands_scores = zip(*candidates)

                candidates = rank_candidates(new_tokens[l:r], pure_cand_words,
                                             nll_cands_scores,
                                             BADLY_RECOGNIZED_WORDS)

            # tooltip = 'data-tooltip="{}"'.format('; '.join(candidates))
            tooltip = 'data-tooltip="{}"'.format('; '.join(
                [f'{x}' for (x, y) in candidates]))
        else:
            tooltip = ''

        brightness = 100
        new_tokens[l:r] = [
            '<strong class="brightness-{}" {}>{}</strong>'.format(
                brightness, tooltip, med)
        ]
    # print(new_tokens)
    text = ' '.join(new_tokens)
    return text