Exemple #1
0
def test_tokenization():
    assert suggestion_generator.tokenize_sofar('this place ') == [
        '<s>', '<D>', 'this', 'place'
    ]
    assert suggestion_generator.tokenize_sofar('this place. ') == [
        '<s>', '<D>', 'this', 'place', '.', '</S>', '<S>'
    ]
Exemple #2
0
def get_content_stats_single_suggestion(sugg, word_freq_analyzer):
    from suggestion import suggestion_generator
    sugg = sugg.copy()
    meta = sugg.pop('flags')

    if not meta['domain'].startswith('yelp'):
        return

    if sugg['cur_word']:
        # Skip partial words.
        return

    model = suggestion_generator.Model.get_or_load_model(meta['domain'])
    try:
        toks = suggestion_generator.tokenize_sofar(sugg['sofar'])
    except:
        # Tokenization failed.
        return
    # Optimization: trim context to the n-gram level, plus some padding.
    toks = toks[-10:]
    state = model.get_state(toks)[0]
    clf_startstate = suggestion_generator.sentiment_classifier.get_state(toks)
    res = []
    for sugg_slot, rec in enumerate(sugg['recs']['predictions']):
        phrase = rec['words']
        if phrase:
            sentiment_posteriors = suggestion_generator.sentiment_classifier.classify_seq_by_tok(clf_startstate, phrase)
            sentiment = np.mean(sentiment_posteriors, axis=0) @ suggestion_generator.sentiment_classifier.sentiment_weights
        else:
            sentiment = None
        analyzer_indices = [word_freq_analyzer.word2idx.get(tok) for tok in phrase]
        res.append(dict(
            request_id=sugg['request_id'],
            sugg_slot=sugg_slot,
            sugg_contextual_llk=model.score_seq(state, phrase)[0],
            sugg_unigram_llk=np.nanmean(np.array([word_freq_analyzer.log_freqs[idx] if idx is not None else np.nan for idx in analyzer_indices])),
            sugg_sentiment=sentiment))
    return res
Exemple #3
0
    # Now just beam-search forward from each.
    beam_search_results = [
        suggestion_generator.beam_search_phrases_loop(
            model, [ent],
            start_idx=1,
            beam_width=50,
            length_after_first=length_after_first,
            **beam_search_kwargs)
        for model, ent in zip(lang_models, first_ents)
    ]
    phrases = [(ent.words, dict(score=ent.score, type='bos'))
               for ents in beam_search_results for ent in ents[:1]]
    return phrases, sug_state


sug_state = {}
sofar = ''
for i in range(10):
    toks = suggestion_generator.tokenize_sofar(sofar)
    clf_state = clf.get_state(toks)
    phrases, sug_state = get_sentiment_diverse_bos(sofar, toks, sug_state)
    print('{:30s} {:30s} {:30s}'.format(
        *[' '.join(phr) for phr, meta in phrases]))
    print('{:<30.2f} {:<30.2f} {:<30.2f}'.format(
        *[clf.sentiment(clf_state, phr) for phr, meta in phrases]))
    print()
    sofar += ' i used to come here every week. '
#%%
toks = suggestion_generator.tokenize_sofar(sofar)
phrases, sug_state = get_sentiment_diverse_bos(sofar, toks, sug_state)
Exemple #4
0
        #        assert word == model.id2str[picked_idx]
        phrase.append(word)
        generated_logprobs[i] = np.log(probs[picked_subidx])
    return phrase, generated_logprobs

for context in [
        '',
        'my',
        'the lunch menu',
        'we',
        'i could not imagine',
        'absolutely',
        "i love",
        "my first",
]:
    context_toks = suggestion_generator.tokenize_sofar(context + ' ')
    print('\n\n', context)
    print("- Exploratory")
    for i in range(5):
        print(
            '', ' '.join(
                generate_phrase_from_sufarr(model,
                                            sufarr,
                                            context_toks,
                                            6,
                                            temperature=1)[0]))
    print("- Max likelihood")
    print('\n'.join([
        ' ' + ' '.join(x['words'])
        for x in suggestion_generator.beam_search_phrases(
            model, context_toks, 100, 30)[:5]
Exemple #5
0
def test_odd_tokenization():
    suggestion_generator.tokenize_sofar('. ')
#%%
# bin 6 seems high, and bin 1. Why?
[model.id2str[idx] for idx in np.flatnonzero(wf_bins == 6)[:20]]
[
    wf_bins_rank[idx] / wf_bins_rank.max()
    for idx in np.flatnonzero(wf_bins == 0)[:20]
]
#model.unigram_probs[]
#%%
next_words = np.flatnonzero(~model.is_special)
for sofar in [
        '', 'best', 'i', 'i really', 'i love their',
        'i love their vegan huevos', 'i love their turkey', 'this'
]:
    sofar = sofar + ' '
    toks = suggestion_generator.tokenize_sofar(
        sofar)  #'<D> best breakfast menu of all places such as'.split()

    state = model.get_state(toks, bos=True)[0]
    #    next_words, logprobs = model.next_word_logprobs_raw(state, toks[-1])
    logprobs = model.eval_logprobs_for_words(state, next_words)

    #logprobs[logprobs < np.percentile(logprobs, 75)] = -np.
    bins_for_next_words = wf_bins[next_words]
    logprobs += 100
    bin_probs = logprobs @ np.eye(10)[bins_for_next_words]
    # ^^ this line is suspect.
    logprobs -= 100
    #    bin_probs -= mean_probs
    bin_probs -= logsumexp(bin_probs)
    chosen_bin = np.argmax(bin_probs)
    indices_in_bin = np.flatnonzero(bins_for_next_words == chosen_bin)