word_freqs = []
for i, toks in enumerate(tqdm.tqdm(sent_text, desc="Compute mean word llks")):
    toks = toks.lower().split()
    toks = [tok for tok in toks if tok[0] not in '.,?!:-<']
    indices = wordfreq_analyzer.lookup_indices(toks)
    if len(indices):
        unifreqs = wordfreq_analyzer.log_freqs[indices]
        wf_mean = np.mean(unifreqs)
        wf_std = np.std(unifreqs)
    else:
        wf_mean = wf_std = np.nan
    word_freqs.append(wf_mean)
word_freqs = np.array(word_freqs)
#%%
contextual_llks = []
model = suggestion_generator.get_model('yelp_train-balanced')
start_states = [
    model.get_state(['<s>', '<D>'])[0],
    model.get_state(["<s>", "<S>"])[0]
]
for i, toks in enumerate(
        tqdm.tqdm(sent_text, desc="Compute mean contextual llks")):
    toks = toks.lower().split()
    start_state = start_states[min(sent_sent_idx[i], 1)]
    scores = model.score_seq_by_word(start_state, toks)
    contextual_llks.append(np.mean(scores))
contextual_llks = np.array(contextual_llks)
#%%
valid_wordfreq = ~np.isnan(word_freqs)
token_lengths = np.array([len(sent.split()) for sent in sent_text])
min_length, max_length = np.percentile(token_lengths, [10, 90])
Beispiel #2
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun  8 09:54:58 2017

@author: kcarnold
"""
#%%
import numpy as np
#%%
from suggestion import suggestion_generator
from scipy.special import expit
clf = suggestion_generator.sentiment_classifier
#%%
lang_models = [
    suggestion_generator.get_model(f'yelp_train-{star}star')
    for star in [5, 3, 1]
]


#%%
def get_sentiment_diverse_bos(sofar,
                              toks,
                              sug_state,
                              *,
                              domain='yelp_train',
                              length_after_first=17):
    """
    Get beginning-of-sentence suggestions that are diverse in sentiment and not too repetitive.

    Approach: generate from 5-star, 3-star, and 1-star LMs, but ensure diversity of first word
    with respect to (1) the other slots and (2) the prior words used by the same LM.
Beispiel #3
0
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 16 13:43:39 2017

@author: kcarnold
"""
from suggestion import suggestion_generator
import itertools

model = suggestion_generator.get_model('yelp_train')
import tqdm
import datrie
import numpy as np
#%%
sa = suggestion_generator.sufarr
a, b = sa.search_range(('<D>', ''))
chars = sorted(set(itertools.chain.from_iterable(
    model._bigrams[0].values()))) + [' ']
sent_starts = datrie.Trie(''.join(chars))
for i in tqdm.tqdm(range(a, b)):
    sent_starts[' '.join(sa.docs[sa.doc_idx[i]][sa.tok_idx[i] + 1:][:5])] = 1
#%%
starts_keys = [k.split() for k in sent_starts.keys()]
#%%
starts_keys = [
    start for start in starts_keys
    if len(start) == 5 and '.' not in start and '</S>' not in start
]
#%%
starts_keys_join = [' '.join(start) for start in starts_keys]
starts_char_lens = np.array([len(start) for start in starts_keys_join])
Beispiel #4
0
                groups.append((meta, group))
                group = []
            continue
        if line[0] in string.digits:
            continue
        group.append(line)
    if group:
        groups.append((meta, group))
#%%
from suggestion import suggestion_generator
from scipy.special import expit
clf = suggestion_generator.CLASSIFIERS['positive']
#%%
from suggestion.lang_model import LMClassifier
clf = LMClassifier([
    suggestion_generator.get_model(f'yelp_train-{star}star') for star in [1, 2, 4, 5]], [-.5, -.5, .5, .5])
clf.classify_seq(clf.get_state([]), "i wouldn't recommend this place".split())
#%%
done_indices = []
for i in range(10):
    while True:
        group_idx = np.random.choice(len(groups))
        if group_idx in done_indices:
            continue
        meta, group = groups[group_idx]
        if i == 0 and meta not in ['START', 'EARLY']:
            continue
        if meta == "EARLY" and i > 1:
            continue
        if i == 9 and meta != "END":
            continue