Exemple #1
0
    def build(cls, n_clusters=10):
        params = {}
        params['n_clusters'] = n_clusters

        logger.info("Loading sentences")
        sents = filter_reasonable_length_sents(get_all_sents())

        logger.info("Vectorizing")
        params['vectorizer'], raw_vecs = get_vectorizer(sents)
        params['projection_mat'] = get_projection_mat(params['vectorizer'])
        vecs = raw_vecs.dot(params['projection_mat'])
        sents, projected_vecs = filter_by_norm(vecs, sents)

        logger.info("Clustering")
        params['clusterer'], cluster_dists = get_clusterer_and_dists(
            projected_vecs, n_clusters=n_clusters, random_state=0)

        logger.info("Training sub-models")
        train_models_per_cluster(params['clusterer'],
                                 vecs=projected_vecs,
                                 texts=sents)

        models = [
            lang_model.Model.from_basename(
                paths.model_basename('cluster_{}'.format(cluster_idx)))
            for cluster_idx in range(n_clusters)
        ]
        params['omit_unks'] = np.flatnonzero([[
            any(model.model.vocab_index(tok) == 0 for tok in toks)
            for model in models
        ] for toks in params['unique_starts']])

        #%% Score the first 5 words of every sentence.
        params['unique_starts'] = [
            x.split()
            for x in sorted({' '.join(sent.split()[:5])
                             for sent in sents})
        ]
        params['scores_by_cluster'] = np.array(
            [[model.score_seq(model.bos_state, k)[0] for model in models]
             for k in tqdm.tqdm(params['unique_starts'], desc="Score starts")])
        return cls(**params)
Exemple #2
0
import numpy as np
from scipy.special import expit
from scipy.misc import logsumexp
from suggestion.paths import paths
from suggestion import lang_model
Model = lang_model.Model
#%%
PRELOAD_MODELS = '''
yelp_train
yelp_train-1star
yelp_train-2star
yelp_train-3star
yelp_train-4star
yelp_train-5star'''.split()
models = {name: Model.from_basename(name, paths.model_basename(name)) for name in PRELOAD_MODELS}
#%%
import json
prior_counts = np.array(json.load(open(paths.models / 'star_counts.json')))

#%%
class LMClassifier:
    def __init__(self, models, prior_counts, sentiment_weights=[-1, -1, 0, 1, 1.]):#[-1, -.5, 0, .5, 1]):
        self.models = models
        self.prior_logprobs = np.log(prior_counts / prior_counts.sum())
        self.sentiment_weights = np.array(sentiment_weights)
        self.sentiment_weights -= np.min(self.sentiment_weights)
        self.sentiment_weights /= np.max(self.sentiment_weights)

    def get_state(self, toks, bos=False):
        models = self.models
Exemple #3
0
 def get_or_load_model(cls, name: str) -> 'Model':
     from suggestion.paths import paths
     if name not in cls.preloaded:
         cls.preload_model(name, paths.model_basename(name))
     return cls.get_model(name)
Exemple #4
0
vectorizer = TfidfVectorizer(min_df=5, max_df=.5, stop_words='english')
all_vecs = vectorizer.fit_transform(sents)
#%%
vocab_indices = [vectorizer.vocabulary_[w] for w in LM_seeds]
#%%
sents_by_cluster = [all_vecs[:, idx].nonzero()[0] for idx in vocab_indices]
#%%
for word, sent_indices in zip(LM_seeds, sents_by_cluster):
    print(word)
    dump_kenlm(f'tmp_{word}_0', [sents[idx] for idx in sent_indices])
#%%
from suggestion import lang_model
from suggestion.paths import paths
#%%
models = [
    lang_model.Model.from_basename(paths.model_basename(f'tmp_{word}_0'))
    for word in LM_seeds
]
#%%
import tqdm
scores_by_cluster = np.array(
    [[model.score_seq(model.bos_state, k)[0] for model in models]
     for k in tqdm.tqdm(sents, desc="Score sents")])
#%%
sbc_lmnorm = scores_by_cluster - np.mean(scores_by_cluster, axis=0)
#%%

from scipy.misc import logsumexp
sbc_lse = logsumexp(scores_by_cluster, axis=1, keepdims=True)
#%%
sbc = scores_by_cluster - 1 * sbc_lse
Exemple #5
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 23 14:13:45 2017

@author: kcarnold
"""

import numpy as np
from suggestion.lang_model import Model
from suggestion.paths import paths
#%%
lowercase = Model.from_basename('yelp_train',
                                paths.model_basename('yelp_train'))
truecase = Model.from_basename('yelp_train_truecase',
                               paths.model_basename('yelp_train_truecase'))
#%%
import cytoolz
#%%
case_options = cytoolz.groupby(lambda x: x.lower(), truecase.id2str)


#%%
def infer_true_case(sent_toks):
    state = truecase.get_state(["<S>"], bos=True)[0]
    result = []
    for tok in sent_toks:
        options = case_options.get(tok, [tok])
        if len(options) == 1:
            result.append(options[0])
            continue
        vocab_indices = [truecase.model.vocab_index(opt) for opt in options]
Exemple #6
0
    for i, sent in enumerate(sents):
        res.append([topic_tags[c] for c in clusters_for_sents[:i + 1][-4:]] +
                   sent.lower().split())
    return res


import tqdm
from suggestion import util
util.dump_kenlm('yelp_topic_tagged', [
    ' '.join(s) for tokenized in tqdm.tqdm(reviews.tokenized)
    for s in review_to_tagged_sents(tokenized.split('\n'))
])
#%%
from suggestion import lang_model
topic2sentence_lm = lang_model.Model.from_basename(
    paths.model_basename('yelp_topic_tagged'))
#%%
import itertools
topic_transitions_indices = list(itertools.product(range(10), range(10)))
rev_topic_transitions_indices = [10 * i + i for i in range(10)]
#%%
transition_log_likelihoods = np.array([[
    topic2sentence_lm.score_seq(
        topic2sentence_lm.get_state([topic_tags[c1], topic_tags[c2]],
                                    bos=True)[0], k)[0]
    for c1, c2 in itertools.product(range(10), range(10))
] for k in tqdm.tqdm(clizer.unique_starts, desc="Score starts")])
#%%
#scores_by_cluster = scores_by_cluster_raw.copy()
#likelihood_bias = logsumexp(scores_by_cluster, axis=1, keepdims=True)
#%%
Exemple #7
0
clusters_to_use.fill(True)
clusters_to_use[omit_clusters] = False
clusters_to_use = np.flatnonzero(clusters_to_use)
#%%
unique_starts = [
    x.split() for x in sorted({' '.join(sent.split()[:5])
                               for sent in sents_2})
]
#%%
from suggestion import lang_model
from suggestion.paths import paths

scores_by_cluster = []
for cluster_idx in tqdm.tqdm(clusters_to_use):
    model = lang_model.Model.from_basename(
        paths.model_basename(f'yelp_bigclust_{cluster_idx}'))
    scores_by_cluster.append(
        [model.score_seq(model.bos_state, k)[0] for k in unique_starts])
#%%
sbc = np.array(scores_by_cluster).T
#%%
from scipy.misc import logsumexp

likelihood_bias = logsumexp(sbc, axis=1, keepdims=True)
sbc2 = sbc - .85 * likelihood_bias
#%%
sbc_argsort = np.argsort(sbc2, axis=0)
#%%
import contextlib
with open('cluster_starts.txt', 'w') as f, contextlib.redirect_stdout(f):
    for cluster_idx in range(len(clusters_to_use)):
Exemple #8
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 31 12:36:56 2017

@author: kcarnold
"""

from suggestion import suggestion_generator, lang_model
from suggestion.paths import paths
import numpy as np
from scipy.misc import logsumexp

#%%
clizer = suggestion_generator.clizer
n_clusters = clizer.n_clusters
models = [lang_model.Model.from_basename(paths.model_basename('cluster_{}'.format(cluster_idx))) for cluster_idx in range(n_clusters)]
#%%
has_unks = np.array([[any(model.model.vocab_index(tok) == 0 for tok in toks) for model in models] for toks in clizer.unique_starts])
#%%
omit2 = np.flatnonzero(np.sum(has_unks, axis=1))
#%%
import re
has_review = np.array([bool(re.search(r'\breview(er|ed)?s?\b|\bstars?\b', ' '.join(toks))) for toks in clizer.unique_starts])
#%%
scores_by_cluster = clizer.scores_by_cluster.copy()
likelihood_bias = logsumexp(scores_by_cluster, axis=1, keepdims=True)
scores_by_cluster -= likelihood_bias
#scores_by_cluster[suggested_already] = -np.inf
scores_by_cluster[omit2] = -np.inf
#scores_by_cluster[has_review] = -np.inf
scores_by_cluster[clizer.omit] = -np.inf