def build(cls, n_clusters=10): params = {} params['n_clusters'] = n_clusters logger.info("Loading sentences") sents = filter_reasonable_length_sents(get_all_sents()) logger.info("Vectorizing") params['vectorizer'], raw_vecs = get_vectorizer(sents) params['projection_mat'] = get_projection_mat(params['vectorizer']) vecs = raw_vecs.dot(params['projection_mat']) sents, projected_vecs = filter_by_norm(vecs, sents) logger.info("Clustering") params['clusterer'], cluster_dists = get_clusterer_and_dists( projected_vecs, n_clusters=n_clusters, random_state=0) logger.info("Training sub-models") train_models_per_cluster(params['clusterer'], vecs=projected_vecs, texts=sents) models = [ lang_model.Model.from_basename( paths.model_basename('cluster_{}'.format(cluster_idx))) for cluster_idx in range(n_clusters) ] params['omit_unks'] = np.flatnonzero([[ any(model.model.vocab_index(tok) == 0 for tok in toks) for model in models ] for toks in params['unique_starts']]) #%% Score the first 5 words of every sentence. params['unique_starts'] = [ x.split() for x in sorted({' '.join(sent.split()[:5]) for sent in sents}) ] params['scores_by_cluster'] = np.array( [[model.score_seq(model.bos_state, k)[0] for model in models] for k in tqdm.tqdm(params['unique_starts'], desc="Score starts")]) return cls(**params)
import numpy as np from scipy.special import expit from scipy.misc import logsumexp from suggestion.paths import paths from suggestion import lang_model Model = lang_model.Model #%% PRELOAD_MODELS = ''' yelp_train yelp_train-1star yelp_train-2star yelp_train-3star yelp_train-4star yelp_train-5star'''.split() models = {name: Model.from_basename(name, paths.model_basename(name)) for name in PRELOAD_MODELS} #%% import json prior_counts = np.array(json.load(open(paths.models / 'star_counts.json'))) #%% class LMClassifier: def __init__(self, models, prior_counts, sentiment_weights=[-1, -1, 0, 1, 1.]):#[-1, -.5, 0, .5, 1]): self.models = models self.prior_logprobs = np.log(prior_counts / prior_counts.sum()) self.sentiment_weights = np.array(sentiment_weights) self.sentiment_weights -= np.min(self.sentiment_weights) self.sentiment_weights /= np.max(self.sentiment_weights) def get_state(self, toks, bos=False): models = self.models
def get_or_load_model(cls, name: str) -> 'Model': from suggestion.paths import paths if name not in cls.preloaded: cls.preload_model(name, paths.model_basename(name)) return cls.get_model(name)
vectorizer = TfidfVectorizer(min_df=5, max_df=.5, stop_words='english') all_vecs = vectorizer.fit_transform(sents) #%% vocab_indices = [vectorizer.vocabulary_[w] for w in LM_seeds] #%% sents_by_cluster = [all_vecs[:, idx].nonzero()[0] for idx in vocab_indices] #%% for word, sent_indices in zip(LM_seeds, sents_by_cluster): print(word) dump_kenlm(f'tmp_{word}_0', [sents[idx] for idx in sent_indices]) #%% from suggestion import lang_model from suggestion.paths import paths #%% models = [ lang_model.Model.from_basename(paths.model_basename(f'tmp_{word}_0')) for word in LM_seeds ] #%% import tqdm scores_by_cluster = np.array( [[model.score_seq(model.bos_state, k)[0] for model in models] for k in tqdm.tqdm(sents, desc="Score sents")]) #%% sbc_lmnorm = scores_by_cluster - np.mean(scores_by_cluster, axis=0) #%% from scipy.misc import logsumexp sbc_lse = logsumexp(scores_by_cluster, axis=1, keepdims=True) #%% sbc = scores_by_cluster - 1 * sbc_lse
# -*- coding: utf-8 -*- """ Created on Fri Jun 23 14:13:45 2017 @author: kcarnold """ import numpy as np from suggestion.lang_model import Model from suggestion.paths import paths #%% lowercase = Model.from_basename('yelp_train', paths.model_basename('yelp_train')) truecase = Model.from_basename('yelp_train_truecase', paths.model_basename('yelp_train_truecase')) #%% import cytoolz #%% case_options = cytoolz.groupby(lambda x: x.lower(), truecase.id2str) #%% def infer_true_case(sent_toks): state = truecase.get_state(["<S>"], bos=True)[0] result = [] for tok in sent_toks: options = case_options.get(tok, [tok]) if len(options) == 1: result.append(options[0]) continue vocab_indices = [truecase.model.vocab_index(opt) for opt in options]
for i, sent in enumerate(sents): res.append([topic_tags[c] for c in clusters_for_sents[:i + 1][-4:]] + sent.lower().split()) return res import tqdm from suggestion import util util.dump_kenlm('yelp_topic_tagged', [ ' '.join(s) for tokenized in tqdm.tqdm(reviews.tokenized) for s in review_to_tagged_sents(tokenized.split('\n')) ]) #%% from suggestion import lang_model topic2sentence_lm = lang_model.Model.from_basename( paths.model_basename('yelp_topic_tagged')) #%% import itertools topic_transitions_indices = list(itertools.product(range(10), range(10))) rev_topic_transitions_indices = [10 * i + i for i in range(10)] #%% transition_log_likelihoods = np.array([[ topic2sentence_lm.score_seq( topic2sentence_lm.get_state([topic_tags[c1], topic_tags[c2]], bos=True)[0], k)[0] for c1, c2 in itertools.product(range(10), range(10)) ] for k in tqdm.tqdm(clizer.unique_starts, desc="Score starts")]) #%% #scores_by_cluster = scores_by_cluster_raw.copy() #likelihood_bias = logsumexp(scores_by_cluster, axis=1, keepdims=True) #%%
clusters_to_use.fill(True) clusters_to_use[omit_clusters] = False clusters_to_use = np.flatnonzero(clusters_to_use) #%% unique_starts = [ x.split() for x in sorted({' '.join(sent.split()[:5]) for sent in sents_2}) ] #%% from suggestion import lang_model from suggestion.paths import paths scores_by_cluster = [] for cluster_idx in tqdm.tqdm(clusters_to_use): model = lang_model.Model.from_basename( paths.model_basename(f'yelp_bigclust_{cluster_idx}')) scores_by_cluster.append( [model.score_seq(model.bos_state, k)[0] for k in unique_starts]) #%% sbc = np.array(scores_by_cluster).T #%% from scipy.misc import logsumexp likelihood_bias = logsumexp(sbc, axis=1, keepdims=True) sbc2 = sbc - .85 * likelihood_bias #%% sbc_argsort = np.argsort(sbc2, axis=0) #%% import contextlib with open('cluster_starts.txt', 'w') as f, contextlib.redirect_stdout(f): for cluster_idx in range(len(clusters_to_use)):
# -*- coding: utf-8 -*- """ Created on Fri Mar 31 12:36:56 2017 @author: kcarnold """ from suggestion import suggestion_generator, lang_model from suggestion.paths import paths import numpy as np from scipy.misc import logsumexp #%% clizer = suggestion_generator.clizer n_clusters = clizer.n_clusters models = [lang_model.Model.from_basename(paths.model_basename('cluster_{}'.format(cluster_idx))) for cluster_idx in range(n_clusters)] #%% has_unks = np.array([[any(model.model.vocab_index(tok) == 0 for tok in toks) for model in models] for toks in clizer.unique_starts]) #%% omit2 = np.flatnonzero(np.sum(has_unks, axis=1)) #%% import re has_review = np.array([bool(re.search(r'\breview(er|ed)?s?\b|\bstars?\b', ' '.join(toks))) for toks in clizer.unique_starts]) #%% scores_by_cluster = clizer.scores_by_cluster.copy() likelihood_bias = logsumexp(scores_by_cluster, axis=1, keepdims=True) scores_by_cluster -= likelihood_bias #scores_by_cluster[suggested_already] = -np.inf scores_by_cluster[omit2] = -np.inf #scores_by_cluster[has_review] = -np.inf scores_by_cluster[clizer.omit] = -np.inf