def train_models_per_cluster(mbk, vecs, texts): sentences_in_cluster = [[] for i in range(mbk.n_clusters)] for i, c in enumerate(mbk.predict(vecs)): sentences_in_cluster[c].append(texts[i]) for cluster_idx, cluster in enumerate(sentences_in_cluster): print(cluster_idx) dump_kenlm('cluster_{}'.format(cluster_idx), [s.lower() for s in cluster])
#%% sents = clustering.filter_reasonable_length_sents(clustering.get_all_sents()) #%% LM_seeds = 'food service location ambiance value'.split() #%% from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=5, max_df=.5, stop_words='english') all_vecs = vectorizer.fit_transform(sents) #%% vocab_indices = [vectorizer.vocabulary_[w] for w in LM_seeds] #%% sents_by_cluster = [all_vecs[:, idx].nonzero()[0] for idx in vocab_indices] #%% for word, sent_indices in zip(LM_seeds, sents_by_cluster): print(word) dump_kenlm(f'tmp_{word}_0', [sents[idx] for idx in sent_indices]) #%% from suggestion import lang_model from suggestion.paths import paths #%% models = [ lang_model.Model.from_basename(paths.model_basename(f'tmp_{word}_0')) for word in LM_seeds ] #%% import tqdm scores_by_cluster = np.array( [[model.score_seq(model.bos_state, k)[0] for model in models] for k in tqdm.tqdm(sents, desc="Score sents")]) #%% sbc_lmnorm = scores_by_cluster - np.mean(scores_by_cluster, axis=0)
def dump_indices(name, indices): dump_kenlm( name, sorted({("<D> " if sent_sent_idx_2[i] == 0 else "<S> ") + sent_text_2[i].lower() + " </S>" for i in tqdm.tqdm(indices, desc=name)}))
not_rts[not_rts.created_at > datetime.date(2016, 1, 1)].text) tweets_duplicating_recent = list(not_rts.text) + recent_texts * 5 #%% def preprocess(tweet_text): return URL_RE.sub(' ', remove_handles(_replace_html_entities(tweet_text))) from suggestion import train_ngram def tokenize(text): text = ' '.join(text.split()) text = re.sub(r'([?!])\1+', r'\1', text) # text = URL_RE.sub(' ', text) sents = nltk.sent_tokenize(text) # Use our simple word tokenizer, since spacy breaks apart contractions. token_spaced_sents = (' '.join(sent[a:b] for a, b in token_spans(sent)) for sent in sents) return '\n'.join(token_spaced_sents) #%% import tqdm from suggestion.util import dump_kenlm dump_kenlm( 'tweeterinchief', (' '.join(train_ngram.convert_tokenization(tokenize(preprocess(text)))) for text in tqdm.tqdm(tweets_duplicating_recent)))
cluster_distances = cytoolz.thread_first(sents, clizer.vectorize_sents, clustering.normalize_vecs, clizer.clusterer.transform) clusters_for_sents = np.argmin(cluster_distances, axis=1) res = [] for i, sent in enumerate(sents): res.append([topic_tags[c] for c in clusters_for_sents[:i + 1][-4:]] + sent.lower().split()) return res import tqdm from suggestion import util util.dump_kenlm('yelp_topic_tagged', [ ' '.join(s) for tokenized in tqdm.tqdm(reviews.tokenized) for s in review_to_tagged_sents(tokenized.split('\n')) ]) #%% from suggestion import lang_model topic2sentence_lm = lang_model.Model.from_basename( paths.model_basename('yelp_topic_tagged')) #%% import itertools topic_transitions_indices = list(itertools.product(range(10), range(10))) rev_topic_transitions_indices = [10 * i + i for i in range(10)] #%% transition_log_likelihoods = np.array([[ topic2sentence_lm.score_seq( topic2sentence_lm.get_state([topic_tags[c1], topic_tags[c2]], bos=True)[0], k)[0] for c1, c2 in itertools.product(range(10), range(10))
def preprocess_csv(input_filename, model_name, lowercase=True): import pandas as pd data = pd.read_csv(input_filename) dump_kenlm(model_name, (' '.join(convert_tokenization(tokenize(text), lowercase=lowercase)) for text in tqdm.tqdm(data.Text)))
print("Loading...", flush=True) data = pd.read_pickle(args.input) reviews = data['data'] tokenized_reviews = [convert_tokenization(tokenized, lowercase=not args.no_lower) for tokenized in tqdm.tqdm(reviews.tokenized, desc="Converting format")] if args.split_stars: counts = [] for stars in [1, 2, 3, 4, 5]: star_indices = np.flatnonzero(reviews.stars_review == stars) counts.append(len(star_indices)) # star_indices = np.random.choice(star_indices, size=args.subsample_stars, replace=False) dump_kenlm( f"{args.model_name}-{stars}star", (' '.join(tokenized_reviews[idx]) for idx in tqdm.tqdm(star_indices, desc=f"Writing {stars}-star")), order=args.star_ngram_order) json.dump(counts, open('models/star_counts.json', 'w')) bucket_size = min(counts) dump_kenlm( f"{args.model_name}-balanced", (' '.join(tokenized_reviews[idx]) for stars in tqdm.trange(1, 6, desc="Writing balanced") for idx in np.random.choice(np.flatnonzero(reviews.stars_review == stars), bucket_size, replace=False)), order=args.order) for stars_group in ['12', '45']: indices = [] for stars in stars_group: stars = int(stars)
np.argsort(np.bincount(closest)) #%% dist_to_closest_cluster = np.min(dists_to_centers, axis=1) is_close = dist_to_closest_cluster < np.median(dist_to_closest_cluster) #[sents_2[idx] for idx in np.argsort(dist_to_closest_cluster)[-50:]] #%% #%% omit_clusters = [] # Train LMs on each cluster from suggestion.util import dump_kenlm for cluster_idx in range(n_clusters): sents_in_cluster = np.flatnonzero((closest == cluster_idx) & is_close) if len(sents_in_cluster) < 50: omit_clusters.append(cluster_idx) print(cluster_idx) dump_kenlm(f'yelp_bigclust_{cluster_idx}', [sents_2[idx] for idx in sents_in_cluster]) #%% clusters_to_use = np.zeros(n_clusters, dtype=bool) clusters_to_use.fill(True) clusters_to_use[omit_clusters] = False clusters_to_use = np.flatnonzero(clusters_to_use) #%% unique_starts = [ x.split() for x in sorted({' '.join(sent.split()[:5]) for sent in sents_2}) ] #%% from suggestion import lang_model from suggestion.paths import paths scores_by_cluster = []