def print_top_n_surps(model, acm): #Parallelised version -- takes forever, not sure why, leaving this for now #top_surps = Parallel(n_jobs=cores)(delayed(top_n_surps_from_doc)(doc, # model, # acm.cooccurrence, # acm.word_occurrence, # acm.dictionary, # acm.total_docs) # for doc in acm.documents) #top_surps = sorted(itertools.chain.from_iterable(top_surps)) #top_surps = top_surps[:10] top_surps = [] for doc in acm.documents: if len(doc): top_surps += evaluate.estimate_document_surprise_pairs( doc, model, acm)[:10] top_surps = list(set(top_surps)) top_surps.sort(key=lambda x: x[2], reverse=False) top_surps = top_surps[:10] print "Top 10 surprising combos" w1s = [] w2s = [] w1_occs = [] w2_occs = [] est_surps = [] est_coocs = [] obs_coocs = [] obs_surps = [] for surp in top_surps: w1s.append(surp[0]) w2s.append(surp[1]) w1_occ = acm.word_occurrence[surp[0]] w2_occ = acm.word_occurrence[surp[1]] w1_occs.append(w1_occ) w2_occs.append(w2_occ) est_surps.append(surp[2]) wk1 = acm.dictionary.token2id[surp[0]] wk2 = acm.dictionary.token2id[surp[1]] est_coocs.append( evaluate.estimate_word_pair_cooccurrence(wk1, wk2, model, acm.cooccurrence)) w1_w2_cooccurrence = acm.cooccurrence[wk1][wk2] obs_coocs.append(w1_w2_cooccurrence) obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ, w2_occ, len(acm.documents)) obs_surps.append(obs_surp) tab = PrettyTable() tab.add_column("Word 1", w1s) tab.add_column("Word 2", w2s) tab.add_column("W1 occs", w1_occs) tab.add_column("W2 occs", w2_occs) tab.add_column("Obs. cooc", obs_coocs) tab.add_column("Obs. surp", obs_surps) tab.add_column("Est. cooc", est_coocs) tab.add_column("Est. surp", est_surps) tab.float_format = ".4" print tab
def document_cooccurrence_to_surprise(surps, fc, doc, cooc_mat, word_occurrence, dictionary, key_map, n_docs, ignore_order=True): surp_list = [] for i1, i2 in zip( *np.triu_indices(cooc_mat.shape[0], k=1, m=cooc_mat.shape[1])): w1 = doc[i1][0] w2 = doc[i2][0] if not w1 == w2 and w1 in key_map.keys() and w2 in key_map.keys( ): #if the words aren't in this famcat's model, then don't make any predictions on them. s = evaluate.word_pair_surprise( cooc_mat[i1, i2], word_occurrence[dictionary[key_map[w1]]], word_occurrence[dictionary[key_map[w2]]], n_docs) if key_map[w1] not in surps.keys(): surps[key_map[w1]] = {key_map[w2]: [(fc, s)]} elif key_map[w2] not in surps[key_map[w1]].keys(): surps[key_map[w1]][key_map[w2]] = [(fc, s)] else: surps[key_map[w1]][key_map[w2]].append((fc, s)) surp_list.append( (dictionary[key_map[w1]], dictionary[key_map[w2]], s)) return surp_list
def combine_surprise_across_famcats_for_user_normalised( w1_occs_by_famcat, w2_occs_by_famcat, coocs_by_famcat, n_docs, user): # Multiply word occurrences in each category by the user's familiarity with that category, sum them all up and calculate surprise cooc = sum([c * fc for c, fc in zip(coocs_by_famcat, user)]) w1_occs = sum([c * fc for c, fc in zip(w1_occs_by_famcat, user)]) w2_occs = sum([c * fc for c, fc in zip(w2_occs_by_famcat, user)]) return evaluate.word_pair_surprise(cooc, w1_occs, w2_occs, n_docs)
def print_top_n_surps(model, acm, top_n): top_surps = [] for doc in acm.documents: if len(doc): top_surps += evaluate.estimate_document_surprise_pairs(doc, model, acm)[:10] top_surps = list(set(top_surps)) top_surps.sort(key = lambda x: x[2], reverse=False) top_surps = top_surps[:top_n] print "top_n surprising combos" w1s = [] w2s = [] w1_occs = [] w2_occs = [] est_surps = [] est_coocs = [] obs_coocs = [] obs_surps = [] for surp in top_surps: w1s.append(surp[0]) w2s.append(surp[1]) w1_occ = acm.word_occurrence[surp[0]] w2_occ = acm.word_occurrence[surp[1]] w1_occs.append(w1_occ) w2_occs.append(w2_occ) est_surps.append(surp[2]) wk1 = acm.dictionary.token2id[surp[0]] wk2 = acm.dictionary.token2id[surp[1]] est_coocs.append(evaluate.estimate_word_pair_cooccurrence(wk1, wk2, model, acm.cooccurrence)) w1_w2_cooccurrence = acm.cooccurrence[wk1][wk2] obs_coocs.append(w1_w2_cooccurrence) obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ, w2_occ, len(acm.documents)) obs_surps.append(obs_surp) tab = PrettyTable() tab.add_column("Word 1",w1s) tab.add_column("Word 2",w2s) tab.add_column("W1 occs", w1_occs) tab.add_column("W2 occs", w2_occs) tab.add_column("Obs. cooc",obs_coocs) tab.add_column("Obs. surp",obs_surps) tab.add_column("Est. cooc",est_coocs) tab.add_column("Est. surp",est_surps) tab.float_format = ".4" print tab
def export_observed_surprise(self): if self.use_famcats: raise NotImplementedError with open(self.filepath + self.argstring + "_observed_surprise.csv", "wb") as ef: writer = csv.writer(ef) surpmat = np.zeros((len(self.dictionary.id2token.keys()), len(self.dictionary.id2token.keys()))) for wk1, w1 in self.dictionary.id2token.iteritems(): for wk2, w2 in self.dictionary.id2token.iteritems(): if wk1 != wk2: try: cooc = self.cooccurrence[wk1][wk2] except KeyError: cooc = 0 surpmat[wk1, wk2] = evaluate.word_pair_surprise( cooc, self.word_occurrence[w1], self.word_occurrence[w2], len(self.doc_ids)) writer.writerows(list(surpmat)) logger.info(" **** Observed surprise exported.")
def combine_surprise_normalised(surps, reader, user): for w1, w1_pairs in surps.iteritems(): for w2 in w1_pairs: w1_str = reader.dictionary[w1] w2_str = reader.dictionary[w2] w1_occs_by_famcat = [ reader.word_occurrence[fc][w1_str] if w1_str in reader.word_occurrence[fc] else 0 for fc in reader.famcats ] w2_occs_by_famcat = [ reader.word_occurrence[fc][w2_str] if w2_str in reader.word_occurrence[fc] else 0 for fc in reader.famcats ] coocs_by_famcat = [] for fc in reader.famcats: try: cooc = reader.cooccurrence[fc][ reader.all_keys_to_per_fc_keys[fc][w1]][ reader.all_keys_to_per_fc_keys[fc][w2]] except KeyError: cooc = 0 coocs_by_famcat.append(cooc) # Multiply word occurrences in each category by the user's familiarity with that category, sum them all up and calculate surprise cooc = sum([ c * user[fc] for c, fc in zip(coocs_by_famcat, reader.famcats) ]) w1_occs = sum([ c * user[fc] for c, fc in zip(w1_occs_by_famcat, reader.famcats) ]) w2_occs = sum([ c * user[fc] for c, fc in zip(w2_occs_by_famcat, reader.famcats) ]) surps[w1][w2].append( ("combined", evaluate.word_pair_surprise(cooc, w1_occs, w2_occs, reader.total_docs)))
def export_estimated_surprise(self, model, use_sglove=False): if self.use_famcats: raise NotImplementedError with open(self.filepath + self.argstring + "_estimated_surprise.csv", "wb") as ef: writer = csv.writer(ef) surpmat = np.zeros((len(self.dictionary.id2token.keys()), len(self.dictionary.id2token.keys()))) for wk1, w1 in self.dictionary.id2token.iteritems(): for wk2, w2 in self.dictionary.id2token.iteritems(): if wk1 != wk2: surpmat[wk1, wk2] = evaluate.word_pair_surprise( evaluate.estimate_word_pair_cooccurrence( wk1, wk2, model, self.cooccurrence, use_sglove=use_sglove), self.word_occurrence[w1], self.word_occurrence[w2], len(self.doc_ids)) writer.writerows(list(surpmat)) logger.info(" **** Estimated surprise exported.")
def print_top_n_surps(model, reader, top_n, famcat=None): top_surps = [] if famcat is None: for doc in reader.documents: if len(doc): top_surps += evaluate.estimate_document_surprise_pairs( doc, model, reader.cooccurrence, reader.word_occurrence, reader.dictionary, reader.documents, use_sglove=reader.use_sglove)[:10] top_surps = list(set(top_surps)) top_surps.sort(key=lambda x: x[2], reverse=True) top_surps = top_surps[:top_n] else: for doc, fcs in zip(reader.documents, reader.doc_famcats): if len(doc) and famcat in fcs: top_surps += evaluate_personalised.estimate_personalised_document_surprise_pairs_one_fc( doc, model, famcat, reader)[:10] top_surps = list(set(top_surps)) top_surps.sort(key=lambda x: x[2], reverse=True) top_surps = top_surps[:top_n] print "top_n surprising combos" w1s = [] w2s = [] w1_occs = [] w2_occs = [] est_surps = [] est_coocs = [] obs_coocs = [] obs_surps = [] for surp in top_surps: if famcat is None: w1s.append(surp[0]) w2s.append(surp[1]) w1_occ = reader.word_occurrence[surp[0]] w2_occ = reader.word_occurrence[surp[1]] w1_occs.append(w1_occ) w2_occs.append(w2_occ) est_surps.append(surp[2]) wk1 = reader.dictionary.token2id[surp[0]] wk2 = reader.dictionary.token2id[surp[1]] est_coocs.append( evaluate.estimate_word_pair_cooccurrence( wk1, wk2, model, reader.cooccurrence)) w1_w2_cooccurrence = reader.cooccurrence[wk1][wk2] obs_coocs.append(w1_w2_cooccurrence) obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ, w2_occ, len(reader.documents)) obs_surps.append(obs_surp) else: w1s.append(surp[0]) w2s.append(surp[1]) w1_occ = reader.word_occurrence[famcat][surp[0]] w2_occ = reader.word_occurrence[famcat][surp[1]] w1_occs.append(w1_occ) w2_occs.append(w2_occ) est_surps.append(surp[2]) fc_wk1 = reader.all_keys_to_per_fc_keys[famcat][ reader.dictionary.token2id[surp[0]]] fc_wk2 = reader.all_keys_to_per_fc_keys[famcat][ reader.dictionary.token2id[surp[1]]] est_coocs.append( evaluate_personalised.estimate_word_pair_cooccurrence( fc_wk1, fc_wk2, model, reader.cooccurrence[famcat])) try: w1_w2_cooccurrence = reader.cooccurrence[famcat][fc_wk1][ fc_wk2] except KeyError: w1_w2_cooccurrence = 0 obs_coocs.append(w1_w2_cooccurrence) obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ, w2_occ, reader.docs_per_fc[famcat]) obs_surps.append(obs_surp) tab = PrettyTable() tab.add_column("Word 1", w1s) tab.add_column("Word 2", w2s) tab.add_column("W1 occs", w1_occs) tab.add_column("W2 occs", w2_occs) tab.add_column("Obs. cooc", obs_coocs) tab.add_column("Obs. surp", obs_surps) tab.add_column("Est. cooc", est_coocs) tab.add_column("Est. surp", est_surps) tab.float_format = ".4" print tab