Exemple #1
0
def print_top_n_surps(model, acm):
    #Parallelised version -- takes forever, not sure why, leaving this for now
    #top_surps = Parallel(n_jobs=cores)(delayed(top_n_surps_from_doc)(doc,
    #																 model,
    #																 acm.cooccurrence,
    #																 acm.word_occurrence,
    #																 acm.dictionary,
    # 																 acm.total_docs)
    #								   								 for doc in acm.documents)
    #top_surps = sorted(itertools.chain.from_iterable(top_surps))
    #top_surps = top_surps[:10]

    top_surps = []
    for doc in acm.documents:
        if len(doc):
            top_surps += evaluate.estimate_document_surprise_pairs(
                doc, model, acm)[:10]
            top_surps = list(set(top_surps))
            top_surps.sort(key=lambda x: x[2], reverse=False)
            top_surps = top_surps[:10]

    print "Top 10 surprising combos"
    w1s = []
    w2s = []
    w1_occs = []
    w2_occs = []
    est_surps = []
    est_coocs = []
    obs_coocs = []
    obs_surps = []
    for surp in top_surps:
        w1s.append(surp[0])
        w2s.append(surp[1])
        w1_occ = acm.word_occurrence[surp[0]]
        w2_occ = acm.word_occurrence[surp[1]]
        w1_occs.append(w1_occ)
        w2_occs.append(w2_occ)
        est_surps.append(surp[2])
        wk1 = acm.dictionary.token2id[surp[0]]
        wk2 = acm.dictionary.token2id[surp[1]]
        est_coocs.append(
            evaluate.estimate_word_pair_cooccurrence(wk1, wk2, model,
                                                     acm.cooccurrence))
        w1_w2_cooccurrence = acm.cooccurrence[wk1][wk2]
        obs_coocs.append(w1_w2_cooccurrence)
        obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ,
                                               w2_occ, len(acm.documents))
        obs_surps.append(obs_surp)

    tab = PrettyTable()
    tab.add_column("Word 1", w1s)
    tab.add_column("Word 2", w2s)
    tab.add_column("W1 occs", w1_occs)
    tab.add_column("W2 occs", w2_occs)
    tab.add_column("Obs. cooc", obs_coocs)
    tab.add_column("Obs. surp", obs_surps)
    tab.add_column("Est. cooc", est_coocs)
    tab.add_column("Est. surp", est_surps)
    tab.float_format = ".4"
    print tab
def document_cooccurrence_to_surprise(surps,
                                      fc,
                                      doc,
                                      cooc_mat,
                                      word_occurrence,
                                      dictionary,
                                      key_map,
                                      n_docs,
                                      ignore_order=True):
    surp_list = []
    for i1, i2 in zip(
            *np.triu_indices(cooc_mat.shape[0], k=1, m=cooc_mat.shape[1])):
        w1 = doc[i1][0]
        w2 = doc[i2][0]
        if not w1 == w2 and w1 in key_map.keys() and w2 in key_map.keys(
        ):  #if the words aren't in this famcat's model, then don't make any predictions on them.
            s = evaluate.word_pair_surprise(
                cooc_mat[i1, i2], word_occurrence[dictionary[key_map[w1]]],
                word_occurrence[dictionary[key_map[w2]]], n_docs)
            if key_map[w1] not in surps.keys():
                surps[key_map[w1]] = {key_map[w2]: [(fc, s)]}
            elif key_map[w2] not in surps[key_map[w1]].keys():
                surps[key_map[w1]][key_map[w2]] = [(fc, s)]
            else:
                surps[key_map[w1]][key_map[w2]].append((fc, s))
            surp_list.append(
                (dictionary[key_map[w1]], dictionary[key_map[w2]], s))
    return surp_list
def combine_surprise_across_famcats_for_user_normalised(
        w1_occs_by_famcat, w2_occs_by_famcat, coocs_by_famcat, n_docs, user):
    # Multiply word occurrences in each category by the user's familiarity with that category, sum them all up and calculate surprise
    cooc = sum([c * fc for c, fc in zip(coocs_by_famcat, user)])
    w1_occs = sum([c * fc for c, fc in zip(w1_occs_by_famcat, user)])
    w2_occs = sum([c * fc for c, fc in zip(w2_occs_by_famcat, user)])
    return evaluate.word_pair_surprise(cooc, w1_occs, w2_occs, n_docs)
Exemple #4
0
def print_top_n_surps(model, acm, top_n):

	top_surps = []
	for doc in acm.documents:
		if len(doc):
			top_surps += evaluate.estimate_document_surprise_pairs(doc, model, acm)[:10]
			top_surps = list(set(top_surps))
			top_surps.sort(key = lambda x: x[2], reverse=False)
			top_surps = top_surps[:top_n]

	print "top_n surprising combos"
	w1s = []
	w2s = []
	w1_occs = []
	w2_occs = []
	est_surps = []
	est_coocs = []
	obs_coocs = []
	obs_surps = []
	for surp in top_surps:
		w1s.append(surp[0])
		w2s.append(surp[1])
		w1_occ = acm.word_occurrence[surp[0]]
		w2_occ = acm.word_occurrence[surp[1]]
		w1_occs.append(w1_occ)
		w2_occs.append(w2_occ)
		est_surps.append(surp[2])
		wk1 = acm.dictionary.token2id[surp[0]]
		wk2 = acm.dictionary.token2id[surp[1]]
		est_coocs.append(evaluate.estimate_word_pair_cooccurrence(wk1, wk2, model, acm.cooccurrence))
		w1_w2_cooccurrence = acm.cooccurrence[wk1][wk2]
		obs_coocs.append(w1_w2_cooccurrence)
		obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ, w2_occ, len(acm.documents))
		obs_surps.append(obs_surp)

	tab = PrettyTable()
	tab.add_column("Word 1",w1s)
	tab.add_column("Word 2",w2s)
	tab.add_column("W1 occs", w1_occs)
	tab.add_column("W2 occs", w2_occs)
	tab.add_column("Obs. cooc",obs_coocs)
	tab.add_column("Obs. surp",obs_surps)
	tab.add_column("Est. cooc",est_coocs)
	tab.add_column("Est. surp",est_surps)
	tab.float_format = ".4"
	print tab
Exemple #5
0
 def export_observed_surprise(self):
     if self.use_famcats:
         raise NotImplementedError
     with open(self.filepath + self.argstring + "_observed_surprise.csv",
               "wb") as ef:
         writer = csv.writer(ef)
         surpmat = np.zeros((len(self.dictionary.id2token.keys()),
                             len(self.dictionary.id2token.keys())))
         for wk1, w1 in self.dictionary.id2token.iteritems():
             for wk2, w2 in self.dictionary.id2token.iteritems():
                 if wk1 != wk2:
                     try:
                         cooc = self.cooccurrence[wk1][wk2]
                     except KeyError:
                         cooc = 0
                     surpmat[wk1, wk2] = evaluate.word_pair_surprise(
                         cooc, self.word_occurrence[w1],
                         self.word_occurrence[w2], len(self.doc_ids))
         writer.writerows(list(surpmat))
     logger.info("   **** Observed surprise exported.")
def combine_surprise_normalised(surps, reader, user):
    for w1, w1_pairs in surps.iteritems():
        for w2 in w1_pairs:
            w1_str = reader.dictionary[w1]
            w2_str = reader.dictionary[w2]
            w1_occs_by_famcat = [
                reader.word_occurrence[fc][w1_str]
                if w1_str in reader.word_occurrence[fc] else 0
                for fc in reader.famcats
            ]
            w2_occs_by_famcat = [
                reader.word_occurrence[fc][w2_str]
                if w2_str in reader.word_occurrence[fc] else 0
                for fc in reader.famcats
            ]
            coocs_by_famcat = []
            for fc in reader.famcats:
                try:
                    cooc = reader.cooccurrence[fc][
                        reader.all_keys_to_per_fc_keys[fc][w1]][
                            reader.all_keys_to_per_fc_keys[fc][w2]]
                except KeyError:
                    cooc = 0
                coocs_by_famcat.append(cooc)
            # Multiply word occurrences in each category by the user's familiarity with that category, sum them all up and calculate surprise
            cooc = sum([
                c * user[fc] for c, fc in zip(coocs_by_famcat, reader.famcats)
            ])
            w1_occs = sum([
                c * user[fc]
                for c, fc in zip(w1_occs_by_famcat, reader.famcats)
            ])
            w2_occs = sum([
                c * user[fc]
                for c, fc in zip(w2_occs_by_famcat, reader.famcats)
            ])

            surps[w1][w2].append(
                ("combined",
                 evaluate.word_pair_surprise(cooc, w1_occs, w2_occs,
                                             reader.total_docs)))
Exemple #7
0
 def export_estimated_surprise(self, model, use_sglove=False):
     if self.use_famcats:
         raise NotImplementedError
     with open(self.filepath + self.argstring + "_estimated_surprise.csv",
               "wb") as ef:
         writer = csv.writer(ef)
         surpmat = np.zeros((len(self.dictionary.id2token.keys()),
                             len(self.dictionary.id2token.keys())))
         for wk1, w1 in self.dictionary.id2token.iteritems():
             for wk2, w2 in self.dictionary.id2token.iteritems():
                 if wk1 != wk2:
                     surpmat[wk1, wk2] = evaluate.word_pair_surprise(
                         evaluate.estimate_word_pair_cooccurrence(
                             wk1,
                             wk2,
                             model,
                             self.cooccurrence,
                             use_sglove=use_sglove),
                         self.word_occurrence[w1], self.word_occurrence[w2],
                         len(self.doc_ids))
         writer.writerows(list(surpmat))
     logger.info("   **** Estimated surprise exported.")
Exemple #8
0
def print_top_n_surps(model, reader, top_n, famcat=None):
    top_surps = []
    if famcat is None:
        for doc in reader.documents:
            if len(doc):
                top_surps += evaluate.estimate_document_surprise_pairs(
                    doc,
                    model,
                    reader.cooccurrence,
                    reader.word_occurrence,
                    reader.dictionary,
                    reader.documents,
                    use_sglove=reader.use_sglove)[:10]
                top_surps = list(set(top_surps))
                top_surps.sort(key=lambda x: x[2], reverse=True)
                top_surps = top_surps[:top_n]
    else:
        for doc, fcs in zip(reader.documents, reader.doc_famcats):
            if len(doc) and famcat in fcs:
                top_surps += evaluate_personalised.estimate_personalised_document_surprise_pairs_one_fc(
                    doc, model, famcat, reader)[:10]
                top_surps = list(set(top_surps))
                top_surps.sort(key=lambda x: x[2], reverse=True)
                top_surps = top_surps[:top_n]

    print "top_n surprising combos"
    w1s = []
    w2s = []
    w1_occs = []
    w2_occs = []
    est_surps = []
    est_coocs = []
    obs_coocs = []
    obs_surps = []
    for surp in top_surps:
        if famcat is None:
            w1s.append(surp[0])
            w2s.append(surp[1])
            w1_occ = reader.word_occurrence[surp[0]]
            w2_occ = reader.word_occurrence[surp[1]]
            w1_occs.append(w1_occ)
            w2_occs.append(w2_occ)
            est_surps.append(surp[2])
            wk1 = reader.dictionary.token2id[surp[0]]
            wk2 = reader.dictionary.token2id[surp[1]]
            est_coocs.append(
                evaluate.estimate_word_pair_cooccurrence(
                    wk1, wk2, model, reader.cooccurrence))
            w1_w2_cooccurrence = reader.cooccurrence[wk1][wk2]
            obs_coocs.append(w1_w2_cooccurrence)
            obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence,
                                                   w1_occ, w2_occ,
                                                   len(reader.documents))
            obs_surps.append(obs_surp)
        else:
            w1s.append(surp[0])
            w2s.append(surp[1])
            w1_occ = reader.word_occurrence[famcat][surp[0]]
            w2_occ = reader.word_occurrence[famcat][surp[1]]
            w1_occs.append(w1_occ)
            w2_occs.append(w2_occ)
            est_surps.append(surp[2])
            fc_wk1 = reader.all_keys_to_per_fc_keys[famcat][
                reader.dictionary.token2id[surp[0]]]
            fc_wk2 = reader.all_keys_to_per_fc_keys[famcat][
                reader.dictionary.token2id[surp[1]]]
            est_coocs.append(
                evaluate_personalised.estimate_word_pair_cooccurrence(
                    fc_wk1, fc_wk2, model, reader.cooccurrence[famcat]))
            try:
                w1_w2_cooccurrence = reader.cooccurrence[famcat][fc_wk1][
                    fc_wk2]
            except KeyError:
                w1_w2_cooccurrence = 0

            obs_coocs.append(w1_w2_cooccurrence)
            obs_surp = evaluate.word_pair_surprise(w1_w2_cooccurrence, w1_occ,
                                                   w2_occ,
                                                   reader.docs_per_fc[famcat])
            obs_surps.append(obs_surp)

    tab = PrettyTable()
    tab.add_column("Word 1", w1s)
    tab.add_column("Word 2", w2s)
    tab.add_column("W1 occs", w1_occs)
    tab.add_column("W2 occs", w2_occs)
    tab.add_column("Obs. cooc", obs_coocs)
    tab.add_column("Obs. surp", obs_surps)
    tab.add_column("Est. cooc", est_coocs)
    tab.add_column("Est. surp", est_surps)
    tab.float_format = ".4"
    print tab