def mi(chat1, chat2): sizes1 = threadSizes(chat1) sizes2 = threadSizes(chat2) threads1 = threads(chat1) threads2 = threads(chat2) assert (len(threads1) == len(threads2)) nlines = (len(threads1) + 0.0) intersects = DefaultDict(DefaultDict(0)) for t1, t2 in izip(threads1, threads2): intersects[t1][t2] += 1 res = 0.0 for k1 in sizes1: for k2 in sizes2: jointp = intersects[k1][k2] / nlines pk1 = sizes1[k1] / nlines pk2 = sizes2[k2] / nlines if jointp: term = jointp * log2(jointp / (pk1 * pk2)) else: term = 0 res += term return res
def conditional_entropy_Y_Given_X(confusion_dict): """H(Y|X) = H(Y) - I(Y;X) = H(Y) - [H(X)+H(Y)-H(X,Y)] = H(X,Y) - H(X)""" from AIMA import DefaultDict count_x = DefaultDict(0) count_y = DefaultDict(0) for (x_key, y_key), count in confusion_dict.items(): count_x[x_key] += count count_y[y_key] += count joint = entropy_of_multinomial(confusion_dict.values()) return joint - entropy_of_multinomial(count_x.values())
def mutual_information(confusion_dict): """I(X; Y) = H(X) + H(Y) - H(X, Y)""" # TODO document from AIMA import DefaultDict count_x = DefaultDict(0) count_y = DefaultDict(0) for (x_key, y_key), count in confusion_dict.items(): count_x[x_key] += count count_y[y_key] += count return entropy_of_multinomial(count_x.values()) + \ entropy_of_multinomial(count_y.values()) - \ entropy_of_multinomial(confusion_dict.values())
def fget(self): if self._gold_sizes is None: self._gold_sizes = DefaultDict(0) for gold_dict in self.by_test.values(): for gold,size in gold_dict.items(): self._gold_sizes[gold] += size return self._gold_sizes
def variation_of_information(confusion_dict): """VI(X, Y) = H(X | Y) + H(Y | X) = H(X) - I(X; Y) + H(Y) - I(X; Y) = H(X) + H(Y) - 2I(X; Y) = H(X) + H(Y) - 2[H(X) + H(Y) - H(X, Y)] = 2H(X, Y) - H(X) - H(Y)""" # TODO document better from AIMA import DefaultDict count_x = DefaultDict(0) count_y = DefaultDict(0) for (x_key, y_key), count in confusion_dict.items(): count_x[x_key] += count count_y[y_key] += count return (2 * entropy_of_multinomial(confusion_dict.values())) - \ entropy_of_multinomial(count_x.values()) - \ entropy_of_multinomial(count_y.values())
def threadSizes(chat): ids = threads(chat) counts = DefaultDict(0) for t in ids: counts[t] += 1 return counts
def conditional_entropy_X_Given_Y(confusion_dict): """H(X|Y) = H(X) - I(X;Y) = H(X) - [H(Y)+H(X)-H(Y,X)] = H(Y,X) - H(Y)""" from AIMA import DefaultDict count_y = DefaultDict(0) for (x_key, y_key), count in confusion_dict.items(): count_y[y_key] += count return (entropy_of_multinomial(confusion_dict.values())) - \ entropy_of_multinomial(count_y.values())
def __init__(self): """Creates an empty confusion matrix. You'll need to call the add() method to populate it.""" # test : { gold : count } self.by_test = DefaultDict(DefaultDict(0)) self._all_gold = None
def speakerLines(chat): spkToLines = DefaultDict([]) for comment in chat: spkToLines[comment.name].append(comment) return spkToLines
def makeFeatDict(self, p1, p2, verbose=False): shared = DefaultDict([]) unshared = DefaultDict([]) sharedSubj = DefaultDict([]) unsharedSubj = DefaultDict([]) #df-buckets for feat in set(p1.feats.keys() + p2.feats.keys()): if feat.startswith("LSA"): continue if feat.startswith("SUBJ"): fnum = int(feat.lstrip("SUBJ")) bucket = int(log(self.dfs[fnum], 1.8)) val1 = p1.feats.get(feat, 0) val2 = p2.feats.get(feat, 0) if verbose and val1 and val2: print >> sys.stderr, "SUBJ", self.revVoc[fnum], bucket, if (val1 or val2) and not (val1 and val2): unsharedSubj[bucket].append(max(val1, val2) // 2) else: sharedSubj[bucket].append(min(val1, val2) // 2) continue fnum = int(feat) if fnum not in self.dfs: continue if self.wfr[fnum] < 3: continue bucket = int(log(self.dfs[fnum], 1.8)) val1 = p1.feats.get(feat, 0) val2 = p2.feats.get(feat, 0) if verbose and val1 and val2: print >> sys.stderr, self.revVoc[fnum], bucket, if (val1 or val2) and not (val1 and val2): unshared[bucket].append(max(val1, val2) // 2) else: shared[bucket].append(min(val1, val2) // 2) if 0: #tf bucketing fdict = DefaultDict(0) for bucket, ct in shared.items(): for freq in ct: if freq > 10: freq = 10 fdict["SHARE_%g_%g" % (bucket, freq)] += 1 for bucket, ct in unshared.items(): for freq in ct: if freq > 10: freq = 10 fdict["UNIQUE_%g_%g" % (bucket, freq)] += 1 elif 0: #only df bucketing fdict = DefaultDict(0) for bucket, ct in shared.items(): items = len(ct) fdict["SHARE_%g" % bucket] += items for bucket, ct in unshared.items(): items = len(ct) fdict["UNIQUE_%g" % bucket] += items else: #proportion features fdict = DefaultDict(0) for bucket, ct in shared.items(): items = len(ct) nUnshared = len(unshared[bucket]) frac = items / (nUnshared + items) fdict["PROP_%g" % bucket] = frac if nUnshared + items == 0: fdict["NO_%g" % bucket] = 1 for bucket, ct in sharedSubj.items(): items = len(ct) nUnshared = len(unsharedSubj[bucket]) frac = items / (nUnshared + items) fdict["PROP_SUBJ_%g" % bucket] = frac if nUnshared + items == 0: fdict["NO_SUBJ_%g" % bucket] = 1 #LSA cos = self.cosine(p1, p2) if verbose: print >> sys.stderr, "cosine", cos fdict["COS"] = cos if verbose and 0: print >> sys.stderr print >> sys.stderr, "Feats", fdict if verbose: print >> sys.stderr return fdict
def __init__(self, group, vocab): self.counts = DefaultDict(0) self.subjCounts = DefaultDict(0) self.vocab = vocab self.group = group
for ff in group.files(): np = NewsgroupPost(gName, vocab) np.read(file(ff)) groups[gName].append(np) return vocab, groups if __name__ == "__main__": vocab, groups = readNewsgroups(path("DIRECTORY WITH MINI_NEWSGROUPS")) print >> sys.stderr, "counting term/doc frequencies" wfreqs = DefaultDict(0) dfs = DefaultDict(0) for group, posts in groups.items(): for post in posts: for word, ct in post.counts.items(): wfreqs[word] += ct dfs[word] += 1 print >> sys.stderr, "dumping" output = path("FILE WHERE WE DUMP THE WORD FREQS") fh = file(output, 'w') pickle.dump(vocab, fh) pickle.dump(groups, fh) pickle.dump(wfreqs, fh)
from DistributedExperiment import Experiment, Evaluation def getCorpus(obj): return int(re.search("\d+", obj.exp.mfile.basename()).group(0)) def solverObj(solvers): if "log" in solvers: return "logObjective" return "objective" if __name__ == "__main__": sessionName = sys.argv[1] ssn = Session(sessionName, read_only=True) evals = DefaultDict(DefaultDict([])) for job in ssn: #print job if job.status != "finished": continue obj = job.args[0] if isinstance(obj, Evaluation): corpus = getCorpus(obj) solvers = tuple(obj.exp.solvers) evals[corpus][solvers].append(job.results) avgBest = DefaultDict([]) avgAvg = DefaultDict([]) for corpus,values in evals.items():
from AIMA import DefaultDict import pylab from chatStats import * chat = nonSys(markSys(readChat(argv[1]))) print "Length is", len(chat) spkLines = speakerLines(chat) print "Speakers", len(spkLines) print "Average utterances", avgUtterances(chat) print "Average conversations", avgConversations(chat) spkThreads = DefaultDict([]) for spk, lines in spkLines.items(): spkThreads[spk] = [x.thread for x in lines] #print spkThreads utts = [] unique = [] for threads in spkThreads.values(): utts.append(len(threads)) unique.append(len(Set(threads))) #print " ".join([str(x) for x in utts]) #print #print " ".join([str(x) for x in unique])
return color+sym+line if __name__ == "__main__": workdir = path(sys.argv[1]) print "Working directory", workdir ssnName = "hog" + workdir.dirname().basename() print "Session", ssnName session = Session(ssnName, read_only=True) evalSsnName = "hogeval" + workdir.dirname().basename() print "Eval session", evalSsnName evalSsn = Session(evalSsnName, read_only=True) plotStat = "time" modelSeqs = DefaultDict([]) for ct,job in enumerate(evalSsn): if ct % 20 == 0: print ct, "..." if job.status == "finished": res = job.results try: time = res.stats["time"] except KeyError: time = res.time if time == None: continue