def mi(chat1, chat2):
    sizes1 = threadSizes(chat1)
    sizes2 = threadSizes(chat2)

    threads1 = threads(chat1)
    threads2 = threads(chat2)
    assert (len(threads1) == len(threads2))
    nlines = (len(threads1) + 0.0)
    intersects = DefaultDict(DefaultDict(0))

    for t1, t2 in izip(threads1, threads2):
        intersects[t1][t2] += 1

    res = 0.0

    for k1 in sizes1:
        for k2 in sizes2:
            jointp = intersects[k1][k2] / nlines
            pk1 = sizes1[k1] / nlines
            pk2 = sizes2[k2] / nlines

            if jointp:
                term = jointp * log2(jointp / (pk1 * pk2))
            else:
                term = 0

            res += term

    return res
예제 #2
0
def conditional_entropy_Y_Given_X(confusion_dict):
    """H(Y|X) = H(Y) - I(Y;X)
              = H(Y) - [H(X)+H(Y)-H(X,Y)]
              = H(X,Y) - H(X)"""
    from AIMA import DefaultDict
    count_x = DefaultDict(0)
    count_y = DefaultDict(0)
    for (x_key, y_key), count in confusion_dict.items():
        count_x[x_key] += count
        count_y[y_key] += count
    joint = entropy_of_multinomial(confusion_dict.values())
    return joint - entropy_of_multinomial(count_x.values())
예제 #3
0
def mutual_information(confusion_dict):
    """I(X; Y) = H(X) + H(Y) - H(X, Y)"""
    # TODO document
    from AIMA import DefaultDict
    count_x = DefaultDict(0)
    count_y = DefaultDict(0)
    for (x_key, y_key), count in confusion_dict.items():
        count_x[x_key] += count
        count_y[y_key] += count
    return entropy_of_multinomial(count_x.values()) + \
           entropy_of_multinomial(count_y.values()) - \
           entropy_of_multinomial(confusion_dict.values())
예제 #4
0
 def fget(self):
     if self._gold_sizes is None:
         self._gold_sizes = DefaultDict(0)
         for gold_dict in self.by_test.values():
             for gold,size in gold_dict.items():
                 self._gold_sizes[gold] += size
     return self._gold_sizes
예제 #5
0
def variation_of_information(confusion_dict):
    """VI(X, Y) = H(X | Y) + H(Y | X)
                = H(X) - I(X; Y) + H(Y) - I(X; Y)
                = H(X) + H(Y) - 2I(X; Y)
                = H(X) + H(Y) - 2[H(X) + H(Y) - H(X, Y)]
                = 2H(X, Y) - H(X) - H(Y)"""
    # TODO document better
    from AIMA import DefaultDict
    count_x = DefaultDict(0)
    count_y = DefaultDict(0)
    for (x_key, y_key), count in confusion_dict.items():
        count_x[x_key] += count
        count_y[y_key] += count

    return (2 * entropy_of_multinomial(confusion_dict.values())) - \
           entropy_of_multinomial(count_x.values()) - \
           entropy_of_multinomial(count_y.values())
def threadSizes(chat):
    ids = threads(chat)
    counts = DefaultDict(0)

    for t in ids:
        counts[t] += 1

    return counts
예제 #7
0
def conditional_entropy_X_Given_Y(confusion_dict):
    """H(X|Y) = H(X) - I(X;Y)
              = H(X) - [H(Y)+H(X)-H(Y,X)]
              = H(Y,X) - H(Y)"""
    from AIMA import DefaultDict
    count_y = DefaultDict(0)
    for (x_key, y_key), count in confusion_dict.items():
        count_y[y_key] += count

    return (entropy_of_multinomial(confusion_dict.values())) - \
           entropy_of_multinomial(count_y.values())
예제 #8
0
 def __init__(self):
     """Creates an empty confusion matrix.  You'll need to call the add()
     method to populate it."""
     # test : { gold : count }
     self.by_test = DefaultDict(DefaultDict(0))
     self._all_gold = None
def speakerLines(chat):
    spkToLines = DefaultDict([])
    for comment in chat:
        spkToLines[comment.name].append(comment)
    return spkToLines
예제 #10
0
    def makeFeatDict(self, p1, p2, verbose=False):
        shared = DefaultDict([])
        unshared = DefaultDict([])
        sharedSubj = DefaultDict([])
        unsharedSubj = DefaultDict([])

        #df-buckets
        for feat in set(p1.feats.keys() + p2.feats.keys()):
            if feat.startswith("LSA"):
                continue

            if feat.startswith("SUBJ"):
                fnum = int(feat.lstrip("SUBJ"))
                bucket = int(log(self.dfs[fnum], 1.8))
                val1 = p1.feats.get(feat, 0)
                val2 = p2.feats.get(feat, 0)

                if verbose and val1 and val2:
                    print >> sys.stderr, "SUBJ", self.revVoc[fnum], bucket,

                if (val1 or val2) and not (val1 and val2):
                    unsharedSubj[bucket].append(max(val1, val2) // 2)
                else:
                    sharedSubj[bucket].append(min(val1, val2) // 2)

                continue

            fnum = int(feat)

            if fnum not in self.dfs:
                continue

            if self.wfr[fnum] < 3:
                continue

            bucket = int(log(self.dfs[fnum], 1.8))

            val1 = p1.feats.get(feat, 0)
            val2 = p2.feats.get(feat, 0)

            if verbose and val1 and val2:
                print >> sys.stderr, self.revVoc[fnum], bucket,

            if (val1 or val2) and not (val1 and val2):
                unshared[bucket].append(max(val1, val2) // 2)
            else:
                shared[bucket].append(min(val1, val2) // 2)

        if 0:
            #tf bucketing
            fdict = DefaultDict(0)
            for bucket, ct in shared.items():
                for freq in ct:
                    if freq > 10:
                        freq = 10
                    fdict["SHARE_%g_%g" % (bucket, freq)] += 1
            for bucket, ct in unshared.items():
                for freq in ct:
                    if freq > 10:
                        freq = 10
                    fdict["UNIQUE_%g_%g" % (bucket, freq)] += 1
        elif 0:
            #only df bucketing
            fdict = DefaultDict(0)
            for bucket, ct in shared.items():
                items = len(ct)
                fdict["SHARE_%g" % bucket] += items
            for bucket, ct in unshared.items():
                items = len(ct)
                fdict["UNIQUE_%g" % bucket] += items
        else:
            #proportion features
            fdict = DefaultDict(0)
            for bucket, ct in shared.items():
                items = len(ct)
                nUnshared = len(unshared[bucket])
                frac = items / (nUnshared + items)

                fdict["PROP_%g" % bucket] = frac
                if nUnshared + items == 0:
                    fdict["NO_%g" % bucket] = 1
            for bucket, ct in sharedSubj.items():
                items = len(ct)
                nUnshared = len(unsharedSubj[bucket])
                frac = items / (nUnshared + items)

                fdict["PROP_SUBJ_%g" % bucket] = frac
                if nUnshared + items == 0:
                    fdict["NO_SUBJ_%g" % bucket] = 1

        #LSA
        cos = self.cosine(p1, p2)
        if verbose:
            print >> sys.stderr, "cosine", cos
        fdict["COS"] = cos

        if verbose and 0:
            print >> sys.stderr
            print >> sys.stderr, "Feats", fdict

        if verbose:
            print >> sys.stderr

        return fdict
예제 #11
0
 def __init__(self, group, vocab):
     self.counts = DefaultDict(0)
     self.subjCounts = DefaultDict(0)
     self.vocab = vocab
     self.group = group
예제 #12
0
        for ff in group.files():
            np = NewsgroupPost(gName, vocab)
            np.read(file(ff))

            groups[gName].append(np)

    return vocab, groups


if __name__ == "__main__":
    vocab, groups = readNewsgroups(path("DIRECTORY WITH MINI_NEWSGROUPS"))

    print >> sys.stderr, "counting term/doc frequencies"

    wfreqs = DefaultDict(0)
    dfs = DefaultDict(0)

    for group, posts in groups.items():
        for post in posts:
            for word, ct in post.counts.items():
                wfreqs[word] += ct
                dfs[word] += 1

    print >> sys.stderr, "dumping"

    output = path("FILE WHERE WE DUMP THE WORD FREQS")
    fh = file(output, 'w')
    pickle.dump(vocab, fh)
    pickle.dump(groups, fh)
    pickle.dump(wfreqs, fh)
예제 #13
0
from DistributedExperiment import Experiment, Evaluation

def getCorpus(obj):
    return int(re.search("\d+", obj.exp.mfile.basename()).group(0))

def solverObj(solvers):
    if "log" in solvers:
        return "logObjective"
    return "objective"

if __name__ == "__main__":
    sessionName = sys.argv[1]

    ssn = Session(sessionName, read_only=True)

    evals = DefaultDict(DefaultDict([]))

    for job in ssn:
        #print job
        if job.status != "finished":
            continue
        obj = job.args[0]
        if isinstance(obj, Evaluation):
            corpus = getCorpus(obj)
            solvers = tuple(obj.exp.solvers)
            evals[corpus][solvers].append(job.results)

    avgBest = DefaultDict([])
    avgAvg = DefaultDict([])

    for corpus,values in evals.items():
예제 #14
0
from AIMA import DefaultDict

import pylab

from chatStats import *

chat = nonSys(markSys(readChat(argv[1])))

print "Length is", len(chat)
spkLines = speakerLines(chat)
print "Speakers", len(spkLines)
print "Average utterances", avgUtterances(chat)
print "Average conversations", avgConversations(chat)

spkThreads = DefaultDict([])
for spk, lines in spkLines.items():
    spkThreads[spk] = [x.thread for x in lines]

#print spkThreads

utts = []
unique = []
for threads in spkThreads.values():
    utts.append(len(threads))
    unique.append(len(Set(threads)))

#print " ".join([str(x) for x in utts])
#print
#print " ".join([str(x) for x in unique])
예제 #15
0
    return color+sym+line

if __name__ == "__main__":
    workdir = path(sys.argv[1])
    print "Working directory", workdir
    ssnName = "hog" + workdir.dirname().basename()
    print "Session", ssnName
    session = Session(ssnName, read_only=True)
    evalSsnName = "hogeval" + workdir.dirname().basename()
    print "Eval session", evalSsnName
    evalSsn = Session(evalSsnName, read_only=True)

    plotStat = "time"

    modelSeqs = DefaultDict([])

    for ct,job in enumerate(evalSsn):
        if ct % 20 == 0:
            print ct, "..."
        
        if job.status == "finished":
            res = job.results

            try:
                time = res.stats["time"]
            except KeyError:
                time = res.time
            if time == None:
                continue