def construct(filename):
    oldj = {}
    oj = {}
    newj = {}
    newe = {}
    newf = {}

    with open(filename, "r") as fh:
        c = 0
        for line in fh:
            c += 1
            if c > 300:
                break
            pcs = line.split()
            px = pcs[0].split("_")
            f = px[0]
            e = px[2]
            old = int(pcs[1])
            new = int(pcs[2])
            oldj[(e, f)] = old
            oj[pcs[0]] = old
            newj[pcs[0]] = new
            newe[e] = newe.get(e, 0) + new
            newf[f] = newf.get(f, 0) + new

    normalize(oldj)
    normalize(oj)
    normalize(newj)
    normalize(newe)
    normalize(newf)
    nonzeroentries = {}
    for e in newe:
        for f in newf:
            nonzeroentries[(e, f)] = 1
    model = constructModel(oldj, nonzeroentries, newe, newf)

    # verbose output
    with timer("opt") as tim:
        model.params.outputflag = 1
        model.optimize()
        model.printStats()
        model.printQuality()

    # show the final output
    print
    print "==== final variable values ===="
    osum = 0
    nsum = 0
    for var in model.getVars():
        name = var.getAttr(grb.GRB.attr.VarName)
        val = var.getAttr(grb.GRB.attr.X)
        if not name.startswith("b"):
            o = oj.get(name, 1e-9)
            n = newj.get(name, 1e-9)
            val = max(val, 1e-9)
            # if val > 1e-6: print name, "\t", o, '\t', n, '\t', val
            osum += o * log(o / val)
            nsum += n * log(n / val)
    print osum
    print nsum
Beispiel #2
0
def cosSim(m1, m2, keep=10):
    with timer('computing cosine sim') as tim:
        E = dict()
        for r in m1:
            for (k,v) in r.iteritems():
                E[k] = 1
        print 'source term count : ', len(E)
        t = dict()
        for e in E:
            if len(t) % 100 == 0: print len(t)
            s = 0
            rr = dict()
            for d in range(len(m1)):
                r1 = m1[d]
                r2 = m2[d]
                if e not in r1: continue
                se = r1[e]
                for (f, sf) in r2.iteritems():
                    rr[f] = rr.get(f, 0) + se * sf
            rr2 = dict()
            for (k, v) in sorted(rr.iteritems(), reverse=True, key=lambda p: p[1]):
                rr2[k] = v
                if len(rr2) >= keep: break
            t[e] = rr2
        return t
 def __init__(self, lines):
     with timer('loading phrases') as tim:
         count = 0
         for line in lines:
             phrase = PhrasePair(line)
             key = ' '.join(phrase.source)
             l = self.d.get(key, [])
             l.append(phrase)
             self.d[key] = l
             count += 1
             if 0 == count % 10000: print "{0} phrases  \r".format(count), ; stdout.flush()
             if count == 100000: break
         self.count = count
def pruneRelativeEntropy(filename, outfile):
    pt = PhraseTable(fileLines(filename))
    mapFn = lambda line: computeRelEnt(pt, line)
    with timer('pruning') as tim:
        with openMaybeGz(outfile, 'w') as o:
            count = 0
            chunksize = 100
            for line in threaded_map(mapFn, fileChunks(filename, chunksize), threadCount = 6, maxInputQ = 1024):
                o.write(line)
                count += chunksize
                if 0 == count % 500:
                    (elapsed, remaining, totalTime) = tim.predict(count, pt.count)
                    print "{0:.3f} elapsed; {1:.3f} remaining; {2:.3f} total; count = {3}  \r".format(elapsed, remaining, totalTime, count), ; stdout.flush()
Beispiel #5
0
def normalizeMatrix(matrix):
    with timer('normalizing matrix') as tim:
        s = dict()
        for row in matrix:
            for (k,v) in row.iteritems():
                s[k] = s.get(k, 0) + v * v
        for (k,v) in s.iteritems():
            s[k] = sqrt(v)
        nmatrix = []
        for row in matrix:
            nrow = dict()
            for (k, v) in row.iteritems():
                nrow[k] = v / s[k]
            nmatrix.append(nrow)
        return nmatrix
Beispiel #6
0
def countMatrixToBm25(matrix):
    with timer('counts->bm25') as tim:
        docFreqs = docFreqFromCountMatrix(matrix)
        numDocs = float(len(matrix))
        avgDocLen = sum(map(len, matrix)) / numDocs
        bmatrix = []
        for row in matrix:
            docLen = len(row)
            brow = dict()
            for pair in row.iteritems():
                termFreq = pair[1]
                docFreq = docFreqs[pair[0]]
                bm = bm25(termFreq, docFreq, docLen, avgDocLen, numDocs)
                #print '\t'.join(map(str, [termFreq, docFreq, docLen, avgDocLen, numDocs, bm]))
                brow[pair[0]] = bm
            bmatrix.append(brow)
        return bmatrix
def constructModel(oldJointData, nonzeroEntries, newEMarginal, newFMarginal, epsilon=1e-6):
    with timer("constr") as tim:
        model = grb.Model("model")

        # construct the variables and the objective
        e2f = {}
        f2e = {}
        newJointVars = {}
        obj = grb.QuadExpr()
        bb = {}
        for e_f in nonzeroEntries.iterkeys():
            (e, f) = e_f
            v = model.addVar(0.0, 1.0, 0.0, grb.GRB.CONTINUOUS, f + "__" + e)
            if not e2f.has_key(e):
                e2f[e] = {}
            if not f2e.has_key(f):
                f2e[f] = {}
            e2f[e][f] = v
            f2e[f][e] = v
            newJointVars[e_f] = v
            if oldJointData.has_key(e_f):
                # objective should contain (v - OLD)^2
                old = oldJointData[e_f]
                b = model.addVar(0.0, 1.0, 0.0, grb.GRB.CONTINUOUS, "b" + str(e_f))
                obj += (v - old) * (v - old)
                obj += b
                bb[v] = (b, old)
            else:
                # objective should contain v^2
                obj += 1.1 * v
                obj += v * v
        model.update()  # add the variables before setting the objective
        model.setObjective(obj)

        # now create the constraints -- there are E and F marginal
        # constraints of the form:
        #    sum_f newJoint[e,f] = newEMarginal[e]  for all e
        #    sum_e newJoint[e,f] = newFMarginal[f]  for all f
        # this is too restrictive (maybe impossible), so we give
        # a little slack -- maybe this should be penalized in the
        # objective.  anyway, we write:
        #    sum_f newJoint[e,f] - newEMarginal[e] <  epsilon    for all e
        #    sum_f newJoint[e,f] - newEMarginal[e] > -epsilon    for all e
        # and similarly for newFMarginal

        for e, marg in newEMarginal.iteritems():
            lhs = grb.LinExpr(-marg)
            for f, var in e2f[e].iteritems():
                lhs += var
            model.addConstr(lhs <= epsilon, "ce+" + e)
            model.addConstr(lhs >= -epsilon, "ce-" + e)

        for f, marg in newFMarginal.iteritems():
            lhs = grb.LinExpr(-marg)
            for e, var in f2e[f].iteritems():
                lhs += var
            model.addConstr(lhs <= epsilon, "cf+" + e)
            model.addConstr(lhs >= -epsilon, "cf-" + e)

        counter = 0
        for v, (b, old) in bb.iteritems():
            model.addConstr(b >= v - old, "vb+" + str(counter))
            model.addConstr(b >= -v + old, "vb-" + str(counter))
            model.addConstr(b >= 0, "vb0" + str(counter))
            counter += 1

        # finalize the model
        model.update()
        return model
Beispiel #8
0
def docTermCountMatrix(fileList, mincount = 0):
    with timer('building termcount matrix') as tim:
        matrix = []
        for filename in fileList:
            matrix.append(getUnigrams(filename, mincount))
        return matrix