def main(): # Takes in the file and parses into datum's. D = cData(sys.argv[1]) # Starting points from the pickle or not. EmAlg = parseCommandLine(D, sys.argv) prevCons = 0 totalcons = 0 nmiResult = evaluateEM_NMI(D, EmAlg) print "Initial nmi: ",nmiResult consobj = Cons.cCons(D) for numCons in range(1,len(D.data)/4,1): consobj.constype = Cons.cCons.eConsType.TripCenterChunk consobj.centerChunkSize = 0.2 cons = consobj.tripCons(EmAlg.mGammas,numCons-prevCons) prevCons = numCons totalcons += len(cons) for i in cons: EmAlg.mCij[i[0]][i[1]] = i[2] EmAlg.mCij[i[1]][i[0]] = i[2] EmAlg.EM(len(D.classlist)) nmiresult = evaluateEM_NMI(D, EmAlg) print numCons, ",", nmiresult, ",", totalcons if(nmiresult > 0.999 or len(D.data)==numCons): break
def scenario1(D, sFileStub): f = open(sFileStub + ".scen1.results", "w") EMResults = [] for iOutertrial in range(numOuterTrials): f.write("outertrial: %d\n" % iOutertrial) f.write("likelihood,NMI\n") bestEM = [] bestLikelihood = 0 for iRestart in range(numInnerTrials): EMAlg = EM.cEM(D) EMAlg.bPPC = False EMAlg.EM(len(D.classlist)) if iRestart == 0 or EMAlg.dEMLikelihood > bestLikelihood: bestLikelihood = EMAlg.dEMLikelihood bestEM = EMAlg EMResults.append(bestEM) f.write("%f,%f\n" % (bestLikelihood, utils.evaluateEM_NMI(D, EMAlg) ) ) f.flush() f.close() return EMResults
def TripConsTest(D, sNum, EMStarts, fp): fp.write("trips,queries,cons,likelihood,NMI\n") for option in TripConsOptions.lOptions: optname = TripConsOptions.lOptionNames[option] fp.write(optname + "\n") for iOutertrial in range(numOuterTrials): print "scenario ", sNum, " options ", optname, " outertrial ", iOutertrial fp.write("outertrial: %d\n" % iOutertrial) em = copy.deepcopy(EMStarts[iOutertrial]) em.bPPC = True prevTrips = 0 totalCons = 0 nmiResult = utils.evaluateEM_NMI(D, em) fp.write("Initial nmi: %f\n" % nmiResult) consobj = Cons.cCons(D) for numTrips in range(1,len(D.data)/4,1): if option == TripConsOptions.CenterChunkCons: consobj.constype = Cons.cCons.eConsType.TripCenterChunk elif option == TripConsOptions.MidCons: consobj.constype = Cons.cCons.eConsType.TripMids print em.mLikelihood_il cons = consobj.tripCons(em.mGammas,numTrips-prevTrips) prevTrips = numTrips totalCons += len(cons) for i in cons: em.mCij[i[0]][i[1]] = i[2] em.mCij[i[1]][i[0]] = i[2] em.EM(len(D.classlist)) nmiresult = utils.evaluateEM_NMI(D, em) fp.write("%d,%d,%d,%f,%f\n" % (numTrips, numTrips*14, totalCons, em.dEMLikelihood, nmiresult) ) fp.flush() if (nmiresult > 0.999 or len(D.data)==numTrips): break
def goodInitial (self,D,em,emclusters,RepPts,fp): # Consistent means all the midpoints are same with the center. constraints = [] iters = 0 indEMClusters = range(len(emclusters)) lResetExclusions = [] numUserQueries = 0 for cl in emclusters: print ([D.data[i.index].cl for i in cl.midpoints],D.data[cl.center.index].cl)," ",cl.center.index while len(indEMClusters) != 0 and iters < 5: resetCenters = [] for ind in indEMClusters[:]: cl = emclusters[ind] if(len(cl.midpoints) <= 1): resetCenters.append(ind) continue # simulate feedback from real classes realpoints = [D.data[i.index] for i in cl.midpoints] realcenter = D.data[cl.center.index] numUserQueries += len(realpoints) + 1 # points in realpoints s.t. their real class is same as center rightclass = filter(lambda x: x.cl==realcenter.cl,realpoints) rightclass.append(realcenter) wrongclass = filter(lambda x: x.cl!=realcenter.cl,realpoints) # All the leftovers... if len(wrongclass) == 0: indEMClusters.remove(ind) lResetExclusions.extend( [x.index for x in rightclass] ) else: resetCenters.append(ind) # Cross constraints between right and wrong classes. for i in rightclass: for j in realpoints: if j in wrongclass: constraints.append([i.index,j.index,-2]) elif j!= i: constraints.append([i.index,j.index,2]) for i in constraints: em.mCij[i[0]][i[1]] = i[2] em.mCij[i[1]][i[0]] = i[2] # If all classes are not right, restart. em.resetSomeCenters(em.lInitialCenters,resetCenters,lResetExclusions) em.EM(len(emclusters)) emclusters = RepPts.createClusters(em) RepPts.repPoints(em, emclusters) print "goodInitial iter nmi: ", evaluateEM_NMI(D,em)," ",iters iters += 1 # queries,cons,likelihood,NMI maybeWrite(fp, "%d,%d,%f,%f\n" % (numUserQueries, len(constraints), em.dEMLikelihood, evaluateEM_NMI(D,em) ) ) print indEMClusters for cl in emclusters: print ([D.data[i.index].cl for i in cl.midpoints],D.data[cl.center.index].cl)," ",cl.center.index return em
import numpy import EM import cData import utils # run EM several times and get the likelihood for iRestart in range(20): D = cData.cData("data/winenorm3_pyre.csv") # D = cData.cData("data/normvert.csv") M = EM.cEM(D) M.bPPC = False M.EM(3) print M.dEMLikelihood, print " nmi: ", print utils.evaluateEM_NMI(D, M)