def impdata(self): self.basedata = [] f = open(self.filebase, 'r') for line in f.readlines(): data = line.split(' ') #print data d = cData.cData(data[0], data[1:],data[6]) self.basedata.append(d) f.close() print "base data len : ", len(self.basedata) self.realdata = [] f = open(self.filereal, 'r') for line in f.readlines(): data = line.split(' ') #print data d = cData.cData(data[0], data[1:],data[6]) self.realdata.append(d) f.close() print "real data len : ", len(self.realdata)
def gencode(self): d = [] for i in range(self.count): d.append(i + 1) while True: d = self.getnext(d) if d is not None: cd = cData.cData(0, d, 0) self.total += 1 yield cd else : break yield None
# After several rounds of having the user provide # feedback on starting points, we proceed with # random pairwise constraints and watch to see our # NMI increase import pickle import EM import cData import sys if len(sys.argv) < 2: print "provide filename and optionally a filename for the pickle of centers" exit(1) # use the same code for getting initial points as baturay did D = cData.cData(sys.argv[1]) D.setType("2", "random") EmAlg = EM.cEM(D) EmAlg.EM(len(D.classlist)) EmAlg.bPPC = True #Creates clusters depending on what EM guessed. D.createClusters(EmAlg) #Finds the outerpoints and the midpoints and assigns them in emclusters. D.repPoints(EmAlg) #This makes the algorithm start with good initial points. EmAlg = D.goodInitial(EmAlg) print "pickling starting position to: ", picklefname = "pickles/"+sys.argv[1].split('/')[-1]+".pickle" if len(sys.argv) > 2: picklefname = sys.argv[2]
def run(): D = cData.cData("data/winenorm3_pyre.csv") E = EM.cEM(D) E.EM(3)
import EM import cData D = cData.cData("data/DATASET3_trim.csv") # load full constraint set into cData # run PPC with same initial clusters with increasing number constraints M = EM(D) numCent = 6 for numCons in [500]:
bNewCenters = False if len(sys.argv) > 2: # check for 'new centers' bypass if sys.argv[2] == "__NEW__": bNewCenters = True else: lICenters = pickle.load(open(sys.argv[2])) else: # check for default initial centers pickle sfDefltPkl = sfData + ".icenters.pickle" if os.access(sfDefltPkl, os.R_OK): lICenters = pickle.load(open(sys.argv[2])) else: bNewCenters = True D = cData.cData(sfData) PPC = EM.EM(D) if bNewCenters: #PPC = D.emRestarts(10) #PPC.EM(len(D.data)) PPC.EM(3) print PPC.lInitialCenters lICenters = PPC.lInitialCenters[:] D.poscons = [(i,j) for i in range(len(D.data)) for j in range(len(D.data))] cons = D.pairCons(500) for i in cons: PPC.mCij[i[0]][i[1]] = i[2] PPC.mCij[i[1]][i[0]] = i[2]
import numpy import EM import cData import utils # run EM several times and get the likelihood for iRestart in range(20): D = cData.cData("data/winenorm3_pyre.csv") # D = cData.cData("data/normvert.csv") M = EM.cEM(D) M.bPPC = False M.EM(3) print M.dEMLikelihood, print " nmi: ", print utils.evaluateEM_NMI(D, M)
sCommands = "s1_s2_s3_s4" argc = len(sys.argv) if argc >= 2: sNickname = sys.argv[1] if argc >= 3: # includes filename stub of input lDataFiles = [ sys.argv[2] ] if argc == 4: sCommands = sys.argv[3] if argc < 2 or argc > 4: errmsg() exit(1) lCommands = sCommands.split("_") for dfname in lDataFiles: D = cData.cData("data/" + dfname + ".csv") # scenario 1 if "s1" in lCommands: print "running scenario 1" EMStartsS1 = scenario1(D, "results/" + sNickname + "." + dfname) print "pickling results of scenario 1" pf = open("results/" + sNickname + "." + dfname + ".scen1.pickle", "w") pickle.dump(EMStartsS1, pf) pf.close() # load results from scenario 1 pickle if "p1" in lCommands: print "loading scenario 1 results from pickle" pf = open("results/" + sNickname + "." + dfname + ".scen1.pickle") EMStartsS1 = pickle.load(pf)
def EMLikelihood(self): # formula is sum over n,l of gamma(i,l) * z(i,l) where # z = 1 if i is in cluster k, else 0 nData = len(self.mData.data) LL = 0 membership = np.ravel(self.mGammas.argmax(1).T) for i in range(nData): for k in range(len(self.lCenters)): if membership[i] == k: LL += self.mLikelihood_il[i,k] self.dEMLikelihood = LL return LL # give the class labels of each data item def Membership(self): return np.ravel(self.mGammas.argmax(1).T) def printDims(v, textv): print "dims ", textv, np.size(v,0), np.size(v,1) def printDim(v, textv): print "dim ", textv, np.size(v,0) if __name__ == "__main__": import cData D = cData.cData("data/winenorm3_pyre.txt") M = cEM(D) M.EM(3)