def __init__(self): # data structure self.data = [] self.features = [] self.width = 27 self.height = 27 self.featureExtractor = AutoEncoderFeatureExtractor( self.width, self.height) self.processor = PreProcessor() self.driver = ProbDriver()
def __init__(self): # data structure self.data = [] self.features = [] self.width = 27 self.height = 27 self.featureExtractor = AutoEncoderFeatureExtractor(self.width, self.height) self.processor = PreProcessor() self.driver = ProbDriver()
class AutoEncoderDriver(AutoEncoderFeatureDriver): def __init__(self): # data structure self.data = [] self.features = [] self.width = 27 self.height = 27 self.featureExtractor = AutoEncoderFeatureExtractor(self.width, self.height) self.processor = PreProcessor() self.driver = ProbDriver() def load_data(self): dataDir = "../data/Task2" os.chdir(dataDir) curDir = os.getcwd() self.data = [] for uid in range(1, settings.USER_COUNT + 1): uidData = [] for sid in range(1, 41): fileName = "U%dS%d.TXT" % (uid, sid) X, Y, T, P = self.get_data_from_file(fileName) uidData.append((X, Y)) self.data.append(uidData) os.chdir("../..") def size_normalization(self): data = [] for uid in range(40): uidData = [] for sid in range(40): X, Y = self.processor.size_normalization( self.data[uid][sid][0], self.data[uid][sid][1], self.width, self.height ) uidData.append((X, Y)) data.append(uidData) self.data = data def imagize(self): data = [] for uid in range(40): uidData = [] for sid in range(40): image = self.featureExtractor.imagize(self.data[uid][sid][0], self.data[uid][sid][1]) uidData.append(image) data.append(uidData) self.data = data def train(self, layer_sizes=[500, 300, 100, 50], epoch=1000): if not self.data: self.imagize() train_set_x = numpy.asarray(self.data) (uCnt, sCnt, pCnt) = train_set_x.shape train_set_x = train_set_x.reshape((uCnt * sCnt, pCnt)) n_ins = (self.width + 1) * (self.height + 1) # train data # self.featureExtractor.train(train_set_x, n_ins, layer_sizes, epoch) self.featureExtractor.train_with_mnist( pretraining_epochs=15, training_epochs=epoch, hidden_layers_sizes=layer_sizes ) def generate_features(self): """ generate feature from image to features using stacked autoencoder """ # train data first self.train() print "generating features..." self.features = [] for uid in range(40): uidFeatures = [] for sid in range(40): feature = self.featureExtractor.generate_features(self.data[uid][sid]) uidFeatures.append(feature) # print ">>>uid: %d, sid: %d ends" % (uid, sid) self.features.append(uidFeatures) def dump_feature(self): print "... dumpint features" dataDir = "./data" os.chdir(dataDir) autoFeatureDir = "auto_features" if not os.path.exists(autoFeatureDir): os.mkdir(autoFeatureDir) os.chdir(autoFeatureDir) for uid in range(40): for sid in range(40): fileName = "u%ds%d.txt" % (uid, sid) numpy.savetxt(fileName, self.features[uid][sid], fmt="%10.5f") os.chdir("../..") def train_test_set(self, uid, cnt): uidFeatures = self.features[uid] train_set_x = [] pos_set_x = [] neg_set_x_ori = [] neg_set_x_oth = [] for sid in range(cnt): train_set_x.append(uidFeatures[sid].tolist()) for sid in range(cnt, 20): pos_set_x.append(uidFeatures[sid].tolist()) for sid in range(20, 40): neg_set_x_ori.append(uidFeatures[sid].tolist()) for i in range(40): if i == uid: continue for sid in range(40): neg_set_x_oth.append(self.features[i][sid].tolist()) return train_set_x, pos_set_x, neg_set_x_ori, neg_set_x_oth def score_of_uid(self, uid, cnt): train_set_x, pos_set_x, neg_set_x_ori, neg_set_x_oth = self.train_test_set(uid, cnt) driver = ProbDriver() # print ">>> training..." driver.ps_temp(train_set_x) # print ">>> train set" trainPS = [] for X in train_set_x: ps = driver.PS(X) # print ps trainPS.append(ps) threshold = min(trainPS) # print ">>> train set min is ", threshold def _score_of_set(set_x, pos=True): size = len(set_x) setPS = [] for X in set_x: ps = driver.PS(X) setPS.append(ps) if pos: correctSize = len([ps for ps in setPS if ps >= threshold]) else: correctSize = len([ps for ps in setPS if ps <= threshold]) return correctSize / float(size) # testing process # print ">>> postive test set" scoreOfPos = _score_of_set(pos_set_x, pos=True) # print ">>> total postive set %d, greater than threshold %f" % (len(pos_set_x), scoreOfPos) # print ">>> negtive test set" scoreOfNegOri = _score_of_set(neg_set_x_ori, pos=False) # print ">>> original negtive set %d, less than threshold %f" % (len(neg_set_x_ori), scoreOfNegOri) # print ">>> other negtive test set" scoreOfNegOth = _score_of_set(neg_set_x_oth, pos=False) # print ">>> total negtive set %d, less than threhold %f" % (len(neg_set_x_oth), scoreOfNegOth) return scoreOfPos, scoreOfNegOri, scoreOfNegOth def score(self): self.load_feature() scoreOfPos = [] scoreOfNegOri = [] scoreOfNegOth = [] for cnt in [3, 5, 7, 10, 15]: for uid in range(40): pos, negOri, negOth = self.score_of_uid(uid, cnt) scoreOfPos.append(pos) scoreOfNegOri.append(negOri) scoreOfNegOth.append(negOth) print numpy.mean(scoreOfPos), numpy.mean(scoreOfNegOri), numpy.mean(scoreOfNegOth)
class AutoEncoderDriver(AutoEncoderFeatureDriver): def __init__(self): # data structure self.data = [] self.features = [] self.width = 27 self.height = 27 self.featureExtractor = AutoEncoderFeatureExtractor( self.width, self.height) self.processor = PreProcessor() self.driver = ProbDriver() def load_data(self): dataDir = "../data/Task2" os.chdir(dataDir) curDir = os.getcwd() self.data = [] for uid in range(1, settings.USER_COUNT + 1): uidData = [] for sid in range(1, 41): fileName = "U%dS%d.TXT" % (uid, sid) X, Y, T, P = self.get_data_from_file(fileName) uidData.append((X, Y)) self.data.append(uidData) os.chdir("../..") def size_normalization(self): data = [] for uid in range(40): uidData = [] for sid in range(40): X, Y = self.processor.size_normalization( self.data[uid][sid][0], self.data[uid][sid][1], self.width, self.height) uidData.append((X, Y)) data.append(uidData) self.data = data def imagize(self): data = [] for uid in range(40): uidData = [] for sid in range(40): image = self.featureExtractor.imagize(self.data[uid][sid][0], self.data[uid][sid][1]) uidData.append(image) data.append(uidData) self.data = data def train(self, layer_sizes=[500, 300, 100, 50], epoch=1000): if not self.data: self.imagize() train_set_x = numpy.asarray(self.data) (uCnt, sCnt, pCnt) = train_set_x.shape train_set_x = train_set_x.reshape((uCnt * sCnt, pCnt)) n_ins = (self.width + 1) * (self.height + 1) # train data # self.featureExtractor.train(train_set_x, n_ins, layer_sizes, epoch) self.featureExtractor.train_with_mnist(pretraining_epochs=15, training_epochs=epoch, hidden_layers_sizes=layer_sizes) def generate_features(self): """ generate feature from image to features using stacked autoencoder """ # train data first self.train() print "generating features..." self.features = [] for uid in range(40): uidFeatures = [] for sid in range(40): feature = self.featureExtractor.generate_features( self.data[uid][sid]) uidFeatures.append(feature) # print ">>>uid: %d, sid: %d ends" % (uid, sid) self.features.append(uidFeatures) def dump_feature(self): print "... dumpint features" dataDir = "./data" os.chdir(dataDir) autoFeatureDir = "auto_features" if not os.path.exists(autoFeatureDir): os.mkdir(autoFeatureDir) os.chdir(autoFeatureDir) for uid in range(40): for sid in range(40): fileName = "u%ds%d.txt" % (uid, sid) numpy.savetxt(fileName, self.features[uid][sid], fmt="%10.5f") os.chdir("../..") def train_test_set(self, uid, cnt): uidFeatures = self.features[uid] train_set_x = [] pos_set_x = [] neg_set_x_ori = [] neg_set_x_oth = [] for sid in range(cnt): train_set_x.append(uidFeatures[sid].tolist()) for sid in range(cnt, 20): pos_set_x.append(uidFeatures[sid].tolist()) for sid in range(20, 40): neg_set_x_ori.append(uidFeatures[sid].tolist()) for i in range(40): if i == uid: continue for sid in range(40): neg_set_x_oth.append(self.features[i][sid].tolist()) return train_set_x, pos_set_x, neg_set_x_ori, neg_set_x_oth def score_of_uid(self, uid, cnt): train_set_x, pos_set_x, neg_set_x_ori, neg_set_x_oth = self.train_test_set( uid, cnt) driver = ProbDriver() # print ">>> training..." driver.ps_temp(train_set_x) # print ">>> train set" trainPS = [] for X in train_set_x: ps = driver.PS(X) # print ps trainPS.append(ps) threshold = min(trainPS) # print ">>> train set min is ", threshold def _score_of_set(set_x, pos=True): size = len(set_x) setPS = [] for X in set_x: ps = driver.PS(X) setPS.append(ps) if pos: correctSize = len([ps for ps in setPS if ps >= threshold]) else: correctSize = len([ps for ps in setPS if ps <= threshold]) return correctSize / float(size) # testing process # print ">>> postive test set" scoreOfPos = _score_of_set(pos_set_x, pos=True) # print ">>> total postive set %d, greater than threshold %f" % (len(pos_set_x), scoreOfPos) # print ">>> negtive test set" scoreOfNegOri = _score_of_set(neg_set_x_ori, pos=False) # print ">>> original negtive set %d, less than threshold %f" % (len(neg_set_x_ori), scoreOfNegOri) # print ">>> other negtive test set" scoreOfNegOth = _score_of_set(neg_set_x_oth, pos=False) # print ">>> total negtive set %d, less than threhold %f" % (len(neg_set_x_oth), scoreOfNegOth) return scoreOfPos, scoreOfNegOri, scoreOfNegOth def score(self): self.load_feature() scoreOfPos = [] scoreOfNegOri = [] scoreOfNegOth = [] for cnt in [3, 5, 7, 10, 15]: for uid in range(40): pos, negOri, negOth = self.score_of_uid(uid, cnt) scoreOfPos.append(pos) scoreOfNegOri.append(negOri) scoreOfNegOth.append(negOth) print numpy.mean(scoreOfPos), numpy.mean( scoreOfNegOri), numpy.mean(scoreOfNegOth)