def run(self): print "Master pid is :%d\n" % os.getpid() self.predictor.start() testenv = Environment(226) while True: self.global_t += 1 if self.global_t % LOG_FREQ == 0: t_diff = time.time() - self.start_time print("{} steps in {} seconds, {} steps/h".format( LOG_FREQ, t_diff, 3600 * LOG_FREQ / t_diff)) self.start_time = time.time() if self.global_t % SAVE_FREQ == 0: self.network.save(self.global_t) identity, frame, reward, isover, frag_cnt, kdr = load( self.c2s_socket.recv(copy=False).bytes) if len(self.client[identity]) > 0: self.client[identity][-1].reward = reward #print 'frame received from {}'.format(identity) self._on_state(frame, identity) if isover: self._parse_memory(identity, 0, True) else: if len(self.client[identity]) == LOCAL_T_MAX + 1: self._parse_memory(identity, self.client[identity][-1].value, False) if isover: self.network.log_eval(frag_cnt, kdr)
def feature_trans_pca(src_pack_file, dst_pack_file): all_data = [] person_feature_dic = msgpack_numpy.load(open(src_pack_file, 'rb')) for person_index, person in enumerate(person_feature_dic): feature_list = person_feature_dic.get(person) for index in range(len(feature_list)): try: if feature_list[index][1] == None: continue this_feature = np.array(feature_list[index][1][0]) all_data.append(this_feature) except: traceback.print_exc() all_data = np.asarray(all_data) pca = PCA(n_components=128) pca.fit(all_data) for person_index, person in enumerate(person_feature_dic): feature_list = person_feature_dic.get(person) for index in range(len(feature_list)): try: if feature_list[index][1] == None: continue this_feature = np.array(feature_list[index][1][0]) this_feature = np.reshape(this_feature, (1, this_feature.size)) this_feature = pca.transform(this_feature)[0] feature_list[index][1][0] = this_feature except: traceback.print_exc() msgpack_numpy.dump(person_feature_dic, open(dst_pack_file, 'wb'))
def extract_triplet_feature(): lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) new_lfw_feature_dic = {} model_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.model' weight_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.weight' model = model_from_json(open(model_file, 'r').read()) opt = Adam() model.compile(optimizer=opt, loss=['categorical_crossentropy']) model.load_weights(weight_file) # pdb.set_trace() get_Conv_FeatureMap = K.function( [model.layers[2].layers[0].get_input_at(False), K.learning_phase()], [model.layers[2].layers[-1].get_output_at(False)]) for person in lfw_feature_dic: # print person this_person_feature_list = lfw_feature_dic.get(person) this_person_triplet_feature_list = [] for feature, path in this_person_feature_list: feature = np.reshape(feature, (1, feature.size)) new_feature = get_Conv_FeatureMap([feature, 0])[0].copy() this_person_triplet_feature_list.append((new_feature, path)) new_lfw_feature_dic[person] = this_person_triplet_feature_list msgpack_numpy.dump(new_lfw_feature_dic, open(triplet_feature_pack_file, 'wb'))
def split_train_valid(pack_file, train_pic_num, feature_dim): start = time() person_feature_dic = msgpack_numpy.load(open(pack_file, 'rb')) all_train_data = [] all_train_label = [] all_valid_data = [] all_valid_label = [] for person_index, person in enumerate(person_feature_dic): feature_list = person_feature_dic.get(person) np.random.shuffle(feature_list) if len(feature_list) < train_pic_num: continue else: for index in range(train_pic_num): pic_name, feature = feature_list[index] feature = np.asarray(feature) if feature.shape != (1, feature_dim): continue all_train_data.append(feature) all_train_label.append(person) for index in range(train_pic_num, len(feature_list)): pic_name, feature = feature_list[index] feature = np.asarray(feature) if feature.shape != (1, feature_dim): continue all_valid_data.append(feature) all_valid_label.append(person) all_train_data = np.asarray(all_train_data) all_train_label = np.asarray(all_train_label) all_valid_data = np.asarray(all_valid_data) all_valid_label = np.asarray(all_valid_label) return all_train_data, all_train_label, all_valid_data, all_valid_label
def main_distance(): all_data = [] all_label = [] all_pic_path_list = [] count = 0 verif_path_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) for line in open(pair_file): if count % 100 == 0: print count count += 1 tmp = line.rstrip().split() if len(tmp) == 3: path1 = tmp[0] path2 = tmp[1] label = int(tmp[2]) feature1 = verif_path_feature_dic.get(path1) feature2 = verif_path_feature_dic.get(path2) # pdb.set_trace() # predicts = pw.cosine_similarity(feature1, feature2) predicts = np.fabs(feature1-feature2) all_data.append(predicts) all_label.append(label) all_pic_path_list.append((path1, path2)) data = np.asarray(all_data) # print data.shape # data = np.reshape(data, newshape=(data.shape[0], 1)) data = np.reshape(data, newshape=(data.shape[0], data.shape[2])) label = np.asarray(all_label) print data.shape, label.shape msgpack_numpy.dump((data, label, all_pic_path_list), open('orl_verif_fc7_finetune_fc8.p', 'wb'))
def word_length_stat(): # 每个查询词的长度不超过30 word_dic = msgpack_numpy.load( open('/data/liubo/hotspot/all_query_dic.p', 'rb')) word_length_dic = {} for word in word_dic: word_length = len(word) word_length_dic[word_length] = word_length_dic.get(word_length, 0) + 1 print word_length_dic
def run(self): while True: for _ in tqdm(range(config.save_freq)): client_id, observations = load( self.c2s_socket.recv(copy=False).bytes) self._on_state(client_id, observations) self.network.save() config.update()
def load_data(): # the data, shuffled and split between train and test sets (data, label) = msgpack_numpy.load(open('/data/pictures_annotate_feature/annotate_data.p', 'rb')) digit_indices = [np.where(label == i)[0] for i in range(nb_class)] pairs_x, pairs_y = create_pairs(data, digit_indices) pairs_x = pairs_x[:10000] pairs_y = pairs_y[:10000] tr_pairs, te_pairs, tr_y, te_y = train_test_split(pairs_x, pairs_y, test_size=0.1) print tr_pairs.shape, te_pairs.shape, tr_y.shape, te_y.shape return tr_pairs, te_pairs, tr_y, te_y
def create_train_valid_data(folder='/data/liubo/face/research_feature_self'): # 根据已经存在的数据训练人脸验证模型 person_list = os.listdir(folder) path_feature_dic = {} # for person in person_list: person_path = os.path.join(folder, person) pic_feature_list = os.listdir(person_path) for pic_feature_path in pic_feature_list: pic_feature_path = os.path.join(person_path, pic_feature_path) pic_feature = msgpack_numpy.load(open(pic_feature_path, 'rb')) path_feature_dic[pic_feature_path] = pic_feature msgpack.dump(path_feature_dic, open('research_feature.p', 'wb'))
def feature_fusion(): kf = KFold(n_folds=10) all_acc = [] (data, label, all_pic_path_list) = msgpack_numpy.load(open('original_verif_fc7_finetune_fc8.p', 'rb')) error_file = 'error_pair.txt' f = open(error_file, 'w') all_pic_path_list = np.asarray(all_pic_path_list) for k, (train, valid) in enumerate(kf.split(data, label)): train_data = data[train] valid_data = data[valid] train_label = label[train] valid_label = label[valid] valid_path = all_pic_path_list[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) # for index in range(len(valid_data)): # if clf.predict(valid_data[index:index+1]) != valid_label[index]: # f.write(valid_path[index][0]+'\t'+valid_path[index][1]+'\n') # rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=15) # rf_clf.fit(train_data, train_label) # rf_predict_train_label_prob = rf_clf.predict_proba(train_data) # rf_predict_valid_label_prob = rf_clf.predict_proba(valid_data) # # gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500) # gb_clf.fit(train_data, train_label) # gb_predict_train_label_prob = gb_clf.predict_proba(train_data) # gb_predict_valid_label_prob = gb_clf.predict_proba(valid_data) # mf_clf = RandomForestClassifier() # mf_train_data = np.column_stack((rf_predict_train_label_prob, gb_predict_train_label_prob)) # mf_valid_data = np.column_stack((rf_predict_valid_label_prob, gb_predict_valid_label_prob)) # mf_clf.fit(mf_train_data, train_label) # acc = accuracy_score(valid_label, mf_clf.predict(mf_valid_data)) # roc_auc = roc_auc_score(valid_label, mf_clf.predict(mf_valid_data)) # acc = accuracy_score(valid_label, rf_clf.predict(valid_data)) # roc_auc = roc_auc_score(valid_label, rf_clf.predict(valid_data)) # acc = accuracy_score(valid_label, gb_clf.predict(valid_data)) # roc_auc = roc_auc_score(valid_label, gb_clf.predict(valid_data)) all_acc.append(acc) print acc, roc_auc # roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) # print acc, roc_auc # cPickle.dump(clf, open('/data/liubo/face/vgg_face_dataset/model/lfw_verification_model', 'wb')) print 'mean :', np.mean(all_acc) f.close()
def train_valid_verif_model(): all_data = [] all_label = [] all_pic_path_list = [] count = 0 for line in open(pair_file): if count % 100 == 0: print count count += 1 tmp = line.rstrip().split() if len(tmp) == 3: path1 = tmp[0] path2 = tmp[1] if (os.path.exists(path1)) and (os.path.exists(path2)): feature1 = extract_feature_from_file(path1) feature2 = extract_feature_from_file(path2) predicts = pw.cosine_similarity(feature1, feature2) all_data.append(predicts) all_label.append(int(tmp[2])) msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb')) (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb')) all_data = np.asarray(all_data) data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2])) label = np.asarray(all_label) print data.shape, label.shape kf = KFold(len(label), n_folds=10) all_acc = [] for (train, valid) in kf: train_data = data[train] valid_data = data[valid] train_label = label[train] valid_label = label[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) all_acc.append(acc) print acc, roc_auc print np.mean(all_acc) clf = LinearSVC() clf.fit(data, label) pdb.set_trace() cPickle.dump(clf, open(verification_model_file, 'wb'))
def run(self): self.player = Environment(self.index * 113) context = zmq.Context() self.c2s_socket = context.socket(zmq.PUSH) self.c2s_socket.setsockopt(zmq.IDENTITY, self.identity) self.c2s_socket.connect(self.c2s) self.s2c_socket = context.socket(zmq.DEALER) self.s2c_socket.setsockopt(zmq.IDENTITY, self.identity) self.s2c_socket.connect(self.s2c) while True: obs = self.player.current_state() self.c2s_socket.send(dump((self.index, obs)), copy=False) if obs is not None: action = load(self.s2c_socket.recv(copy=False).bytes) self.player.action(action)
def feature_fusion(): kf = KFold(n_folds=10) all_acc = [] (data, label, all_pic_path_list) = msgpack_numpy.load(open('orl_verif_fc7_finetune_fc8.p', 'rb')) error_file = 'error_pair.txt' f = open(error_file, 'w') all_pic_path_list = np.asarray(all_pic_path_list) for k, (train, valid) in enumerate(kf.split(data, label)): train_data = data[train] valid_data = data[valid] train_label = label[train] valid_label = label[valid] valid_path = all_pic_path_list[valid] # clf = LinearSVC() # clf.fit(train_data, train_label) # acc = accuracy_score(valid_label, clf.predict(valid_data)) # roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) # for index in range(len(valid_data)): # if clf.predict(valid_data[index:index+1]) != valid_label[index]: # f.write(valid_path[index][0]+'\t'+valid_path[index][1]+'\n') rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=15) rf_clf.fit(train_data, train_label) rf_predict_train_label_prob = rf_clf.predict_proba(train_data) rf_predict_valid_label_prob = rf_clf.predict_proba(valid_data) gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500) gb_clf.fit(train_data, train_label) gb_predict_train_label_prob = gb_clf.predict_proba(train_data) gb_predict_valid_label_prob = gb_clf.predict_proba(valid_data) mf_clf = RandomForestClassifier() mf_train_data = np.column_stack((rf_predict_train_label_prob, gb_predict_train_label_prob)) mf_valid_data = np.column_stack((rf_predict_valid_label_prob, gb_predict_valid_label_prob)) mf_clf.fit(mf_train_data, train_label) acc = accuracy_score(valid_label, mf_clf.predict(mf_valid_data)) roc_auc = roc_auc_score(valid_label, mf_clf.predict(mf_valid_data)) all_acc.append(acc) print acc, roc_auc # roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) # print acc, roc_auc # cPickle.dump(clf, open('/data/liubo/face/vgg_face_dataset/model/lfw_verification_model', 'wb')) print 'mean :', np.mean(all_acc) f.close()
def load_train_data(self, data_folder): # 直接读取图片特征, 返回所有特征和label all_pic_feature = [] all_label = [] person_list = os.listdir(data_folder) for person in person_list: if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person: continue person_path = os.path.join(data_folder, person) pic_feature_list = os.listdir(person_path) for pic_feature_path in pic_feature_list: pic_feature = msgpack_numpy.load(open(os.path.join(person_path, pic_feature_path), 'rb')) all_pic_feature.append(pic_feature) all_label.append(person) all_pic_feature = np.asarray(all_pic_feature) all_label = np.asarray(all_label) return all_pic_feature, all_label
def load_train_data(self, data_folder): # 直接读取图片特征, 返回所有特征和label all_pic_feature = [] all_label = [] person_list = os.listdir(data_folder) for person in person_list: if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person: continue person_path = os.path.join(data_folder, person) pic_feature_list = os.listdir(person_path) for pic_feature_path in pic_feature_list: pic_feature = msgpack_numpy.load( open(os.path.join(person_path, pic_feature_path), 'rb')) all_pic_feature.append(pic_feature) all_label.append(person) all_pic_feature = np.asarray(all_pic_feature) all_label = np.asarray(all_label) return all_pic_feature, all_label
def feature_trans_autoencoder(src_pack_file, dst_pack_file): weight_file = '/data/liubo/face/annotate_face_model/skyeye_face_autoencoder.weight' model_file = '/data/liubo/face/annotate_face_model/skyeye_face_autoencoder.model' autoencoder = model_from_json(open(model_file, 'r').read()) autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') autoencoder.load_weights(weight_file) get_Conv_FeatureMap = K.function([autoencoder.layers[0].get_input_at(False), K.learning_phase()], [autoencoder.layers[-2].get_output_at(False)]) person_feature_dic = msgpack_numpy.load(open(src_pack_file, 'rb')) for person_index, person in enumerate(person_feature_dic): feature_list = person_feature_dic.get(person) for index in range(len(feature_list)): try: if feature_list[index][1] == None: continue this_feature = np.array(feature_list[index][1][0]) this_feature = np.reshape(this_feature, (1, this_feature.size)) this_feature = get_Conv_FeatureMap([this_feature, 0])[0][0] feature_list[index][1][0] = this_feature except: traceback.print_exc() msgpack_numpy.dump(person_feature_dic, open(dst_pack_file, 'wb'))
def run(self): print "My pid is :%d\n" % os.getpid() self.player = Environment(self.index * 113) context = zmq.Context() self.c2s_socket = context.socket(zmq.PUSH) self.c2s_socket.setsockopt(zmq.IDENTITY, self.identity) self.c2s_socket.connect(self.c2s) self.s2c_socket = context.socket(zmq.DEALER) self.s2c_socket.setsockopt(zmq.IDENTITY, self.identity) self.s2c_socket.connect(self.s2c) rew, isover, frag, kdr = None, False, 0, 0 while True: frame = self.player.current_state() self.c2s_socket.send(dump( (self.identity, [frame], rew, isover, frag, kdr)), copy=False) #rew is last action's reward action = load(self.s2c_socket.recv(copy=False).bytes) rew, isover, frag, kdr = self.player.action(action) if isover: self.player.reset_stat()
def extract_triplet_feature(): lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) new_lfw_feature_dic = {} model_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.model' weight_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.weight' model = model_from_json(open(model_file, 'r').read()) opt = Adam() model.compile(optimizer=opt, loss=['categorical_crossentropy']) model.load_weights(weight_file) # pdb.set_trace() get_Conv_FeatureMap = K.function([model.layers[2].layers[0].get_input_at(False), K.learning_phase()], [model.layers[2].layers[-1].get_output_at(False)]) for person in lfw_feature_dic: # print person this_person_feature_list = lfw_feature_dic.get(person) this_person_triplet_feature_list = [] for feature, path in this_person_feature_list: feature = np.reshape(feature, (1, feature.size)) new_feature = get_Conv_FeatureMap([feature, 0])[0].copy() this_person_triplet_feature_list.append((new_feature, path)) new_lfw_feature_dic[person] = this_person_triplet_feature_list msgpack_numpy.dump(new_lfw_feature_dic, open(triplet_feature_pack_file, 'wb'))
def import_data(self, debug=False): ''' method for importing and processing input data ''' # Importing pickled wordvectors, dictionary, inputs and labels with open(self.base_path + '/wordvectors', 'rb') as vectors_file: print("Importing wordvectors...", end=' ', flush=True) word_vectors = msgpack_numpy.load(vectors_file) print("Done") with open(self.base_path + '/dictionary', 'rb') as dict_file: print("Importing dictionary...", end=' ', flush=True) dictionary = msgpack.load(dict_file, raw=False) print("Done") with open('inputs_slot_filling', 'rb') as data_inputs_file: print("Importing inputs...", end=' ', flush=True) sentences = msgpack.load(data_inputs_file, raw=False) print("Done") with open('outputs_slot_filling', 'rb') as data_outputs_file: print("Importing labels...", end=' ', flush=True) outputs = msgpack.load(data_outputs_file, raw=False) print("Done") ######################################################################################################################## # Processing inputs ######################################################################################################################## print("Modifying input sentences...") # importing progressbar bar = progressbar.ProgressBar(max_value=len(sentences), redirect_stdout=True, end=' ') # preassigning the inputs variable for faster processing data_inputs = np.zeros((len(sentences), self.n_steps), dtype=np.int32) # initiating all the inputs to index of zero vector (zerowordvec_idx = dictionary['zerowordvec']) zerowordvec_idx = dictionary['zerowordvec'] data_inputs[:, :] = zerowordvec_idx # Processing inputs lengths = np.zeros(len(sentences), dtype=np.int32) i = 0 no_words_not_found = 0 for line in sentences: # Initializing an empty list of Indexes h = [] # Iterating each word in the line over the dictionary and appending the indexes to a list for k in range(len(line)): try: idx = dictionary[line[k]] except: idx = zerowordvec_idx no_words_not_found += 1 with open('words_not_found_in_dic','a') as f: f.write(line[k] + '\n') # Appending the index(idx) of each word to the list h. h.append(idx) # appending the length of each line to the list lengths lengths[i] = len(line) # modify contents of the array data_inputs[i, :len(h)] = h # bar update bar.update(i) i = i + 1 # bar finish bar.finish() # if words are not found in dictionary if no_words_not_found!=0: print('# of words not found in the dictionary(incl. repeatations) = {}'.format(no_words_not_found), end=' ', flush=True) # if debug print input sample to check if the input pipeline is correct if debug: print('Sample input data') print('=========================================================') print('input sentences are {}'.format(sentences[0:2])) print('[Vector]input sentence are {}'.format(data_inputs[0:2])) print('=========================================================') ######################################################################################################################## # Processing labels ######################################################################################################################## print("Modifying outputs...") # Pre assigning the data_outputs array data_outputs = np.zeros((self.n_examples, self.n_steps, len(self.available_slots)), dtype=np.int32) # Initiating all the one hot vectors to the default vector corresponding to the 'Outside' slot # Outside string 'O' is part of the naming convention (Begin, Inside and Outside)for classification of words (Object, Source etc) in a sentence. # Ref: $ROS_WORKSPACE/mbot_natural_language_processing/mbot_nlu/ros/doc/pedro_thesis.pdf idx_outside = self.available_slots.index('O') data_outputs[:, :, idx_outside] = 1 # Initiating progress bar bar = progressbar.ProgressBar(max_value=len(outputs), redirect_stdout=True, end=' ') # Index for line wise iteration v = 0 # Process outputs for line in outputs: # Index for word wise iteration w = 0 for output in line: # find slot if it exists in available slots list try: idx_found = self.available_slots.index(output) # print('index found is ' + str(idx_found)) except ValueError: raise Exception('Could not find this output = {} in this sentence = {} in the available list of slots'.format(output, sentences[outputs.index(line)])) # modify array data_outputs[v][w][idx_outside] = 0 data_outputs[v][w][idx_found] = 1 w = w + 1 # Incrementing line index v = v + 1 # Progress bar update bar.update(v) # Progress bar finished bar.finish() # debug prining if debug==True: print('Sample output data') print('=========================================================') print('output labels are {}'.format(outputs[0:2])) print('[Vector]output labels are {}'.format(outputs_train[0:2])) print('=========================================================') return word_vectors, data_inputs, data_outputs, lengths
def train_valid_verif_model(): all_data = [] all_label = [] all_pic_path_list = [] count = 0 path_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) not_in = 0 not_in_pair = {} for line in open(pair_file): if count % 100 == 0: print count count += 1 tmp = line.rstrip().split() if len(tmp) == 3: path1 = tmp[0] path2 = tmp[1] label = int(tmp[2]) if path1 in path_feature_dic and path2 in path_feature_dic: try: feature1 = np.asarray(path_feature_dic.get(path1)) feature2 = np.asarray(path_feature_dic.get(path2)) predicts = pw.cosine_similarity(feature1, feature2) all_data.append(predicts) all_label.append(label) all_pic_path_list.append((path1, path2)) except: traceback.print_exc() else: traceback.print_exc() msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb')) (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb')) pdb.set_trace() all_data = np.asarray(all_data) all_data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2])) all_label = np.asarray(all_label) all_pic_path_list = np.asarray(all_pic_path_list) print all_data.shape, all_label.shape all_acc = [] kf = KFold(n_folds=10) all_acc = [] f = open('research_verif_result.txt', 'w') for k, (train, valid) in enumerate( kf.split(all_data, all_label, all_pic_path_list)): train_data = all_data[train] valid_data = all_data[valid] train_label = all_label[train] valid_label = all_label[valid] train_path_list = all_pic_path_list[train] valid_path_list = all_pic_path_list[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) for k in range(len(valid_path_list)): f.write( os.path.split(valid_path_list[k][0])[1] + '\t' + os.path.split(valid_path_list[k][1])[1] + '\t' + str(valid_data[k][0]) + '\t' + str(valid_label[k]) + '\n') all_acc.append(acc) print acc print 'mean_acc :', np.mean(all_acc) f.close() clf = LinearSVC() clf.fit(all_data, all_label) pdb.set_trace() cPickle.dump(clf, open(verification_model_file, 'wb'))
parser = OptionParser() parser.add_option("-n", "--num_class", dest="num_class", help="classify label num") parser.add_option("-m", "--model_file", dest="model_file", help="model file") parser.add_option("-w", "--weight_file", dest="weight_file", help="weight file") parser.add_option("-a", "--need_augment", dest="need_augment", help="need_augment") parser.add_option("-l", "--train_valid_sample_list_file", dest="train_valid_sample_list_file", help="train_valid_sample_list_file") (options, args) = parser.parse_args() if options.need_augment.rstrip() == 'True': need_augment = True else: need_augment = False print 'need_augment :', need_augment model_file = options.model_file weight_file = options.weight_file nb_classes = int(options.num_class) train_valid_sample_list_file = options.train_valid_sample_list_file if K.image_dim_ordering() == 'th': pic_shape = (96, 96, 3) # center loss的shape else: # pic_shape = (3, 96, 96) pic_shape = (96, 96, 3) (train_sample_list, valid_sample_list) = msgpack_numpy.load(open(train_valid_sample_list_file, 'rb')) train_valid_model(train_sample_list, valid_sample_list, pic_shape, nb_classes, model_file, weight_file)
# person_list = os.listdir(folder) # all_pic_path = [] # all_person = [] # for person in person_list: # if person == 'unknown' or person.startswith('new_person'): # continue # person_path = os.path.join(folder, person) # pic_list = os.listdir(person_path) # for pic in pic_list: # pic_path = os.path.join(person_path, pic) # all_pic_path.append(pic_path) # all_person.append(person) # all_score, all_label = cal_pic_distance(all_pic_path, all_person) # msgpack_numpy.dump((all_score, all_label), open('all_score_label.p','wb')) # all_score, all_label = msgpack_numpy.load(open('all_score_label.p', 'rb')) count = Counter(all_label) print count all_score = np.reshape(np.asarray(all_score), (len(all_score), 1)) all_label = np.asarray(all_label) gnb = GaussianNB() train_data, test_data, train_label, test_label = train_test_split( all_score, all_label) gnb.fit(train_data, train_label) gnb.predict_proba(test_data) print accuracy_score(test_label, gnb.predict(test_data)) cPickle.dump( gnb, open('/data/liubo/face/vgg_face_dataset/model/dist_prob.p', 'wb')) pdb.set_trace()
model.compile(optimizer=opt, loss=[contrastive_loss]) print 'load weights' model.load_weights(weight_file) # train_model(person_feature_list_dic, model) # last_acc = valid_model(valid_person_feature_list_dic, model) # print 'first_acc :', last_acc for epoch_index in range(nb_epoch): print('-'*40) print('Training ', 'current epoch :', epoch_index, 'all epcoh :', nb_epoch) train_model(train_person_feature_list_dic, model) # this_acc = valid_model(valid_path_list, model, pic_shape) # print 'this_acc :', this_acc, 'last_acc :', last_acc # if this_acc > last_acc: # model.save_weights(weight_file, overwrite=True) # print ('save_model') # last_acc = this_acc if __name__ == '__main__': model_file = '/data/liubo/face/vgg_face_dataset/model/facenet.model' weight_file = '/data/liubo/face/vgg_face_dataset/model/facenet.weight' model = build_model(feature_dim=4096) print model.summary() print model.layers[2].summary() model.save_weights(weight_file, overwrite=True) open(model_file,'w').write(model.to_json()) person_feature_list_dic = msgpack_numpy.load(open('/data/pictures_annotate_feature/person_feature_list_dic.p', 'rb')) train_valid_model(person_feature_list_dic, person_feature_list_dic, model_file, weight_file)
def train_valid_verif_model(): all_data = [] all_label = [] all_pic_path_list = [] count = 0 path_feature_dic = msgpack.load(open('research_feature.p', 'rb')) not_in = 0 not_in_pair = {} for line in open(pair_file): if count % 100 == 0: print count count += 1 tmp = line.rstrip().split() if len(tmp) == 3: path1 = tmp[0] path2 = tmp[1] label = int(tmp[2]) if path1 in path_feature_dic and path2 in path_feature_dic: try: feature1 = np.asarray(path_feature_dic.get(path1)) feature2 = np.asarray(path_feature_dic.get(path2)) if len(feature1) < 100 or len(feature2) < 100: print path1, path2 not_in += 1 not_in_pair[(path1, path2)] = 1 continue feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0])) feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0])) predicts = pw.cosine_similarity(feature1, feature2) all_data.append(predicts) all_label.append(label) all_pic_path_list.append((path1, path2)) except: traceback.print_exc() # pdb.set_trace() else: traceback.print_exc() # pdb.set_trace() msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb')) (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb')) all_data = np.asarray(all_data) all_data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2])) all_label = np.asarray(all_label) print all_data.shape, all_label.shape kf = KFold(len(all_label), n_folds=10) all_acc = [] for (train, valid) in kf: train_data = all_data[train] valid_data = all_data[valid] train_label = all_label[train] valid_label = all_label[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) all_acc.append(acc) print acc, roc_auc print 'mean_acc :', np.mean(all_acc) clf = LinearSVC() clf.fit(all_data, all_label) cPickle.dump(clf, open(verification_model_file, 'wb'))
def main(args): network = importlib.import_module(args.model_def, 'inference') subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) # 创建模型文件夹 if not os.path.isdir(log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir(model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) # Store some git revision info in a text file in the log directory src_path, _ = os.path.split(os.path.realpath(__file__)) facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv)) np.random.seed(seed=args.seed) train_set = facenet.get_dataset(args.data_dir) nrof_classes = len(train_set) print('Model directory: %s' % model_dir) print('Log directory: %s' % log_dir) pretrained_model = None if args.pretrained_model: pretrained_model = os.path.expanduser(args.pretrained_model) print('Pre-trained model: %s' % pretrained_model) if args.lfw_dir: print('LFW directory: %s' % args.lfw_dir) # Read the file containing the pairs used for testing pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs)) # Get the paths for the corresponding images lfw_paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs, args.lfw_file_ext) if args.baihe_pack_file: print('load baihe dataset') lfw_paths, actual_issame = msgpack_numpy.load(open(args.baihe_pack_file)) with tf.Graph().as_default(): tf.set_random_seed(args.seed) # 迭代轮数, 不同的轮数可以使用不同的学习率 global_step = tf.Variable(0, trainable=False) # Get a list of image paths and their labels image_list, label_list = facenet.get_image_paths_and_labels(train_set) # Read data and apply label preserving distortions image_batch, label_batch = facenet.read_and_augment_data(image_list, label_list, args.image_size, args.batch_size, args.max_nrof_epochs, args.random_crop, args.random_flip, args.random_rotate, args.nrof_preprocess_threads) print('Total number of classes: %d' % nrof_classes) print('Total number of examples: %d' % len(image_list)) print('Building training graph') # Placeholder for the learning rate learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') # Build the inference graph, 返回的是网络结构 prelogits, _ = network.inference(image_batch, args.keep_probability, phase_train=True, weight_decay=args.weight_decay) # 初始化采用截断的正态分布噪声, 标准差为0.1 # tf.truncated_normal_initializer(stddev=0.1) logits = slim.fully_connected(prelogits, len(train_set), activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=0.1), weights_regularizer=slim.l2_regularizer(args.weight_decay), scope='Logits', reuse=False) # Add DeCov regularization loss if args.decov_loss_factor > 0.0: logits_decov_loss = facenet.decov_loss(logits) * args.decov_loss_factor # 将decov_loss加入到名字为tf.GraphKeys.REGULARIZATION_LOSSES的集合当中来 tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, logits_decov_loss) # Add center loss (center_loss作为一个正则项加入到collections) if args.center_loss_factor > 0.0: prelogits_center_loss, _ = facenet.center_loss(prelogits, label_batch, args.center_loss_alfa, nrof_classes) # 将center加入到名字为tf.GraphKeys.REGULARIZATION_LOSSES的集合当中来 tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, prelogits_center_loss * args.center_loss_factor) # 对学习率进行指数衰退 learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step, args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True) tf.scalar_summary('learning_rate', learning_rate) # Calculate the average cross entropy loss across the batch # 将softmax和交叉熵一起做,得到最后的损失函数,提高效率 cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, label_batch, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # Calculate the total losses # 获取正则loss regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss') # Build a Graph that trains the model with one batch of examples and updates the model parameters train_op = facenet.train(total_loss, global_step, args.optimizer, learning_rate, args.moving_average_decay, tf.all_variables(), args.log_histograms) # Evaluation print('Building evaluation graph') lfw_label_list = range(0, len(lfw_paths)) assert (len(lfw_paths) % args.lfw_batch_size == 0), \ "The number of images in the LFW test set need to be divisible by the lfw_batch_size" eval_image_batch, eval_label_batch = facenet.read_and_augment_data(lfw_paths, lfw_label_list, args.image_size, args.lfw_batch_size, None, False, False, False, args.nrof_preprocess_threads, shuffle=False) # Node for input images eval_image_batch.set_shape((None, args.image_size, args.image_size, 3)) eval_image_batch = tf.identity(eval_image_batch, name='input') eval_prelogits, _ = network.inference(eval_image_batch, 1.0, phase_train=False, weight_decay=0.0, reuse=True) eval_embeddings = tf.nn.l2_normalize(eval_prelogits, 1, 1e-10, name='embeddings') # Create a saver saver = tf.train.Saver(tf.all_variables(), max_to_keep=10) # saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Start running operations on the Graph. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) # sess.run(tf.global_variables_initializer()) # sess.run(tf.local_variables_initializer()) sess.run(tf.initialize_all_variables()) sess.run(tf.initialize_local_variables()) summary_writer = tf.train.SummaryWriter(log_dir, sess.graph) tf.train.start_queue_runners(sess=sess) # 将队列runner启动,队列就开始运行,返回启动的线程 # 注意input_queue是先入列,再出列,由于入列的时候输入是place holder,因此到后的线程的时候,会阻塞, # 直到下train中sess run (enqueue_op)的时候, 会向队列中载入值,后面的出列才有对象,才在各自的队列中开始执行 with sess.as_default(): if pretrained_model: print('Restoring pretrained model: %s' % pretrained_model) saver.restore(sess, pretrained_model) # Training and validation loop print('Running training') epoch = 0 while epoch < args.max_nrof_epochs: try: step = sess.run(global_step, feed_dict=None) epoch = step // args.epoch_size # Train for one epoch train(args, sess, epoch, learning_rate_placeholder, global_step, total_loss, train_op, summary_op, summary_writer, regularization_losses, args.learning_rate_schedule_file) # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step) # Evaluate on LFW if args.lfw_dir: evaluate(sess, eval_embeddings, eval_label_batch, actual_issame, args.lfw_batch_size, args.seed, args.lfw_nrof_folds, log_dir, step, summary_writer) # Evaluate on baihe_data if args.baihe_pack_file: evaluate(sess, eval_embeddings, eval_label_batch, actual_issame, args.lfw_batch_size, args.seed, args.lfw_nrof_folds, log_dir, step, summary_writer) except: traceback.print_exc() continue return model_dir
def find_max_min(): # 同一个人里找相似度最小的, 不同人里找相似度最大的 lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) person_list = lfw_feature_dic.keys() same_person_score = [] same_person_score_pair_dic = {} # {score:[(path1,path2), ...,(path1,path2)]} no_same_person_score = [] no_same_person_score_pair_dic = {} # {score:[(path1,path2), ...,(path1,path2)]} heapq.heapify(same_person_score) pair_threshold = 3000 for person_index, person in enumerate(person_list): start = time() path_feature_list = lfw_feature_dic.get(person) # 找出该人里所有可能的pair --- score越小越好(同一个人最不相似的照片) # 每次将最大的score去掉,加入更小的score,所以在加入是score取负,这样堆顶就是原来score最大的值 length = len(path_feature_list) for index_i in range(length): for index_j in range(index_i, length): feature1, path1 = path_feature_list[index_i] feature2, path2 = path_feature_list[index_j] feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0])) feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0])) this_score = 0 - pw.cosine_similarity(feature1, feature2)[0][0] if len(same_person_score) > pair_threshold: top_item = same_person_score[0] if this_score < top_item: # 更加不相似,加入 heapq.heappop(same_person_score) heapq.heappush(same_person_score, this_score) # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair) if top_item in same_person_score_pair_dic: same_person_score_pair_dic.pop(top_item) pair_list = same_person_score_pair_dic.get(this_score, []) pair_list.append((path1, path2)) same_person_score_pair_dic[this_score] = pair_list else: heapq.heappush(same_person_score, this_score) pair_list = same_person_score_pair_dic.get(this_score, []) pair_list.append((path1, path2)) same_person_score_pair_dic[this_score] = pair_list # 找出所有可能的不相似的pair for other_person_index, other_person in enumerate(person_list[person_index+1:], start=person_index+1): other_path_feature_list = lfw_feature_dic.get(other_person) if other_person == person: continue other_length = len(other_path_feature_list) for index_i in range(length): for index_j in range(other_length): feature1, path1 = path_feature_list[index_i] feature2, path2 = other_path_feature_list[index_j] feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0])) feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0])) this_score = pw.cosine_similarity(feature1, feature2)[0][0] if len(no_same_person_score) > pair_threshold: top_item = no_same_person_score[0] if this_score < top_item: # 更加相似, 加入 heapq.heappop(no_same_person_score) heapq.heappush(no_same_person_score, this_score) # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair) if top_item in no_same_person_score_pair_dic: no_same_person_score_pair_dic.pop(top_item) pair_list = no_same_person_score_pair_dic.get(this_score, []) pair_list.append((path1, path2)) no_same_person_score_pair_dic[this_score] = pair_list else: heapq.heappush(no_same_person_score, this_score) pair_list = no_same_person_score_pair_dic.get(this_score, []) pair_list.append((path1, path2)) no_same_person_score_pair_dic[this_score] = pair_list end = time() print person_index, person, (end - start), length msgpack_numpy.dump((same_person_score_pair_dic, same_person_score, no_same_person_score_pair_dic, no_same_person_score), open(new_pair_pack_file, 'wb'))
def main_distance(): # lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) lfw_feature_dic = msgpack_numpy.load(open(triplet_feature_pack_file, 'rb')) data = [] label = [] pic_path_list = [] for line in open(pair_file): tmp = line.rstrip().split() if len(tmp) == 3: person = tmp[0] #取该人的两个特征向量 this_person_feature_list = lfw_feature_dic.get(person, []) index_list = range(len(this_person_feature_list)) np.random.shuffle(index_list) filter_path(this_person_feature_list, index_list) if len(index_list) < 2: continue feature1, path1 = this_person_feature_list[index_list[0]] feature2, path2 = this_person_feature_list[index_list[1]] feature1 = np.reshape(feature1, newshape=(1, feature1.size)) feature2 = np.reshape(feature2, newshape=(1, feature2.size)) predicts = pw.cosine_similarity(feature1, feature2) label.append(0) data.append(predicts) pic_path_list.append('\t'.join([path1, path2])) elif len(tmp) == 4: person1 = tmp[0] person2 = tmp[2] # 每个人分别取一个特征向量 this_person_feature_list1 = lfw_feature_dic.get(person1, []) this_person_feature_list2 = lfw_feature_dic.get(person2, []) index_list1 = range(len(this_person_feature_list1)) index_list2 = range(len(this_person_feature_list2)) np.random.shuffle(index_list1) np.random.shuffle(index_list2) filter_path(this_person_feature_list1, index_list1) filter_path(this_person_feature_list2, index_list2) if len(index_list1) < 1 or len(index_list2) < 1: continue index_list1 = np.arange(len(this_person_feature_list1)) index_list2 = np.arange(len(this_person_feature_list2)) np.random.shuffle(index_list1) np.random.shuffle(index_list2) feature1, path1 = this_person_feature_list1[index_list1[0]] feature2, path2 = this_person_feature_list2[index_list2[0]] feature1 = np.reshape(feature1, newshape=(1, feature1.size)) feature2 = np.reshape(feature2, newshape=(1, feature2.size)) predicts = pw.cosine_similarity(feature1, feature2) label.append(1) data.append(predicts) pic_path_list.append('\t'.join([path1, path2])) data = np.asarray(data) print data.shape data = np.reshape(data, newshape=(len(data), 1)) label = np.asarray(label) print data.shape, label.shape kf = KFold(n_folds=10) all_acc = [] for k, (train, valid) in enumerate(kf.split(data, label)): train_data = data[train] valid_data = data[valid] train_label = label[train] valid_label = label[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) all_acc.append(acc) print 'acc :', acc print 'mean acc :', np.mean(all_acc)
def train_valid_verif_model(): all_data = [] all_label = [] all_pic_path_list = [] count = 0 path_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) not_in = 0 not_in_pair = {} for line in open(pair_file): if count % 100 == 0: print count count += 1 tmp = line.rstrip().split() if len(tmp) == 3: path1 = tmp[0] path2 = tmp[1] label = int(tmp[2]) if path1 in path_feature_dic and path2 in path_feature_dic: try: feature1 = np.asarray(path_feature_dic.get(path1)) feature2 = np.asarray(path_feature_dic.get(path2)) predicts = pw.cosine_similarity(feature1, feature2) all_data.append(predicts) all_label.append(label) all_pic_path_list.append((path1, path2)) except: traceback.print_exc() else: traceback.print_exc() msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb')) (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb')) pdb.set_trace() all_data = np.asarray(all_data) all_data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2])) all_label = np.asarray(all_label) all_pic_path_list = np.asarray(all_pic_path_list) print all_data.shape, all_label.shape all_acc = [] kf = KFold(n_folds=10) all_acc = [] f = open('research_verif_result.txt', 'w') for k, (train, valid) in enumerate(kf.split(all_data, all_label, all_pic_path_list)): train_data = all_data[train] valid_data = all_data[valid] train_label = all_label[train] valid_label = all_label[valid] train_path_list = all_pic_path_list[train] valid_path_list = all_pic_path_list[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) for k in range(len(valid_path_list)): f.write(os.path.split(valid_path_list[k][0])[1] + '\t' + os.path.split(valid_path_list[k][1])[1] + '\t' + str(valid_data[k][0])+ '\t' + str(valid_label[k]) + '\n') all_acc.append(acc) print acc print 'mean_acc :', np.mean(all_acc) f.close() clf = LinearSVC() clf.fit(all_data, all_label) pdb.set_trace() cPickle.dump(clf, open(verification_model_file, 'wb'))
this_patience = 1 def extract_feature(model_file, weight_file): print 'model_file :', model_file print 'weight_file :', weight_file model = model_from_json(open(model_file, 'r').read()) model.load_weights(weight_file) get_Conv_FeatureMap = K.function( [model.layers[0].get_input_at(False), K.learning_phase()], [model.layers[-2].get_output_at(False)]) return model, get_Conv_FeatureMap if __name__ == '__main__': model_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.model' weight_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.weight' # extract_feature(model_file, weight_file) # model = deep_net(pic_shape=(3, 128, 128), nb_classes=NB_CLASS) # model.compile('rmsprop', 'categorical_crossentropy') model_data, model_label = msgpack_numpy.load( open('/data/liubo/face/originalimages/originalimages_model.p', 'rb')) model_data = np.transpose(model_data, (0, 3, 1, 2)) X_train, X_test, y_train, y_test = train_test_split(model_data, model_label, test_size=0.1) train_valid_model(X_train, y_train, X_test, y_test, NB_CLASS, model_file, weight_file)
def find_max_min(): # 同一个人里找相似度最小的, 不同人里找相似度最大的 lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) person_list = lfw_feature_dic.keys() same_person_score = [] same_person_score_pair_dic = { } # {score:[(path1,path2), ...,(path1,path2)]} no_same_person_score = [] no_same_person_score_pair_dic = { } # {score:[(path1,path2), ...,(path1,path2)]} heapq.heapify(same_person_score) pair_threshold = 3000 for person_index, person in enumerate(person_list): start = time() path_feature_list = lfw_feature_dic.get(person) # 找出该人里所有可能的pair --- score越小越好(同一个人最不相似的照片) # 每次将最大的score去掉,加入更小的score,所以在加入是score取负,这样堆顶就是原来score最大的值 length = len(path_feature_list) for index_i in range(length): for index_j in range(index_i, length): feature1, path1 = path_feature_list[index_i] feature2, path2 = path_feature_list[index_j] feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0])) feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0])) this_score = 0 - pw.cosine_similarity(feature1, feature2)[0][0] if len(same_person_score) > pair_threshold: top_item = same_person_score[0] if this_score < top_item: # 更加不相似,加入 heapq.heappop(same_person_score) heapq.heappush(same_person_score, this_score) # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair) if top_item in same_person_score_pair_dic: same_person_score_pair_dic.pop(top_item) pair_list = same_person_score_pair_dic.get( this_score, []) pair_list.append((path1, path2)) same_person_score_pair_dic[this_score] = pair_list else: heapq.heappush(same_person_score, this_score) pair_list = same_person_score_pair_dic.get(this_score, []) pair_list.append((path1, path2)) same_person_score_pair_dic[this_score] = pair_list # 找出所有可能的不相似的pair for other_person_index, other_person in enumerate( person_list[person_index + 1:], start=person_index + 1): other_path_feature_list = lfw_feature_dic.get(other_person) if other_person == person: continue other_length = len(other_path_feature_list) for index_i in range(length): for index_j in range(other_length): feature1, path1 = path_feature_list[index_i] feature2, path2 = other_path_feature_list[index_j] feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0])) feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0])) this_score = pw.cosine_similarity(feature1, feature2)[0][0] if len(no_same_person_score) > pair_threshold: top_item = no_same_person_score[0] if this_score < top_item: # 更加相似, 加入 heapq.heappop(no_same_person_score) heapq.heappush(no_same_person_score, this_score) # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair) if top_item in no_same_person_score_pair_dic: no_same_person_score_pair_dic.pop(top_item) pair_list = no_same_person_score_pair_dic.get( this_score, []) pair_list.append((path1, path2)) no_same_person_score_pair_dic[ this_score] = pair_list else: heapq.heappush(no_same_person_score, this_score) pair_list = no_same_person_score_pair_dic.get( this_score, []) pair_list.append((path1, path2)) no_same_person_score_pair_dic[this_score] = pair_list end = time() print person_index, person, (end - start), length msgpack_numpy.dump((same_person_score_pair_dic, same_person_score, no_same_person_score_pair_dic, no_same_person_score), open(new_pair_pack_file, 'wb'))
def msgpack_load_text(stream): return msgpack.load(stream, encoding='utf-8')
def import_data(self, debug=False): ''' method for importing and processing input data ''' # Importing pickled wordvectors, dictionary, inputs and labels with open(self.base_path + '/wordvectors', 'rb') as vectors_file: print("Importing wordvectors...", end=' ', flush=True) word_vectors = msgpack_numpy.load(vectors_file) print("Done") with open(self.base_path + '/dictionary', 'rb') as dict_file: print("Importing dictionary...", end=' ', flush=True) dictionary = msgpack.load(dict_file, raw=False) print("Done") with open('inputs', 'rb') as data_inputs_file: print("Importing inputs...", end=' ', flush=True) sentences = msgpack.load(data_inputs_file, raw=False) print("Done") with open('outputs', 'rb') as data_outputs_file: print("Importing labels...", end=' ', flush=True) outputs = msgpack.load(data_outputs_file, raw=False) print("Done") ######################################################################################################################## # Processing inputs ######################################################################################################################## print('Modifying input sentences...') # importing progressbar bar = progressbar.ProgressBar(max_value=len(sentences), redirect_stdout=True, end=' ') # preassigning the inputs variable for faster processing data_inputs = np.zeros((len(sentences), self.n_steps), dtype=np.int32) # initiating all the inputs to index of zero vector (zerowordvec_idx = dictionary['zerowordvec']) zerowordvec_idx = dictionary['zerowordvec'] data_inputs[:, :] = zerowordvec_idx # Modifying the input senteces for training. lengths = [] i = 0 no_words_not_found = 0 for line in sentences: line = line.lower() # All the sentences to lowe caps line = line.strip('\n') # Strip all the \n at the end line = line.replace(',', '') # Removing "," line = line.rsplit( ' ', -1) # Split the sentence to a list of strings(words) # Initializing an empty list h = [] # Iterating each word in the line over the dictionary and appending the indexes to a list for k in range(len(line)): # searching the index of each word in the dictionary and saving the number to the variable "idx" try: idx = dictionary[line[k]] except: # Exporting the words not found in the dictionary to (for reference) idx = zerowordvec_idx with open('words_not_found_in_dic', 'a') as f: f.write(line[k] + '\n') no_words_not_found += 1 # Appending the index(idx) of each word to the list h. h.append(idx) # appending the length of each line to the list lengths lengths.append(len(line)) # modifying the array data_inputs[i, :len(h)] = h # bar update bar.update(i) i = i + 1 # bar finish bar.finish() # if words are not found in dictionary if no_words_not_found != 0: print( '\nNo. of words not found in the dict = {}, pls. check words_not_found_in_dic file\n' .format(no_words_not_found), end='', flush=True) # if debug print input sample to check if the input pipeline is correct if debug: print('Sample input data') print('=========================================================') print('input sentence are {}'.format(sentences[0:2])) print('input lengths are {}'.format(lengths[0:2])) print('[Vector]input sentence are {}'.format(data_inputs[0:2])) print('=========================================================') ######################################################################################################################## # Processing labels ######################################################################################################################## print("Modifying labels...") # initiating progress bar bar = progressbar.ProgressBar(max_value=len(outputs), redirect_stdout=True, end=' ') # preassinging outputs variable for faster processing data_outputs = np.zeros((len(outputs), len(self.available_intents)), dtype=np.int32) # Iterating over outputs list and corresponding one hot vectors is stacked( using vstack) to a list (o) v = 0 for output in outputs: # find intent if it exists in available intents list try: idx_found = self.available_intents.index(output) except ValueError: raise Exception( 'Could not find this output = {} in the available list of intents' .format(output)) # modifying the output array data_outputs[v, idx_found] = 1 # bar update bar.update(v) v = v + 1 # bar finish bar.finish() # debug prining if debug: print('Sample output data') print('=========================================================') print('output labels are {}'.format(outputs[0:2])) print('[Vector]output labels are {}'.format(data_outputs[0:2])) print('=========================================================') return word_vectors, data_inputs, data_outputs, lengths
import msgpack_numpy from sklearn.svm import LinearSVC from sklearn.metrics.pairwise import cosine_similarity import numpy as np from sklearn.cross_validation import KFold from sklearn.metrics import accuracy_score, roc_auc_score import pdb reload(sys) sys.setdefaultencoding("utf-8") # fileConfig('logger_config.ini') # logger_error = logging.getLogger('errorhandler') if __name__ == '__main__': (paths, emb_array, actual_issame) = msgpack_numpy.load(open('lfw_feature.p', 'rb')) data = [] pair_paths = [] for index in range(len(actual_issame)): data.append(cosine_similarity(emb_array[2*index:2*index+1], emb_array[2*index+1:2*index+2])[0][0]) pair_paths.append(str(paths[2*index]) + '\t' + str(paths[2*index+1])) data = np.reshape(np.array(data), (len(data), 1)) label = np.reshape(np.array(actual_issame), (len(actual_issame), 1)) pair_paths = np.array(pair_paths) kf = KFold(len(label), n_folds=10) all_acc = [] f = open('error.txt', 'w') for (train, valid) in kf: train_data = data[train] valid_data = data[valid]
# read as txt file with open(common_words, 'r') as common_words_file: content = common_words_file.readlines() common_words_l = [x.strip() for x in content] print("read {} words from common words file".format(len(common_words_l))) print("loading serialized big_dictionary") with open(sys.argv[1] + '/big_dictionary', 'rb') as big_dictionary_file: big_dictionary = msgpack.load(big_dictionary_file, raw=False) print("loading serialized big_wordvectors") with open(sys.argv[1] + '/big_wordvectors', 'rb') as big_wordvectors_file: big_wordvectors = msgpack_numpy.load(big_wordvectors_file) print("Finding most common words in big_dictionary and generating reduced size dictionary") print("--------") i = 0 dictionary = {} wordvectors = [] number_of_loops = len(common_words_l) for common_word in common_words_l: # get index try: index = big_dictionary[common_word] # from the index we get the correspinding vector
break print('Testing...') Y_predict_batch = model.predict(valid_data, batch_size=batch_size, verbose=1) test_acc = accuracy_score(np.argmax(valid_label, axis=1), np.argmax(Y_predict_batch, axis=1)) Y_train_preidct_batch = model.predict(train_data, batch_size=batch_size, verbose=1) train_acc = accuracy_score(np.argmax(train_label, axis=1), np.argmax(Y_train_preidct_batch, axis=1)) print ('train_acc :', train_acc, 'test acc', test_acc) if last_crps < test_acc: this_patience = 0 model.save_weights(weight_file, overwrite=True) print ('save_model') last_crps = test_acc else: if this_patience >= patience: break else: this_patience = 1 if __name__ == '__main__': model = deepface(pic_shape=(512, 7, 7), class_num=168) model_file = '/data/liubo/face/vgg_face_dataset/model/deepface_test.model' weight_file = '/data/liubo/face/vgg_face_dataset/model/deepface_test.weight' data, label = msgpack_numpy.load(open('/home/liubo-it/FaceRecognization/FineTune/v2/hanlin.p', 'rb')) data = np.asarray(data) label = np.asarray(label) train_data, valid_data, train_label, valid_label = train_test_split(data, label, test_size=0.2) print train_data.shape, valid_data.shape train_valid(train_data, valid_data, train_label, valid_label, model_file, weight_file)
# train_model(person_feature_list_dic, model) # last_acc = valid_model(valid_person_feature_list_dic, model) # print 'first_acc :', last_acc for epoch_index in range(nb_epoch): print('-' * 40) print('Training ', 'current epoch :', epoch_index, 'all epcoh :', nb_epoch) train_model(train_person_feature_list_dic, model) # this_acc = valid_model(valid_path_list, model, pic_shape) # print 'this_acc :', this_acc, 'last_acc :', last_acc # if this_acc > last_acc: # model.save_weights(weight_file, overwrite=True) # print ('save_model') # last_acc = this_acc if __name__ == '__main__': model_file = '/data/liubo/face/vgg_face_dataset/model/facenet.model' weight_file = '/data/liubo/face/vgg_face_dataset/model/facenet.weight' model = build_model(feature_dim=4096) print model.summary() print model.layers[2].summary() model.save_weights(weight_file, overwrite=True) open(model_file, 'w').write(model.to_json()) person_feature_list_dic = msgpack_numpy.load( open('/data/pictures_annotate_feature/person_feature_list_dic.p', 'rb')) train_valid_model(person_feature_list_dic, person_feature_list_dic, model_file, weight_file)
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.1) clf = LinearSVC() print len(x), len(y) clf.fit(train_x, train_y) acc = accuracy_score(valid_y, clf.predict(valid_x)) print acc clf = DecisionTreeClassifier() clf.fit(train_x, train_y) acc = accuracy_score(valid_y, clf.predict(valid_x)) print acc if __name__ == '__main__': # main_feature() # main() # cal_acc('dist.p') # main_max_min() # cal_acc('dist_max_min.p') (data, label) = msgpack_numpy.load(open('lfw_data_label.p', 'r')) data = np.asarray(data) label = np.asarray(label) train_x, valid_x, train_label, valid_label = train_test_split( data, label, test_size=0.1) print train_x.shape, valid_x.shape, train_label.shape, valid_label.shape clf = RandomForestClassifier(n_estimators=1000, n_jobs=15) clf.fit(train_x, train_label) acc = accuracy_score(valid_label, clf.predict(valid_x)) train_acc = accuracy_score(train_label, clf.predict(train_x)) print acc, train_acc
def main_distance(): lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb')) data = [] label = [] pic_path_list = [] for line in open(pair_file): tmp = line.rstrip().split() if len(tmp) == 3: person = tmp[0] # 取该人的两个特征向量 index1 = int(tmp[1]) index2= int(tmp[2]) this_person_feature_dic = lfw_feature_dic.get(person, {}) if index1 in this_person_feature_dic and index2 in this_person_feature_dic: feature1, path1 = this_person_feature_dic[index1] feature2, path2 = this_person_feature_dic[index2] predicts = pw.cosine_similarity(feature1, feature2) label.append(0) data.append(predicts) pic_path_list.append('\t'.join([path1, path2])) elif len(tmp) == 4: person1 = tmp[0] index1 = int(tmp[1]) person2 = tmp[2] index2 = int(tmp[3]) # 每个人分别取一个特征向量 this_person_feature_dic1 = lfw_feature_dic.get(person1, {}) this_person_feature_dic2 = lfw_feature_dic.get(person2, {}) if index1 in this_person_feature_dic1 and index2 in this_person_feature_dic2: feature1, path1 = this_person_feature_dic1[index1] feature2, path2 = this_person_feature_dic2[index2] predicts = pw.cosine_similarity(feature1, feature2) label.append(1) data.append(predicts) pic_path_list.append('\t'.join([path1, path2])) data = np.asarray(data) # data = np.reshape(data, newshape=(data.shape[0], data.shape[-1])) data = np.reshape(data, newshape=(data.shape[0], 1)) label = np.asarray(label) pic_path_list = np.asarray(pic_path_list) kf = KFold(len(label), n_folds=10) all_acc = [] f = open('error.txt', 'w') for (train, valid) in kf: train_data = data[train] valid_data = data[valid] train_label = label[train] valid_label = label[valid] train_path = pic_path_list[train] valid_path = pic_path_list[valid] clf = LinearSVC() clf.fit(train_data, train_label) acc = accuracy_score(valid_label, clf.predict(valid_data)) roc_auc = roc_auc_score(valid_label, clf.predict(valid_data)) for index in range(len(valid_data)): if valid_label[index] != clf.predict(np.reshape(valid_data[index], (1, 1))): f.write(str(index)+'\t'+valid_path[index]+'\n') all_acc.append(acc) print acc, roc_auc f.close() all_acc.sort(reverse=True) print 'mean_acc :', np.mean(all_acc[:])
if __name__ == '__main__': # folder = '/data/liubo/face/self' # person_list = os.listdir(folder) # all_pic_path = [] # all_person = [] # for person in person_list: # if person == 'unknown' or person.startswith('new_person'): # continue # person_path = os.path.join(folder, person) # pic_list = os.listdir(person_path) # for pic in pic_list: # pic_path = os.path.join(person_path, pic) # all_pic_path.append(pic_path) # all_person.append(person) # all_score, all_label = cal_pic_distance(all_pic_path, all_person) # msgpack_numpy.dump((all_score, all_label), open('all_score_label.p','wb')) # all_score, all_label = msgpack_numpy.load(open('all_score_label.p','rb')) count = Counter(all_label) print count all_score = np.reshape(np.asarray(all_score),(len(all_score), 1)) all_label = np.asarray(all_label) gnb = GaussianNB() train_data, test_data, train_label, test_label = train_test_split(all_score, all_label) gnb.fit(train_data, train_label) gnb.predict_proba(test_data) print accuracy_score(test_label, gnb.predict(test_data)) cPickle.dump(gnb, open('/data/liubo/face/vgg_face_dataset/model/dist_prob.p','wb')) pdb.set_trace()
this_acc = valid_model(valid_path_list, model, nb_classes, pic_shape) train_acc = valid_model(train_path_list, model, nb_classes, pic_shape) print 'this_acc :', this_acc, 'last_acc :', last_acc, 'train_acc :', train_acc if this_acc > last_acc: model.save_weights(weight_file, overwrite=True) print ('save_model') last_acc = this_acc if __name__ == '__main__': parser = OptionParser() parser.add_option("-n", "--num_class", dest="num_class", help="classify label num") parser.add_option("-m", "--model_file", dest="model_file", help="model file") parser.add_option("-w", "--weight_file", dest="weight_file", help="weight file") parser.add_option("-l", "--train_valid_sample_list_file", dest="train_valid_sample_list_file", help="train_valid_sample_list_file") (options, args) = parser.parse_args() model_file = options.model_file weight_file = options.weight_file nb_classes = int(options.num_class) train_valid_sample_list_file = options.train_valid_sample_list_file pic_shape = (96, 96, 3) # inception_v4的shape (train_sample_list, valid_sample_list) = msgpack_numpy.load(open(train_valid_sample_list_file, 'rb')) print 'len(train_sample_list) :', len(train_sample_list), 'len(valid_sample_list) :', len(valid_sample_list) train_valid_model(train_sample_list, valid_sample_list, pic_shape, nb_classes, model_file, weight_file)
last_crps = test_acc else: if this_patience >= patience: break else: this_patience = 1 def extract_feature(model_file, weight_file): print 'model_file :', model_file print 'weight_file :', weight_file model = model_from_json(open(model_file, 'r').read()) model.load_weights(weight_file) get_Conv_FeatureMap = K.function([model.layers[0].get_input_at(False), K.learning_phase()], [model.layers[-2].get_output_at(False)]) return model, get_Conv_FeatureMap if __name__ == '__main__': model_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.model' weight_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.weight' # extract_feature(model_file, weight_file) # model = deep_net(pic_shape=(3, 128, 128), nb_classes=NB_CLASS) # model.compile('rmsprop', 'categorical_crossentropy') model_data, model_label = msgpack_numpy.load(open('/data/liubo/face/originalimages/originalimages_model.p', 'rb')) model_data = np.transpose(model_data, (0, 3, 1, 2)) X_train, X_test, y_train, y_test = train_test_split(model_data, model_label, test_size=0.1) train_valid_model(X_train, y_train, X_test, y_test, NB_CLASS, model_file, weight_file)
import pdb import numpy as np import cPickle reload(sys) sys.setdefaultencoding("utf-8") # fileConfig('logger_config.ini') # logger_error = logging.getLogger('errorhandler') if __name__ == '__main__': threshold = float(sys.argv[1]) method = sys.argv[2] day = sys.argv[3] (query_list, all_dist) = msgpack_numpy.load( open( '/data/liubo/hotspot/query_search/all_query_dist_beijing_{}.p'. format(day), 'rb')) query_dist_dic = cPickle.load( open( '/data/liubo/hotspot/query_search/beijing_query_dist_dic_{}.p'. format(day), 'rb')) linkage = sch.linkage(all_dist, method=method) cluster_result = sch.fcluster(linkage, t=threshold) cluster_result_dic = {} f_result = open( '/data/liubo/hotspot/query_search/beijing_{}_cluster_result_{}_{}.txt'. format(day, threshold, method), 'w') for index in range(len(cluster_result)): this_cluster_id = cluster_result[index] this_cluster_query_list = cluster_result_dic.get(this_cluster_id, [])
def load_data(data_path): feature, label = msgpack_numpy.load(open(data_path, 'rb')) return feature, label