dataset = dataset.prefetch(cf.batch_size * 8) iterator = dataset.make_one_shot_iterator() images, labels = iterator.get_next() else: num_examples = None global_step = None images_ph = tf.placeholder(tf.float32, [ None, cf.train_image_size, cf.train_image_size, cf.train_image_channel ], name="inputs") labels_ph = tf.placeholder(tf.int32, [None], name="labels") if cf.mode == 'train': if cf.use_pair_sampling: loss_op, end_points, train_op, embeddings_op = model_fn.build_model( images_ph, labels_ph, cf, True, num_examples, global_step) else: loss_op, end_points, train_op, embeddings_op = model_fn.build_model( images, labels, cf, True, num_examples, global_step) else: embeddings_op = model_fn.build_model(images_ph, labels_ph, cf, is_training=False) if cf.mode == 'train': if cf.fine_tuning and cf.model_name in pretrained_map: import urllib.request import tarfile pretrained_url = pretrained_map[cf.model_name]
assert os.path.isfile(args.image), "Image {} not found".format(args.iamge) image_string = tf.read_file(args.image) image_decoded = tf.image.decode_jpeg(image_string, channels=3) image = tf.image.convert_image_dtype(image_decoded, tf.float32) resized_image = tf.image.resize_images(image, [params.image_size, params.image_size]) image = tf.clip_by_value(resized_image, 0.0, 0.1) image = tf.expand_dims(image, 0) inputs = {"images": image} print(image.get_shape().as_list()) # Building model with tf.variable_scope('model'): logits = build_model(False, inputs, params) # logits shape: (1, 6) predictions = tf.argmax(logits, 1) # min max in col=1 probs = tf.nn.softmax(logits=logits) # list all the variables of graphs # for var in tf.all_variables(): # print(var) # from tensorflow.contrib.framework.python.framework import checkpoint_utils # var_list = checkpoint_utils.list_variables(os.path.join(args.model_dir, args.restore_from)) # for var in var_list: # print(var) # Initialize tf.Saver
def infer(queries, db): def _parse_function(filename): image_string = tf.read_file(filename) image_decoded = tf.image.decode_jpeg(image_string, channels=3) eval_image_size = cf.train_image_size if cf.preprocessing_name is not None: image_preprocessing_fn = preprocessing_factory.get_preprocessing(cf.preprocessing_name, is_training=False) image_decoded = image_preprocessing_fn(image_decoded, eval_image_size, eval_image_size) else: image = tf.cast(image_decoded, tf.float32) image = tf.expand_dims(image, 0) image = tf.image.resize_image_with_pad(image, cf.train_image_size, cf.train_image_size) image = tf.squeeze(image, [0]) image = tf.divide(image, 255.0) image = tf.subtract(image, 0.5) image_decoded = tf.multiply(image, 2.0) return image_decoded dataset_queries = tf.data.Dataset.from_tensor_slices(queries) dataset_queries = dataset_queries.map(_parse_function) dataset_queries = dataset_queries.batch(len(queries)) iterator = dataset_queries.make_one_shot_iterator() features = iterator.get_next() query_imgs = sess.run(features) dataset_db = tf.data.Dataset.from_tensor_slices(db) dataset_db = dataset_db.map(_parse_function) dataset_db = dataset_db.batch(len(db)) iterator_db = dataset_db.make_one_shot_iterator() features_db = iterator_db.get_next() db_imgs = sess.run(features_db) checkpoints = cf.nsml_eval_checkpoints.split(",") sim_matrix = None model_names = cf.nsml_eval_models.split(",") eval_sessions = cf.nsml_eval_sessions.split(",") embedding_nums = [int(v) for v in cf.nsml_eval_embeddings.split(",")] for i, cp in enumerate(checkpoints): tf.reset_default_graph() images_ph = tf.placeholder(tf.float32, [None, cf.train_image_size, cf.train_image_size, cf.train_image_channel], name="inputs") query_feed_dict = {images_ph: query_imgs} index_feed_dict = {images_ph: db_imgs} model_cf = {"model_name": model_names[i], "embedding_size": embedding_nums[i]} embeddings_op = model_fn.build_model(images_ph, None, model_cf, is_training=False) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True global_sess = tf.Session(config=tf_config) global_sess.run(tf.global_variables_initializer()) nsml.load(checkpoint=cp, session=eval_sessions[i]) query_vecs = global_sess.run(embeddings_op, feed_dict=query_feed_dict) reference_vecs = global_sess.run(embeddings_op, feed_dict=index_feed_dict) print('test data load queries {} query_img {} references {} reference_img {}'. format(len(queries), len(query_imgs), len(db), len(db_imgs))) print('inference start') # l2 normalization query_vecs = l2_normalize(query_vecs) reference_vecs = l2_normalize(reference_vecs) # Calculate cosine similarity if sim_matrix is None: sim_matrix = np.dot(query_vecs, reference_vecs.T) else: sim_matrix += np.dot(query_vecs, reference_vecs.T) sim_matrix /= len(checkpoints) retrieval_results = {} for (i, query) in enumerate(queries): query = query.split('/')[-1].split('.')[0] sim_list = zip(db, sim_matrix[i].tolist()) sorted_sim_list = sorted(sim_list, key=lambda x: x[1], reverse=True) ranked_list = [k.split('/')[-1].split('.')[0] for (k, v) in sorted_sim_list] # ranked list retrieval_results[query] = ranked_list print('done') return list(zip(range(len(retrieval_results)), retrieval_results.items()))
def main(cf, hyper_param_txt, hostname): tf.logging.set_verbosity(tf.logging.INFO) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = F.gpu_no print("CUDA Visible device", device_lib.list_local_devices()) start_time = datetime.now().strftime('%Y%m%d%H%M%S') start_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if not os.path.isdir(cf.save_dir): os.makedirs(cf.save_dir) f = open(os.path.join(cf.save_dir, "train_parameters_%s.txt" % start_time), mode="w+") f.write(hyper_param_txt) # inputs_ph = tf.placeholder(tf.float32, [None, cf.train_image_size, cf.train_image_size, cf.train_image_channel], # name="inputs") # labels_ph = tf.placeholder(tf.int32, [None], name="labels") tf.set_random_seed(123) files = glob.glob(os.path.join(cf.data_dir, "*_train*tfrecord")) files.sort() assert len(files) > 0 num_examples = util.count_records(files) global_step = tf.Variable(0, trainable=False) image_preprocessing_fn = None if cf.preprocessing_name: image_preprocessing_fn = preprocessing_factory.get_preprocessing( cf.preprocessing_name, is_training=True) def sampling_pre_process(example_proto): features = { "image/encoded": tf.FixedLenFeature((), tf.string, default_value=""), "image/class/label": tf.FixedLenFeature((), tf.int64, default_value=0), 'image/height': tf.FixedLenFeature((), tf.int64, default_value=0), 'image/width': tf.FixedLenFeature((), tf.int64, default_value=0) } if cf.use_attr: features["image/attr"] = tf.VarLenFeature(dtype=tf.int64) parsed_features = tf.parse_single_example(example_proto, features) image = parsed_features["image/encoded"] label = parsed_features["image/class/label"] if cf.use_attr: return image, label, parsed_features["image/attr"] else: return image, label def train_pre_process(img_string): image = tf.image.decode_jpeg(img_string, cf.train_image_channel) if image_preprocessing_fn is not None: image = image_preprocessing_fn(image, cf.train_image_size, cf.train_image_size) else: image = tf.cast(image, tf.float32) image = tf.expand_dims(image, 0) image = tf.image.resize_image_with_pad(image, cf.train_image_size, cf.train_image_size) # image = tf.image.resize_bilinear(image, [224, 224], align_corners=False) image = tf.squeeze(image, [0]) image = tf.divide(image, 255.0) image = tf.subtract(image, 0.5) image = tf.multiply(image, 2.0) return image string_img_pl = tf.placeholder(tf.string, (None)) pair_dataset = tf.data.Dataset.from_tensor_slices(string_img_pl) pair_dataset = pair_dataset.map( train_pre_process, num_parallel_calls=cf.num_preprocessing_threads) pair_dataset = pair_dataset.batch(cf.batch_size) pair_dataset = pair_dataset.prefetch(cf.batch_size) pair_iterator = pair_dataset.make_initializable_iterator() pair_images = pair_iterator.get_next() steps_each_epoch = int(num_examples / cf.batch_size) if num_examples % cf.batch_size > 0: steps_each_epoch += 1 dataset = tf.data.TFRecordDataset(files) dataset = dataset.map(sampling_pre_process, num_parallel_calls=cf.num_preprocessing_threads) dataset = dataset.shuffle(cf.shuffle_buffer_size) dataset = dataset.repeat() dataset = dataset.batch(cf.sampling_buffer_size) dataset = dataset.prefetch(cf.sampling_buffer_size) iterator = dataset.make_one_shot_iterator() # iterator = dataset.make_initializable_iterator() if cf.use_attr: images, labels, attrs = iterator.get_next() else: images, labels = iterator.get_next() images_ph = tf.placeholder(tf.float32, [ cf.batch_size, cf.train_image_size, cf.train_image_size, cf.train_image_channel ], name="inputs") labels_ph = tf.placeholder(tf.int32, [cf.batch_size], name="labels") if cf.use_attr: attrs_ph = tf.placeholder(tf.float32, [cf.batch_size, cf.attr_dim], name="attrs") if not cf.use_attr_net: cf.embedding_size = cf.attr_dim else: attrs_ph = None # seed_ph = tf.placeholder(tf.int64, (), name="shuffle_seed") loss_op, end_points, train_op = model_fn.build_model( images_ph, labels_ph, cf, attrs_ph, True, cf.use_attr_net, cf.num_hidden_attr_net, num_examples, global_step, use_old_model=cf.use_old_model) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for end_points. for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) summary_op = tf.summary.merge(list(summaries), name='summary_op') if cf.quantize_delay >= 0: tf.contrib.quantize.create_training_graph( quant_delay=cf.quantize_delay) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(cf.save_dir, sess.graph) epoch = 1 steps = 1 latest_epoch = 0 if cf.checkpoint_path is not None and ( os.path.isfile(cf.checkpoint_path) or (os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint(cf.checkpoint_path) is not None)): latest_checkpoint = tf.train.latest_checkpoint(cf.checkpoint_path) exclusions = [] if cf.checkpoint_exclude_scopes: exclusions = [ scope.strip() for scope in cf.checkpoint_exclude_scopes.split(',') ] variables_to_restore = [] for var in slim.get_model_variables(): for exclusion in exclusions: if var.op.name.startswith(exclusion): break else: variables_to_restore.append(var) saver_for_restore = tf.train.Saver(var_list=variables_to_restore, max_to_keep=cf.keep_checkpoint_max) if os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint( cf.checkpoint_path) is not None: cp = tf.train.latest_checkpoint(cf.checkpoint_path) else: cp = cf.checkpoint_path saver_for_restore.restore(sess, cp) if os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint( cf.checkpoint_path) is not None: latest_epoch = int( os.path.basename(latest_checkpoint).split("-")[1]) epoch = latest_epoch + 1 cf.max_number_of_epochs += latest_epoch f.write("%s:%s\n" % ("restore_checkpoint", latest_checkpoint)) saver = tf.train.Saver(tf.global_variables(), max_to_keep=cf.keep_checkpoint_max) f.close() num_trained_images = 0 last_saved_epoch = None last_saved_step = None start_avg_loss_steps = 10 start_total_loss = 0. while True: # sess.run(iterator.initializer, feed_dict={seed_ph: steps}) try: start = time.time() if cf.use_attr: tmp_images, tmp_labels, tmp_attrs = sess.run( [images, labels, attrs]) tmp_attrs = np.reshape(tmp_attrs.values, [cf.sampling_buffer_size, cf.attr_dim]) tmp_attrs = tmp_attrs.astype(np.float64) else: tmp_images, tmp_labels = sess.run([images, labels]) pair_indices = set() single_index_map = {} label_buffer = {} for i, tmp_label in enumerate(tmp_labels): if tmp_label in label_buffer: pair_indices.add(i) pair_indices.add(label_buffer[tmp_label]) if tmp_label in single_index_map: del single_index_map[tmp_label] else: label_buffer[tmp_label] = i single_index_map[tmp_label] = i pair_indices = list(pair_indices) if len(pair_indices) > cf.batch_size: pair_indices = pair_indices[:cf.batch_size] elif len(pair_indices) < cf.batch_size: pair_indices += list( single_index_map.values())[:cf.batch_size - len(pair_indices)] # print(pair_indices) batch_images = tmp_images[pair_indices] sess.run(pair_iterator.initializer, feed_dict={string_img_pl: batch_images}) batch_images = sess.run(pair_images) batch_labels = tmp_labels[pair_indices] if cf.use_attr: batch_attrs = tmp_attrs[pair_indices] sampling_time = time.time() - start tmp_images = None tmp_labels = None start = time.time() feed_dict = {images_ph: batch_images, labels_ph: batch_labels} if cf.use_attr: feed_dict[attrs_ph] = batch_attrs if steps % cf.save_summaries_steps == 0: loss, _, summary = sess.run([loss_op, train_op, summary_op], feed_dict=feed_dict) summary_writer.add_summary(summary, steps) else: loss, _ = sess.run([loss_op, train_op], feed_dict=feed_dict) if steps <= start_avg_loss_steps: start_total_loss += loss train_time = time.time() - start if steps % cf.log_every_n_steps == 0: now = datetime.now().strftime('%Y/%m/%d %H:%M:%S') print( "[%s: %d epoch(%d/%d), %d steps] sampling time: %f, train time: %f, loss: %f" % (now, epoch, steps % steps_each_epoch, steps_each_epoch, steps, sampling_time, train_time, loss)) num_trained_images += cf.batch_size if cf.use_save_steps: if steps % cf.save_interval_steps == 0: saver.save(sess, cf.save_dir + "/model.ckpt", steps) last_saved_step = steps if cf.max_number_of_steps is not None and steps >= cf.max_number_of_steps: break steps += 1 if num_trained_images >= num_examples: if not cf.use_save_steps and cf.save_interval_epochs >= 1 and ( epoch - latest_epoch) % cf.save_interval_epochs == 0: saver.save(sess, cf.save_dir + "/model.ckpt", epoch) last_saved_epoch = epoch if epoch >= cf.max_number_of_epochs: break epoch += 1 num_trained_images = 0 except tf.errors.OutOfRangeError: break if cf.use_save_steps: if last_saved_step is None or last_saved_step < steps: saver.save(sess, cf.save_dir + "/model.ckpt", steps) else: if last_saved_epoch is None or last_saved_epoch < epoch: saver.save(sess, cf.save_dir + "/model.ckpt", epoch) summary_writer.add_summary(sess.run(summary_op, feed_dict=feed_dict), steps) sess.close() tf.reset_default_graph() if cf.notify_after_training: txt = "%s[%s]\n\n" % (hostname, socket.gethostbyname(socket.gethostname())) txt += "start avg loss : %f" % (start_total_loss / start_avg_loss_steps) txt += "last loss : %f" % loss txt += "start time: %s\n" % start_time_str txt += "end time: %s\n" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') if cf.eval_after_training: txt += "going to evaluate" else: txt += "not going to evaluate" txt += "\n[params]\n" txt += hyper_param_txt util.send_msg_to_slack( "\n\n==================================\nTraining is Done\n" + txt) if cf.eval_after_training: cuda.select_device(0) cuda.close() eval_cmd = 'python -u multiple_search_models.py --model_dir="%s" --embedding_size=%d --data_dir="%s" --model_name=%s --max_top_k=%d --shutdown_after_train=%d --gpu_no=%s --step_type=%s --image_size=%s --eval_batch_size=%d --preprocessing_name=%s --notify_after_training=%d --use_old_model=%d --save_static_data=%d' % ( cf.save_dir, cf.embedding_size, cf.data_dir, cf.model_name, cf.eval_max_top_k, 1 if cf.shutdown_after_train else 0, cf.gpu_no, "step" if cf.use_save_steps else "epoch", cf.train_image_size, cf.eval_batch_size, cf.preprocessing_name, 1 if cf.notify_after_training else 0, 1 if cf.use_old_model else 0, 1 if cf.save_static_data else 0) print(eval_cmd) os.system(eval_cmd) else: if cf.shutdown_after_train: os.system("sudo shutdown now")