def set_lock(check_lock_file): check_lock = None if os.name == "posix": check_lock = Lock(check_lock_file) check_lock.lifetime = timedelta(hours=1) frameinfo = getframeinfo(currentframe()) print("-- {}/{}: waiting to obtain lock --".format( frameinfo.filename, frameinfo.lineno)) check_lock.lock() print(">> obtained lock for posix system<<") elif os.name == "nt": import filelock check_lock = filelock.FileLock(check_lock_file) check_lock.timeout = 100 # 100s check_lock.acquire() if check_lock.is_locked: print(">> obtained lock for windows system <<") else: print("Unknown operating system, lock unavailable") return check_lock
def load_data_for_set(self, pathconf, param, mode): # ---------------------------------------------------------------------- # Train, Validation, and Test # mlab = matlab.engine.start_matlab() # Read from pathconf # Original implementation train_data_dir = os.path.normpath(pathconf.dataset) dump_data_dir = os.path.normpath(pathconf.train_dump) dump_patch_dir = os.path.normpath(pathconf.patch_dump) # local (or volatile) copy of the dump data tmp_patch_dir = os.path.normpath(pathconf.volatile_patch_dump) # print("train_data_dir = {}".format(train_data_dir)) # print("dump_data_dir = {}".format(dump_data_dir)) # print("dump_patch_dir = {}".format(dump_patch_dir)) # print("tmp_patch_dir = {}".format(tmp_patch_dir)) if not os.path.exists(dump_data_dir): os.makedirs(dump_data_dir) if not os.path.exists(dump_patch_dir): os.makedirs(dump_patch_dir) if not os.path.exists(tmp_patch_dir): os.makedirs(tmp_patch_dir) # Check if we have the big h5 file ready big_file_name = dump_patch_dir + mode + "-data-chunked.h5" # if os.getenv("MLTEST_DEBUG", default=""): # import pdb # pdb.set_trace() # Mutex lock # # We will create an nfs-safe lock file in a temporary directory to # prevent our script from using corrupted, or data that is still being # generated. This allows us to launch multiple instances at the same # time, and allow only a single instance to generate the big_file. if not os.path.exists(".locks"): os.makedirs(".locks") check_lock_file = ".locks/" + \ hashlib.md5(big_file_name.encode()).hexdigest() if os.name == "posix": check_lock = Lock(check_lock_file) check_lock.lifetime = timedelta(days=2) frameinfo = getframeinfo(currentframe()) print("-- {}/{}: waiting to obtain lock --".format( frameinfo.filename, frameinfo.lineno)) check_lock.lock() print(">> obtained lock for posix system<<") elif os.name == "nt": import filelock check_lock = filelock.FileLock(check_lock_file) check_lock.timeout = 2000 check_lock.acquire() if check_lock.is_locked: print(">> obtained lock for windows system <<") else: print("Unknown operating system, lock unavailable") # if the large training data file does not exist if not os.path.exists(big_file_name): print("big data file does not exist...") # if the patch-mode-data file does not exist if not os.path.exists( os.path.join(dump_patch_dir, mode + "-data.h5")): print("{0} does not exist...".format( os.path.join(dump_patch_dir, mode + "-data.h5"))) # Read scale histogram hist_file_path = train_data_dir + "scales-histogram-minsc-" + str( param.dataset.fMinKpSize) + ".h5" if not os.path.exists(hist_file_path): print("Hist file does not exist, creating...") get_scale_hist(train_data_dir, param) # print("Loading hist file...") hist_file = h5py.File(hist_file_path, "r") scale_hist = np.asarray(hist_file["histogram_bins"], dtype=float).flatten() # print(scale_hist) scale_hist /= np.sum(scale_hist) scale_hist_c = np.asarray( hist_file["histogram_centers"]).flatten() # Read list of images from split files split_name = "" split_name += str(param.dataset.nTrainPercent) + "-" split_name += str(param.dataset.nValidPercent) + "-" split_name += str(param.dataset.nTestPercent) + "-" if mode == "train": # split_name += "train-" split_name += "train" elif mode == "valid": # split_name += "val-" split_name += "val" elif mode == "test": # split_name += "test-" split_name += "test" print("split_name: {}".format(split_name)) # split_file_name = train_data_dir + "split-" \ # + split_name + "minsc-" \ # + str(param.dataset.fMinKpSize) + ".h.txt" # split_file_name = "split-" + split_name + "minsc-" + str(param.dataset.fMinKpSize) + ".h.txt" split_file_name = "split-" + split_name + ".txt" split_file_name = train_data_dir + split_file_name # split_file_name = os.path.join(train_data_dir, split_file_name) print("split_file_name: {}".format(split_file_name)) if not os.path.exists(split_file_name): print("split_file_name does not exist...") list_jpg_file = get_list_of_img(train_data_dir, dump_data_dir, param, mode) else: print("split_file_name exists...") list_jpg_file = [] for file_name in list( np.loadtxt(split_file_name, dtype=bytes)): list_jpg_file += [ file_name.decode("utf-8").replace( "-kp-minsc-" + str(param.dataset.fMinKpSize), ".jpg") ] # ------------------------------------------------- # Create dumps in parallel # I am lazy so create arguments in loop lol pool_arg = [None] * len(list_jpg_file) for idx_jpg in six.moves.xrange(len(list_jpg_file)): pool_arg[idx_jpg] = (idx_jpg, list_jpg_file[idx_jpg], train_data_dir, dump_data_dir, tmp_patch_dir, scale_hist, scale_hist_c, self.out_dim, param) # # if true, use multi thread, otherwise use only single thread prod = True if prod: number_of_process = int(ratio_CPU * mp.cpu_count()) pool = mp.Pool(processes=number_of_process) manager = mp.Manager() queue = manager.Queue() for idx_jpg in six.moves.xrange(len(list_jpg_file)): pool_arg[idx_jpg] = pool_arg[idx_jpg] + (queue, ) # map async pool_res = pool.map_async(createDump, pool_arg) # pool_res = pool.map_async(createDump, pool_arg, chunksize = int(len(list_jpg_file)/(number_of_process* mp.cpu_count()))) # monitor loop while True: if pool_res.ready(): print("Pool_res ready?") break else: size = queue.qsize() print("\r -- " + mode + ": Processing image {}/{}".format( size, len(list_jpg_file)), end="") # print(list_jpg_file[size]) sys.stdout.flush() time.sleep(1) pool.close() pool.join() print("\r -- " + mode + ": Finished Processing Images!") # for debugging, if multi thread is used, then it is difficult # to debug else: for idx_jpg in six.moves.xrange(len(list_jpg_file)): pool_arg[idx_jpg] = pool_arg[idx_jpg] + (None, ) for idx_jpg in six.moves.xrange(len(list_jpg_file)): createDump(pool_arg[idx_jpg]) print("\r -- " + mode + ": Processing image " "{}/{}".format(idx_jpg + 1, len(list_jpg_file)), end="") sys.stdout.flush() print("\r -- " + mode + ": Finished Processing Images!") # ------------------------------------------------- # # -------------------- # use single thread for simplify debugging # for idx_jpg in six.moves.xrange(len(list_jpg_file)): # pool_arg[idx_jpg] = pool_arg[idx_jpg] + (None,) # for idx_jpg in six.moves.xrange(len(list_jpg_file)): # createDump(pool_arg[idx_jpg]) # print("\r -- " + mode + ": Processing image " # "{}/{}".format(idx_jpg + 1, len(list_jpg_file)), # end="") # sys.stdout.flush() # print("\r -- " + mode + ": Finished Processing Images!") # ------------------------------------------------------------------ # Use only valid indices to ascertain mutual exclusiveness id_file_name = train_data_dir + "split-" id_file_name += str(param.dataset.nTrainPercent) + "-" id_file_name += str(param.dataset.nValidPercent) + "-" id_file_name += str(param.dataset.nTestPercent) + "-" id_file_name += ("minsc-" + str(param.dataset.fMinKpSize) + ".h5") if mode == "train": id_key = "indices_train" elif mode == "valid": id_key = "indices_val" elif mode == "test": id_key = "indices_test" # print(id_file_name) try: with h5py.File(id_file_name, "r") as id_file: id_2_keep = np.asarray(id_file[id_key]) except OSError as err: print(err) print("Creating idx file...") # if "unable to open file" in err: createsplitindexh5file(id_file_name, train_data_dir, param) with h5py.File(id_file_name, "r") as id_file: id_2_keep = np.asarray(id_file[id_key]) # print(id_2_keep) print("{0} has {1} sfmid points to keep...".format( id_key, len(id_2_keep))) # exit() # ind_2_keep = np.in1d(dataset[2], id_2_keep) # ind_2_keep += dataset[2] < 0 # loop through files to figure out how many valid items we have # pdb.set_trace() # for tracking of the dataset num_valid = 0 # print(len(list_jpg_file)) # exit() for idx_jpg in six.moves.xrange(len(list_jpg_file)): jpg_file = list_jpg_file[idx_jpg] print("\r -- " + mode + ": " "Reading dumps to figure out number of valid " "{}/{}".format(idx_jpg + 1, len(list_jpg_file)), end="") sys.stdout.flush() # Load created dump # final_dump_file_name = tmp_patch_dir + jpg_file.replace(".jpg", ".h5") # print(tmp_patch_dir) # print(jpg_file) final_dump_file_name = tmp_patch_dir + "\\" + os.path.basename( jpg_file)[:-4] + ".h5" # print(final_dump_file_name) # Use loadh5 and turn it back to original cur_data_set try: with h5py.File(final_dump_file_name, "r") as dump_file: # print(list(dump_file.keys())) cur_ids = dump_file["2"].value # kps = dump_file["valid_keypoints"][()] # cur_ids = np.asarray(kps[:, 4]) # print(cur_ids) except OSError as err: # print(err) continue # Find cur valid by looking at id_2_keep cur_valid = np.in1d(cur_ids, id_2_keep) # print(cur_valid) # Add all negative labels as valid (neg data) cur_valid += cur_ids < 0 # Sum it up num_valid += np.sum(cur_valid) # print(num_valid) print("\n -- " + mode + ": " "Found {} valid data points from {} files" "".format(num_valid, len(list_jpg_file))) # Get the first data to simply check the shape tmp_dump_file_name = tmp_patch_dir + "\\" + os.path.basename( list_jpg_file[-1])[:-4] + ".h5" with h5py.File(tmp_dump_file_name, "r") as dump_file: dataset_shape = [] dataset_type = [] for _idx in six.moves.xrange(len(dump_file.keys())): dataset_shape += [dump_file[str(_idx)].shape] dataset_type += [dump_file[str(_idx)].dtype] # create and save the large dataset chunk with h5py.File(big_file_name, "w-") as big_file: big_file["time_stamp"] = np.asarray(time.localtime()) name_list = ["x", "y", "ID", "pos", "angle", "coords"] # create the dataset storage chunk for __i in six.moves.xrange(len(dataset_shape)): big_file.create_dataset( name_list[__i], (num_valid, ) + dataset_shape[__i][1:], chunks=(1, ) + dataset_shape[__i][1:], maxshape=((num_valid, ) + dataset_shape[__i][1:]), dtype=dataset_type[__i]) # loop through the file to save to a big chunk save_base = 0 for idx_jpg in six.moves.xrange(len(list_jpg_file)): jpg_file = list_jpg_file[idx_jpg] print("\r -- " + mode + ": " "Saving the data to the big dump " "{}/{}".format(idx_jpg + 1, len(list_jpg_file)), end="") sys.stdout.flush() # Load created dump # final_dump_file_name = tmp_patch_dir + jpg_file.replace(".jpg", ".h5") final_dump_file_name = tmp_patch_dir + "\\" + os.path.basename( jpg_file)[:-4] + ".h5" # print(final_dump_file_name) # Use loadh5 and turn it back to original cur_data_set try: tmpdict = loadh5(final_dump_file_name) cur_data_set = tuple([ tmpdict[str(_idx)] for _idx in range(len(tmpdict.keys())) ]) # Find cur valid by looking at id_2_keep cur_valid = np.in1d(cur_data_set[2], id_2_keep) # Add all negative labels as valid (neg data) cur_valid += cur_data_set[2] < 0 for __i in six.moves.xrange(len(dataset_shape)): big_file[name_list[__i]][ save_base:save_base + np.sum(cur_valid )] = cur_data_set[__i][cur_valid] # Move base to the next chunk save_base += np.sum(cur_valid) except OSError as err: # print(err) # print("{0} skipped due to invalidity...".format(final_dump_file_name)) # sys.stdout.flush() continue # Assert that we saved all assert save_base == num_valid print("\n -- " + mode + ": " "Done saving {} valid data points from {} files" "".format(num_valid, len(list_jpg_file))) # -------------------------------------------------- # Cleanup dump for idx_jpg in six.moves.xrange(len(list_jpg_file)): jpg_file = list_jpg_file[idx_jpg] print("\r -- " + mode + ": " "Removing dump " "{}/{}".format(idx_jpg + 1, len(list_jpg_file)), end="") sys.stdout.flush() # Delete dump # final_dump_file_name = tmp_patch_dir + jpg_file.replace(".jpg", ".h5") final_dump_file_name = tmp_patch_dir + "\\" + os.path.basename( jpg_file)[:-4] + ".h5" try: os.remove(final_dump_file_name) except FileNotFoundError as err: pass print("\r -- " + mode + ": " "Cleaned up dumps! " "Local dump is now clean!") else: print(" -- Found old file without chunks. " "Copying to new file with chunks...") old_big_file_name = dump_patch_dir + mode + "-data.h5" with h5py.File(old_big_file_name, "r") as old_big_file, \ h5py.File(big_file_name, "w-") as big_file: dataset = [] # load old train into array name_list = ["x", "y", "ID", "pos", "angle", "coords"] for __i in six.moves.xrange(len(name_list)): dataset += [np.asarray(old_big_file[name_list[__i]])] # save train big_file["time_stamp"] = np.asarray(time.localtime()) # allocate and write for __i in six.moves.xrange(len(name_list)): if name_list[__i] == "x": chunk_shape = (1, ) + dataset[__i].shape[1:] else: chunk_shape = None big_file.create_dataset( name_list[__i], dataset[__i].shape, data=dataset[__i], chunks=chunk_shape, maxshape=dataset[__i].shape, ) print(" -- Finished creating chunked file, removing old...") os.remove(old_big_file_name) # ---------------------------------------------------------------------- # Copy to local tmp if necessary if not os.path.exists(tmp_patch_dir + mode + "-data-chunked.h5"): print(" -- " + mode + ": " "Local dump does not exist! " "Copying big dump to local drive... ") shutil.copy(dump_patch_dir + mode + "-data-chunked.h5", tmp_patch_dir + mode + "-data-chunked.h5") else: print(" -- " + mode + ": " "Local dump exists. Checking timestamp...") # get timestamp from nfs with h5py.File(dump_patch_dir + mode + "-data-chunked.h5", "r") \ as nfs_file: nfs_time = np.asarray(nfs_file["time_stamp"]) # get timestamp from local with h5py.File(tmp_patch_dir + mode + "-data-chunked.h5", "r") \ as local_file: local_time = np.asarray(local_file["time_stamp"]) # if the two files have different time stamps if any(nfs_time != local_time): print(" -- " + mode + ": " "Time stamps are different! " "Copying big dump to local drive... ") shutil.copy(dump_patch_dir + mode + "-data-chunked.h5", tmp_patch_dir + mode + "-data-chunked.h5") else: print(" -- " + mode + ": " "Time stamps are identical! Re-using local dump") # Free lock if os.name == "posix": check_lock.unlock() print("-- free lock --") elif os.name == "nt": check_lock.release() print("-- free lock --") else: pass # ---------------------------------------------------------------------- # Use local copy for faster speed print(" -- " + mode + ": Loading from local drive... ") big_file_name = tmp_patch_dir + mode + "-data-chunked.h5" # open big_file and don"t close big_file = h5py.File(big_file_name, "r") x = big_file["x"] # work arround for h5py loading all things to memory read_batch_size = 10000 read_batch_num = int( np.ceil(float(big_file["x"].shape[0]) / float(read_batch_size))) # Manual, since I don't want to bother debugging the below # fields = ["y", "ID", "pos", "angle", "coords"] # for var_name in fields: # # allocate data # exec("{0} = np.zeros(big_file['{0}'].shape, " # "dtype=big_file['{0}'].dtype)".format(var_name)) # # copy data in batches # for idx_batch in six.moves.xrange(read_batch_num): # idx_s = idx_batch * read_batch_size # idx_e = (idx_batch + 1) * read_batch_size # idx_e = np.minimum(idx_e, big_file["x"].shape[0]) # exec("{0}[idx_s:idx_e] = np.asarray(big_file['{0}'][idx_s:idx_e])" # "". format(var_name)) # Allocate y = np.zeros(big_file["y"].shape, dtype=big_file["y"].dtype) ID = np.zeros(big_file["ID"].shape, dtype=big_file["ID"].dtype) pos = np.zeros(big_file["pos"].shape, dtype=big_file["pos"].dtype) angle = np.zeros(big_file["angle"].shape, dtype=big_file["angle"].dtype) coords = np.zeros(big_file["coords"].shape, dtype=big_file["coords"].dtype) # Copy data in batches for idx_batch in six.moves.xrange(read_batch_num): idx_s = idx_batch * read_batch_size idx_e = (idx_batch + 1) * read_batch_size idx_e = np.minimum(idx_e, big_file["x"].shape[0]) y[idx_s:idx_e] = np.asarray(big_file['y'][idx_s:idx_e]) ID[idx_s:idx_e] = np.asarray(big_file['ID'][idx_s:idx_e]) pos[idx_s:idx_e] = np.asarray(big_file['pos'][idx_s:idx_e]) angle[idx_s:idx_e] = np.asarray(big_file['angle'][idx_s:idx_e]) coords[idx_s:idx_e] = np.asarray(big_file['coords'][idx_s:idx_e]) # import pdb # pdb.set_trace() # # Make sure data is contiguos # y = np.ascontiguousarray(y) # ID = np.ascontiguousarray(ID) # pos = np.ascontiguousarray(pos) # angle = np.ascontiguousarray(angle) # coords = np.ascontiguousarray(coords) print(" -- " + mode + ": Done... ") return x, y, ID, pos, angle, coords