def batch_one_image_dataset(global_X_path, global_Y_path, img_window_path, img_vlad_path, img_metadata_path, target, overlap_threshold=0.5): pic = parse_image_metadata(file_path=img_metadata_path, parseObject=True) winL = de_serialize_window(input_path=img_window_path) vladL = load_matrix(input_path=img_vlad_path).tolist() dataset = get_data_set_X_Y(winL, vladL, pic, target, overlap_threshold) X, Y = dataset[0], dataset[1] append_file(dest_file=global_X_path, strInput=X) append_file(dest_file=global_Y_path, strInput=Y)
def batch_all_images(input_image_path, annotation_path, output_parent_path, unit_ratio_list, overlap_ratio, target, target_pos_path, vladVector, target_count=20, pca=True, k=30, max_iter=30, preVLAD=False, voca_path=None, dataset_mode=False, overlap_threshold=0.5): global_sift_path = '%sglobal_sift.txt' % (output_parent_path) image_name_list = get_target_pos_names(input_path=target_pos_path, target=target, target_count=target_count) sift_path_L = [] if preVLAD: delete_file(file_path=global_sift_path) for image_name in image_name_list: metadata_path = '%s%s.xml' % (annotation_path, image_name) win_sift_path = batch_one_image_pre_VLAD(input_image_path, image_name, metadata_path, output_parent_path, unit_ratio_list, overlap_ratio) sift_path_L.append(win_sift_path) append_file(dest_file=global_sift_path, input_path=win_sift_path) print "----------pre-VLAD Done" else: all_dir = list_all_files(input_path=output_parent_path + 'windows/', onlyDir=True) for d in all_dir: sift_path_L.append("%s/windows/%s/temp_sift/" % (output_parent_path, d)) print "----------pre-VLAD is enabled" if voca_path is None or not isfile(voca_path): print "~~~~~~~Learning vocabulary by the sift vectors of all windows of all images" vector_matrix = None if pca: vector_matrix = pca_dataset(input_path=global_sift_path) vocabulary = learn_vocabulary(input_path=global_sift_path, k=k, max_iter=max_iter, single_file=True, vector_matrix=vector_matrix) save_matrix(v=vocabulary, output_path=voca_path) print "~~~~~~~Learning vocabulary done" elif vladVector: print "~~~~~~~Loading existing vocabulary" vocabulary = load_matrix(input_path=voca_path) if vladVector: for i in xrange(len(image_name_list)): image_name = image_name_list[i] output_path = "%swindows/%s/%s" % (output_parent_path, image_name, image_name) print "\t======Creating VLAD vectors" vlad_vector_batch(input_path=sift_path_L[i], output_path=output_path, vocabulary=vocabulary) print "\t======VLAD Done for", image_name else: print "##########No VLAD vector generated...." if dataset_mode: print "^^^^^^^^^^Generate data set for global windows and VLAD" global_X_path = output_parent_path + "global_X.txt" global_Y_path = output_parent_path + "global_Y.txt" delete_file(global_X_path) delete_file(global_Y_path) for img_name in image_name_list: img_window_path = "%s/windows/%s/%s_windows.txt" % (output_parent_path, img_name , img_name) img_vlad_path = "%s/windows/%s/%s_vlad.txt" % (output_parent_path, img_name , img_name) metadata_path = '%s%s.xml' % (annotation_path, img_name) batch_one_image_dataset(global_X_path, global_Y_path, img_window_path, img_vlad_path, metadata_path, target, overlap_threshold=overlap_threshold) print "\tData set done for", img_name print "....................All done"
def multi_thread_continous_download(url, file_name=None, overwrite=False, thread_num=4): if thread_num == 1: single_thread_continous_download(url, file_name, overwrite) elif thread_num > 1: # 如果文件名为空,则从 URL 中获取文件名 if file_name is None: file_name = url.rpartition('/')[-1] target_size = get_file_size(url) if (target_size < 0): print("multi_thread_continous_download(): get_file_size() error!\n") return if os.path.exists(file_name): if overwrite: os.remove(file_name) current_size = os.path.getsize(file_name) # 理论上来说,更严谨的方法是下载完目标文件,然后比较两个文件的 MD5 值。但是需要事先下载整个文件,可能浪费带宽(尤其是文件很大的时侯) if (current_size == target_size): print("multi_thread_continous_download(): file %s already downloaded complete!" %(file_name)) return # 已存在的同名文件大小 != 要下载的目标文件大小,重命名已存在文件,重新下载目标文件 else: print("multi_thread_continous_download(): file %s size exception, current_size != target_size" %(file_name)) new_file_name = file_name + '_' + get_current_timestamp() os.rename(file_name, new_file_name) print("multi_thread_continous_download(): %s RENAMED TO %s" %(file_name, new_file_name)) ranges = split_file_size(target_size, thread_num) thread_group = [] for i in range(thread_num): # print(i, '\t', ranges[i][0], ',', ranges[i][1]) t = threading.Thread(target=sub_thread_continous_download, name="thread%d" % i, args=(url, get_file_name_split(file_name, i), ranges[i][0], ranges[i][1], ranges[i][2])) t.start() thread_group.append(t) for t in thread_group: t.join() # 拼接前检查各个文件块的完整性 if check_file_integrity(file_name, target_size, thread_num): append_file(file_name, thread_num)
def multi_thread_download(url, file_name=None, overwrite=False, thread_num=4): if thread_num == 1: single_thread_download(url, file_name, overwrite) elif thread_num > 1: # 如果文件名为空,则从 URL 中获取文件名 if file_name is None: file_name = url.rpartition('/')[-1] # 潜在 bug:如果不覆盖己有文件,而已有文件不完整(eg. 没下载全),会有潜在影响 if os.path.exists(file_name) and (not overwrite): return target_size = get_file_size(url) if (target_size < 0): print("multi_thread_download(): get_file_size() error!\n") return ranges = split_file_size(target_size, thread_num) thread_group = [] for i in range(thread_num): # print(i, '\t', ranges[i][0], ',', ranges[i][1]) t = threading.Thread(target=sub_thread_download, name="thread%d" % i, args=(url, split_file_name(file_name, i), ranges[i][0], ranges[i][1])) t.start() thread_group.append(t) for t in thread_group: t.join() append_file(file_name, thread_num, False)
def batch_all_images(input_image_path, annotation_path, output_parent_path, unit_ratio_list, overlap_ratio, target, target_pos_path, vladVector, target_count=20, pca=True, k=30, max_iter=30, preVLAD=False, voca_path=None, dataset_mode=False, overlap_threshold=0.5): global_sift_path = '%sglobal_sift.txt' % (output_parent_path) image_name_list = get_target_pos_names(input_path=target_pos_path, target=target, target_count=target_count) sift_path_L = [] if preVLAD: delete_file(file_path=global_sift_path) for image_name in image_name_list: metadata_path = '%s%s.xml' % (annotation_path, image_name) win_sift_path = batch_one_image_pre_VLAD(input_image_path, image_name, metadata_path, output_parent_path, unit_ratio_list, overlap_ratio) sift_path_L.append(win_sift_path) append_file(dest_file=global_sift_path, input_path=win_sift_path) print "----------pre-VLAD Done" else: all_dir = list_all_files(input_path=output_parent_path + 'windows/', onlyDir=True) for d in all_dir: sift_path_L.append("%s/windows/%s/temp_sift/" % (output_parent_path, d)) print "----------pre-VLAD is enabled" if voca_path is None or not isfile(voca_path): print "~~~~~~~Learning vocabulary by the sift vectors of all windows of all images" vector_matrix = None if pca: vector_matrix = pca_dataset(input_path=global_sift_path) vocabulary = learn_vocabulary(input_path=global_sift_path, k=k, max_iter=max_iter, single_file=True, vector_matrix=vector_matrix) save_matrix(v=vocabulary, output_path=voca_path) print "~~~~~~~Learning vocabulary done" elif vladVector: print "~~~~~~~Loading existing vocabulary" vocabulary = load_matrix(input_path=voca_path) if vladVector: for i in xrange(len(image_name_list)): image_name = image_name_list[i] output_path = "%swindows/%s/%s" % (output_parent_path, image_name, image_name) print "\t======Creating VLAD vectors" vlad_vector_batch(input_path=sift_path_L[i], output_path=output_path, vocabulary=vocabulary) print "\t======VLAD Done for", image_name else: print "##########No VLAD vector generated...." if dataset_mode: print "^^^^^^^^^^Generate data set for global windows and VLAD" global_X_path = output_parent_path + "global_X.txt" global_Y_path = output_parent_path + "global_Y.txt" delete_file(global_X_path) delete_file(global_Y_path) for img_name in image_name_list: img_window_path = "%s/windows/%s/%s_windows.txt" % ( output_parent_path, img_name, img_name) img_vlad_path = "%s/windows/%s/%s_vlad.txt" % (output_parent_path, img_name, img_name) metadata_path = '%s%s.xml' % (annotation_path, img_name) batch_one_image_dataset(global_X_path, global_Y_path, img_window_path, img_vlad_path, metadata_path, target, overlap_threshold=overlap_threshold) print "\tData set done for", img_name print "....................All done"