def multithread_create_tf_record_by_files(files, output_dir, name="train", shuffling=True, fidx=0, label_text_to_id=None): print(f"samples per files {SAMPLES_PER_FILES}.") if os.path.exists(output_dir): shutil.rmtree(output_dir) print("删除文件夹%s" % (output_dir)) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) if shuffling: random.seed(time.time()) random.shuffle(files) print(f"Total {len(files)} files.") files = wmlu.list_to_2dlist(files, SAMPLES_PER_FILES) files_data = list(enumerate(files)) if fidx != 0: _files_data = [] for fid, file_d in files_data: _files_data.append([fid + fidx, file_d]) files_data = _files_data sys.stdout.flush() pool = Pool(13) pool.map(functools.partial(make_tfrecord, output_dir=output_dir, name=name, label_text_to_id=label_text_to_id), files_data) pool.close() pool.join() print('\nFinished converting the dataset total %d examples.!' % (len(files)))
def save2dirs(files, save_dir, files_nr, dir_name): files = wmlu.list_to_2dlist(files, files_nr) for i, lfiles in enumerate(files): cur_save_dir = osp.join(save_dir, dir_name + f"_{i}") wmlu.create_empty_dir(cur_save_dir, remove_if_exists=False) for f in lfiles: wmlu.try_link(f, cur_save_dir)
def test_list_to_2dlist(self): a = [1,2,3,4,5,6,7,8,9] b = [[1,2],[3,4],[5,6],[7,8],[9,0]] c = wmlu.list_to_2dlist(a,2) print(c) c[-1].append(0) self.assertAllEqual(c,b)
def multi_thread_to_tfrecords_by_files(self,files, output_dir,shuffling=False,fidx=0): wmlu.create_empty_dir(output_dir,remove_if_exists=True,yes_to_all=True) if shuffling: random.seed(time.time()) random.shuffle(files) wmlu.show_list(files[:100]) if len(files)>100: print("...") print(f"Total {len(files)} files.") sys.stdout.flush() files = wmlu.list_to_2dlist(files,SAMPLES_PER_FILES) files_data = list(enumerate(files)) if fidx != 0: _files_data = [] for fid,file_d in files_data: _files_data.append([fid+fidx,file_d]) files_data = _files_data sys.stdout.flush() pool = Pool(13) pool.map(functools.partial(self.make_tfrecord,output_dir=output_dir),files_data) #list(map(functools.partial(self.make_tfrecord,output_dir=output_dir),files_data)) pool.close() pool.join() print('\nFinished converting the dataset total %d examples.!'%(len(files)))
def make_data_unit(datas,total_nr=None,nr_per_unit=None): assert total_nr is None or nr_per_unit is None, "Error arguments" if total_nr is not None: if total_nr>=len(datas): return datas datas = wmlu.list_to_2dlistv2(datas,total_nr) else: if nr_per_unit<=1: return datas datas = wmlu.list_to_2dlist(datas,nr_per_unit) datas = [DataUnit(x) for x in datas] return datas