def multithread_create_tf_record_by_files(files, output_dir, name="train", shuffling=True, fidx=0,
                                          label_text_to_id=None):
    print(f"samples per files {SAMPLES_PER_FILES}.")
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        print("删除文件夹%s" % (output_dir))
    if not tf.gfile.Exists(output_dir):
        tf.gfile.MakeDirs(output_dir)

    if shuffling:
        random.seed(time.time())
        random.shuffle(files)
    print(f"Total {len(files)} files.")
    files = wmlu.list_to_2dlist(files, SAMPLES_PER_FILES)
    files_data = list(enumerate(files))
    if fidx != 0:
        _files_data = []
        for fid, file_d in files_data:
            _files_data.append([fid + fidx, file_d])
        files_data = _files_data

    sys.stdout.flush()
    pool = Pool(13)
    pool.map(functools.partial(make_tfrecord, output_dir=output_dir, name=name, label_text_to_id=label_text_to_id),
             files_data)
    pool.close()
    pool.join()

    print('\nFinished converting the dataset total %d examples.!' % (len(files)))
Exemple #2
0
def save2dirs(files, save_dir, files_nr, dir_name):
    files = wmlu.list_to_2dlist(files, files_nr)
    for i, lfiles in enumerate(files):
        cur_save_dir = osp.join(save_dir, dir_name + f"_{i}")
        wmlu.create_empty_dir(cur_save_dir, remove_if_exists=False)
        for f in lfiles:
            wmlu.try_link(f, cur_save_dir)
Exemple #3
0
 def test_list_to_2dlist(self):
     a = [1,2,3,4,5,6,7,8,9]
     b = [[1,2],[3,4],[5,6],[7,8],[9,0]]
     c = wmlu.list_to_2dlist(a,2)
     print(c)
     c[-1].append(0)
     self.assertAllEqual(c,b)
    def multi_thread_to_tfrecords_by_files(self,files, output_dir,shuffling=False,fidx=0):
        wmlu.create_empty_dir(output_dir,remove_if_exists=True,yes_to_all=True)
        if shuffling:
            random.seed(time.time())
            random.shuffle(files)
        wmlu.show_list(files[:100])
        if len(files)>100:
            print("...")
        print(f"Total {len(files)} files.")
        sys.stdout.flush()
        files = wmlu.list_to_2dlist(files,SAMPLES_PER_FILES)
        files_data = list(enumerate(files))
        if fidx != 0:
            _files_data = []
            for fid,file_d in files_data:
                _files_data.append([fid+fidx,file_d])
            files_data = _files_data
        sys.stdout.flush()
        pool = Pool(13)
        pool.map(functools.partial(self.make_tfrecord,output_dir=output_dir),files_data)
        #list(map(functools.partial(self.make_tfrecord,output_dir=output_dir),files_data))
        pool.close()
        pool.join()

        print('\nFinished converting the dataset total %d examples.!'%(len(files)))
Exemple #5
0
def make_data_unit(datas,total_nr=None,nr_per_unit=None):
    assert total_nr is None or nr_per_unit is None, "Error arguments"
    if total_nr is not None:
        if total_nr>=len(datas):
            return datas
        datas = wmlu.list_to_2dlistv2(datas,total_nr)
    else:
        if nr_per_unit<=1:
            return datas
        datas = wmlu.list_to_2dlist(datas,nr_per_unit)

    datas = [DataUnit(x) for x in datas]
    return datas