Ejemplo n.º 1
0
def validate_downloaded_data(data_path):
    if not os.path.isdir(data_path):
        print("data_path({:s}) doesnt exist".format(data_path))
        return False, [], []
    img_counts_matches = True
    targets = funcH.getFolderList(dir2Search=data_path, sortList=True).tolist()
    csv_file = funcH.getFileList(dir2Search=data_path,
                                 startString="cnt_table",
                                 endString=".csv")
    csv_file_exist = csv_file != []
    if csv_file_exist:
        cnt_pd = pd.read_csv(filepath_or_buffer=os.path.join(
            data_path, csv_file[0]),
                             delimiter=',')
        file_targets = cnt_pd[cnt_pd.columns[0]].values[:-1]
        file_counts = cnt_pd[cnt_pd.columns[1]].values[:-1]
    folder_counts = np.zeros((len(targets), ), dtype=int)
    for i, t in enumerate(targets):
        source_path = os.path.join(data_path, t)
        samples = os.listdir(source_path)
        folder_counts[i] = len(samples)
        if csv_file_exist:
            img_counts_matches = img_counts_matches and (folder_counts[i]
                                                         == file_counts[i])
            assert (file_targets[i] == t), "{:s}!={:s}".format(
                file_targets[i], t)

    return img_counts_matches, targets, folder_counts
Ejemplo n.º 2
0
def get_last_epoch_completed(out_folder):
    epoch_out_img_list = funcH.getFileList(out_folder,
                                           startString="output_te",
                                           endString=".png",
                                           sortList=False)
    ep_fr = len(epoch_out_img_list)
    return ep_fr
Ejemplo n.º 3
0
def count_data_in_folder(data_path):
    if not os.path.isdir(data_path):
        print("data_path({:s}) doesnt exist".format(data_path))
        return [], []
    targets = getFolderList(dir2Search=data_path, sortList=True)
    img_cnt = np.zeros(np.shape(targets))
    for i, t in enumerate(targets):
        source_path = os.path.join(data_path, t)
        samples = getFileList(dir2Search=source_path, endString=".png")
        img_cnt[i] = len(samples)
    return targets, img_cnt
Ejemplo n.º 4
0
def create_data_folder(userIDTest,
                       userIDValid,
                       nos,
                       to_folder,
                       base_dir="/home/doga/DataFolder"):
    #  base_dir = funcH.getVariableByComputerName('base_dir')  # xx/DataPath or xx/DataFolder
    data_path_base = "neuralNetHandImages_nos" + str(nos) + "_rs224"
    data_path = os.path.join(base_dir, data_path_base,
                             "imgs")  # original path of data to load
    data_ident = "te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos)
    train_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_tr')
    valid_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_va')
    test_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_te')

    createDirIfNotExist(train_path)
    createDirIfNotExist(valid_path)
    createDirIfNotExist(test_path)

    cnt_table_fileName = os.path.join(
        to_folder, "conv_data_" + data_ident, "cnt_table" +
        "_te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos) + ".csv")
    targets = getFolderList(dir2Search=data_path, sortList=True).tolist()
    table_rows = targets.copy()
    table_rows.append("total")
    cnt_table = pd.DataFrame(index=table_rows,
                             columns=["train", "validation", "test", "total"])
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    if os.path.isdir(train_path) and os.path.isdir(
            valid_path) and os.path.isdir(test_path):
        rmtree(train_path, ignore_errors=True)
        rmtree(valid_path, ignore_errors=True)
        rmtree(test_path, ignore_errors=True)

    create_sub_folders(targets, train_path)
    create_sub_folders(targets, valid_path)
    create_sub_folders(targets, test_path)
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    spaces_list = []
    for t in targets:
        print(f"Start copying target {t} -->")
        source_path = os.path.join(data_path, t)
        samples = getFileList(dir2Search=source_path, endString=".png")
        # according to user_id_dict
        cnt_table["total"][t] = len(samples)
        cnt_table["total"]["total"] += len(samples)
        train_samples = []
        for s in samples:
            sample_dict = s.split(sep="_")
            # <3 signID><1 userID><2 repID>
            # int_id = int(sample_dict[1])
            # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10)
            # user_id_str = sample_dict[1][3]
            user_id_int = int(sample_dict[1][3])
            # if user_id_dict["valid"] == user_id_int:
            #    copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s))
            #    cnt_table["validation"][t] += 1
            if userIDTest == user_id_int:
                copyfile(os.path.join(source_path, s),
                         os.path.join(test_path, t, s))
                cnt_table["test"][t] += 1
            elif userIDValid == user_id_int:
                copyfile(os.path.join(source_path, s),
                         os.path.join(valid_path, t, s))
                cnt_table["validation"][t] += 1
            else:
                copyfile(os.path.join(source_path, s),
                         os.path.join(train_path, t, s))
                cnt_table["train"][t] += 1

        cnt_table["train"]["total"] += cnt_table["train"][t]
        cnt_table["validation"]["total"] += cnt_table["validation"][t]
        cnt_table["test"]["total"] += cnt_table["test"][t]
        print(
            f"Copied {t} --> train({cnt_table['train'][t]}),valid({cnt_table['validation'][t]}),test({cnt_table['test'][t]})"
        )

    pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName)
    print('\n'.join(map(str, spaces_list)))
    samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt")
    with open(samples_list_filename, 'w') as f:
        for i, item in enumerate(spaces_list):
            f.write("%s - %s\n" % (str(targets[i]), str(item)))

    return data_ident
Ejemplo n.º 5
0
def create_dataset(path_dict, user_id_dict, params_dict):
    data_path = path_dict["data_base"]  # original path of data to load
    train_path = path_dict["train"]  # train data to create
    valid_path = path_dict["valid"]  # valid data to create
    test_path = path_dict["test"]  # test data to create
    cnt_table_fileName = os.path.join(
        os.path.abspath(os.path.join(path_dict["train"], os.pardir)),
        "cnt_table" + params_dict["exp_ident"] + ".csv")

    img_cnt_ok_all, targets, cnt_vec_all = validate_downloaded_data(data_path)
    if not img_cnt_ok_all:
        print("download the data again!!")
        sys.exit(21)

    table_rows = targets.copy()
    table_rows.append("total")
    cnt_table = pd.DataFrame(index=table_rows,
                             columns=["train", "validation", "test", "total"])
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    if os.path.isdir(train_path) and os.path.isdir(
            valid_path) and os.path.isdir(test_path):
        try:
            targets_tr, img_cnt_tr = count_data_in_folder(train_path)
            cnt_table["train"].values[:-1] = img_cnt_tr
            targets_va, img_cnt_va = count_data_in_folder(valid_path)
            cnt_table["validation"].values[:-1] = img_cnt_va
            targets_te, img_cnt_te = count_data_in_folder(test_path)
            cnt_table["test"].values[:-1] = img_cnt_te
            cnt_table[
                "total"].values[:-1] = img_cnt_tr + img_cnt_va + img_cnt_te
            cnt_table[-1:].values[:] = np.sum(cnt_table[:-1].values[:], axis=0)
            if np.sum(cnt_vec_all - img_cnt_tr - img_cnt_va - img_cnt_te) == 0:
                return cnt_table
            else:
                rmtree(train_path, ignore_errors=True)
                rmtree(valid_path, ignore_errors=True)
                rmtree(test_path, ignore_errors=True)
        except:
            rmtree(train_path, ignore_errors=True)
            rmtree(valid_path, ignore_errors=True)
            rmtree(test_path, ignore_errors=True)

    create_sub_folders(targets, train_path)
    create_sub_folders(targets, valid_path)
    create_sub_folders(targets, test_path)
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    np.random.seed(seed=params_dict["randomSeed"])
    torch.random.manual_seed(params_dict["randomSeed"])
    spaces_list = []
    for t in targets:
        print(f"Start copying target {t} -->")
        source_path = os.path.join(data_path, t)
        samples = funcH.getFileList(dir2Search=source_path, endString=".png")
        #according to user_id_dict
        cnt_table["total"][t] = len(samples)
        cnt_table["total"]["total"] += len(samples)
        train_samples = []
        for s in samples:
            sample_dict = s.split(sep="_")
            # <3 signID><1 userID><2 repID>
            # int_id = int(sample_dict[1])
            # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10)
            # user_id_str = sample_dict[1][3]
            user_id_int = int(sample_dict[1][3])
            #if user_id_dict["valid"] == user_id_int:
            #    copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s))
            #    cnt_table["validation"][t] += 1
            if user_id_dict["test"] == user_id_int:
                copyfile(os.path.join(source_path, s),
                         os.path.join(test_path, t, s))
                cnt_table["test"][t] += 1
            elif user_id_dict["cross_valid_id"] is not None and user_id_dict[
                    "valid"] is None:
                copyfile(os.path.join(source_path, s),
                         os.path.join(train_path, t, s))
                train_samples.append(os.path.join(train_path, t, s))
                cnt_table["train"][t] += 1
            elif user_id_dict["cross_valid_id"] is None and user_id_dict[
                    "valid"] == user_id_int:
                copyfile(os.path.join(source_path, s),
                         os.path.join(valid_path, t, s))
                cnt_table["validation"][t] += 1
            elif user_id_dict["cross_valid_id"] is None:
                copyfile(os.path.join(source_path, s),
                         os.path.join(train_path, t, s))
                cnt_table["train"][t] += 1

        # deal with validation samples
        if user_id_dict["cross_valid_id"] is not None and user_id_dict[
                "valid"] is None:
            num_of_train_samples = len(train_samples)
            perm_list = np.random.permutation(num_of_train_samples)
            spaces = np.array(np.floor(
                np.linspace(0.0, num_of_train_samples, num=6)),
                              dtype=int)
            fr, to = spaces[user_id_dict["cross_valid_id"] -
                            1], spaces[user_id_dict["cross_valid_id"]]
            spaces_list.append(
                list(np.array([fr, to])) + list([-1]) + list(perm_list[fr:to]))
            for i in range(fr, to):
                sample_to_move = train_samples[perm_list[i]]
                sample_new_name = sample_to_move.replace(
                    train_path, valid_path)
                os.rename(sample_to_move, sample_new_name)
                cnt_table["train"][t] -= 1
                cnt_table["validation"][t] += 1

        cnt_table["train"]["total"] += cnt_table["train"][t]
        cnt_table["validation"]["total"] += cnt_table["validation"][t]
        cnt_table["test"]["total"] += cnt_table["test"][t]
        print(
            f"Copied {t} --> train({cnt_table['train'][t]}),valid,({cnt_table['validation'][t]})test({cnt_table['test'][t]})"
        )

    pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName)
    print('\n'.join(map(str, spaces_list)))
    samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt")
    with open(samples_list_filename, 'w') as f:
        for i, item in enumerate(spaces_list):
            f.write("%s - %s\n" % (str(targets[i]), str(item)))

    return cnt_table