def main():
    parser = argparse.ArgumentParser(description='Select the type of reduced.')
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        required=True,
        help=
        'Path to the file the contains the dictionary with the info of the dataset reduced.'
    )
    parser.add_argument("-d",
                        "--database",
                        type=str,
                        required=True,
                        help='Database it can be cnrpark or pklot')
    args = vars(parser.parse_args())
    info_filename = args["filename"]
    database = args["database"]

    data_paths, subsets_info = getSubsets(info_filename,
                                          init_por_training_data,
                                          init_por_test_data, database)
    train_set = data_paths['train']
    test_set = data_paths['test']
    empty_count, occupied_count = count_quantity_of_classes(
        data_paths['train'])

    random.shuffle(train_set)
    random.shuffle(test_set)

    total_size_training_data_empty = int(total_size_training_data / 2)
    total_size_training_data_occupied = int(total_size_training_data / 2)
    missing_training_data_empty = total_size_training_data_empty - empty_count
    missing_training_data_occupied = total_size_training_data_occupied - occupied_count

    filename = ntpath.basename(info_filename).split('.')[0]
    subset_dir = filename + '-{}v-{}nv_dataaug-{}v-{}nv'.format(
        occupied_count, empty_count, missing_training_data_occupied,
        missing_training_data_empty)
    create_subset_directorie(subset_dir)

    print('{} {}'.format(missing_training_data_occupied,
                         missing_training_data_empty))
    datagen = ImageDataGenerator(width_shift_range=0.1,
                                 horizontal_flip=True,
                                 rotation_range=90)

    dataug_dir = os.path.join(subset_dir, 'dataaug')
    dataug_dir, maded = create_subset_directorie(dataug_dir)

    total_dataaug_occupied = 0
    total_dataaug_empty = 0
    for train_data in tqdm(train_set):
        img_path = train_data['path']
        y = train_data['y']
        if y == '0':
            if missing_training_data_empty <= 0:
                continue
            missing_training_data_empty -= 1
            total_dataaug_empty += 1
        else:
            if missing_training_data_occupied <= 0:
                continue
            missing_training_data_occupied -= 1
            total_dataaug_occupied += 1
        img = load_img(img_path)  # this is a PIL image
        x = img_to_array(img)  # this is a Numpy array with shape (3, 150, 150)
        x = x.reshape(
            (1, ) +
            x.shape)  # this is a Numpy array with shape (1, 3, 150, 150)

        for batch in datagen.flow(x,
                                  batch_size=1,
                                  save_to_dir=dataug_dir,
                                  save_prefix=y,
                                  save_format='jpeg'):
            dataaug_subset, total_dataaug = getPathsAndLabelDataaug(
                dataug_dir, fileext='.jpeg')
            if total_dataaug < (total_dataaug_occupied + total_dataaug_empty):
                print('{} {} {}'.format(total_dataaug, total_dataaug_occupied,
                                        total_dataaug_empty))
            else:
                break

    dataaug_subset, total_dataaug = getPathsAndLabelDataaug(dataug_dir,
                                                            fileext='.jpeg')
    print(total_dataaug)
    train_set.extend(dataaug_subset)
    random.shuffle(train_set)

    saveSubsetsInfo(
        subsets=subsets_info,
        subsets_dir=subset_dir,
        database=database,
        extra_info=
        'Training subset complemented with {} vehicles {} and nonvehicles {}'.
        format('data augmentation', total_dataaug_occupied,
               total_dataaug_empty))

    save_subset(subset=train_set, subsets_dir=subset_dir, type='train')
    save_subset(subset=test_set, subsets_dir=subset_dir, type='test')
def main():
    parser = argparse.ArgumentParser(description='Select the type of reduced.')
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        required=True,
        help=
        'Path to the file the contains the dictionary with the info of the dataset reduced.'
    )
    parser.add_argument(
        "-ext",
        "--extra-dataset",
        type=str,
        required=True,
        help=
        'Complementary dataset choosen for  balance it can be carnd or mio-tcd'
    )
    parser.add_argument("-d",
                        "--database",
                        type=str,
                        required=True,
                        help='Database it can be cnrpark or pklot')
    args = vars(parser.parse_args())

    info_filename = args["filename"]
    database = args["database"]
    complementary_dataset = args["extra_dataset"]

    data_paths, subsets_info = getSubsets(info_filename,
                                          init_por_training_data,
                                          init_por_test_data, database)
    train_set = data_paths['train']
    test_set = data_paths['test']
    empty_count, occupied_count = count_quantity_of_classes(
        data_paths['train'])

    if complementary_dataset == 'carnd':
        complementary_vehicles_path = path_carnd_vehicles
        complementary_nonvehicles_path = path_carnd_nonvehicles
    else:
        complementary_vehicles_path = path_miotcd_vehicles
        complementary_nonvehicles_path = path_miotcd_nonvehicles

    if complementary_dataset == 'carnd':
        ext = 'png'
    else:
        ext = 'jpg'
    complementary_vehicles_data, total_complementary_vehicles = get_paths_and_label(
        complementary_vehicles_path, '1', ext)
    complementary_nonvehicles_data, total_complementary_nonvehicles = get_paths_and_label(
        complementary_nonvehicles_path, '0', ext)

    total_size_training_data_empty = int(total_size_training_data / 2)
    total_size_training_data_occupied = int(total_size_training_data / 2)

    missing_training_data_empty = total_size_training_data_empty - empty_count
    missing_training_data_occupied = total_size_training_data_occupied - occupied_count

    random.shuffle(complementary_nonvehicles_data)
    for i in range(missing_training_data_empty):
        train_set.append(complementary_nonvehicles_data[i])
    random.shuffle(complementary_vehicles_data)
    for i in range(missing_training_data_occupied):
        train_set.append(complementary_vehicles_data[i])

    random.shuffle(train_set)
    random.shuffle(test_set)

    filename = ntpath.basename(info_filename).split('.')[0]
    subset_dir = filename + '-{}v-{}nv_complementary-{}-{}v-{}nv'.format(
        occupied_count, empty_count, complementary_dataset,
        missing_training_data_occupied, missing_training_data_empty)
    create_subset_directorie(subset_dir)

    saveSubsetsInfo(
        subsets=subsets_info,
        subsets_dir=subset_dir,
        database=database,
        extra_info=
        'Training subset complemented with {} vehicles {} and nonvehicles {}'.
        format(complementary_dataset, missing_training_data_occupied,
               missing_training_data_empty))

    save_subset(subset=train_set, subsets_dir=subset_dir, type='train')
    save_subset(subset=test_set, subsets_dir=subset_dir, type='test')
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='Select the type of reduced.')
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        required=True,
        help=
        'Path to the file the contains the dictionary with the info of the dataset reduced.'
    )

    args = vars(parser.parse_args())

    info_filename = args["filename"]

    # the dataset filename is gonna be called the same instead of labels it will have dataset
    dataset_filename = info_filename.replace("labels", "dataset")
    dataset_directory = ntpath.basename(dataset_filename).split(
        '.')[0] + "-balance" + "_csv"
    subsets_path = "C:\\Eduardo\\ProyectoFinal\\Proyecto\\ProyectoFinal\\Train\\subsets"
    subsets_path = os.path.join(subsets_path, dataset_directory)

    if os.path.isdir(subsets_path):
        print('This subset was already made')
        #return
    else:
        print("Make dir: {}".format(subsets_path))
        os.mkdir(subsets_path)

    if not os.path.isfile(info_filename):
        print('Insert a valid file')
        return

    with open(info_filename, "rb") as fp:  # Unpickling
        images_info_reduced = pickle.load(fp)

    spaces = extractUniqueItemsByKey(images_info_reduced, 'space')

    spaces_distribution = asignNumberSpacesForSubsets(len(spaces))

    # For each day i would select randoms spaces for each subset (training, validation, test)
    images_info_reduced.sort(key=lambda x: x['date'])
    subsets = {'train': [], 'test': []}
    current_date = images_info_reduced[0]['date']
    spaces_subsets = getSpacesSubsets(spaces, spaces_distribution)

    data_paths = {'train': [], 'test': []}

    for image_info in tqdm(images_info_reduced):
        if current_date != image_info['date']:
            current_date = image_info['date']
            spaces_subsets = getSpacesSubsets(spaces, spaces_distribution)
        # this decides if the spaces is going to be train or test
        subset = spaces_subsets[image_info['space']]
        image_path = os.path.join(path_patches, image_info['filePath'])
        data_path_state = {'path': image_path, 'y': image_info['state']}
        data_paths[subset].append(data_path_state)
        subsets[subset].append(image_info)

    empty_count = 0
    ocuppied_count = 0
    for train_image in data_paths['train']:
        if train_image['y'] == '0':
            empty_count += 1
        else:
            ocuppied_count += 1
    print("Train images empty state: {} occupied state: {}".format(
        empty_count, ocuppied_count))
    # si las imagenes en entrenamiento están desbalanceada, esto quiere decir que
    # tienen más elementos vacios o ocupados, se balancearan de la siguiente forma
    # en caso de que las imagenes con el espacio vacio sean mayores se pasara el exceso
    # a las imagenes de prueba
    # en caso de que las imagenes de espacios ocupados sea mayor, las imagenes de espacios
    # vacios se aumentara con imagenes del conjunto de entrenamiento
    if empty_count > ocuppied_count:
        train_paths = data_paths['train']
        i = 0
        while i < (empty_count - ocuppied_count):
            random_index = random.randint(0, len(train_paths))
            if train_paths[random_index]['y'] == '0':
                data_path_state = train_paths.pop(random_index)
                data_paths['test'].append(data_path_state)
                image_info = subsets['train'].pop(random_index)
                subsets['test'].append(image_info)
                i += 1
        data_paths['train'] = train_paths
    elif ocuppied_count > empty_count:
        test_paths = data_paths['test']
        i = 0
        while i < (ocuppied_count - empty_count):
            random_index = random.randint(0, len(test_paths))
            if test_paths[random_index]['y'] == '0':
                data_path_state = test_paths.pop(random_index)
                data_paths['train'].append(data_path_state)
                image_info = subsets['test'].pop(random_index)
                subsets['train'].append(image_info)
                i += 1
        data_paths['test'] = test_paths

    for key, value in subsets.items():
        spacesIn = extractUniqueItemsByKey(value, 'space')
        empty_count = 0
        ocuppied_count = 0
        overcast_count = 0
        sunny_count = 0
        rainy_count = 0
        for v in value:
            if v['state'] == '0':
                empty_count += 1
            else:
                ocuppied_count += 1
            if v['weather'] == 'OVERCAST':
                overcast_count += 1
            elif v['weather'] == 'SUNNY':
                sunny_count += 1
            elif v['weather'] == 'RAINY':
                rainy_count += 1
        info = "Subset {} size: {} from {} spaces - empty: {} ocuppied: {} - cloudy: {} sunny: {} rainy: {}".format(
            key, len(value), len(spacesIn), empty_count, ocuppied_count,
            overcast_count, sunny_count, rainy_count)
        print(info)
        with open(os.path.join(subsets_path, 'data_info.txt'), "a") as finfo:
            finfo.write(info)

    #with open(os.path.join(subsets_path, 'data_paths.txt'), "wb") as fp:  # Pickling
    #		pickle.dump(data_paths, fp)
    with open(os.path.join(subsets_path, 'data_paths.txt'), 'w') as outfile:
        json.dump(data_paths, outfile)

    save_subset(data_paths['train'], dataset_directory, 'train')
    save_subset(data_paths['test'], dataset_directory, 'test')
    """
def main():
	train_set = []
	test_set = []

	carnd_vehicles_data, total_carnd_vehicles = get_paths_and_label(path_carnd_vehicles, '1')
	train_set.extend(carnd_vehicles_data)
	carnd_nonvehicles_data, total_carnd_nonvehicles = get_paths_and_label(path_carnd_nonvehicles, '0')
	train_set.extend(carnd_nonvehicles_data)

	left_training_vehicles = int(abs((size_training_data / 2) - total_carnd_vehicles))
	left_training_nonvehicles = int(abs((size_training_data / 2) - total_carnd_nonvehicles))

	miotcd_vehicles_data, t = get_paths_and_label(path_miotcd_vehicles, '1', fileext='.jpg')
	random.shuffle(miotcd_vehicles_data)
	for i in range(left_training_vehicles):
			train_set.append(miotcd_vehicles_data[i])

	miotcd_nonvehicles_data, t = get_paths_and_label(path_miotcd_nonvehicles, '0', fileext='.jpg')
	random.shuffle(miotcd_nonvehicles_data)
	for i in range(left_training_nonvehicles):
			train_set.append(miotcd_nonvehicles_data[i])

	random.shuffle(train_set)

	print(len(train_set))
	for i in range(10):
		print(random.choice(train_set))

	# test set
	with open(info_filename, "rb") as fp:  # Unpickling
			images_info_reduced = pickle.load(fp)

	random.shuffle(images_info_reduced)
	empty_count = 0
	ocuppied_count = 0
	subsets_info = {'test':[]}

	for image_info in images_info_reduced:
		state = image_info['state']
		if state == '0':
			if empty_count < int(size_testing_data / 2):
				empty_count += 1
			else:
				continue
		elif state == '1':
			if ocuppied_count < int(size_testing_data / 2):
				ocuppied_count += 1
			else:
				continue
		image_path = os.path.join(path_patches, image_info['filePath'])
		test_set.append({'path': image_path, 'y': image_info['state']})
		subsets_info['test'].append(image_info)

	test_file = ntpath.basename(info_filename).split('.')[0]
	subset_dir = 'train_CarND-{}v-{}nv_MIO-TCD-{}v-{}nv_test_{}-{}'.format(total_carnd_vehicles, total_carnd_nonvehicles, left_training_vehicles, left_training_nonvehicles, test_file, len(test_set))
	create_subset_directorie(subset_dir)

	save_subsets_info(subsets=subsets_info, subsets_dir=subset_dir, extra_info='Special subset of training form of:\n vehicles {} from CarND-Project5 and {} from MIO-TCD \n nonvehicles {} from CarND-Project5 and {} from MIO-TCD'.format(total_carnd_vehicles, left_training_vehicles, total_carnd_nonvehicles, left_training_nonvehicles))

	save_subset(subset=train_set, subsets_dir=subset_dir, type='train')
	save_subset(subset=test_set, subsets_dir=subset_dir, type='test')

	print(len(test_set))
	for i in range(10):
		print(random.choice(test_set))
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(description='Select the type of reduced.')
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        required=True,
        help=
        'Path to the file the contains the dictionary with the info of the dataset reduced.'
    )
    parser.add_argument("-d",
                        "--database",
                        type=str,
                        required=True,
                        help='Database it can be cnrpark or pklot')
    args = vars(parser.parse_args())
    info_filename = args["filename"]
    database = args["database"]

    data_paths, subsets_info = getSubsets(info_filename,
                                          init_por_training_data,
                                          init_por_test_data, database)
    train_set = data_paths['train']
    test_set = data_paths['test']
    empty_count, occupied_count = count_quantity_of_classes(
        data_paths['train'])

    random.shuffle(train_set)
    random.shuffle(test_set)

    total_size_training_data_empty = int(total_size_training_data / 2)
    total_size_training_data_occupied = int(total_size_training_data / 2)
    missing_training_data_empty = total_size_training_data_empty - empty_count
    missing_training_data_occupied = total_size_training_data_occupied - occupied_count

    filename = ntpath.basename(info_filename).split('.')[0]
    subset_dir = filename + '-{}v-{}nv_short'.format(
        total_size_training_data_occupied, total_size_training_data_empty)
    create_subset_directorie(subset_dir)

    print('{} {}'.format(missing_training_data_occupied,
                         missing_training_data_empty))

    total_data_occupied = 0
    total_data_empty = 0

    train_set_used = []
    train_set_not_used = []
    empty_data_removed = 0
    occupied_data_removed = 0
    for train_data in tqdm(train_set):
        y = train_data['y']
        if y == '1':
            if total_data_occupied >= int(total_size_training_data / 2):
                train_set_not_used.append(train_data)
                occupied_data_removed += 1
                continue
            total_data_occupied += 1
        else:
            if total_data_empty >= int(total_size_training_data / 2):
                train_set_not_used.append(train_data)
                empty_data_removed += 1
                continue
            total_data_empty += 1
        train_set_used.append(train_data)

    print(train_set_used[0])
    print(train_set_not_used[0])

    random.shuffle(train_set_used)

    saveSubsetsInfo(
        subsets=subsets_info,
        subsets_dir=subset_dir,
        database=database,
        extra_info=
        'Training subset short, this info is not accurate because the training set was modified \n The size of the training set is: {} The removed data is {} empty  {} occupied'
        .format(total_size_training_data, empty_data_removed,
                occupied_data_removed))

    save_subset(subset=train_set_used, subsets_dir=subset_dir, type='train')
    save_subset(subset=train_set_not_used,
                subsets_dir=subset_dir,
                type='train_notused')
    save_subset(subset=test_set, subsets_dir=subset_dir, type='test')