Ejemplo n.º 1
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print 'Dataset files already exist. Exiting without re-creating them.'
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names, test_set = permutate(FLAGS.dataset_dir,
                                                       10000, 400)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    #num_validation = int(FLAGS.validation_size * len(photo_filenames))

    #Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    random.shuffle(test_set)
    #training_filenames = photo_filenames[num_validation:]
    #validation_filenames = photo_filenames[:num_validation]
    training_filenames = photo_filenames
    validation_filenames = test_set

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
Ejemplo n.º 2
0
def main():

    #==============================================================CHECKS==========================================================================
    # Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.')

    # Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')

    # Check if there is a dataset directory entered
    if not FLAGS.tfrecord_dir:
        raise ValueError('tfrecord_dir is empty. Please state a tfrecord_dir argument.')

    if not os.path.exists(FLAGS.tfrecord_dir):
        os.makedirs(FLAGS.tfrecord_dir)

    # If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(
            dataset_dir = FLAGS.tfrecord_dir,
            _NUM_SHARDS = FLAGS.num_shards,
            output_filename = FLAGS.tfrecord_filename):
        print 'Dataset files already exist. Exiting without re-creating them.'
        return None
    #==============================================================END OF CHECKS===================================================================

    # Get a list of photo_filenames and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_ids = _get_filenames_and_classes(FLAGS.dataset_dir)

    # Refer each of the class name to a specific integer number for predictions later
    class_ids_to_serial = dict(zip(class_ids, range(len(class_ids))))

    # Write the labels file:
    serial_to_class_ids = dict(zip(range(len(class_ids)), class_ids))
    write_label_file(serial_to_class_ids, FLAGS.dataset_dir)

    # Find the number of validation examples we need
    num_validation = int(float(FLAGS.validation_size) * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # Convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_ids_to_serial,
                     dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards,
                     simulate = FLAGS.simulate)
    _convert_dataset('validation', validation_filenames, class_ids_to_serial,
                     dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards,
                     simulate = FLAGS.simulate)

    print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
Ejemplo n.º 3
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo__train_filenames, photo__val_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    training_filenames = photo__train_filenames
    validation_filenames = photo__val_filenames

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Ejemplo n.º 4
0
def main(argv):

    if not FLAGS.tfrecord_filename:
        raise ValueError(
            "tfrecord_filename is empty. Please state a tfrecord_filename argument."
        )

    if not FLAGS.dataset_dir:
        raise ValueError(
            "dataset_dir is empty. Please state a dataset_dir argument.")

    if _dataset_exists(
            dataset_dir=FLAGS.dataset_dir,
            _NUM_SHARDS=FLAGS.num_shards,
            output_filename=FLAGS.tfrecord_filename,
    ):
        return None
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    num_validation = int(FLAGS.validation_size * len(photo_filenames))
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    _convert_dataset(
        "train",
        training_filenames,
        class_names_to_ids,
        dataset_dir=FLAGS.dataset_dir,
        tfrecord_filename=FLAGS.tfrecord_filename,
        _NUM_SHARDS=FLAGS.num_shards,
    )
    _convert_dataset(
        "validation",
        validation_filenames,
        class_names_to_ids,
        dataset_dir=FLAGS.dataset_dir,
        tfrecord_filename=FLAGS.tfrecord_filename,
        _NUM_SHARDS=FLAGS.num_shards,
    )

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)
Ejemplo n.º 5
0
def main():
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None

    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)
    write_data_summary(num_validation, len(photo_filenames), FLAGS.dataset_dir)
Ejemplo n.º 6
0
def main():

    #=============CHECKS==============
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_file=FLAGS.dataset_file,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==========END OF CHECKS============

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_file)

    #Refer each of the class name to a specific integer number for predictions later
    class_ids = [1 if label == "normal" else 0 for label in class_names]

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]
    training_labels = class_ids[num_validation:]
    validation_labels = class_ids[:num_validation]
    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     training_labels,
                     dataset_file=FLAGS.dataset_file,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     validation_labels,
                     dataset_file=FLAGS.dataset_file,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Ejemplo n.º 7
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    print('{}; {}'.format(len(photo_filenames), len(class_names)))
    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # Save validation images list to CSV file
    with open("./data/validate.csv", 'w') as f:
        f.write('IMAGE_NAME\n')
        for file in validation_filenames:
            head, filename = os.path.split(file)
            class_name = os.path.basename(os.path.dirname(file))
            f.write(str(filename) + ',' + str(class_name) + '\n')

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Ejemplo n.º 8
0
def main():
    #=============CHECKS==============
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==========END OF CHECKS============

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir, output_filename=FLAGS.tfrecord_filename)

    # Some datasets have a "categories" file with actual names those photo_names correspond to
    # such as UECFOOD256 dataset. Let's map it out.
    # if os.path.exists(os.path.join(FLAGS.dataset_dir, 'category.txt')):
    #     with open(os.path.join(FLAGS.dataset_dir, 'category.txt')) as cat_file:
    #         replacement_dict = [cat_name.split('\t') for cat_name in cat_file]
    #         class_names = [replacement_dict[int(class_name)][1].replace('\n','') for class_name in class_names]
    #         import pdb; pdb.set_trace()

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Ejemplo n.º 9
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    #if not FLAGS.tfrecord_filename:
    #raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.')

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    training_filenames = []
    validation_filenames = []
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    # Divide dataset into training and validation.
    for class_ in class_names:
        files_matching_class = [
            fil for fil in photo_filenames if fil.startswith(class_)
        ]
        val_samples_cnt = int(
            len(files_matching_class) * FLAGS.validation_size)
        training_filenames.extend(files_matching_class[val_samples_cnt:])
        validation_filenames.extend(files_matching_class[:val_samples_cnt])
    print("Training files size", len(training_filenames))
    print("Validation files size", len(validation_filenames))
    '''
    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]
    '''
    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards,
                     output_dir=FLAGS.output_dir)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards,
                     output_dir=FLAGS.output_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Ejemplo n.º 10
0
#Refer each of the class name to a specific integer number for predictions later
class_names_to_ids = dict(zip(class_names, range(len(class_names))))

#Find the number of validation examples we need
num_validation = int(FLAGS.validation_size * len(photo_filenames))

# Divide the training datasets into train and test:
random.seed(FLAGS.random_seed)
random.shuffle(photo_filenames)
training_filenames = photo_filenames[num_validation:]
validation_filenames = photo_filenames[:num_validation]

# First, convert the training and validation sets.
_convert_dataset('train',
                 training_filenames,
                 class_names_to_ids,
                 dataset_dir=FLAGS.dataset_dir,
                 tfrecord_filename=FLAGS.tfrecord_filename,
                 _NUM_SHARDS=FLAGS.num_shards)

_convert_dataset('validation',
                 validation_filenames,
                 class_names_to_ids,
                 dataset_dir=FLAGS.dataset_dir,
                 tfrecord_filename=FLAGS.tfrecord_filename,
                 _NUM_SHARDS=FLAGS.num_shards)

# Finally, write the labels file:
labels_to_class_names = dict(zip(range(len(class_names)), class_names))
write_label_file(labels_to_class_names, FLAGS.dataset_dir)

print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Ejemplo n.º 11
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)
    # generate fileids to create a <fileid, filename> .csv mapping file to be stored in TFDataset
    photo_fileids = list(range(1, len(photo_filenames) + 1))

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    print(f'\nrandom seed partition = {FLAGS.random_seed}')
    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]
    training_fileids = photo_fileids[num_validation:]
    validation_fileids = photo_fileids[:num_validation]

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     training_fileids,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     _NUM_SHARDS=FLAGS.num_shards)
    write_image_ids_file(training_filenames, training_fileids,
                         FLAGS.dataset_dir, IMAGE_IDS_FILENAME)

    if num_validation > 0:
        _convert_dataset('validation',
                         validation_filenames,
                         validation_fileids,
                         class_names_to_ids,
                         dataset_dir=FLAGS.dataset_dir,
                         _NUM_SHARDS=FLAGS.num_shards)
        write_image_ids_file(validation_filenames, validation_fileids,
                             FLAGS.dataset_dir,
                             IMAGE_IDS_VALIDATION_TMP_FILENAME)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the dataset!')
Ejemplo n.º 12
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.')

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir = FLAGS.dataset_dir, _NUM_SHARDS = FLAGS.num_shards, output_filename = FLAGS.tfrecord_filename):
        print 'Dataset files already exist. Exiting without re-creating them.'
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(FLAGS.dataset_dir)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = {'pos': 1, 'neg': 0}
    

    training_filenames = []
    validation_filenames = []
    pos_dir = root_dir + "training_clips/split/photos/pos/"
    neg_dir = root_dir + "training_clips/split/photos/neg/"
    trainf = open(root_dir + "train_llbl.txt", 'r')
    for line in trainf:
        name = line.split(" ")[0]
        classid = int(line.split(" ")[1])
        if classid == 1:
            full_name = pos_dir + name
        elif classid == 0:
            full_name = neg_dir + name
        training_filenames.append(full_name)
    trainf.close()

    testf = open(root_dir + "test_llbl.txt", 'r')
    for line in testf:
        name = line.split(" ")[0]
        classid = int(line.split(" ")[1])
        if classid == 1:
            full_name = pos_dir + name
        elif classid == 0:
            full_name = neg_dir + name        
        validation_filenames.append(full_name)
    testf.close()


    random.seed(FLAGS.random_seed)
    random.shuffle(training_filenames)
    random.shuffle(validation_filenames)

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards)

    print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def master():

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/home/agravat/key.json'
    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)
    """
    cloud_filenames, cloud_class_names = get_cloud_filenames_and_classes("gs://agravat-demo/images")
    
    for f in cloud_filenames:
        print(f)
    """

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # First, convert the training and validation sets.
    train_file_mappings = _convert_dataset(
        'train',
        training_filenames,
        class_names_to_ids,
        dataset_dir=FLAGS.dataset_dir,
        tfrecord_filename=FLAGS.tfrecord_filename,
        _NUM_SHARDS=FLAGS.num_shards)
    val_file_mappings = _convert_dataset(
        'validation',
        validation_filenames,
        class_names_to_ids,
        dataset_dir=FLAGS.dataset_dir,
        tfrecord_filename=FLAGS.tfrecord_filename,
        _NUM_SHARDS=FLAGS.num_shards)

    file_mappings = train_file_mappings
    file_mappings.update(val_file_mappings)
    rank_files = []
    outfile = "out.tfrecord"
    """
    for rank,o,file in train_file_mappings:
        if comm.rank == rank:
            rank_files.append(file)
            outfile = o
            #print("rank %d len files = %d, outfile = %s\n" % (comm.rank, len(rank_files), outfile))

    """
    #print("rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile))

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    #print('Finished converting the %s dataset for %s rank %d' % (FLAGS.tfrecord_filename, gethostname(), comm.rank))
    all_data = []
    size = MPI.COMM_WORLD.Get_size()
    current_work = Work(train_file_mappings)
    comm = MPI.COMM_WORLD
    status = MPI.Status()

    # this is the loop where the master distributes all the work based on the number of workers
    # that are available
    for i in range(1, size):
        # the master gets the next element in the list
        anext = current_work.get_next_item()
        if not anext: break
        # master sends the element to a worker
        comm.send(obj=anext, dest=i, tag=WORKTAG)

    # this is a fallback if there are more work items than workers
    while 1:
        # get the next work item but we break if there are None
        anext = current_work.get_next_item()
        if not anext: break

        # get the result from any worker
        data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
        #print("more work rank %d, host %s data %s" % (comm.rank, gethostname(), data))
        # add the processed result to the list of results
        all_data.append(data)
        # send another work item to the worker who completed the last task
        print("spillover %d %s" % (comm.rank, gethostname()))
        comm.send(obj=anext, dest=status.Get_source(), tag=WORKTAG)

    # get the results back from the workers
    for i in range(1, size):
        data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG)
        print("recieved from %d" % (i))
        all_data.append(data)

    # end the tasks
    for i in range(1, size):
        comm.send(obj=None, dest=i, tag=DIETAG)

    return all_data
Ejemplo n.º 14
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print 'Dataset files already exist. Exiting without re-creating them.'
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    # photo_filenames, labels = _get_image_label(read_label_file(FLAGS.dataset_dir, 'data/list/binary_effusion.txt'))
    train_image, train_label = _get_image_label(
        read_label_file(FLAGS.dataset_dir, FLAGS.train_list))
    logging.debug("train_image: %s, train_label: %s", train_image[:10],
                  train_label[:10])
    val_image, val_label = _get_image_label(
        read_label_file(FLAGS.dataset_dir, FLAGS.val_list))
    logging.debug("val_image: %s, val_label: %s", val_image[:10],
                  val_label[:10])

    #Refer each of the class name to a specific integer number for predictions later
    # class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    # num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    # random.shuffle(photo_filenames)
    # training_filenames = photo_filenames[num_validation:]
    # train_label = labels[num_validation:]
    # validation_filenames = photo_filenames[:num_validation]
    # val_label = labels[:num_validation]

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     train_image,
                     train_label,
                     dataset_dir=FLAGS.dataset_dir,
                     write_dir=FLAGS.write_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     val_image,
                     val_label,
                     dataset_dir=FLAGS.dataset_dir,
                     write_dir=FLAGS.write_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    # labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    # write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    if comm.rank == 0:
        # First, convert the training and validation sets.
        train_file_mappings = _convert_dataset(
            'train',
            training_filenames,
            class_names_to_ids,
            dataset_dir=FLAGS.dataset_dir,
            tfrecord_filename=FLAGS.tfrecord_filename,
            _NUM_SHARDS=FLAGS.num_shards)
        val_file_mappings = _convert_dataset(
            'validation',
            validation_filenames,
            class_names_to_ids,
            dataset_dir=FLAGS.dataset_dir,
            tfrecord_filename=FLAGS.tfrecord_filename,
            _NUM_SHARDS=FLAGS.num_shards)

        #file_mappings.extend(val_file_mappings)
    else:
        train_file_mappings = None
        val_file_mappings = None

    train_file_mappings = comm.bcast(train_file_mappings, root=0)
    rank_files = []
    outfile = None
    for rank, o, file in train_file_mappings:
        if comm.rank == rank:
            rank_files.append(file)
            outfile = o
            #print("rank %d len files = %d, outfile = %s\n" % (comm.rank, len(rank_files), outfile))

    print("rank: %d, %s, outfile = %s" % (comm.rank, gethostname(), outfile))

    # write training files
    if outfile != None:
        _write_dataset(rank_files, outfile, class_names_to_ids)

    val_file_mappings = comm.bcast(val_file_mappings, root=0)
    rank_files = []
    outfile = None
    for rank, o, file in val_file_mappings:
        #if comm.rank == 29:
        #print("rank %d, shards = %d, comm rank = %d, mod = %d" % (rank, FLAGS.num_shards, comm.rank, rank % FLAGS.num_shards))
        if rank % FLAGS.num_shards == comm.rank:
            rank_files.append(file)
            outfile = o
            #print("rannk: %d, file = %s" % (
        #pprint("rank: %s, %d, %s, %s" % (comm.rank, a,b,c))

    print("val rank: %d, %s, outfile = %s" %
          (comm.rank, gethostname(), outfile))
    # write validation files
    if outfile != None:
        _write_dataset(rank_files, outfile, class_names_to_ids)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('Finished converting the %s dataset for %s rank %d' %
          (FLAGS.tfrecord_filename, gethostname(), comm.rank))