def main(): """Use local data to train the neural net, probably made by bin/create_training_data.py.""" parser = create_parser() args = parser.parse_args() with open(CACHE_PATH + 'raster_data_paths.pickle', 'r') as infile: raster_data_paths = pickle.load(infile) test_images, model = train_on_cached_data(raster_data_paths, args.neural_net, args.bands, args.tile_size, args.number_of_epochs) if not args.omit_findings: for path in raster_data_paths: print path labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: print("WARNING, there is a borked naip image file") continue false_positives, false_negatives, fp_images, fn_images = list_findings(labels, images, model) path_parts = path.split('/') filename = path_parts[len(path_parts) - 1] print("FINDINGS: {} false pos and {} false neg, of {} tiles, from {}".format( len(false_positives), len(false_negatives), len(images), filename)) render_results_for_analysis([path], false_positives, fp_images, args.bands, args.tile_size) if args.render_results: predictions = predictions_for_tiles(test_images, model) render_results_for_analysis(raster_data_paths, predictions, test_images, args.bands, args.tile_size)
def train_on_cached_data(raster_data_paths, neural_net_type, bands, tile_size, number_of_epochs): """Load tiled/cached data, which was prepared for the NAIPs listed in raster_data_paths. Read in each NAIP's images/labels, add to train/test data, run some epochs as each is added. Keep the train and test sets to a max of 10K images by throwing out random data sometimes. """ training_images = [] onehot_training_labels = [] test_images = [] onehot_test_labels = [] model = None for path in raster_data_paths: # read in another NAIP worth of data labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: continue equal_count_way_list, equal_count_tile_list = equalize_data( labels, images, False) new_test_labels, training_labels, new_test_images, new_training_images = \ split_train_test(equal_count_tile_list, equal_count_way_list, .9) if len(training_labels) == 0: print("WARNING: a naip image didn't have any road labels?") continue if len(new_test_labels) == 0: print("WARNING: a naip image didn't have any road images?") continue # add it to the training and test lists [training_images.append(i) for i in new_training_images] [test_images.append(i) for i in new_test_images] [ onehot_training_labels.append(l) for l in format_as_onehot_arrays(training_labels) ] [ onehot_test_labels.append(l) for l in format_as_onehot_arrays(new_test_labels) ] # once we have 100 test_images, maybe from more than one NAIP, train on a mini batch if len(training_images) >= 100: # continue training the model with the new data set model = train_with_data(onehot_training_labels, onehot_test_labels, test_images, training_images, neural_net_type, bands, tile_size, number_of_epochs, model) training_images = [] onehot_training_labels = [] # keep test list to 10000 images, in case the machine doesn't have much memory if len(test_images) > 10000: # shuffle so when we chop off data, it's from many NAIPs, not just the last one shuffle_in_unison(test_images, onehot_test_labels) test_images = test_images[:9000] onehot_test_labels = onehot_test_labels[:9000] return test_images, model
def post_findings_to_s3(raster_data_paths, model, training_info, render_results): """Aggregate findings from all NAIPs into a pickled list, post to S3.""" findings = [] for path in raster_data_paths: labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: print("WARNING, there is a borked naip image file") continue false_positives, fp_images = list_findings(labels, images, model) path_parts = path.split('/') filename = path_parts[len(path_parts) - 1] print("FINDINGS: {} false pos of {} tiles, from {}".format( len(false_positives), len(images), filename)) if render_results: # render JPEGs showing findings render_results_for_analysis([path], false_positives, fp_images, training_info['bands'], training_info['tile_size']) # combine findings for all NAIP images analyzedfor the region [findings.append(f) for f in tag_with_locations(fp_images, false_positives, training_info['tile_size'])] # dump combined findings to disk as a pickle try: os.mkdir(CACHE_PATH + training_info['naip_state']) except: pass naip_path_in_cache_dir = training_info['naip_state'] + '/' + 'findings.pickle' local_path = CACHE_PATH + naip_path_in_cache_dir with open(local_path, 'w') as outfile: pickle.dump(findings, outfile) # push pickle to S3 s3_client = boto3.client('s3') s3_client.upload_file(local_path, FINDINGS_S3_BUCKET, naip_path_in_cache_dir)
def train_on_cached_data(raster_data_paths, neural_net_type, bands, tile_size): """Load tiled/cached data, which was prepared for the NAIPs listed in raster_data_paths. Read in each NAIP's images/labels, add to train/test data, run some epochs as each is added. Keep the train and test sets to a max of 10K images by throwing out random data sometimes. """ training_images = [] onehot_training_labels = [] test_images = [] onehot_test_labels = [] model = None epoch = 0 for path in raster_data_paths: # keep test list to 1000 images if len(test_images) > 10000: test_images = test_images[:9000] onehot_test_labels = onehot_test_labels[:9000] # keep train list to 10000 images if len(training_images) > 10000: training_images = training_images[:9000] onehot_training_labels = onehot_training_labels[:9000] # read in another NAIP worth of data labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: continue equal_count_way_list, equal_count_tile_list = equalize_data(labels, images, False) new_test_labels, training_labels, new_test_images, new_training_images = \ split_train_test(equal_count_tile_list, equal_count_way_list, .9) if len(training_labels) == 0: print("WARNING: a naip image didn't have any road labels?") continue if len(new_test_labels) == 0: print("WARNING: a naip image didn't have any road images?") continue # add it to the training and test lists [training_images.append(i) for i in new_training_images] [test_images.append(i) for i in new_test_images] [onehot_training_labels.append(l) for l in format_as_onehot_arrays(training_labels)] [onehot_test_labels.append(l) for l in format_as_onehot_arrays(new_test_labels)] # shuffle it so when we chop off data it's from many NAIPs, not just the last one shuffle_in_unison(training_images, onehot_training_labels) shuffle_in_unison(test_images, onehot_test_labels) # continue training the model with the new data set model = train_with_data(onehot_training_labels, onehot_test_labels, test_images, training_images, neural_net_type, bands, tile_size, epoch, model) epoch += 1 return test_images, model
def render_errors(raster_data_paths, model, training_info, render_results): """Render JPEGs showing findings.""" for path in raster_data_paths: labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: print("WARNING, there is a borked naip image file") continue false_positives, fp_images = list_findings(labels, images, model) path_parts = path.split('/') filename = path_parts[len(path_parts) - 1] print("FINDINGS: {} false pos of {} tiles, from {}".format( len(false_positives), len(images), filename)) render_results_for_analysis([path], false_positives, fp_images, training_info['bands'], training_info['tile_size'])
def main(): """Use local data to train the neural net, probably made by bin/create_training_data.py.""" parser = create_parser() args = parser.parse_args() with open(CACHE_PATH + 'raster_data_paths.pickle', 'r') as infile: raster_data_paths = pickle.load(infile) test_images, model = train_on_cached_data(raster_data_paths, args.neural_net, args.bands, args.tile_size, args.number_of_epochs) if not args.omit_findings: findings = [] for path in raster_data_paths: print path labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: print("WARNING, there is a borked naip image file") continue false_positives, false_negatives, fp_images, fn_images = list_findings( labels, images, model) path_parts = path.split('/') filename = path_parts[len(path_parts) - 1] print( "FINDINGS: {} false pos and {} false neg, of {} tiles, from {}" .format(len(false_positives), len(false_negatives), len(images), filename)) # render JPEGs showing findings render_results_for_analysis([path], false_positives, fp_images, args.bands, args.tile_size) # combine findings for all NAIP images analyzed [ findings.append(f) for f in tag_with_locations( fp_images, false_positives, args.tile_size) ] # dump combined findings to disk as a pickle with open(CACHE_PATH + 'findings.pickle', 'w') as outfile: pickle.dump(findings, outfile) # push pickle to S3 s3_client = boto3.client('s3') s3_client.upload_file(CACHE_PATH + 'findings.pickle', 'deeposm', 'findings.pickle') if args.render_results: predictions = predictions_for_tiles(test_images, model) render_results_for_analysis(raster_data_paths, predictions, test_images, args.bands, args.tile_size)
def main(): """Use local data to train the neural net, probably made by bin/create_training_data.py.""" parser = create_parser() args = parser.parse_args() with open(CACHE_PATH + 'raster_data_paths.pickle', 'r') as infile: raster_data_paths = pickle.load(infile) test_images, model = train_on_cached_data(raster_data_paths, args.neural_net, args.bands, args.tile_size, args.number_of_epochs) if not args.omit_findings: findings = [] for path in raster_data_paths: print path labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: print("WARNING, there is a borked naip image file") continue false_positives, false_negatives, fp_images, fn_images = list_findings(labels, images, model) path_parts = path.split('/') filename = path_parts[len(path_parts) - 1] print("FINDINGS: {} false pos and {} false neg, of {} tiles, from {}".format( len(false_positives), len(false_negatives), len(images), filename)) # render JPEGs showing findings render_results_for_analysis([path], false_positives, fp_images, args.bands, args.tile_size) # combine findings for all NAIP images analyzed [findings.append(f) for f in tag_with_locations(fp_images, false_positives, args.tile_size)] # dump combined findings to disk as a pickle with open(CACHE_PATH + 'findings.pickle', 'w') as outfile: pickle.dump(findings, outfile) # push pickle to S3 s3_client = boto3.client('s3') s3_client.upload_file(CACHE_PATH + 'findings.pickle', 'deeposm', 'findings.pickle') if args.render_results: predictions = predictions_for_tiles(test_images, model) render_results_for_analysis(raster_data_paths, predictions, test_images, args.bands, args.tile_size)
def train_on_cached_data(neural_net_type, number_of_epochs): """Load tiled/cached training data in batches, and train the neural net.""" with open(CACHE_PATH + METADATA_PATH, 'r') as infile: training_info = pickle.load(infile) bands = training_info['bands'] tile_size = training_info['tile_size'] training_images = [] onehot_training_labels = [] model = None # there are usually 100+ images with road through the middle, out of every 10,000 # because we want half on, half off, and discard most images EQUALIZATION_BATCH_SIZE = 10000 # the number of times to pull EQUALIZATION_BATCH_SIZE images from disk NUMBER_OF_BATCHES = 50 for x in range(0, NUMBER_OF_BATCHES): print("BATCH: {} of {}".format(str(x + 1), str(NUMBER_OF_BATCHES))) new_label_paths = load_training_tiles(EQUALIZATION_BATCH_SIZE) print("Got batch of {} labels".format(len(new_label_paths))) new_training_images, new_onehot_training_labels = format_as_onehot_arrays( new_label_paths) equal_count_way_list, equal_count_tile_list = equalize_data( new_onehot_training_labels, new_training_images, False) [training_images.append(i) for i in equal_count_tile_list] [onehot_training_labels.append(l) for l in equal_count_way_list] # once we have 100 test_images, train on a mini batch if len(training_images) >= 100: # continue training the model with the new data set model = train_with_data(onehot_training_labels, training_images, neural_net_type, bands, tile_size, number_of_epochs, model) training_images = [] onehot_training_labels = [] save_model(model, neural_net_type, bands, tile_size) return model
def post_findings_to_s3(raster_data_paths, model, training_info, render_results): """Aggregate findings from all NAIPs into a pickled list, post to S3.""" findings = [] for path in raster_data_paths: labels, images = load_training_tiles(path) if len(labels) == 0 or len(images) == 0: print("WARNING, there is a borked naip image file") continue false_positives, fp_images = list_findings(labels, images, model) path_parts = path.split('/') filename = path_parts[len(path_parts) - 1] print("FINDINGS: {} false pos of {} tiles, from {}".format( len(false_positives), len(images), filename)) if render_results: # render JPEGs showing findings render_results_for_analysis([path], false_positives, fp_images, training_info['bands'], training_info['tile_size']) # combine findings for all NAIP images analyzedfor the region [ findings.append(f) for f in tag_with_locations( fp_images, false_positives, training_info['tile_size']) ] # dump combined findings to disk as a pickle try: os.mkdir(CACHE_PATH + training_info['naip_state']) except: pass naip_path_in_cache_dir = training_info[ 'naip_state'] + '/' + 'findings.pickle' local_path = CACHE_PATH + naip_path_in_cache_dir with open(local_path, 'w') as outfile: pickle.dump(findings, outfile) # push pickle to S3 s3_client = boto3.client('s3') s3_client.upload_file(local_path, FINDINGS_S3_BUCKET, naip_path_in_cache_dir)
def train_on_cached_data(neural_net_type, number_of_epochs): """Load tiled/cached training data in batches, and train the neural net.""" with open(CACHE_PATH + METADATA_PATH, "r") as infile: training_info = pickle.load(infile) bands = training_info["bands"] tile_size = training_info["tile_size"] training_images = [] onehot_training_labels = [] model = None # there are usually 100+ images with road through the middle, out of every 10,000 # because we want half on, half off, and discard most images EQUALIZATION_BATCH_SIZE = 10000 # the number of times to pull EQUALIZATION_BATCH_SIZE images from disk NUMBER_OF_BATCHES = 10 for x in range(0, NUMBER_OF_BATCHES): new_label_paths = load_training_tiles(EQUALIZATION_BATCH_SIZE) print("Got batch of {} labels".format(len(new_label_paths))) new_training_images, new_onehot_training_labels = format_as_onehot_arrays(new_label_paths) equal_count_way_list, equal_count_tile_list = equalize_data( new_onehot_training_labels, new_training_images, False ) [training_images.append(i) for i in equal_count_tile_list] [onehot_training_labels.append(l) for l in equal_count_way_list] # once we have 100 test_images, train on a mini batch if len(training_images) >= 100: # continue training the model with the new data set model = train_with_data( onehot_training_labels, training_images, neural_net_type, bands, tile_size, number_of_epochs, model ) training_images = [] onehot_training_labels = [] save_model(model, neural_net_type, bands, tile_size) return model