def main(args): """ Classify dataset """ # Parse config and file data dataset_config, yatsm_config = parse_config_file(args['config_file']) # Get some attributes about the dataset dates, sensors, images = csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) nrow, _, _, _ = get_image_attribute(images[0]) # Read in the saved classification result try: _ = open(args['algo']) except: logger.error('Could not open pickled classifier') sys.exit(1) classifier = joblib.load(args['algo']) # Split into lines and classify job_lines = calculate_lines(args['job_number'] - 1, args['total_jobs'], nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) start_time = time.time() logger.info('Starting to run lines') for job_line in job_lines: filename = get_output_name(dataset_config, job_line) if not os.path.exists(filename): logger.warning('No model result found for line {l} ' '(file {f})'.format(l=job_line, f=filename)) pass if args['resume'] and try_resume(filename): logger.debug('Already processed line {l}'.format(l=job_line)) continue logger.debug('Classifying line {l}'.format(l=job_line)) classify_line(filename, classifier) logger.debug('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time) / 60.0, 2)) )
def main(dataset_config, yatsm_config, check=False, resume=False, do_not_run=False, read_cache=False, write_cache=False, validate_cache=False): """ Read in dataset and YATSM for a complete line Args: dataset_config (dict): dict of dataset configuration options yatsm_config (dict): dict of YATSM algorithm options check (bool, optional): check to make sure images are readible resume (bool, optional): do not overwrite existing results, instead continue from first non-existing result file do_not_run (bool, optional): Don't run YATSM read_cache (bool, optional): try to read from cache directory (default: False) write_cache (bool, optional): try to to write to cache directory (default: False) validate_cache (bool, optional): ensure data from cache file come from images specified in configuration (default: False) """ # Read in dataset dates, sensors, images = csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) image_IDs = get_image_IDs(images) # Check for existence of files and remove missing if check: to_delete = [] for i, img in enumerate(images): if not os.path.isfile(img): logger.warning('Could not find file {f} -- removing'. format(f=img)) to_delete.append(i) if len(to_delete) == 0: logger.debug('Checked and found all input images') else: logger.warning('Removing {n} images'.format(n=len(to_delete))) dates = np.delete(dates, np.array(to_delete)) images = np.delete(images, np.array(to_delete)) # Get attributes of one of the images nrow, ncol, nband, dtype = get_image_attribute(images[0]) # Calculate the lines this job ID works on job_lines = calculate_lines(job_number, total_jobs, nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Calculate X feature input X = patsy.dmatrix(yatsm_config['design_matrix'], {'x': dates, 'sensor': sensors}) # Start running YATSM start_time_all = time.time() logger.info('Starting to run lines') for job_line in job_lines: if resume: try: z = np.load(get_output_name(dataset_config, job_line)) except: pass else: del z logger.debug('Already processed line {l}'.format(l=job_line)) continue logger.debug('Running line {l}'.format(l=job_line)) start_time = time.time() try: run_line(job_line, X, images, image_IDs, dataset_config, yatsm_config, nrow, ncol, nband, dtype, do_not_run=do_not_run, read_cache=read_cache, write_cache=write_cache, validate_cache=validate_cache) except Exception as e: logger.error('Could not process line {l}'.format(l=job_line)) logger.error(type(e)) logger.error(str(e)) logger.debug('Took {s}s to run'.format( s=round(time.time() - start_time, 2))) logger.info('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time_all) / 60.0, 2) ))
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format'] ) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name( dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format( l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern( job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file( images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs ) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))
def main(dataset_config, yatsm_config, check=False, resume=False, do_not_run=False, read_cache=False, write_cache=False, validate_cache=False): """ Read in dataset and YATSM for a complete line Args: dataset_config (dict): dict of dataset configuration options yatsm_config (dict): dict of YATSM algorithm options check (bool, optional): check to make sure images are readible resume (bool, optional): do not overwrite existing results, instead continue from first non-existing result file do_not_run (bool, optional): Don't run YATSM read_cache (bool, optional): try to read from cache directory (default: False) write_cache (bool, optional): try to to write to cache directory (default: False) validate_cache (bool, optional): ensure data from cache file come from images specified in configuration (default: False) """ # Read in dataset dates, sensors, images = csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format']) image_IDs = get_image_IDs(images) # Check for existence of files and remove missing if check: to_delete = [] for i, img in enumerate(images): if not os.path.isfile(img): logger.warning( 'Could not find file {f} -- removing'.format(f=img)) to_delete.append(i) if len(to_delete) == 0: logger.debug('Checked and found all input images') else: logger.warning('Removing {n} images'.format(n=len(to_delete))) dates = np.delete(dates, np.array(to_delete)) images = np.delete(images, np.array(to_delete)) # Get attributes of one of the images nrow, ncol, nband, dtype = get_image_attribute(images[0]) # Calculate the lines this job ID works on job_lines = calculate_lines(job_number, total_jobs, nrow) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Calculate X feature input X = patsy.dmatrix(yatsm_config['design_matrix'], { 'x': dates, 'sensor': sensors }) # Start running YATSM start_time_all = time.time() logger.info('Starting to run lines') for job_line in job_lines: if resume: try: z = np.load(get_output_name(dataset_config, job_line)) except: pass else: del z logger.debug('Already processed line {l}'.format(l=job_line)) continue logger.debug('Running line {l}'.format(l=job_line)) start_time = time.time() try: run_line(job_line, X, images, image_IDs, dataset_config, yatsm_config, nrow, ncol, nband, dtype, do_not_run=do_not_run, read_cache=read_cache, write_cache=write_cache, validate_cache=validate_cache) except Exception as e: logger.error('Could not process line {l}'.format(l=job_line)) logger.error(type(e)) logger.error(str(e)) logger.debug( 'Took {s}s to run'.format(s=round(time.time() - start_time, 2))) logger.info('Completed {n} lines in {m} minutes'.format( n=len(job_lines), m=round((time.time() - start_time_all) / 60.0, 2)))
def main(args): # Parse and validate configuration file dataset_config, yatsm_config = config_parser.parse_config_file( args['config_file']) if not os.path.isdir(dataset_config['cache_line_dir']): os.makedirs(dataset_config['cache_line_dir']) dates, images = utils.csvfile_to_dataset( dataset_config['input_file'], date_format=dataset_config['date_format']) image_IDs = utils.get_image_IDs(images) nrow, ncol, nband, dtype = reader.get_image_attribute(images[0]) # Determine lines to work on job_lines = utils.calculate_lines(args['job_number'], args['total_jobs'], nrow, interlaced=args['interlace']) logger.debug('Responsible for lines: {l}'.format(l=job_lines)) # Determine file reader if dataset_config['use_bip_reader']: logger.debug('Reading in data from disk using BIP reader') image_reader = reader.read_row_BIP image_reader_kwargs = {'size': (ncol, nband), 'dtype': dtype} else: logger.debug('Reading in data from disk using GDAL') image_reader = reader.read_row_GDAL image_reader_kwargs = {} # Attempt to update cache files previous_cache = None if args['update_pattern']: previous_cache = fnmatch.filter( os.listdir(dataset_config['cache_line_dir']), args['update_pattern']) if not previous_cache: logger.warning('Could not find cache files to update with pattern' '{p}'.format(p=args['update_pattern'])) else: logger.debug('Found {n} previously cached files to update'.format( n=len(previous_cache))) for job_line in job_lines: cache_filename = cache.get_line_cache_name(dataset_config, len(images), job_line, nband) logger.debug('Caching line {l} to {f}'.format(l=job_line, f=cache_filename)) start_time = time.time() # Find matching cache file update = False if previous_cache: pattern = cache.get_line_cache_pattern(job_line, nband, regex=False) potential = fnmatch.filter(previous_cache, pattern) if not potential: logger.info('Found zero previous cache files for ' 'line {l}'.format(l=job_line)) elif len(potential) > 1: logger.info('Found more than one previous cache file for ' 'line {l}. Keeping first'.format(l=job_line)) update = os.path.join(dataset_config['cache_line_dir'], potential[0]) else: update = os.path.join(dataset_config['cache_line_dir'], potential[0]) logger.info('Updating from cache file {f}'.format(f=update)) if update: cache.update_cache_file(images, image_IDs, update, cache_filename, job_line, image_reader, image_reader_kwargs) else: if dataset_config['use_bip_reader']: # Use BIP reader logger.debug('Reading in data from disk using BIP reader') Y = reader.read_row_BIP(images, job_line, (ncol, nband), dtype) else: # Read in data just using GDAL logger.debug('Reading in data from disk using GDAL') Y = reader.read_row_GDAL(images, job_line) cache.write_cache_file(cache_filename, Y, image_IDs) logger.debug('Took {s}s to cache the data'.format( s=round(time.time() - start_time, 2)))