def __init__(self, config, boundary, dataset, data, extract_type, temporal_type, version): self.config = config self.client = self.config.client self.c_extracts = self.client.asdf.extracts self.c_msr = self.client.asdf.msr self.base = os.path.join(config.branch_dir, "outputs/extracts", version.replace('.', '_')) self.boundary = boundary self.dataset = dataset self.data = data self.extract_type = extract_type self.temporal_type = temporal_type self.version = version exo = extract_utility.ExtractObject() self.extract_options = exo._extract_options if self.extract_type not in self.extract_options: raise Exception('invalid extract type') self.extract_path = None
def __init__(self, client=None): # self.interface = False # self.user_update = True # base path # self.dir_base = os.path.dirname(os.path.abspath(__file__)) # # current datapackage fields # self.fields = json.load(open(self.dir_base + "/fields.json", 'r')) # acceptable inputs for various fields (dataset types, # vector formats, raster formats, etc.) self.types = { "data": { 'raster': 'raster', 'boundary': 'vector', # 'polydata': 'vector', # 'point': 'vector', # 'multipoint': 'vector', 'release': 'vector' # 'document': 'other' }, "file_extensions": { "vector": ['geojson', 'shp'], "raster": ['tif', 'asc'] }, "extracts": None, "group_class": ['actual', 'sub'] } exo = extract_utility.ExtractObject() self.types['extracts'] = exo._extract_options # init mongo self.client = client self.c_asdf = self.client.asdf.data
if any([i not in dataset_options for i in missing_defaults]): sys.exit("builder.py has terminated : required option(s) missing " + "from both dataset default options.") # gather all relevant options for k in required_options: if k in dataset_options: tmp_config[k] = dataset_options[k] else: tmp_config[k] = job_json['defaults'][k] # ================================================== # init / setup extract and generate qlist exo = extract_utility.ExtractObject(builder=True) exo.set_vector_path(tmp_config['bnd_absolute']) if tmp_config['file_mask'] == "None" and not os.path.isfile( tmp_config['data_base']): for root, dirs, files in os.walk(tmp_config['data_base']): for file in files: tmp_config['data_base'] = os.path.join(root, file) break break exo.set_base_path(tmp_config['data_base']) exo.set_years(tmp_config['years'])
def tmp_worker_job(self, task_index, task_data): worker_tagline = 'Worker %s | Task %s - ' % (self.rank, task_index) # task = self.task_list[task_id] dataset_index = task_data[0] qlist_index = task_data[1] # dataset name data_name = input_json['job']['datasets'][dataset_index]['name'] settings = input_json['job']['datasets'][dataset_index]['settings'] item = input_json['job']['datasets'][dataset_index]['qlist'][qlist_index] # ================================================== # inputs (see jobscript_template comments for detailed descriptions # of inputs) # * = managed by ExtractObject # boundary name bnd_name = settings['bnd_name'] # absolute path of boundary file * bnd_absolute = settings['bnd_absolute'] # folder which contains data (or data file) * data_base = settings['data_base'] # string containing year information * year_string = settings['years'] # file mask for dataset files * file_mask = settings['file_mask'] # extract type * extract_type = settings['extract_type'] # output folder output_base = settings['output_base'] temporal = ''.join([str(e) for e in item[0]]) if temporal == '': raster_name = data_name temporal = 'na' else: raster_name = data_name + "_" + temporal # ================================================== exo = extract_utility.ExtractObject() exo.set_vector_path(bnd_absolute) exo.set_base_path(data_base) exo.set_years(year_string) exo.set_file_mask(file_mask) if extract_type == "categorical": exo.set_extract_type(extract_type, settings['categories']) else: exo.set_extract_type(extract_type) # ================================================== output_dir = os.path.join(output_base, bnd_name, "cache", data_name) # creates directories try: os.makedirs(output_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise # ================================================== # generate raster path if exo._run_option == "1": raster = item[1] else: raster = exo._base_path + "/" + item[1] # run extract print((worker_tagline + 'running extract: ' + '\n\tvector: (%s) %s\n\traster: (%s) %s\n\tmethod: %s ') % (bnd_name, bnd_absolute, raster_name, raster, extract_type)) run_data = exo.run_extract(raster) # generate output path file_name = '.'.join([data_name, temporal, exo._extract_type]) + ".csv" output = os.path.join(output_dir, file_name) run_data = exo.export_to_csv(run_data, output) # run_data = exo.export_to_db(run_data) try: Te_start = int(time.time()) for _ in run_data: pass Te_run = int(time.time() - Te_start) extract_status = 1 print((worker_tagline + 'completed extract in %s seconds' + '\n\tvector: (%s) %s\n\traster: (%s) %s\n\tmethod: %s ') % (Te_run, bnd_name, bnd_absolute, raster_name, raster, extract_type)) except MemoryError as e: extract_status = -2 print((worker_tagline + 'memory error (%s)' + '\n\tvector: (%s) %s\n\traster: (%s) %s\n\tmethod: %s ') % (extract_status, bnd_name, bnd_absolute, raster_name, raster, extract_type)) except Exception as e: extract_status = -1 print((worker_tagline + 'unknown error (%s)' + '\n\tvector: (%s) %s\n\traster: (%s) %s\n\tmethod: %s ') % (extract_status, bnd_name, bnd_absolute, raster_name, raster, extract_type)) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) return extract_status
def tmp_worker_job(self, task_index, task_data): worker_tagline = "Worker {0} | Task {1} - ".format(self.rank, task_index) # ================================= # inputs (see jobscript_template comments for detailed # descriptions of inputs) # * = managed by ExtractObject # absolute path of boundary file * bnd_absolute = task_data['bnd_absolute'] # raster file or dataset directory * data_path = task_data['data_path'] # extract type * extract_type = task_data['extract_type'] # boundary, dataset and raster names bnd_name = task_data['bnd_name'] dataset_name = task_data['dataset_name'] data_name = task_data['data_name'] # output directory output_base = task_data['output_base'] # ================================= exo = extract_utility.ExtractObject() exo.set_vector_path(bnd_absolute) exo.set_base_path(data_path) category_map = None if extract_type in ["categorical", "encoded"]: category_map = task_data['category_map'] exo.set_extract_type(extract_type, category_map=category_map) # ================================= output_dir = os.path.join(output_base, bnd_name, "cache", dataset_name) # creates directories try: os.makedirs(output_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise # ================================= # generate raster path raster = data_path # run extract print ("{0} running extract: " "\n\tvector: ({2}) {3}" "\n\traster: ({4}) {5}" "\n\tmethod: {6}").format( worker_tagline, None, bnd_name, bnd_absolute, data_name, raster, extract_type) run_data = exo.run_feature_extract(raster, pixel_limit=job.pixel_limit) # generate output path temporal = data_name[data_name.rindex('_')+1:] temporal = temporal if temporal != '' else 'na' file_name = '.'.join([dataset_name, temporal, exo._extract_type]) + ".csv" output = os.path.join(output_dir, file_name) run_data = exo.export_to_csv(run_data, output) # run_data = exo.export_to_db( # stats = run_data, # client = client, # bnd_name = bnd_name, # data_name = data_name, # ex_method = extract_type, # classification = task_data['classification'], # ex_version = version # ) try: Te_start = int(time.time()) for _ in run_data: pass Te_run = int(time.time() - Te_start) extract_status = 1 print ("{0} completed extract in {1} seconds" "\n\tvector: ({2}) {3}" "\n\traster: ({4}) {5}" "\n\tmethod: {6}").format( worker_tagline, Te_run, bnd_name, bnd_absolute, data_name, raster, extract_type) except MemoryError as e: extract_status = -2 print ("{0} memory error ({1})" "\n\tvector: ({2}) {3}" "\n\traster: ({4}) {5}" "\n\tmethod: {6}").format( worker_tagline, extract_status, bnd_name, bnd_absolute, data_name, raster, extract_type) except Exception as e: extract_status = -1 print ("{0} unknown error ({1})" "\n\tvector: ({2}) {3}" "\n\traster: ({4}) {5}" "\n\tmethod: {6}").format( worker_tagline, extract_status, bnd_name, bnd_absolute, data_name, raster, extract_type) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) # update status of item in extract queue update_extract = c_extracts.update_one({ '_id': task_data['_id'] }, { '$set': { 'status': extract_status, 'update_time': int(time.time()), 'complete_time': int(time.time()) } }, upsert=False) return extract_status