def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["release-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: base_original = client.asdf.data.find_one({'base': path}) if base_original is None: update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update: doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- if os.path.isdir(path): # remove trailing slash from path if path.endswith("/"): path = path[:-1] else: quit("Invalid base directory provided.") # ------------------------------------- doc['base'] = path doc["type"] = "release" doc["file_format"] = "release" doc["file_extension"] = "" doc["file_mask"] = "None" # ------------------------------------- # get release datapackage release_path = doc["base"] + '/datapackage.json' release_package = json.load(open(release_path, 'r')) core_fields = ['name', 'title', 'description', 'version'] doc["extras"] = {} for f in release_package.keys(): if f in core_fields: rkey = f.replace(" ", "_").lower() doc[f] = release_package[f] elif f == 'extras': for g in release_package['extras']: rkey = g['key'].replace(" ", "_").lower() doc['extras'][rkey] = g['value'] # updating these fields because # - current name is broken (not proper version) # - current title and description are not well suited for # general consumption via DET doc["extras"]["original_name"] = doc["name"] doc["extras"]["original_title"] = doc["title"] doc["extras"]["original_description"] = doc["description"] doc["name"] = "{0}_{1}_{2}_v{3}".format( doc["extras"]["data_set_preamble"].lower(), doc["extras"]["data_type"].lower(), doc["extras"]["processing_level"].lower(), str(doc["version"]).replace(".", "_")) preamble_word_list = re.findall('[A-Z](?:[A-Z]*(?![a-z])|[a-z]*)', doc["extras"]["data_set_preamble"]) clean_preamble_word_list = [ i for i in preamble_word_list if i not in ["AIMS"] ] clean_preamble = ' '.join(clean_preamble_word_list) doc["title"] = "{0} Geocoded Aid Data v{1}".format(clean_preamble, doc["version"]) doc["description"] = ( "Aid data from {0} {1}, geocoded and published by AidData. " "Covers projects from {2} to {3}. Version {4}.").format( clean_preamble, doc["extras"]["source_type"], doc["extras"]["temporal_start"], doc["extras"]["temporal_end"], doc["version"]) doc["extras"]["tags"] = [ "aiddata", "geocoded", "release", "aid", "economics", "welfare" ] is_active = doc["extras"]["data_set_preamble"] in config.release_iso3 doc["active"] = int(is_active) if update: name_original = client.asdf.data.find_one({'name': doc["name"]}) if name_original is None and base_original is None: update = False warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format( doc["base"], doc["name"])) # in case we ended up not finding a match for name doc["asdf"]["date_added"] = str(datetime.date.today()) elif name_original is not None and base_original is not None: if str(name_original['_id']) != str(base_original['_id']): quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format( doc["base"], doc["name"])) else: existing_original = name_original elif name_original is not None: existing_original = name_original elif base_original is not None: existing_original = base_original doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] # doc["active"] = existing_original["active"] # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # set temporal using release datapackage doc["temporal"] = {} doc["temporal"]["name"] = doc['extras']['temporal_name'] doc["temporal"]["format"] = "%Y" doc["temporal"]["type"] = "year" doc["temporal"]["start"] = doc['extras']['temporal_start'] doc["temporal"]["end"] = doc['extras']['temporal_end'] # ------------------------------------- print "\nProcessing spatial..." # get extemt loc_table_path = doc['base'] + "/data/locations.csv" env = ru.release_envelope(loc_table_path) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' resource_tmp = { "name": doc['name'], "bytes": 0, "path": "", "start": doc["temporal"]['start'], "end": doc["temporal"]['end'] } # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document..." pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["gadm-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" elif update in ["missing"]: update = "missing" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" base_original = client.asdf.data.find_one({'base': path}) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: if base_original is None and update != "missing": update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update or update == "missing": doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- if os.path.isdir(path): # remove trailing slash from path if path.endswith("/"): path = path[:-1] else: quit("Invalid base directory provided.") # ------------------------------------- doc['base'] = path doc["type"] = "boundary" doc["file_format"] = "vector" doc["file_extension"] = "geojson" doc["file_mask"] = "None" # ------------------------------------- gadm_name = os.path.basename(doc["base"]) gadm_version = os.path.basename(os.path.dirname(path))[4:] gadm_iso3 = gadm_name[:3] gadm_adm = gadm_name[4:] parent = os.path.dirname(os.path.abspath(__file__)) gadm_lookup_path = parent + '/gadm_iso3.json' gadm_lookup = json.load(open(gadm_lookup_path, 'r')) gadm_country = unidecode(gadm_lookup[gadm_iso3]) doc["name"] = (gadm_iso3.lower() + "_" + gadm_adm.lower() + "_gadm" + gadm_version.replace('.', '')) inactive_bnds_list = config.inactive_bnds is_active = doc["name"] not in inactive_bnds_list doc["active"] = int(is_active) name_original = client.asdf.data.find_one({'name': doc["name"]}) if not update and base_original is not None: msg = "No update specified but dataset exists (base: {0})".format(base_original['base']) raise Exception(msg) elif not update and name_original is not None: msg = "No update specified but dataset exists (name: {0})".format(name_original['name']) raise Exception(msg) if update: if update == "missing" and name_original is not None and base_original is not None: warn("Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible).") update = "partial" if update != "missing": if name_original is None and base_original is None: update = False warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format(doc["base"], doc["name"])) # in case we ended up not finding a match for name doc["asdf"]["date_added"] = str(datetime.date.today()) elif name_original is not None and base_original is not None: if str(name_original['_id']) != str(base_original['_id']): quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format(doc["base"], doc["name"])) else: existing_original = name_original elif name_original is not None: existing_original = name_original elif base_original is not None: existing_original = base_original doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] if existing_original["active"] == -1: doc["active"] = -1 doc["title"] = " ".join([gadm_country, gadm_adm.upper(), "Boundary - GADM", gadm_version]) doc["description"] = "PLACEHOLDER" doc["version"] = gadm_version doc["options"] = {} doc["options"]["group"] = (gadm_iso3.lower() + "_gadm" + gadm_version.replace('.', '')) doc["extras"] = {} doc["extras"]["citation"] = ("Global Administrative Areas " "(GADM) http://www.gadm.org.") doc["extras"]["sources_web"] = "http://www.gadm.org" doc["extras"]["sources_name"] = "Global Administrative Areas (GADM)" doc["extras"]["gadm_country"] = gadm_country doc["extras"]["gadm_iso3"] = gadm_iso3 doc["extras"]["gadm_adm"] = int(gadm_adm[-1:]) doc["extras"]["gadm_unit"] = "PLACEHOLDER" doc["extras"]["tags"] = ["gadm", gadm_adm, gadm_country] doc["options"]["group_title"] = "{0} GADM {1}".format(gadm_country, gadm_version) # boundary group if "adm0" in gadm_name.lower(): doc["options"]["group_class"] = "actual" doc["active"] = -1 else: doc["options"]["group_class"] = "sub" # ------------------------------------- # resource scan # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for fname in files: fname = os.path.join(root, fname) file_check = fname.endswith('.' + doc["file_extension"]) if file_check == True and not fname.endswith('simplified.geojson'): file_list.append(fname) if len(file_list) == 0: quit("No vector file found in " + doc["base"]) elif len(file_list) > 1: quit("Boundaries must be submitted individually.") f = file_list[0] print f # get adm unit name for country and add to gadm info and description if gadm_adm.lower() == "adm0": gadm_unit = "Country" else: with fiona.open(f, 'r') as tmp_feature_src: tmp_feature = tmp_feature_src[0] gadm_unit = tmp_feature['properties']['ENGTYPE_'+ gadm_adm[-1:]] doc["extras"]["gadm_unit"] = gadm_unit if gadm_unit not in [None, "Unknown"]: doc["extras"]["tags"].append(gadm_unit) doc["description"] = "GADM Boundary File for {0} ({1}) in {2}.".format( gadm_adm.upper(), gadm_unit, gadm_country) # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # temporally invariant dataset doc["temporal"] = {} doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 # ------------------------------------- print "\nProcessing spatial..." if not dry_run: convert_status = ru.add_asdf_id(f) if convert_status == 1: quit("Error adding ad_id to boundary file & outputting geojson.") env = ru.vector_envelope(f) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] resource_tmp["name"] = doc["name"] resource_tmp["bytes"] = os.path.getsize(f) resource_tmp["start"] = 10000101 resource_tmp["end"] = 99991231 # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) # update main list resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) try: dbu.features_to_mongo(doc['name']) except: # could remove data entry if it cannot be added # to mongo. or, at least make sure the data entry is # set to inactive raise if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["release-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: base_original = client.asdf.data.find_one({'base': path}) if base_original is None: update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update: doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- if os.path.isdir(path): # remove trailing slash from path if path.endswith("/"): path = path[:-1] else: quit("Invalid base directory provided.") # ------------------------------------- doc['base'] = path doc["type"] = "release" doc["file_format"] = "release" doc["file_extension"] = "" doc["file_mask"] = "None" # ------------------------------------- # get release datapackage release_path = doc["base"] + '/datapackage.json' release_package = json.load(open(release_path, 'r')) core_fields = ['name', 'title', 'description', 'version'] doc["extras"] = {} for f in release_package.keys(): if f in core_fields: rkey = f.replace(" ", "_").lower() doc[f] = release_package[f] elif f == 'extras': for g in release_package['extras']: rkey = g['key'].replace(" ", "_").lower() doc['extras'][rkey] = g['value'] # updating these fields because # - current name is broken (not proper version) # - current title and description are not well suited for # general consumption via DET doc["extras"]["original_name"] = doc["name"] doc["extras"]["original_title"] = doc["title"] doc["extras"]["original_description"] = doc["description"] doc["name"] = "{0}_{1}_{2}_v{3}".format( doc["extras"]["data_set_preamble"].lower(), doc["extras"]["data_type"].lower(), doc["extras"]["processing_level"].lower(), str(doc["version"]).replace(".", "_")) preamble_word_list = re.findall( '[A-Z](?:[A-Z]*(?![a-z])|[a-z]*)', doc["extras"]["data_set_preamble"]) clean_preamble_word_list = [i for i in preamble_word_list if i not in ["AIMS"]] clean_preamble = ' '.join(clean_preamble_word_list) if doc["title"] == "{} {}".format(doc["extras"]["data_set_preamble"], doc["extras"]["data_type"]): doc["title"] = "{0} Geocoded Aid Data v{1}".format(clean_preamble, doc["version"]) auto_description = ( "Aid data from {0} {1}, geocoded and published by AidData. " "Covers projects from {2} to {3}. Version {4}.").format( clean_preamble, doc["extras"]["source_type"], doc["extras"]["temporal_start"], doc["extras"]["temporal_end"], doc["version"]) if doc["description"] == "This is a sample description": doc["description"] = auto_description doc["extras"]["tags"] = ["aiddata", "geocoded", "release", "aid", "economics", "welfare"] is_active = doc["extras"]["data_set_preamble"] in config.release_iso3 doc["active"] = int(is_active) if update: name_original = client.asdf.data.find_one({'name': doc["name"]}) if name_original is None and base_original is None: update = False warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format(doc["base"], doc["name"])) # in case we ended up not finding a match for name doc["asdf"]["date_added"] = str(datetime.date.today()) elif name_original is not None and base_original is not None: if str(name_original['_id']) != str(base_original['_id']): quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format(doc["base"], doc["name"])) else: existing_original = name_original elif name_original is not None: existing_original = name_original elif base_original is not None: existing_original = base_original doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] # doc["active"] = existing_original["active"] # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # set temporal using release datapackage doc["temporal"] = {} doc["temporal"]["name"] = doc['extras']['temporal_name'] doc["temporal"]["format"] = "%Y" doc["temporal"]["type"] = "year" doc["temporal"]["start"] = doc['extras']['temporal_start'] doc["temporal"]["end"] = doc['extras']['temporal_end'] # ------------------------------------- print "\nProcessing spatial..." # get extemt loc_table_path = doc['base'] + "/data/locations.csv" env = ru.release_envelope(loc_table_path) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' resource_tmp = { "name": doc['name'], "bytes": 0, "path": "", "start": doc["temporal"]['start'], "end": doc["temporal"]['end'] } # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document..." pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["boundary-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update: doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- # get inputs if os.path.isfile(path): data = json.load(open(path, 'r')) else: quit("invalid input file path") required_core_fields = [ "base", "type", "file_extension", "file_mask", "name", "title", "description", "version", "active" ] missing_core_fields = [i for i in required_core_fields if i not in data] if len(missing_core_fields) > 0: quit("Missing core fields ({0})".format(missing_core_fields)) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: base_original = client.asdf.data.find_one({'base': data["base"]}) if base_original is None: update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # ------------------------------------- # validate class instance v = ValidationTools(client) # validate base path valid_base = v.base(data["base"], update) if not valid_base.isvalid: quit(valid_base.error) doc["base"] = valid_base.value base_exists = valid_base.data['exists'] # validate name valid_name = v.name(data["name"], update) if not valid_name.isvalid: quit(valid_name.error) doc["name"] = valid_name.value name_exists = valid_name.data['exists'] if update: if not base_exists and not name_exists: warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format( doc["base"], doc["name"])) elif base_exists and name_exists: base_id = str(valid_base.data['search']['_id']) name_id = str(valid_name.data['search']['_id']) if base_id != name_id: quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format( doc["base"], doc["name"])) else: existing_original = valid_name.data['search'] elif name_exists: existing_original = valid_name.data['search'] elif base_exists: existing_original = valid_base.data['search'] doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] # doc["active"] = existing_original["active"] # validate type and set file_format valid_type = v.data_type(data["type"]) if not valid_type.isvalid: quit(valid_type.error) doc["type"] = valid_type.value doc["file_format"] = valid_type.data["file_format"] if doc["type"] != "boundary": quit("Invalid type ({0}), must be boundary.".format(doc["type"])) # validate file extension (validation depends on file format) valid_extension = v.file_extension(data["file_extension"], doc["file_format"]) if not valid_extension.isvalid: quit(valid_extension.error) doc["file_extension"] = valid_extension.value # validate title, description and version doc["title"] = str(data["title"]) doc["description"] = str(data["description"]) doc["version"] = str(data["version"]) doc["active"] = int(data["active"]) # validate options for raster if not "options" in data: quit("Missing options lookup") required_options = ["group", "group_title", "group_class"] missing_options = [i for i in required_options if i not in data["options"]] if len(missing_options) > 0: quit( "Missing fields from options lookup ({0})".format(missing_options)) doc["options"] = {} ### warn("Current group checks for boundary do not cover all potential cases " "(e.g., geometry changes to group actual, various conflicts based " "group_class, existing groups, etc.).") # validate group info valid_group = v.group(data["options"]["group"], data["options"]["group_class"]) if not valid_group.isvalid: quit(valid_group.error) doc["options"]["group"] = valid_group.value doc["options"]["group_class"] = data["options"]["group_class"] doc["options"]["group_title"] = data["options"]["group_title"] ### # extras if not "extras" in data: print( "Although fields in extras are not required, it may contain " "commonly used field which should be added whenever possible " "(example: sources_web field)") doc["extras"] = {} elif not isinstance(data["extras"], dict): quit("Invalid instance of extras ({0}) of type: {1}".format( data["extras"], type(data["extras"]))) else: doc["extras"] = data["extras"] if not "tags" in doc["extras"]: doc["extras"]["tags"] = [] if not "boundary" in doc["extras"]["tags"]: doc["extras"]["tags"].append("boundary") # ------------------------------------- # resource scan # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for fname in files: fname = os.path.join(root, fname) file_check = fname.endswith('.' + doc["file_extension"]) if file_check == True and not fname.endswith('simplified.geojson'): file_list.append(fname) if len(file_list) == 0: quit("No vector file found in " + doc["base"]) elif len(file_list) > 1: quit("Boundaries must be submitted individually.") f = file_list[0] print f # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # temporally invariant dataset doc["temporal"] = {} doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 # ------------------------------------- print "\nProcessing spatial..." if not dry_run: convert_status = ru.add_asdf_id(f) if convert_status == 1: quit("Error adding ad_id to boundary file & outputting geojson.") env = ru.vector_envelope(f) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] resource_tmp["name"] = doc["name"] resource_tmp["bytes"] = os.path.getsize(f) resource_tmp["start"] = 10000101 resource_tmp["end"] = 99991231 # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) # update main list resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document..." pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) try: dbu.features_to_mongo(doc['name']) except: # could remove data entry if it cannot be added # to mongo. or, at least make sure the data entry is # set to inactive raise if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["boundary-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update: doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- # get inputs if os.path.isfile(path): data = json.load(open(path, 'r')) else: quit("invalid input file path") required_core_fields = [ "base", "type", "file_extension", "file_mask", "name", "title", "description", "version", "active" ] missing_core_fields = [i for i in required_core_fields if i not in data] if len(missing_core_fields) > 0: quit("Missing core fields ({0})".format(missing_core_fields)) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: base_original = client.asdf.data.find_one({'base': data["base"]}) if base_original is None: update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # ------------------------------------- # validate class instance v = ValidationTools(client) # validate base path valid_base = v.base(data["base"], update) if not valid_base.isvalid: quit(valid_base.error) doc["base"] = valid_base.value base_exists = valid_base.data['exists'] # validate name valid_name = v.name(data["name"], update) if not valid_name.isvalid: quit(valid_name.error) doc["name"] = valid_name.value name_exists = valid_name.data['exists'] if update: if not base_exists and not name_exists: warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format(doc["base"], doc["name"])) elif base_exists and name_exists: base_id = str(valid_base.data['search']['_id']) name_id = str(valid_name.data['search']['_id']) if base_id != name_id: quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format(doc["base"], doc["name"])) else: existing_original = valid_name.data['search'] elif name_exists: existing_original = valid_name.data['search'] elif base_exists: existing_original = valid_base.data['search'] doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] # doc["active"] = existing_original["active"] # validate type and set file_format valid_type = v.data_type(data["type"]) if not valid_type.isvalid: quit(valid_type.error) doc["type"] = valid_type.value doc["file_format"] = valid_type.data["file_format"] if doc["type"] != "boundary": quit("Invalid type ({0}), must be boundary.".format(doc["type"])) # validate file extension (validation depends on file format) valid_extension = v.file_extension(data["file_extension"], doc["file_format"]) if not valid_extension.isvalid: quit(valid_extension.error) doc["file_extension"] = valid_extension.value # validate title, description and version doc["title"] = str(data["title"]) doc["description"] = str(data["description"]) doc["version"] = str(data["version"]) doc["active"] = int(data["active"]) # validate options for raster if not "options" in data: quit("Missing options lookup") required_options = ["group", "group_title", "group_class"] missing_options = [i for i in required_options if i not in data["options"]] if len(missing_options) > 0: quit("Missing fields from options lookup ({0})".format( missing_options)) doc["options"] = {} ### warn("Current group checks for boundary do not cover all potential cases " "(e.g., geometry changes to group actual, various conflicts based " "group_class, existing groups, etc.).") # validate group info valid_group = v.group(data["options"]["group"], data["options"]["group_class"]) if not valid_group.isvalid: quit(valid_group.error) doc["options"]["group"] = valid_group.value doc["options"]["group_class"] = data["options"]["group_class"] doc["options"]["group_title"] = data["options"]["group_title"] ### # extras if not "extras" in data: print("Although fields in extras are not required, it may contain " "commonly used field which should be added whenever possible " "(example: sources_web field)") doc["extras"] = {} elif not isinstance(data["extras"], dict): quit("Invalid instance of extras ({0}) of type: {1}".format( data["extras"], type(data["extras"]))) else: doc["extras"] = data["extras"] if not "tags" in doc["extras"]: doc["extras"]["tags"] = [] if not "boundary" in doc["extras"]["tags"]: doc["extras"]["tags"].append("boundary") # ------------------------------------- # resource scan # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for fname in files: fname = os.path.join(root, fname) file_check = fname.endswith('.' + doc["file_extension"]) if file_check == True and not fname.endswith('simplified.geojson'): file_list.append(fname) if len(file_list) == 0: quit("No vector file found in " + doc["base"]) elif len(file_list) > 1: quit("Boundaries must be submitted individually.") f = file_list[0] print f # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # temporally invariant dataset doc["temporal"] = {} doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 # ------------------------------------- print "\nProcessing spatial..." if not dry_run: convert_status = ru.add_asdf_id(f) if convert_status == 1: quit("Error adding ad_id to boundary file & outputting geojson.") env = ru.vector_envelope(f) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] resource_tmp["name"] = doc["name"] resource_tmp["bytes"] = os.path.getsize(f) resource_tmp["start"] = 10000101 resource_tmp["end"] = 99991231 # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) # update main list resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document..." pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) # try: # dbu.features_to_mongo(doc['name']) # except: # # could remove data entry if it cannot be added # # to mongo. or, at least make sure the data entry is # # set to inactive # raise if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" elif update in ["missing"]: update = "missing" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" base_original = client.asdf.data.find_one({'base': path}) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: if base_original is None and update != "missing": update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update or update == "missing": doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- if os.path.isdir(path): # remove trailing slash from path if path.endswith("/"): path = path[:-1] else: quit("Invalid base directory provided.") # ------------------------------------- doc['base'] = path doc["type"] = "boundary" doc["file_format"] = "vector" doc["file_extension"] = "geojson" doc["file_mask"] = "None" # ------------------------------------- name = os.path.basename(doc["base"]) iso3 = name[:3] adm = name[4:] metadata_path = os.path.join(path, 'metadata.json') metadata = json.load(open(metadata_path, 'r')) country = unidecode(metadata["country"]) doc["name"] = iso3.lower() + "_" + adm.lower() + "_gb_" + version inactive_bnds_list = config.inactive_bnds is_active = doc["name"] not in inactive_bnds_list doc["active"] = int(is_active) name_original = client.asdf.data.find_one({'name': doc["name"]}) if not update and base_original is not None: msg = "No update specified but dataset exists (base: {0})".format( base_original['base']) raise Exception(msg) elif not update and name_original is not None: msg = "No update specified but dataset exists (name: {0})".format( name_original['name']) raise Exception(msg) if update: if update == "missing" and name_original is not None and base_original is not None: warn( "Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible)." ) update = "partial" if update != "missing": if name_original is None and base_original is None: update = False warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format( doc["base"], doc["name"])) # in case we ended up not finding a match for name doc["asdf"]["date_added"] = str(datetime.date.today()) elif name_original is not None and base_original is not None: if str(name_original['_id']) != str(base_original['_id']): quit( "Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format( doc["base"], doc["name"])) else: existing_original = name_original elif name_original is not None: existing_original = name_original elif base_original is not None: existing_original = base_original doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] if existing_original["active"] == -1: doc["active"] = -1 doc["title"] = "{} {} - GeoBoundaries v{}".format( country, adm.upper(), version.replace("_", ".")) doc["description"] = "PLACEHOLDER" doc["version"] = version doc["options"] = {} doc["options"]["group"] = iso3.lower() + "_gb_" + version doc["options"]["group_title"] = "{} - GeoBoundaries v{}".format( country, version.replace("_", ".")) doc["extras"] = {} doc["extras"]["citation"] = ( 'Seitz, L., Lv, Z., Goodman, S., Runfola, D. ' '"Chapter 3: GeoBoundaries - A Global, Redistributable Map of Administrative Zones." ' 'GeoQuery User\'s Guide. Ed. Dan Runfola Ariel BenYishay, Seth Goodman. ' 'Williamsburg, Va: AidData, 2018.') doc["extras"]["sources_web"] = "http://www.geoboundaries.org" doc["extras"]["sources_name"] = "AidData GeoBoundaries" doc["extras"]["country"] = country doc["extras"]["iso3"] = iso3 doc["extras"]["adm"] = int(adm[-1:]) doc["extras"]["tags"] = ["geoboundaries", adm, country, iso3] # boundary group if "adm0" in name.lower(): doc["options"]["group_class"] = "actual" doc["active"] = -1 else: doc["options"]["group_class"] = "sub" # ------------------------------------- # resource scan # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for fname in files: fname = os.path.join(root, fname) file_check = fname.endswith('.' + doc["file_extension"]) if file_check == True and not fname.endswith('simplified.geojson'): file_list.append(fname) if len(file_list) == 0: quit("No vector file found in " + doc["base"]) elif len(file_list) > 1: quit("Boundaries must be submitted individually.") f = file_list[0] print f doc["description"] = "GeoBoundaries boundary file for {} in {}.".format( adm.upper(), country) # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # temporally invariant dataset doc["temporal"] = {} doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 # ------------------------------------- print "\nProcessing spatial..." if not dry_run: convert_status = ru.add_asdf_id(f) if convert_status == 1: quit("Error adding ad_id to boundary file & outputting geojson.") env = ru.vector_envelope(f) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] resource_tmp["name"] = doc["name"] resource_tmp["bytes"] = os.path.getsize(f) resource_tmp["start"] = 10000101 resource_tmp["end"] = 99991231 # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) # update main list resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) # try: # dbu.features_to_mongo(doc['name']) # except: # # could remove data entry if it cannot be added # # to mongo. or, at least make sure the data entry is # # set to inactive # raise if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["raster-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["meta", "data"]: update = update elif update in ["full", "all"]: update = "full" elif update in [False, "false", "False", None, "none", "no", 0, "0"]: update = False else: raise ValueError("Invalid `update` value provided ({})".format(update)) print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update: doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- # get inputs if os.path.isfile(path): data = json.load(open(path, 'r')) else: quit("invalid input file path") required_core_fields = [ "base", "type", "file_extension", "file_mask", "name", "title", "description", "version", "active" ] missing_core_fields = [i for i in required_core_fields if i not in data] if len(missing_core_fields) > 0: quit("Missing core fields ({0})".format(missing_core_fields)) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: base_original = client.asdf.data.find_one({'base': data["base"]}) if base_original is None: update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # ------------------------------------- # validate class instance v = ValidationTools(client) # validate base path valid_base = v.base(data["base"], update) if not valid_base.isvalid: quit(valid_base.error) doc["base"] = valid_base.value base_exists = valid_base.data['exists'] # validate name valid_name = v.name(data["name"], update) if not valid_name.isvalid: quit(valid_name.error) doc["name"] = valid_name.value name_exists = valid_name.data['exists'] if update: if not base_exists and not name_exists: warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format( doc["base"], doc["name"])) elif base_exists and name_exists: base_id = str(valid_base.data['search']['_id']) name_id = str(valid_name.data['search']['_id']) if base_id != name_id: quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format( doc["base"], doc["name"])) else: existing_original = valid_name.data['search'] elif name_exists: existing_original = valid_name.data['search'] elif base_exists: existing_original = valid_base.data['search'] doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] # doc["active"] = existing_original["active"] # validate type and set file_format valid_type = v.data_type(data["type"]) if not valid_type.isvalid: quit(valid_type.error) doc["type"] = valid_type.value doc["file_format"] = valid_type.data["file_format"] if doc["type"] != "raster": quit("Invalid type ({0}), must be raster.".format(doc["type"])) # validate file extension (validation depends on file format) valid_extension = v.file_extension(data["file_extension"], doc["file_format"]) if not valid_extension.isvalid: quit(valid_extension.error) doc["file_extension"] = valid_extension.value # validate title, description and version doc["title"] = str(data["title"]) doc["description"] = str(data["description"]) doc["details"] = "" if "details" in data: doc["details"] = str(data["details"]) doc["version"] = str(data["version"]) doc["active"] = int(data["active"]) # validate options for raster if not "options" in data: quit("Missing options lookup") required_options = [ "resolution", "extract_types", "factor", "variable_description" ] missing_options = [i for i in required_options if i not in data["options"]] if len(missing_options) > 0: quit( "Missing fields from options lookup ({0})".format(missing_options)) doc["options"] = {} # resolution (in decimal degrees) valid_resolution = v.factor(data["options"]["resolution"]) if not valid_resolution.isvalid: quit(valid_resolution.error) doc["options"]["resolution"] = valid_resolution.value # multiplication factor (if needed, defaults to 1 if blank) valid_factor = v.factor(data["options"]["factor"]) if not valid_factor.isvalid: quit(valid_factor.error) doc["options"]["factor"] = valid_factor.value # *** # if factor changes, any extracts adjust with # old factor need to be removed # *** # extract_types (multiple, separate your input with commas) valid_extract_types = v.extract_types(data["options"]["extract_types"]) if not valid_extract_types.isvalid: quit(valid_extract_types.error) doc["options"]["extract_types"] = valid_extract_types.value valid_extract_types_info = v.extract_types( data["options"]["extract_types_info"]) if not valid_extract_types_info.isvalid: quit(valid_extract_types_info.error) doc["options"]["extract_types_info"] = valid_extract_types_info.value for i in doc["options"]["extract_types"]: if i not in doc["options"]["extract_types_info"]: raise Exception( "Value from `extract_type` missing from `extract_types_info` ({0})" .format(i)) # Description of the variable (units, range, etc.) doc["options"]["variable_description"] = str( data["options"]["variable_description"]) if "pixel_check" in data["options"]: doc["options"]["pixel_check"] = data["options"]["pixel_check"] # extras if not "extras" in data: print( "Although fields in extras are not required, it may contain " "commonly used field which should be added whenever possible " "(example: sources_web field)") doc["extras"] = {} elif not isinstance(data["extras"], dict): quit("Invalid instance of extras ({0}) of type: {1}".format( data["extras"], type(data["extras"]))) else: doc["extras"] = data["extras"] if not "tags" in doc["extras"]: doc["extras"]["tags"] = [] if not "raster" in doc["extras"]["tags"]: doc["extras"]["tags"].append("raster") if "categorical" in doc["options"]["extract_types"] or "encoded" in doc[ "options"]["extract_types"]: if not "category_map" in doc["extras"]: quit( "'categorical' or 'encoded' included as extract type but no 'category_map' dict provided in 'extras'." ) elif not isinstance(doc["extras"]["category_map"], dict): quit( "The 'category_map' field must be provided as a dict. Invalid type ({0}) given." .format(type(doc["extras"]["category_map"]))) else: # make sure category names and values are in proper key:val format # and types # {"field_name": pixel_value} # NOTE: rasterstats requires input cmap as {pixel_value: "field_name"} # this gets switched in extract utility. This was done since using integers # or floats as key values is not valid json and would break ingest jsons # (could put int/float as str maybe? then could keep as key) # pixel value may be int, float # field name may be str, int, float (but only using string for ingest rasters) cat_map = doc["extras"]["category_map"] invalid_cat_vals = [ i for i in cat_map.values() if not isinstance(i, (int, float)) ] invalid_cat_keys = [ i for i in cat_map.keys() if not isinstance(i, basestring) ] # make sure keys are str if invalid_cat_keys: print "Invalid `category_map` keys: ({0})".format( invalid_cat_keys) # make sure vals or int/float if invalid_cat_vals: print "Invalid `category_map` values: ({0})".format( invalid_cat_vals) if invalid_cat_keys or invalid_cat_vals: raise Exception("Invalid `category_map` provided.") cat_map = dict( zip([ re.sub('[^0-9a-z]', '_', i.lower()) for i in cat_map.keys() ], cat_map.values())) # ------------------------------------- if update == "meta": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- # resource scan if data["file_mask"] == "None" and os.path.isfile(doc["base"]): file_list = [doc["base"]] else: # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for file in files: file = os.path.join(root, file) file_check = file.endswith('.' + doc["file_extension"]) if file_check == True: file_list.append(file) if data["file_mask"] == "None": break if data["file_mask"] == "None" and len(file_list) != 1: quit("Multiple files found when `file_mask = None`") # ------------------------------------- # check file mask def validate_file_mask(vmask): """Validate a file mask""" # designates temporally invariant dataset if vmask == "None": return True, None # test file_mask for first file in file_list test_date_str = ru.run_file_mask(vmask, file_list[0], doc["base"]) valid_date = ru.validate_date(test_date_str) if valid_date[0] == False: return False, valid_date[1] return True, None # file mask identifying temporal attributes in path/file names valid_file_mask = validate_file_mask(data["file_mask"]) if valid_file_mask[0]: doc["file_mask"] = data["file_mask"] else: quit(valid_file_mask[1]) # ------------------------------------- print "\nProcessing temporal..." doc["temporal"] = {} if doc["file_mask"] == "None": # temporally invariant dataset doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 elif len(file_list) > 0: # name for temporal data format doc["temporal"]["name"] = "Date Range" doc["temporal"]["format"] = "%Y%m%d" doc["temporal"]["type"] = ru.get_date_range( ru.run_file_mask(doc["file_mask"], file_list[0], doc["base"]))[2] doc["temporal"]["start"] = None doc["temporal"]["end"] = None # day range for each file (eg: MODIS 8 day composites) # if "day_range" in v.data: # "day_range", "File day range? (Must be integer)", v.day_range else: quit("Warning: file mask given but no resources were found") # doc["temporal"]["name"] = "Unknown" # doc["temporal"]["format"] = "Unknown" # doc["temporal"]["type"] = "Unknown" # doc["temporal"]["start"] = "Unknown" # doc["temporal"]["end"] = "Unknown" # ------------------------------------- print "\nProcessing spatial..." # iterate over files to get bbox and do basic spatial validation # (mainly make sure rasters are all same size) f_count = 0 for f in file_list: # get basic geo info from each file env = ru.raster_envelope(f) # get full geo info from first file if f_count == 0: base_geo = env f_count += 1 # exit if basic geo does not match if base_geo != env: print f print base_geo print env warn("Raster bounding box does not match") # quit("Raster bounding box does not match") env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) if doc["scale"] == "global": print( "This dataset has a bounding box larger than a hemisphere " "and will be treated as a global dataset. If this is not a " "global (or near global) dataset you may want to turn it into " "multiple smaller datasets and ingest them individually.") # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' resource_list = [] for f in file_list: print f # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] # file size resource_tmp["bytes"] = os.path.getsize(f) if doc["file_mask"] != "None": # temporal # get unique time range based on dir path / file names # get data from mask date_str = ru.run_file_mask(doc["file_mask"], resource_tmp["path"]) validate_date_str = ru.validate_date(date_str) if not validate_date_str[0]: quit(validate_date_str[1]) if "day_range" in doc: range_start, range_end, range_type = ru.get_date_range( date_str, doc["day_range"]) else: range_start, range_end, range_type = ru.get_date_range( date_str) # name (unique among this dataset's resources, # not same name as dataset name) resource_tmp["name"] = (doc["name"] + "_" + date_str["year"] + date_str["month"] + date_str["day"]) else: range_start = 10000101 range_end = 99991231 resource_tmp["name"] = doc["name"] + "_none" # file date range resource_tmp["start"] = range_start resource_tmp["end"] = range_end # # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) # for k in resource_order) # update main list resource_list.append(resource_tmp) # update dataset temporal info if (doc["temporal"]["start"] is None or range_start < doc["temporal"]["start"]): doc["temporal"]["start"] = range_start elif (doc["temporal"]["end"] is None or range_end > doc["temporal"]["end"]): doc["temporal"]["end"] = range_end doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document..." pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["raster-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update: doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- # get inputs if os.path.isfile(path): data = json.load(open(path, 'r')) else: quit("invalid input file path") required_core_fields = [ "base", "type", "file_extension", "file_mask", "name", "title", "description", "version", "active" ] missing_core_fields = [i for i in required_core_fields if i not in data] if len(missing_core_fields) > 0: quit("Missing core fields ({0})".format(missing_core_fields)) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: base_original = client.asdf.data.find_one({'base': data["base"]}) if base_original is None: update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # ------------------------------------- # validate class instance v = ValidationTools(client) # validate base path valid_base = v.base(data["base"], update) if not valid_base.isvalid: quit(valid_base.error) doc["base"] = valid_base.value base_exists = valid_base.data['exists'] # validate name valid_name = v.name(data["name"], update) if not valid_name.isvalid: quit(valid_name.error) doc["name"] = valid_name.value name_exists = valid_name.data['exists'] if update: if not base_exists and not name_exists: warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format(doc["base"], doc["name"])) elif base_exists and name_exists: base_id = str(valid_base.data['search']['_id']) name_id = str(valid_name.data['search']['_id']) if base_id != name_id: quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format(doc["base"], doc["name"])) else: existing_original = valid_name.data['search'] elif name_exists: existing_original = valid_name.data['search'] elif base_exists: existing_original = valid_base.data['search'] doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] # doc["active"] = existing_original["active"] # validate type and set file_format valid_type = v.data_type(data["type"]) if not valid_type.isvalid: quit(valid_type.error) doc["type"] = valid_type.value doc["file_format"] = valid_type.data["file_format"] if doc["type"] != "raster": quit("Invalid type ({0}), must be raster.".format(doc["type"])) # validate file extension (validation depends on file format) valid_extension = v.file_extension(data["file_extension"], doc["file_format"]) if not valid_extension.isvalid: quit(valid_extension.error) doc["file_extension"] = valid_extension.value # validate title, description and version doc["title"] = str(data["title"]) doc["description"] = str(data["description"]) doc["details"] = "" if "details" in data: doc["details"] = str(data["details"]) doc["version"] = str(data["version"]) doc["active"] = int(data["active"]) # validate options for raster if not "options" in data: quit("Missing options lookup") required_options = ["resolution", "extract_types", "factor", "variable_description"] missing_options = [i for i in required_options if i not in data["options"]] if len(missing_options) > 0: quit("Missing fields from options lookup ({0})".format( missing_options)) doc["options"] = {} # resolution (in decimal degrees) valid_resolution = v.factor(data["options"]["resolution"]) if not valid_resolution.isvalid: quit(valid_resolution.error) doc["options"]["resolution"] = valid_resolution.value # multiplication factor (if needed, defaults to 1 if blank) valid_factor = v.factor(data["options"]["factor"]) if not valid_factor.isvalid: quit(valid_factor.error) doc["options"]["factor"] = valid_factor.value # *** # if factor changes, any extracts adjust with # old factor need to be removed # *** # extract_types (multiple, separate your input with commas) valid_extract_types = v.extract_types(data["options"]["extract_types"]) if not valid_extract_types.isvalid: quit(valid_extract_types.error) doc["options"]["extract_types"] = valid_extract_types.value valid_extract_types_info = v.extract_types(data["options"]["extract_types_info"]) if not valid_extract_types_info.isvalid: quit(valid_extract_types_info.error) doc["options"]["extract_types_info"] = valid_extract_types_info.value for i in doc["options"]["extract_types"]: if i not in doc["options"]["extract_types_info"]: raise Exception("Value from `extract_type` missing from `extract_types_info` ({0})".format(i)) # Description of the variable (units, range, etc.) doc["options"]["variable_description"] = str( data["options"]["variable_description"]) # extras if not "extras" in data: print("Although fields in extras are not required, it may contain " "commonly used field which should be added whenever possible " "(example: sources_web field)") doc["extras"] = {} elif not isinstance(data["extras"], dict): quit("Invalid instance of extras ({0}) of type: {1}".format( data["extras"], type(data["extras"]))) else: doc["extras"] = data["extras"] if not "tags" in doc["extras"]: doc["extras"]["tags"] = [] if not "raster" in doc["extras"]["tags"]: doc["extras"]["tags"].append("raster") if "categorical" in doc["options"]["extract_types"] or "encoded" in doc["options"]["extract_types"]: if not "category_map" in doc["extras"]: quit("'categorical' or 'encoded' included as extract type but no 'category_map' dict provided in 'extras'.") elif not isinstance(doc["extras"]["category_map"], dict): quit("The 'category_map' field must be provided as a dict. Invalid type ({0}) given.".format( type(doc["extras"]["category_map"]))) else: # make sure category names and values are in proper key:val format # and types # {"field_name": pixel_value} # NOTE: rasterstats requires input cmap as {pixel_value: "field_name"} # this gets switched in extract utility. This was done since using integers # or floats as key values is not valid json and would break ingest jsons # (could put int/float as str maybe? then could keep as key) # pixel value may be int, float # field name may be str, int, float (but only using string for ingest rasters) cat_map = doc["extras"]["category_map"] invalid_cat_vals = [i for i in cat_map.values() if not isinstance(i, (int, float))] invalid_cat_keys = [i for i in cat_map.keys() if not isinstance(i, basestring)] # make sure keys are str if invalid_cat_keys: print "Invalid `category_map` keys: ({0})".format(invalid_cat_keys) # make sure vals or int/float if invalid_cat_vals: print "Invalid `category_map` values: ({0})".format(invalid_cat_vals) if invalid_cat_keys or invalid_cat_vals: raise Exception("Invalid `category_map` provided.") cat_map = dict(zip( [re.sub('[^0-9a-z]', '_', i.lower()) for i in cat_map.keys()], cat_map.values() )) # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- # resource scan if data["file_mask"] == "None" and os.path.isfile(doc["base"]): file_list = [doc["base"]] else: # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for file in files: file = os.path.join(root, file) file_check = file.endswith('.' + doc["file_extension"]) if file_check == True: file_list.append(file) if data["file_mask"] == "None": break if data["file_mask"] == "None" and len(file_list) != 1: quit("Multiple files found when `file_mask = None`") # ------------------------------------- # check file mask def validate_file_mask(vmask): """Validate a file mask""" # designates temporally invariant dataset if vmask == "None": return True, None # test file_mask for first file in file_list test_date_str = ru.run_file_mask(vmask, file_list[0], doc["base"]) valid_date = ru.validate_date(test_date_str) if valid_date[0] == False: return False, valid_date[1] return True, None # file mask identifying temporal attributes in path/file names valid_file_mask = validate_file_mask(data["file_mask"]) if valid_file_mask[0]: doc["file_mask"] = data["file_mask"] else: quit(valid_file_mask[1]) # ------------------------------------- print "\nProcessing temporal..." doc["temporal"] = {} if doc["file_mask"] == "None": # temporally invariant dataset doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 elif len(file_list) > 0: # name for temporal data format doc["temporal"]["name"] = "Date Range" doc["temporal"]["format"] = "%Y%m%d" doc["temporal"]["type"] = ru.get_date_range(ru.run_file_mask( doc["file_mask"], file_list[0], doc["base"]))[2] doc["temporal"]["start"] = None doc["temporal"]["end"] = None # day range for each file (eg: MODIS 8 day composites) # if "day_range" in v.data: # "day_range", "File day range? (Must be integer)", v.day_range else: quit("Warning: file mask given but no resources were found") # doc["temporal"]["name"] = "Unknown" # doc["temporal"]["format"] = "Unknown" # doc["temporal"]["type"] = "Unknown" # doc["temporal"]["start"] = "Unknown" # doc["temporal"]["end"] = "Unknown" # ------------------------------------- print "\nProcessing spatial..." # iterate over files to get bbox and do basic spatial validation # (mainly make sure rasters are all same size) f_count = 0 for f in file_list: # get basic geo info from each file env = ru.raster_envelope(f) # get full geo info from first file if f_count == 0: base_geo = env f_count += 1 # exit if basic geo does not match if base_geo != env: print f print base_geo print env warn("Raster bounding box does not match") # quit("Raster bounding box does not match") env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) if doc["scale"] == "global": print ("This dataset has a bounding box larger than a hemisphere " "and will be treated as a global dataset. If this is not a " "global (or near global) dataset you may want to turn it into " "multiple smaller datasets and ingest them individually.") # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' resource_list = [] for f in file_list: print f # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] # file size resource_tmp["bytes"] = os.path.getsize(f) if doc["file_mask"] != "None": # temporal # get unique time range based on dir path / file names # get data from mask date_str = ru.run_file_mask(doc["file_mask"], resource_tmp["path"]) validate_date_str = ru.validate_date(date_str) if not validate_date_str[0]: quit(validate_date_str[1]) if "day_range" in doc: range_start, range_end, range_type = ru.get_date_range( date_str, doc["day_range"]) else: range_start, range_end, range_type = ru.get_date_range( date_str) # name (unique among this dataset's resources, # not same name as dataset name) resource_tmp["name"] = (doc["name"] +"_"+ date_str["year"] + date_str["month"] + date_str["day"]) else: range_start = 10000101 range_end = 99991231 resource_tmp["name"] = doc["name"] + "_none" # file date range resource_tmp["start"] = range_start resource_tmp["end"] = range_end # # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) # for k in resource_order) # update main list resource_list.append(resource_tmp) # update dataset temporal info if (doc["temporal"]["start"] is None or range_start < doc["temporal"]["start"]): doc["temporal"]["start"] = range_start elif (doc["temporal"]["end"] is None or range_end > doc["temporal"]["end"]): doc["temporal"]["end"] = range_end doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document..." pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" elif update in ["missing"]: update = "missing" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" base_original = client.asdf.data.find_one({'base': path}) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: if base_original is None and update != "missing": update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update or update == "missing": doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- if os.path.isdir(path): # remove trailing slash from path if path.endswith("/"): path = path[:-1] else: quit("Invalid base directory provided.") # ------------------------------------- doc['base'] = path doc["type"] = "boundary" doc["file_format"] = "vector" doc["file_extension"] = "geojson" doc["file_mask"] = "None" # ------------------------------------- name = os.path.basename(doc["base"]) iso3 = name[:3] adm = name[4:] metadata_path = os.path.join(path, 'metadata.json') metadata = json.load(open(metadata_path, 'r')) country = unidecode(metadata["country"]) doc["name"] = iso3.lower() + "_" + adm.lower() + "_gb_" + version inactive_bnds_list = config.inactive_bnds is_active = doc["name"] not in inactive_bnds_list doc["active"] = int(is_active) name_original = client.asdf.data.find_one({'name': doc["name"]}) if not update and base_original is not None: msg = "No update specified but dataset exists (base: {0})".format(base_original['base']) raise Exception(msg) elif not update and name_original is not None: msg = "No update specified but dataset exists (name: {0})".format(name_original['name']) raise Exception(msg) if update: if update == "missing" and name_original is not None and base_original is not None: warn("Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible).") update = "partial" if update != "missing": if name_original is None and base_original is None: update = False warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format(doc["base"], doc["name"])) # in case we ended up not finding a match for name doc["asdf"]["date_added"] = str(datetime.date.today()) elif name_original is not None and base_original is not None: if str(name_original['_id']) != str(base_original['_id']): quit("Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format(doc["base"], doc["name"])) else: existing_original = name_original elif name_original is not None: existing_original = name_original elif base_original is not None: existing_original = base_original doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] if existing_original["active"] == -1: doc["active"] = -1 doc["title"] = "{} {} - GeoBoundaries v{}".format(country, adm.upper(), version.replace("_", ".")) doc["description"] = "PLACEHOLDER" doc["version"] = version doc["options"] = {} doc["options"]["group"] = iso3.lower() + "_gb_" + version doc["options"]["group_title"] = "{} - GeoBoundaries v{}".format(country, version.replace("_", ".")) doc["extras"] = {} doc["extras"]["citation"] = ('Seitz, L., Lv, Z., Goodman, S., Runfola, D. ' '"Chapter 3: GeoBoundaries - A Global, Redistributable Map of Administrative Zones." ' 'GeoQuery User\'s Guide. Ed. Dan Runfola Ariel BenYishay, Seth Goodman. ' 'Williamsburg, Va: AidData, 2018.') doc["extras"]["sources_web"] = "http://www.geoboundaries.org" doc["extras"]["sources_name"] = "AidData GeoBoundaries" doc["extras"]["country"] = country doc["extras"]["iso3"] = iso3 doc["extras"]["adm"] = int(adm[-1:]) doc["extras"]["tags"] = ["geoboundaries", adm, country, iso3] # boundary group if "adm0" in name.lower(): doc["options"]["group_class"] = "actual" doc["active"] = -1 else: doc["options"]["group_class"] = "sub" # ------------------------------------- # resource scan # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for fname in files: fname = os.path.join(root, fname) file_check = fname.endswith('.' + doc["file_extension"]) if file_check == True and not fname.endswith('simplified.geojson'): file_list.append(fname) if len(file_list) == 0: quit("No vector file found in " + doc["base"]) elif len(file_list) > 1: quit("Boundaries must be submitted individually.") f = file_list[0] print f doc["description"] = "GeoBoundaries boundary file for {} in {}.".format( adm.upper(), country) # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # temporally invariant dataset doc["temporal"] = {} doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 # ------------------------------------- print "\nProcessing spatial..." if not dry_run: convert_status = ru.add_asdf_id(f) if convert_status == 1: quit("Error adding ad_id to boundary file & outputting geojson.") env = ru.vector_envelope(f) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] resource_tmp["name"] = doc["name"] resource_tmp["bytes"] = os.path.getsize(f) resource_tmp["start"] = 10000101 resource_tmp["end"] = 99991231 # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) # update main list resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) # try: # dbu.features_to_mongo(doc['name']) # except: # # could remove data entry if it cannot be added # # to mongo. or, at least make sure the data entry is # # set to inactive # raise if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0
def run(path=None, client=None, version=None, config=None, generator="auto", update=False, dry_run=False): print '\n---------------------------------------' script = os.path.basename(__file__) def quit(reason): """quit script cleanly to do: - do error log stuff - output error logs somewhere - if auto, move job file to error location """ raise Exception("{0}: terminating script - {1}\n".format( script, reason)) if config is not None: client = config.client elif client is not None: config = client.info.config.findOne() else: quit('Neither config nor client provided.') version = config.versions["gadm-ingest"] # update mongo class instance dbu = MongoUpdate(client) # ------------------------------------- # check path if path is not None: if not os.path.exists(path): quit("Invalid path provided.") else: quit("No path provided") # optional arg - mainly for user to specify manual run if generator not in ['auto', 'manual']: quit("Invalid generator input") if client is None: quit("No mongodb client connection provided") if config is None: quit("No config object provided") raw_update = update if update in ["partial", "meta"]: update = "partial" elif update in ["update", True, 1, "True", "full", "all"]: update = "full" elif update in ["missing"]: update = "missing" else: update = False print "running update status `{0}` (input: `{1}`)".format( update, raw_update) if dry_run in ["false", "False", "0", "None", "none", "no"]: dry_run = False dry_run = bool(dry_run) if dry_run: print "running dry run" base_original = client.asdf.data.find_one({'base': path}) existing_original = None if update: if not "data" in client.asdf.collection_names(): update = False msg = "Update specified but no data collection exists." if generator == "manual": raise Exception(msg) else: warn(msg) else: if base_original is None and update != "missing": update = False msg = "Update specified but no existing dataset found." if generator == "manual": raise Exception(msg) else: warn(msg) # init document doc = {} doc["asdf"] = {} doc["asdf"]["script"] = script doc["asdf"]["version"] = version doc["asdf"]["generator"] = generator doc["asdf"]["date_updated"] = str(datetime.date.today()) if not update or update == "missing": doc["asdf"]["date_added"] = str(datetime.date.today()) # ------------------------------------- if os.path.isdir(path): # remove trailing slash from path if path.endswith("/"): path = path[:-1] else: quit("Invalid base directory provided.") # ------------------------------------- doc['base'] = path doc["type"] = "boundary" doc["file_format"] = "vector" doc["file_extension"] = "geojson" doc["file_mask"] = "None" # ------------------------------------- gadm_name = os.path.basename(doc["base"]) gadm_version = os.path.basename(os.path.dirname(path))[4:] gadm_iso3 = gadm_name[:3] gadm_adm = gadm_name[4:] parent = os.path.dirname(os.path.abspath(__file__)) gadm_lookup_path = parent + '/gadm_iso3.json' gadm_lookup = json.load(open(gadm_lookup_path, 'r')) gadm_country = unidecode(gadm_lookup[gadm_iso3]) doc["name"] = (gadm_iso3.lower() + "_" + gadm_adm.lower() + "_gadm" + gadm_version.replace('.', '')) inactive_bnds_list = config.inactive_bnds is_active = doc["name"] not in inactive_bnds_list doc["active"] = int(is_active) name_original = client.asdf.data.find_one({'name': doc["name"]}) if not update and base_original is not None: msg = "No update specified but dataset exists (base: {0})".format( base_original['base']) raise Exception(msg) elif not update and name_original is not None: msg = "No update specified but dataset exists (name: {0})".format( name_original['name']) raise Exception(msg) if update: if update == "missing" and name_original is not None and base_original is not None: warn( "Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible)." ) update = "partial" if update != "missing": if name_original is None and base_original is None: update = False warn(("Update specified but no dataset with matching " "base ({0}) or name ({1}) was found").format( doc["base"], doc["name"])) # in case we ended up not finding a match for name doc["asdf"]["date_added"] = str(datetime.date.today()) elif name_original is not None and base_original is not None: if str(name_original['_id']) != str(base_original['_id']): quit( "Update option specified but identifying fields (base " "and name) belong to different existing datasets." "\n\tBase: {0}\n\tName: {1}".format( doc["base"], doc["name"])) else: existing_original = name_original elif name_original is not None: existing_original = name_original elif base_original is not None: existing_original = base_original doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"] if existing_original["active"] == -1: doc["active"] = -1 doc["title"] = " ".join( [gadm_country, gadm_adm.upper(), "Boundary - GADM", gadm_version]) doc["description"] = "PLACEHOLDER" doc["version"] = gadm_version doc["options"] = {} doc["options"]["group"] = (gadm_iso3.lower() + "_gadm" + gadm_version.replace('.', '')) doc["extras"] = {} doc["extras"]["citation"] = ("Global Administrative Areas " "(GADM) http://www.gadm.org.") doc["extras"]["sources_web"] = "http://www.gadm.org" doc["extras"]["sources_name"] = "Global Administrative Areas (GADM)" doc["extras"]["gadm_country"] = gadm_country doc["extras"]["gadm_iso3"] = gadm_iso3 doc["extras"]["gadm_adm"] = int(gadm_adm[-1:]) doc["extras"]["gadm_unit"] = "PLACEHOLDER" doc["extras"]["tags"] = ["gadm", gadm_adm, gadm_country] doc["options"]["group_title"] = "{0} GADM {1}".format( gadm_country, gadm_version) # boundary group if "adm0" in gadm_name.lower(): doc["options"]["group_class"] = "actual" doc["active"] = -1 else: doc["options"]["group_class"] = "sub" # ------------------------------------- # resource scan # find all files with file_extension in path file_list = [] for root, dirs, files in os.walk(doc["base"]): for fname in files: fname = os.path.join(root, fname) file_check = fname.endswith('.' + doc["file_extension"]) if file_check == True and not fname.endswith('simplified.geojson'): file_list.append(fname) if len(file_list) == 0: quit("No vector file found in " + doc["base"]) elif len(file_list) > 1: quit("Boundaries must be submitted individually.") f = file_list[0] print f # get adm unit name for country and add to gadm info and description if gadm_adm.lower() == "adm0": gadm_unit = "Country" else: with fiona.open(f, 'r') as tmp_feature_src: tmp_feature = tmp_feature_src[0] gadm_unit = tmp_feature['properties']['ENGTYPE_' + gadm_adm[-1:]] doc["extras"]["gadm_unit"] = gadm_unit if gadm_unit not in [None, "Unknown"]: doc["extras"]["tags"].append(gadm_unit) doc["description"] = "GADM Boundary File for {0} ({1}) in {2}.".format( gadm_adm.upper(), gadm_unit, gadm_country) # ------------------------------------- if update == "partial": print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) print "\n{0}: Done ({1} update).\n".format(script, update) return 0 # ------------------------------------- print "\nProcessing temporal..." # temporally invariant dataset doc["temporal"] = {} doc["temporal"]["name"] = "Temporally Invariant" doc["temporal"]["format"] = "None" doc["temporal"]["type"] = "None" doc["temporal"]["start"] = 10000101 doc["temporal"]["end"] = 99991231 # ------------------------------------- print "\nProcessing spatial..." if not dry_run: convert_status = ru.add_asdf_id(f) if convert_status == 1: quit("Error adding ad_id to boundary file & outputting geojson.") env = ru.vector_envelope(f) env = ru.trim_envelope(env) print "Dataset bounding box: ", env doc["scale"] = ru.envelope_to_scale(env) # set spatial doc["spatial"] = ru.envelope_to_geom(env) # ------------------------------------- print '\nProcessing resources...' # resources # individual resource info resource_tmp = {} # path relative to base resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:] resource_tmp["name"] = doc["name"] resource_tmp["bytes"] = os.path.getsize(f) resource_tmp["start"] = 10000101 resource_tmp["end"] = 99991231 # reorder resource fields # resource_order = ["name", "path", "bytes", "start", "end"] # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order) # update main list resource_list = [resource_tmp] doc["resources"] = resource_list # ------------------------------------- # database updates print "\nProcessed document:" pprint(doc) print "\nUpdating database (dry run = {0})...".format(dry_run) if not dry_run: dbu.update(doc, update, existing_original) try: dbu.features_to_mongo(doc['name']) except: # could remove data entry if it cannot be added # to mongo. or, at least make sure the data entry is # set to inactive raise if update: print "\n{0}: Done ({1} update).\n".format(script, update) else: print "\n{0}: Done.\n".format(script) print '\n---------------------------------------\n' return 0