Esempio n. 1
0
def run(path=None,
        client=None,
        version=None,
        config=None,
        generator="auto",
        update=False,
        dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))

    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["release-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------

    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")

    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            base_original = client.asdf.data.find_one({'base': path})
            if base_original is None:
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update:
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    if os.path.isdir(path):
        # remove trailing slash from path
        if path.endswith("/"):
            path = path[:-1]
    else:
        quit("Invalid base directory provided.")

    # -------------------------------------

    doc['base'] = path

    doc["type"] = "release"
    doc["file_format"] = "release"
    doc["file_extension"] = ""
    doc["file_mask"] = "None"

    # -------------------------------------

    # get release datapackage
    release_path = doc["base"] + '/datapackage.json'
    release_package = json.load(open(release_path, 'r'))

    core_fields = ['name', 'title', 'description', 'version']

    doc["extras"] = {}

    for f in release_package.keys():

        if f in core_fields:
            rkey = f.replace(" ", "_").lower()
            doc[f] = release_package[f]

        elif f == 'extras':

            for g in release_package['extras']:
                rkey = g['key'].replace(" ", "_").lower()
                doc['extras'][rkey] = g['value']

    # updating these fields because
    # - current name is broken (not proper version)
    # - current title and description are not well suited for
    #   general consumption via DET
    doc["extras"]["original_name"] = doc["name"]
    doc["extras"]["original_title"] = doc["title"]
    doc["extras"]["original_description"] = doc["description"]

    doc["name"] = "{0}_{1}_{2}_v{3}".format(
        doc["extras"]["data_set_preamble"].lower(),
        doc["extras"]["data_type"].lower(),
        doc["extras"]["processing_level"].lower(),
        str(doc["version"]).replace(".", "_"))

    preamble_word_list = re.findall('[A-Z](?:[A-Z]*(?![a-z])|[a-z]*)',
                                    doc["extras"]["data_set_preamble"])

    clean_preamble_word_list = [
        i for i in preamble_word_list if i not in ["AIMS"]
    ]

    clean_preamble = ' '.join(clean_preamble_word_list)

    doc["title"] = "{0} Geocoded Aid Data v{1}".format(clean_preamble,
                                                       doc["version"])

    doc["description"] = (
        "Aid data from {0} {1}, geocoded and published by AidData. "
        "Covers projects from {2} to {3}. Version {4}.").format(
            clean_preamble, doc["extras"]["source_type"],
            doc["extras"]["temporal_start"], doc["extras"]["temporal_end"],
            doc["version"])

    doc["extras"]["tags"] = [
        "aiddata", "geocoded", "release", "aid", "economics", "welfare"
    ]

    is_active = doc["extras"]["data_set_preamble"] in config.release_iso3
    doc["active"] = int(is_active)

    if update:
        name_original = client.asdf.data.find_one({'name': doc["name"]})

        if name_original is None and base_original is None:
            update = False
            warn(("Update specified but no dataset with matching "
                  "base ({0}) or name ({1}) was found").format(
                      doc["base"], doc["name"]))

            # in case we ended up not finding a match for name
            doc["asdf"]["date_added"] = str(datetime.date.today())

        elif name_original is not None and base_original is not None:

            if str(name_original['_id']) != str(base_original['_id']):
                quit("Update option specified but identifying fields (base "
                     "and name) belong to different existing datasets."
                     "\n\tBase: {0}\n\tName: {1}".format(
                         doc["base"], doc["name"]))
            else:
                existing_original = name_original

        elif name_original is not None:
            existing_original = name_original

        elif base_original is not None:
            existing_original = base_original

        doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]
        # doc["active"] = existing_original["active"]

# -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # set temporal using release datapackage
    doc["temporal"] = {}
    doc["temporal"]["name"] = doc['extras']['temporal_name']
    doc["temporal"]["format"] = "%Y"
    doc["temporal"]["type"] = "year"
    doc["temporal"]["start"] = doc['extras']['temporal_start']
    doc["temporal"]["end"] = doc['extras']['temporal_end']

    # -------------------------------------
    print "\nProcessing spatial..."

    # get extemt
    loc_table_path = doc['base'] + "/data/locations.csv"

    env = ru.release_envelope(loc_table_path)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    resource_tmp = {
        "name": doc['name'],
        "bytes": 0,
        "path": "",
        "start": doc["temporal"]['start'],
        "end": doc["temporal"]['end']
    }

    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document..."
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 2
0
def run(path=None, client=None, version=None, config=None,
        generator="auto", update=False, dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))


    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["gadm-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------


    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")


    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    elif update in ["missing"]:
        update = "missing"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    base_original = client.asdf.data.find_one({'base': path})

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            if base_original is None and update != "missing":
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)


    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update or update == "missing":
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    if os.path.isdir(path):
        # remove trailing slash from path
        if path.endswith("/"):
            path = path[:-1]
    else:
        quit("Invalid base directory provided.")

    # -------------------------------------

    doc['base'] = path

    doc["type"] = "boundary"
    doc["file_format"] = "vector"
    doc["file_extension"] = "geojson"
    doc["file_mask"] = "None"


    # -------------------------------------

    gadm_name = os.path.basename(doc["base"])

    gadm_version = os.path.basename(os.path.dirname(path))[4:]

    gadm_iso3 = gadm_name[:3]
    gadm_adm = gadm_name[4:]


    parent = os.path.dirname(os.path.abspath(__file__))
    gadm_lookup_path = parent + '/gadm_iso3.json'
    gadm_lookup =  json.load(open(gadm_lookup_path, 'r'))

    gadm_country = unidecode(gadm_lookup[gadm_iso3])

    doc["name"] = (gadm_iso3.lower() + "_" + gadm_adm.lower() + "_gadm" +
                   gadm_version.replace('.', ''))


    inactive_bnds_list = config.inactive_bnds
    is_active = doc["name"] not in inactive_bnds_list


    doc["active"] = int(is_active)


    name_original = client.asdf.data.find_one({'name': doc["name"]})

    if not update and base_original is not None:
        msg = "No update specified but dataset exists (base: {0})".format(base_original['base'])
        raise Exception(msg)
    elif not update and name_original is not None:
        msg = "No update specified but dataset exists (name: {0})".format(name_original['name'])
        raise Exception(msg)


    if update:

        if update == "missing" and name_original is not None and base_original is not None:
            warn("Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible).")
            update = "partial"

        if update != "missing":
            if name_original is None and base_original is None:
                update = False
                warn(("Update specified but no dataset with matching "
                      "base ({0}) or name ({1}) was found").format(doc["base"],
                                                                   doc["name"]))

                # in case we ended up not finding a match for name
                doc["asdf"]["date_added"] = str(datetime.date.today())

            elif name_original is not None and base_original is not None:

                if str(name_original['_id']) != str(base_original['_id']):
                    quit("Update option specified but identifying fields (base "
                         "and name) belong to different existing datasets."
                         "\n\tBase: {0}\n\tName: {1}".format(doc["base"],
                                                             doc["name"]))
                else:
                    existing_original = name_original

            elif name_original is not None:
                existing_original = name_original

            elif base_original is not None:
                existing_original = base_original


            doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]

            if existing_original["active"] == -1:
                doc["active"] = -1


    doc["title"] = " ".join([gadm_country, gadm_adm.upper(), "Boundary - GADM", gadm_version])

    doc["description"] = "PLACEHOLDER"

    doc["version"] = gadm_version


    doc["options"] = {}
    doc["options"]["group"] = (gadm_iso3.lower() + "_gadm" +
                               gadm_version.replace('.', ''))

    doc["extras"] = {}

    doc["extras"]["citation"] = ("Global Administrative Areas "
                                 "(GADM) http://www.gadm.org.")
    doc["extras"]["sources_web"] = "http://www.gadm.org"
    doc["extras"]["sources_name"] = "Global Administrative Areas (GADM)"

    doc["extras"]["gadm_country"] = gadm_country
    doc["extras"]["gadm_iso3"] = gadm_iso3
    doc["extras"]["gadm_adm"] = int(gadm_adm[-1:])
    doc["extras"]["gadm_unit"] = "PLACEHOLDER"
    doc["extras"]["tags"] = ["gadm", gadm_adm, gadm_country]

    doc["options"]["group_title"] = "{0} GADM {1}".format(gadm_country,
                                                          gadm_version)

    # boundary group
    if "adm0" in gadm_name.lower():
         doc["options"]["group_class"] = "actual"
         doc["active"] = -1
    else:
         doc["options"]["group_class"] = "sub"

    # -------------------------------------
    # resource scan

    # find all files with file_extension in path
    file_list = []
    for root, dirs, files in os.walk(doc["base"]):
        for fname in files:

            fname = os.path.join(root, fname)
            file_check = fname.endswith('.' + doc["file_extension"])

            if file_check == True and not fname.endswith('simplified.geojson'):
                file_list.append(fname)


    if len(file_list) == 0:
        quit("No vector file found in " + doc["base"])

    elif len(file_list) > 1:
        quit("Boundaries must be submitted individually.")


    f = file_list[0]
    print f

    # get adm unit name for country and add to gadm info and description
    if gadm_adm.lower() == "adm0":
        gadm_unit = "Country"
    else:
        with fiona.open(f, 'r') as tmp_feature_src:
            tmp_feature = tmp_feature_src[0]
            gadm_unit = tmp_feature['properties']['ENGTYPE_'+ gadm_adm[-1:]]

    doc["extras"]["gadm_unit"] = gadm_unit
    if gadm_unit not in [None, "Unknown"]:
        doc["extras"]["tags"].append(gadm_unit)
    doc["description"] = "GADM Boundary File for {0} ({1}) in {2}.".format(
        gadm_adm.upper(), gadm_unit, gadm_country)

    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # temporally invariant dataset
    doc["temporal"] = {}
    doc["temporal"]["name"] = "Temporally Invariant"
    doc["temporal"]["format"] = "None"
    doc["temporal"]["type"] = "None"
    doc["temporal"]["start"] = 10000101
    doc["temporal"]["end"] = 99991231

    # -------------------------------------
    print "\nProcessing spatial..."

    if not dry_run:
        convert_status = ru.add_asdf_id(f)
        if convert_status == 1:
             quit("Error adding ad_id to boundary file & outputting geojson.")


    env = ru.vector_envelope(f)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    # resources
    # individual resource info
    resource_tmp = {}

    # path relative to base
    resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

    resource_tmp["name"] = doc["name"]
    resource_tmp["bytes"] = os.path.getsize(f)
    resource_tmp["start"] = 10000101
    resource_tmp["end"] = 99991231

    # reorder resource fields
    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)

    # update main list
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document:"
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)
        try:
            dbu.features_to_mongo(doc['name'])
        except:
            # could remove data entry if it cannot be added
            # to mongo. or, at least make sure the data entry is
            # set to inactive
            raise

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 3
0
def run(path=None, client=None, version=None, config=None,
        generator="auto", update=False, dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))


    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["release-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------


    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")


    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")


    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")


    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            base_original = client.asdf.data.find_one({'base': path})
            if base_original is None:
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update:
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    if os.path.isdir(path):
        # remove trailing slash from path
        if path.endswith("/"):
            path = path[:-1]
    else:
        quit("Invalid base directory provided.")

    # -------------------------------------

    doc['base'] = path

    doc["type"] = "release"
    doc["file_format"] = "release"
    doc["file_extension"] = ""
    doc["file_mask"] = "None"

    # -------------------------------------

    # get release datapackage
    release_path = doc["base"] + '/datapackage.json'
    release_package = json.load(open(release_path, 'r'))

    core_fields = ['name', 'title', 'description', 'version']

    doc["extras"] = {}

    for f in release_package.keys():

        if f in core_fields:
            rkey = f.replace(" ", "_").lower()
            doc[f] = release_package[f]

        elif f == 'extras':

            for g in release_package['extras']:
                rkey = g['key'].replace(" ", "_").lower()
                doc['extras'][rkey] = g['value']


    # updating these fields because
    # - current name is broken (not proper version)
    # - current title and description are not well suited for
    #   general consumption via DET
    doc["extras"]["original_name"] = doc["name"]
    doc["extras"]["original_title"] = doc["title"]
    doc["extras"]["original_description"] = doc["description"]

    doc["name"] = "{0}_{1}_{2}_v{3}".format(
        doc["extras"]["data_set_preamble"].lower(),
        doc["extras"]["data_type"].lower(),
        doc["extras"]["processing_level"].lower(),
        str(doc["version"]).replace(".", "_"))


    preamble_word_list = re.findall(
        '[A-Z](?:[A-Z]*(?![a-z])|[a-z]*)',
        doc["extras"]["data_set_preamble"])

    clean_preamble_word_list = [i for i in preamble_word_list
                                if i not in ["AIMS"]]

    clean_preamble = ' '.join(clean_preamble_word_list)



    if doc["title"] == "{} {}".format(doc["extras"]["data_set_preamble"], doc["extras"]["data_type"]):
        doc["title"] = "{0} Geocoded Aid Data v{1}".format(clean_preamble, doc["version"])


    auto_description = (
        "Aid data from {0} {1}, geocoded and published by AidData. "
        "Covers projects from {2} to {3}. Version {4}.").format(
            clean_preamble,
            doc["extras"]["source_type"],
            doc["extras"]["temporal_start"],
            doc["extras"]["temporal_end"],
            doc["version"])

    if doc["description"] == "This is a sample description":
        doc["description"] = auto_description


    doc["extras"]["tags"] = ["aiddata", "geocoded", "release", "aid", "economics", "welfare"]

    is_active = doc["extras"]["data_set_preamble"] in config.release_iso3
    doc["active"] = int(is_active)

    if update:
        name_original = client.asdf.data.find_one({'name': doc["name"]})

        if name_original is None and base_original is None:
            update = False
            warn(("Update specified but no dataset with matching "
                  "base ({0}) or name ({1}) was found").format(doc["base"],
                                                               doc["name"]))

            # in case we ended up not finding a match for name
            doc["asdf"]["date_added"] = str(datetime.date.today())

        elif name_original is not None and base_original is not None:

            if str(name_original['_id']) != str(base_original['_id']):
                quit("Update option specified but identifying fields (base "
                     "and name) belong to different existing datasets."
                     "\n\tBase: {0}\n\tName: {1}".format(doc["base"],
                                                         doc["name"]))
            else:
                existing_original = name_original

        elif name_original is not None:
            existing_original = name_original

        elif base_original is not None:
            existing_original = base_original

        doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]
        # doc["active"] = existing_original["active"]


   # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # set temporal using release datapackage
    doc["temporal"] = {}
    doc["temporal"]["name"] = doc['extras']['temporal_name']
    doc["temporal"]["format"] = "%Y"
    doc["temporal"]["type"] = "year"
    doc["temporal"]["start"] = doc['extras']['temporal_start']
    doc["temporal"]["end"] = doc['extras']['temporal_end']


    # -------------------------------------
    print "\nProcessing spatial..."

    # get extemt
    loc_table_path = doc['base'] + "/data/locations.csv"

    env = ru.release_envelope(loc_table_path)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    resource_tmp = {
        "name": doc['name'],
        "bytes": 0,
        "path": "",
        "start": doc["temporal"]['start'],
        "end": doc["temporal"]['end']
    }

    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document..."
    pprint(doc)


    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 4
0
def run(path=None,
        client=None,
        version=None,
        config=None,
        generator="auto",
        update=False,
        dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))

    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["boundary-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------

    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")

    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update:
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    # get inputs
    if os.path.isfile(path):
        data = json.load(open(path, 'r'))
    else:
        quit("invalid input file path")

    required_core_fields = [
        "base", "type", "file_extension", "file_mask", "name", "title",
        "description", "version", "active"
    ]

    missing_core_fields = [i for i in required_core_fields if i not in data]

    if len(missing_core_fields) > 0:
        quit("Missing core fields ({0})".format(missing_core_fields))

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            base_original = client.asdf.data.find_one({'base': data["base"]})
            if base_original is None:
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # -------------------------------------

    # validate class instance
    v = ValidationTools(client)

    # validate base path
    valid_base = v.base(data["base"], update)

    if not valid_base.isvalid:
        quit(valid_base.error)

    doc["base"] = valid_base.value
    base_exists = valid_base.data['exists']

    # validate name
    valid_name = v.name(data["name"], update)

    if not valid_name.isvalid:
        quit(valid_name.error)

    doc["name"] = valid_name.value
    name_exists = valid_name.data['exists']

    if update:
        if not base_exists and not name_exists:
            warn(("Update specified but no dataset with matching "
                  "base ({0}) or name ({1}) was found").format(
                      doc["base"], doc["name"]))

        elif base_exists and name_exists:

            base_id = str(valid_base.data['search']['_id'])
            name_id = str(valid_name.data['search']['_id'])

            if base_id != name_id:
                quit("Update option specified but identifying fields (base "
                     "and name) belong to different existing datasets."
                     "\n\tBase: {0}\n\tName: {1}".format(
                         doc["base"], doc["name"]))
            else:
                existing_original = valid_name.data['search']

        elif name_exists:
            existing_original = valid_name.data['search']

        elif base_exists:
            existing_original = valid_base.data['search']

        doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]
        # doc["active"] = existing_original["active"]

    # validate type and set file_format
    valid_type = v.data_type(data["type"])

    if not valid_type.isvalid:
        quit(valid_type.error)

    doc["type"] = valid_type.value
    doc["file_format"] = valid_type.data["file_format"]

    if doc["type"] != "boundary":
        quit("Invalid type ({0}), must be boundary.".format(doc["type"]))

    # validate file extension (validation depends on file format)
    valid_extension = v.file_extension(data["file_extension"],
                                       doc["file_format"])

    if not valid_extension.isvalid:
        quit(valid_extension.error)

    doc["file_extension"] = valid_extension.value

    # validate title, description and version
    doc["title"] = str(data["title"])
    doc["description"] = str(data["description"])
    doc["version"] = str(data["version"])

    doc["active"] = int(data["active"])

    # validate options for raster

    if not "options" in data:
        quit("Missing options lookup")

    required_options = ["group", "group_title", "group_class"]

    missing_options = [i for i in required_options if i not in data["options"]]

    if len(missing_options) > 0:
        quit(
            "Missing fields from options lookup ({0})".format(missing_options))

    doc["options"] = {}

    ###

    warn("Current group checks for boundary do not cover all potential cases "
         "(e.g., geometry changes to group actual, various conflicts based "
         "group_class, existing groups, etc.).")

    # validate group info
    valid_group = v.group(data["options"]["group"],
                          data["options"]["group_class"])

    if not valid_group.isvalid:
        quit(valid_group.error)

    doc["options"]["group"] = valid_group.value

    doc["options"]["group_class"] = data["options"]["group_class"]
    doc["options"]["group_title"] = data["options"]["group_title"]

    ###

    # extras
    if not "extras" in data:
        print(
            "Although fields in extras are not required, it may contain "
            "commonly used field which should be added whenever possible "
            "(example: sources_web field)")
        doc["extras"] = {}

    elif not isinstance(data["extras"], dict):
        quit("Invalid instance of extras ({0}) of type: {1}".format(
            data["extras"], type(data["extras"])))
    else:
        doc["extras"] = data["extras"]

    if not "tags" in doc["extras"]:
        doc["extras"]["tags"] = []

    if not "boundary" in doc["extras"]["tags"]:
        doc["extras"]["tags"].append("boundary")

    # -------------------------------------
    # resource scan

    # find all files with file_extension in path
    file_list = []
    for root, dirs, files in os.walk(doc["base"]):
        for fname in files:

            fname = os.path.join(root, fname)
            file_check = fname.endswith('.' + doc["file_extension"])

            if file_check == True and not fname.endswith('simplified.geojson'):
                file_list.append(fname)

    if len(file_list) == 0:
        quit("No vector file found in " + doc["base"])

    elif len(file_list) > 1:
        quit("Boundaries must be submitted individually.")

    f = file_list[0]
    print f

    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # temporally invariant dataset
    doc["temporal"] = {}
    doc["temporal"]["name"] = "Temporally Invariant"
    doc["temporal"]["format"] = "None"
    doc["temporal"]["type"] = "None"
    doc["temporal"]["start"] = 10000101
    doc["temporal"]["end"] = 99991231

    # -------------------------------------
    print "\nProcessing spatial..."

    if not dry_run:
        convert_status = ru.add_asdf_id(f)
        if convert_status == 1:
            quit("Error adding ad_id to boundary file & outputting geojson.")

    env = ru.vector_envelope(f)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    # resources
    # individual resource info
    resource_tmp = {}

    # path relative to base
    resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

    resource_tmp["name"] = doc["name"]
    resource_tmp["bytes"] = os.path.getsize(f)
    resource_tmp["start"] = 10000101
    resource_tmp["end"] = 99991231

    # reorder resource fields
    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)

    # update main list
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document..."
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)
        try:
            dbu.features_to_mongo(doc['name'])
        except:
            # could remove data entry if it cannot be added
            # to mongo. or, at least make sure the data entry is
            # set to inactive
            raise

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 5
0
def run(path=None, client=None, version=None, config=None,
        generator="auto", update=False, dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))


    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["boundary-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------


    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")


    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")


    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")


    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"


    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update:
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    # get inputs
    if os.path.isfile(path):
        data = json.load(open(path, 'r'))
    else:
        quit("invalid input file path")


    required_core_fields = [
        "base", "type", "file_extension", "file_mask",
        "name", "title", "description", "version", "active"
    ]

    missing_core_fields = [i for i in required_core_fields
                           if i not in data]

    if len(missing_core_fields) > 0:
        quit("Missing core fields ({0})".format(missing_core_fields))


    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            base_original = client.asdf.data.find_one({'base': data["base"]})
            if base_original is None:
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # -------------------------------------

    # validate class instance
    v = ValidationTools(client)


    # validate base path
    valid_base = v.base(data["base"], update)

    if not valid_base.isvalid:
        quit(valid_base.error)

    doc["base"] = valid_base.value
    base_exists = valid_base.data['exists']


    # validate name
    valid_name = v.name(data["name"], update)

    if not valid_name.isvalid:
        quit(valid_name.error)

    doc["name"] = valid_name.value
    name_exists = valid_name.data['exists']


    if update:
        if not base_exists and not name_exists:
            warn(("Update specified but no dataset with matching "
                  "base ({0}) or name ({1}) was found").format(doc["base"],
                                                               doc["name"]))

        elif base_exists and name_exists:

            base_id = str(valid_base.data['search']['_id'])
            name_id = str(valid_name.data['search']['_id'])

            if base_id != name_id:
                quit("Update option specified but identifying fields (base "
                     "and name) belong to different existing datasets."
                     "\n\tBase: {0}\n\tName: {1}".format(doc["base"],
                                                         doc["name"]))
            else:
                existing_original = valid_name.data['search']

        elif name_exists:
            existing_original = valid_name.data['search']

        elif base_exists:
            existing_original = valid_base.data['search']

        doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]
        # doc["active"] = existing_original["active"]


    # validate type and set file_format
    valid_type = v.data_type(data["type"])

    if not valid_type.isvalid:
        quit(valid_type.error)

    doc["type"] = valid_type.value
    doc["file_format"] = valid_type.data["file_format"]

    if doc["type"] != "boundary":
        quit("Invalid type ({0}), must be boundary.".format(doc["type"]))


    # validate file extension (validation depends on file format)
    valid_extension = v.file_extension(data["file_extension"],
                                       doc["file_format"])

    if not valid_extension.isvalid:
        quit(valid_extension.error)

    doc["file_extension"] = valid_extension.value


    # validate title, description and version
    doc["title"] = str(data["title"])
    doc["description"] = str(data["description"])
    doc["version"] = str(data["version"])

    doc["active"] = int(data["active"])


    # validate options for raster

    if not "options" in data:
        quit("Missing options lookup")


    required_options = ["group", "group_title", "group_class"]

    missing_options = [i for i in required_options
                       if i not in data["options"]]

    if len(missing_options) > 0:
        quit("Missing fields from options lookup ({0})".format(
            missing_options))


    doc["options"] = {}


    ###

    warn("Current group checks for boundary do not cover all potential cases "
         "(e.g., geometry changes to group actual, various conflicts based "
         "group_class, existing groups, etc.).")

    # validate group info
    valid_group = v.group(data["options"]["group"], data["options"]["group_class"])

    if not valid_group.isvalid:
        quit(valid_group.error)

    doc["options"]["group"] = valid_group.value

    doc["options"]["group_class"] = data["options"]["group_class"]
    doc["options"]["group_title"] = data["options"]["group_title"]


    ###


    # extras
    if not "extras" in data:
        print("Although fields in extras are not required, it may contain "
              "commonly used field which should be added whenever possible "
              "(example: sources_web field)")
        doc["extras"] = {}

    elif not isinstance(data["extras"], dict):
        quit("Invalid instance of extras ({0}) of type: {1}".format(
            data["extras"], type(data["extras"])))
    else:
        doc["extras"] = data["extras"]

    if not "tags" in doc["extras"]:
        doc["extras"]["tags"] = []

    if not "boundary" in doc["extras"]["tags"]:
        doc["extras"]["tags"].append("boundary")


    # -------------------------------------
    # resource scan

    # find all files with file_extension in path
    file_list = []
    for root, dirs, files in os.walk(doc["base"]):
        for fname in files:

            fname = os.path.join(root, fname)
            file_check = fname.endswith('.' + doc["file_extension"])

            if file_check == True and not fname.endswith('simplified.geojson'):
                file_list.append(fname)


    if len(file_list) == 0:
        quit("No vector file found in " + doc["base"])

    elif len(file_list) > 1:
        quit("Boundaries must be submitted individually.")


    f = file_list[0]
    print f

    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # temporally invariant dataset
    doc["temporal"] = {}
    doc["temporal"]["name"] = "Temporally Invariant"
    doc["temporal"]["format"] = "None"
    doc["temporal"]["type"] = "None"
    doc["temporal"]["start"] = 10000101
    doc["temporal"]["end"] = 99991231

    # -------------------------------------
    print "\nProcessing spatial..."

    if not dry_run:
        convert_status = ru.add_asdf_id(f)
        if convert_status == 1:
             quit("Error adding ad_id to boundary file & outputting geojson.")


    env = ru.vector_envelope(f)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    # resources
    # individual resource info
    resource_tmp = {}

    # path relative to base
    resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

    resource_tmp["name"] = doc["name"]
    resource_tmp["bytes"] = os.path.getsize(f)
    resource_tmp["start"] = 10000101
    resource_tmp["end"] = 99991231

    # reorder resource fields
    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)

    # update main list
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document..."
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)
        # try:
        #     dbu.features_to_mongo(doc['name'])
        # except:
        #     # could remove data entry if it cannot be added
        #     # to mongo. or, at least make sure the data entry is
        #     # set to inactive
        #     raise

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 6
0
def run(path=None,
        client=None,
        version=None,
        config=None,
        generator="auto",
        update=False,
        dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))

    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------

    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")

    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    elif update in ["missing"]:
        update = "missing"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    base_original = client.asdf.data.find_one({'base': path})

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            if base_original is None and update != "missing":
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update or update == "missing":
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    if os.path.isdir(path):
        # remove trailing slash from path
        if path.endswith("/"):
            path = path[:-1]
    else:
        quit("Invalid base directory provided.")

    # -------------------------------------

    doc['base'] = path

    doc["type"] = "boundary"
    doc["file_format"] = "vector"
    doc["file_extension"] = "geojson"
    doc["file_mask"] = "None"

    # -------------------------------------

    name = os.path.basename(doc["base"])

    iso3 = name[:3]
    adm = name[4:]

    metadata_path = os.path.join(path, 'metadata.json')
    metadata = json.load(open(metadata_path, 'r'))
    country = unidecode(metadata["country"])

    doc["name"] = iso3.lower() + "_" + adm.lower() + "_gb_" + version

    inactive_bnds_list = config.inactive_bnds
    is_active = doc["name"] not in inactive_bnds_list

    doc["active"] = int(is_active)

    name_original = client.asdf.data.find_one({'name': doc["name"]})

    if not update and base_original is not None:
        msg = "No update specified but dataset exists (base: {0})".format(
            base_original['base'])
        raise Exception(msg)
    elif not update and name_original is not None:
        msg = "No update specified but dataset exists (name: {0})".format(
            name_original['name'])
        raise Exception(msg)

    if update:

        if update == "missing" and name_original is not None and base_original is not None:
            warn(
                "Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible)."
            )
            update = "partial"

        if update != "missing":
            if name_original is None and base_original is None:
                update = False
                warn(("Update specified but no dataset with matching "
                      "base ({0}) or name ({1}) was found").format(
                          doc["base"], doc["name"]))

                # in case we ended up not finding a match for name
                doc["asdf"]["date_added"] = str(datetime.date.today())

            elif name_original is not None and base_original is not None:

                if str(name_original['_id']) != str(base_original['_id']):
                    quit(
                        "Update option specified but identifying fields (base "
                        "and name) belong to different existing datasets."
                        "\n\tBase: {0}\n\tName: {1}".format(
                            doc["base"], doc["name"]))
                else:
                    existing_original = name_original

            elif name_original is not None:
                existing_original = name_original

            elif base_original is not None:
                existing_original = base_original

            doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]

            if existing_original["active"] == -1:
                doc["active"] = -1

    doc["title"] = "{} {} - GeoBoundaries v{}".format(
        country, adm.upper(), version.replace("_", "."))

    doc["description"] = "PLACEHOLDER"

    doc["version"] = version

    doc["options"] = {}
    doc["options"]["group"] = iso3.lower() + "_gb_" + version
    doc["options"]["group_title"] = "{} - GeoBoundaries v{}".format(
        country, version.replace("_", "."))

    doc["extras"] = {}

    doc["extras"]["citation"] = (
        'Seitz, L., Lv, Z., Goodman, S., Runfola, D. '
        '"Chapter 3: GeoBoundaries - A Global, Redistributable Map of Administrative Zones." '
        'GeoQuery User\'s Guide. Ed. Dan Runfola Ariel BenYishay, Seth Goodman. '
        'Williamsburg, Va: AidData, 2018.')

    doc["extras"]["sources_web"] = "http://www.geoboundaries.org"
    doc["extras"]["sources_name"] = "AidData GeoBoundaries"

    doc["extras"]["country"] = country
    doc["extras"]["iso3"] = iso3
    doc["extras"]["adm"] = int(adm[-1:])

    doc["extras"]["tags"] = ["geoboundaries", adm, country, iso3]

    # boundary group
    if "adm0" in name.lower():
        doc["options"]["group_class"] = "actual"
        doc["active"] = -1
    else:
        doc["options"]["group_class"] = "sub"

    # -------------------------------------
    # resource scan

    # find all files with file_extension in path
    file_list = []
    for root, dirs, files in os.walk(doc["base"]):
        for fname in files:

            fname = os.path.join(root, fname)
            file_check = fname.endswith('.' + doc["file_extension"])

            if file_check == True and not fname.endswith('simplified.geojson'):
                file_list.append(fname)

    if len(file_list) == 0:
        quit("No vector file found in " + doc["base"])

    elif len(file_list) > 1:
        quit("Boundaries must be submitted individually.")

    f = file_list[0]
    print f

    doc["description"] = "GeoBoundaries boundary file for {} in {}.".format(
        adm.upper(), country)

    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # temporally invariant dataset
    doc["temporal"] = {}
    doc["temporal"]["name"] = "Temporally Invariant"
    doc["temporal"]["format"] = "None"
    doc["temporal"]["type"] = "None"
    doc["temporal"]["start"] = 10000101
    doc["temporal"]["end"] = 99991231

    # -------------------------------------
    print "\nProcessing spatial..."

    if not dry_run:
        convert_status = ru.add_asdf_id(f)
        if convert_status == 1:
            quit("Error adding ad_id to boundary file & outputting geojson.")

    env = ru.vector_envelope(f)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    # resources
    # individual resource info
    resource_tmp = {}

    # path relative to base
    resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

    resource_tmp["name"] = doc["name"]
    resource_tmp["bytes"] = os.path.getsize(f)
    resource_tmp["start"] = 10000101
    resource_tmp["end"] = 99991231

    # reorder resource fields
    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)

    # update main list
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document:"
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)
        # try:
        #     dbu.features_to_mongo(doc['name'])
        # except:
        #     # could remove data entry if it cannot be added
        #     # to mongo. or, at least make sure the data entry is
        #     # set to inactive
        #     raise

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 7
0
def run(path=None,
        client=None,
        version=None,
        config=None,
        generator="auto",
        update=False,
        dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))

    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["raster-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------

    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")

    raw_update = update
    if update in ["meta", "data"]:
        update = update
    elif update in ["full", "all"]:
        update = "full"
    elif update in [False, "false", "False", None, "none", "no", 0, "0"]:
        update = False
    else:
        raise ValueError("Invalid `update` value provided ({})".format(update))

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update:
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    # get inputs
    if os.path.isfile(path):
        data = json.load(open(path, 'r'))
    else:
        quit("invalid input file path")

    required_core_fields = [
        "base", "type", "file_extension", "file_mask", "name", "title",
        "description", "version", "active"
    ]

    missing_core_fields = [i for i in required_core_fields if i not in data]

    if len(missing_core_fields) > 0:
        quit("Missing core fields ({0})".format(missing_core_fields))

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            base_original = client.asdf.data.find_one({'base': data["base"]})
            if base_original is None:
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # -------------------------------------

    # validate class instance
    v = ValidationTools(client)

    # validate base path
    valid_base = v.base(data["base"], update)

    if not valid_base.isvalid:
        quit(valid_base.error)

    doc["base"] = valid_base.value
    base_exists = valid_base.data['exists']

    # validate name
    valid_name = v.name(data["name"], update)

    if not valid_name.isvalid:
        quit(valid_name.error)

    doc["name"] = valid_name.value
    name_exists = valid_name.data['exists']

    if update:
        if not base_exists and not name_exists:
            warn(("Update specified but no dataset with matching "
                  "base ({0}) or name ({1}) was found").format(
                      doc["base"], doc["name"]))

        elif base_exists and name_exists:

            base_id = str(valid_base.data['search']['_id'])
            name_id = str(valid_name.data['search']['_id'])

            if base_id != name_id:
                quit("Update option specified but identifying fields (base "
                     "and name) belong to different existing datasets."
                     "\n\tBase: {0}\n\tName: {1}".format(
                         doc["base"], doc["name"]))
            else:
                existing_original = valid_name.data['search']

        elif name_exists:
            existing_original = valid_name.data['search']

        elif base_exists:
            existing_original = valid_base.data['search']

        doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]
        # doc["active"] = existing_original["active"]

    # validate type and set file_format
    valid_type = v.data_type(data["type"])

    if not valid_type.isvalid:
        quit(valid_type.error)

    doc["type"] = valid_type.value
    doc["file_format"] = valid_type.data["file_format"]

    if doc["type"] != "raster":
        quit("Invalid type ({0}), must be raster.".format(doc["type"]))

    # validate file extension (validation depends on file format)
    valid_extension = v.file_extension(data["file_extension"],
                                       doc["file_format"])

    if not valid_extension.isvalid:
        quit(valid_extension.error)

    doc["file_extension"] = valid_extension.value

    # validate title, description and version
    doc["title"] = str(data["title"])
    doc["description"] = str(data["description"])

    doc["details"] = ""
    if "details" in data:
        doc["details"] = str(data["details"])

    doc["version"] = str(data["version"])

    doc["active"] = int(data["active"])

    # validate options for raster

    if not "options" in data:
        quit("Missing options lookup")

    required_options = [
        "resolution", "extract_types", "factor", "variable_description"
    ]

    missing_options = [i for i in required_options if i not in data["options"]]

    if len(missing_options) > 0:
        quit(
            "Missing fields from options lookup ({0})".format(missing_options))

    doc["options"] = {}

    # resolution (in decimal degrees)
    valid_resolution = v.factor(data["options"]["resolution"])

    if not valid_resolution.isvalid:
        quit(valid_resolution.error)

    doc["options"]["resolution"] = valid_resolution.value

    # multiplication factor (if needed, defaults to 1 if blank)
    valid_factor = v.factor(data["options"]["factor"])

    if not valid_factor.isvalid:
        quit(valid_factor.error)

    doc["options"]["factor"] = valid_factor.value

    # ***
    # if factor changes, any extracts adjust with
    # old factor need to be removed
    # ***

    # extract_types (multiple, separate your input with commas)
    valid_extract_types = v.extract_types(data["options"]["extract_types"])

    if not valid_extract_types.isvalid:
        quit(valid_extract_types.error)

    doc["options"]["extract_types"] = valid_extract_types.value

    valid_extract_types_info = v.extract_types(
        data["options"]["extract_types_info"])

    if not valid_extract_types_info.isvalid:
        quit(valid_extract_types_info.error)

    doc["options"]["extract_types_info"] = valid_extract_types_info.value

    for i in doc["options"]["extract_types"]:
        if i not in doc["options"]["extract_types_info"]:
            raise Exception(
                "Value from `extract_type` missing from `extract_types_info` ({0})"
                .format(i))

    # Description of the variable (units, range, etc.)
    doc["options"]["variable_description"] = str(
        data["options"]["variable_description"])

    if "pixel_check" in data["options"]:
        doc["options"]["pixel_check"] = data["options"]["pixel_check"]

    # extras
    if not "extras" in data:
        print(
            "Although fields in extras are not required, it may contain "
            "commonly used field which should be added whenever possible "
            "(example: sources_web field)")
        doc["extras"] = {}

    elif not isinstance(data["extras"], dict):
        quit("Invalid instance of extras ({0}) of type: {1}".format(
            data["extras"], type(data["extras"])))
    else:
        doc["extras"] = data["extras"]

    if not "tags" in doc["extras"]:
        doc["extras"]["tags"] = []

    if not "raster" in doc["extras"]["tags"]:
        doc["extras"]["tags"].append("raster")

    if "categorical" in doc["options"]["extract_types"] or "encoded" in doc[
            "options"]["extract_types"]:
        if not "category_map" in doc["extras"]:
            quit(
                "'categorical' or 'encoded' included as extract type but no 'category_map' dict provided in 'extras'."
            )
        elif not isinstance(doc["extras"]["category_map"], dict):
            quit(
                "The 'category_map' field must be provided as a dict. Invalid type ({0}) given."
                .format(type(doc["extras"]["category_map"])))
        else:
            # make sure category names and values are in proper key:val format
            # and types
            # {"field_name": pixel_value}

            # NOTE: rasterstats requires input cmap as {pixel_value: "field_name"}
            #       this gets switched in extract utility. This was done since using integers
            #       or floats as key values is not valid json and would break ingest jsons
            # (could put int/float as str maybe? then could keep as key)

            # pixel value may be int, float
            # field name may be str, int, float (but only using string for ingest rasters)
            cat_map = doc["extras"]["category_map"]
            invalid_cat_vals = [
                i for i in cat_map.values() if not isinstance(i, (int, float))
            ]
            invalid_cat_keys = [
                i for i in cat_map.keys() if not isinstance(i, basestring)
            ]

            # make sure keys are str
            if invalid_cat_keys:
                print "Invalid `category_map` keys: ({0})".format(
                    invalid_cat_keys)

            # make sure vals or int/float
            if invalid_cat_vals:
                print "Invalid `category_map` values: ({0})".format(
                    invalid_cat_vals)

            if invalid_cat_keys or invalid_cat_vals:
                raise Exception("Invalid `category_map` provided.")

            cat_map = dict(
                zip([
                    re.sub('[^0-9a-z]', '_', i.lower())
                    for i in cat_map.keys()
                ], cat_map.values()))

    # -------------------------------------

    if update == "meta":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    # resource scan

    if data["file_mask"] == "None" and os.path.isfile(doc["base"]):
        file_list = [doc["base"]]

    else:
        # find all files with file_extension in path
        file_list = []
        for root, dirs, files in os.walk(doc["base"]):
            for file in files:

                file = os.path.join(root, file)
                file_check = file.endswith('.' + doc["file_extension"])

                if file_check == True:
                    file_list.append(file)

            if data["file_mask"] == "None":
                break

    if data["file_mask"] == "None" and len(file_list) != 1:
        quit("Multiple files found when `file_mask = None`")

    # -------------------------------------
    # check file mask

    def validate_file_mask(vmask):
        """Validate a file mask"""

        # designates temporally invariant dataset
        if vmask == "None":
            return True, None

        # test file_mask for first file in file_list
        test_date_str = ru.run_file_mask(vmask, file_list[0], doc["base"])
        valid_date = ru.validate_date(test_date_str)

        if valid_date[0] == False:
            return False, valid_date[1]

        return True, None

    # file mask identifying temporal attributes in path/file names
    valid_file_mask = validate_file_mask(data["file_mask"])

    if valid_file_mask[0]:
        doc["file_mask"] = data["file_mask"]
    else:
        quit(valid_file_mask[1])

    # -------------------------------------
    print "\nProcessing temporal..."

    doc["temporal"] = {}

    if doc["file_mask"] == "None":

        # temporally invariant dataset
        doc["temporal"]["name"] = "Temporally Invariant"
        doc["temporal"]["format"] = "None"
        doc["temporal"]["type"] = "None"
        doc["temporal"]["start"] = 10000101
        doc["temporal"]["end"] = 99991231

    elif len(file_list) > 0:

        # name for temporal data format
        doc["temporal"]["name"] = "Date Range"
        doc["temporal"]["format"] = "%Y%m%d"
        doc["temporal"]["type"] = ru.get_date_range(
            ru.run_file_mask(doc["file_mask"], file_list[0], doc["base"]))[2]
        doc["temporal"]["start"] = None
        doc["temporal"]["end"] = None
        # day range for each file (eg: MODIS 8 day composites)
        # if "day_range" in v.data:
        # "day_range", "File day range? (Must be integer)", v.day_range

    else:
        quit("Warning: file mask given but no resources were found")
        # doc["temporal"]["name"] = "Unknown"
        # doc["temporal"]["format"] = "Unknown"
        # doc["temporal"]["type"] = "Unknown"
        # doc["temporal"]["start"] = "Unknown"
        # doc["temporal"]["end"] = "Unknown"

    # -------------------------------------
    print "\nProcessing spatial..."

    # iterate over files to get bbox and do basic spatial validation
    # (mainly make sure rasters are all same size)
    f_count = 0
    for f in file_list:

        # get basic geo info from each file
        env = ru.raster_envelope(f)
        # get full geo info from first file
        if f_count == 0:
            base_geo = env

            f_count += 1

        # exit if basic geo does not match
        if base_geo != env:
            print f
            print base_geo
            print env
            warn("Raster bounding box does not match")
            # quit("Raster bounding box does not match")

    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    if doc["scale"] == "global":
        print(
            "This dataset has a bounding box larger than a hemisphere "
            "and will be treated as a global dataset. If this is not a "
            "global (or near global) dataset you may want to turn it into "
            "multiple smaller datasets and ingest them individually.")

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    resource_list = []

    for f in file_list:
        print f

        # resources
        # individual resource info
        resource_tmp = {}

        # path relative to base
        resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

        # file size
        resource_tmp["bytes"] = os.path.getsize(f)

        if doc["file_mask"] != "None":
            # temporal
            # get unique time range based on dir path / file names

            # get data from mask
            date_str = ru.run_file_mask(doc["file_mask"], resource_tmp["path"])

            validate_date_str = ru.validate_date(date_str)

            if not validate_date_str[0]:
                quit(validate_date_str[1])

            if "day_range" in doc:
                range_start, range_end, range_type = ru.get_date_range(
                    date_str, doc["day_range"])
            else:
                range_start, range_end, range_type = ru.get_date_range(
                    date_str)

            # name (unique among this dataset's resources,
            # not same name as dataset name)
            resource_tmp["name"] = (doc["name"] + "_" + date_str["year"] +
                                    date_str["month"] + date_str["day"])

        else:
            range_start = 10000101
            range_end = 99991231

            resource_tmp["name"] = doc["name"] + "_none"

        # file date range
        resource_tmp["start"] = range_start
        resource_tmp["end"] = range_end

        # # reorder resource fields
        # resource_order = ["name", "path", "bytes", "start", "end"]
        # resource_tmp = OrderedDict((k, resource_tmp[k])
        #                            for k in resource_order)

        # update main list
        resource_list.append(resource_tmp)

        # update dataset temporal info
        if (doc["temporal"]["start"] is None
                or range_start < doc["temporal"]["start"]):
            doc["temporal"]["start"] = range_start
        elif (doc["temporal"]["end"] is None
              or range_end > doc["temporal"]["end"]):
            doc["temporal"]["end"] = range_end

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document..."
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 8
0
def run(path=None, client=None, version=None, config=None,
        generator="auto", update=False, dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))


    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["raster-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------


    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")


    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")


    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")


    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"


    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update:
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    # get inputs
    if os.path.isfile(path):
        data = json.load(open(path, 'r'))
    else:
        quit("invalid input file path")


    required_core_fields = [
        "base", "type", "file_extension", "file_mask",
        "name", "title", "description", "version", "active"
    ]

    missing_core_fields = [i for i in required_core_fields
                           if i not in data]

    if len(missing_core_fields) > 0:
        quit("Missing core fields ({0})".format(missing_core_fields))


    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            base_original = client.asdf.data.find_one({'base': data["base"]})
            if base_original is None:
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # -------------------------------------

    # validate class instance
    v = ValidationTools(client)


    # validate base path
    valid_base = v.base(data["base"], update)

    if not valid_base.isvalid:
        quit(valid_base.error)

    doc["base"] = valid_base.value
    base_exists = valid_base.data['exists']


    # validate name
    valid_name = v.name(data["name"], update)

    if not valid_name.isvalid:
        quit(valid_name.error)

    doc["name"] = valid_name.value
    name_exists = valid_name.data['exists']


    if update:
        if not base_exists and not name_exists:
            warn(("Update specified but no dataset with matching "
                  "base ({0}) or name ({1}) was found").format(doc["base"],
                                                               doc["name"]))

        elif base_exists and name_exists:

            base_id = str(valid_base.data['search']['_id'])
            name_id = str(valid_name.data['search']['_id'])

            if base_id != name_id:
                quit("Update option specified but identifying fields (base "
                     "and name) belong to different existing datasets."
                     "\n\tBase: {0}\n\tName: {1}".format(doc["base"],
                                                         doc["name"]))
            else:
                existing_original = valid_name.data['search']

        elif name_exists:
            existing_original = valid_name.data['search']

        elif base_exists:
            existing_original = valid_base.data['search']

        doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]
        # doc["active"] = existing_original["active"]


    # validate type and set file_format
    valid_type = v.data_type(data["type"])

    if not valid_type.isvalid:
        quit(valid_type.error)

    doc["type"] = valid_type.value
    doc["file_format"] = valid_type.data["file_format"]

    if doc["type"] != "raster":
        quit("Invalid type ({0}), must be raster.".format(doc["type"]))


    # validate file extension (validation depends on file format)
    valid_extension = v.file_extension(data["file_extension"],
                                       doc["file_format"])

    if not valid_extension.isvalid:
        quit(valid_extension.error)

    doc["file_extension"] = valid_extension.value


    # validate title, description and version
    doc["title"] = str(data["title"])
    doc["description"] = str(data["description"])

    doc["details"] = ""
    if "details" in data:
        doc["details"] = str(data["details"])

    doc["version"] = str(data["version"])

    doc["active"] = int(data["active"])


    # validate options for raster

    if not "options" in data:
        quit("Missing options lookup")


    required_options = ["resolution", "extract_types", "factor",
                       "variable_description"]

    missing_options = [i for i in required_options
                       if i not in data["options"]]

    if len(missing_options) > 0:
        quit("Missing fields from options lookup ({0})".format(
            missing_options))


    doc["options"] = {}

    # resolution (in decimal degrees)
    valid_resolution = v.factor(data["options"]["resolution"])

    if not valid_resolution.isvalid:
        quit(valid_resolution.error)

    doc["options"]["resolution"] = valid_resolution.value


    # multiplication factor (if needed, defaults to 1 if blank)
    valid_factor = v.factor(data["options"]["factor"])

    if not valid_factor.isvalid:
        quit(valid_factor.error)

    doc["options"]["factor"] = valid_factor.value

    # ***
    # if factor changes, any extracts adjust with
    # old factor need to be removed
    # ***

    # extract_types (multiple, separate your input with commas)
    valid_extract_types = v.extract_types(data["options"]["extract_types"])

    if not valid_extract_types.isvalid:
        quit(valid_extract_types.error)

    doc["options"]["extract_types"] = valid_extract_types.value


    valid_extract_types_info = v.extract_types(data["options"]["extract_types_info"])

    if not valid_extract_types_info.isvalid:
        quit(valid_extract_types_info.error)

    doc["options"]["extract_types_info"] = valid_extract_types_info.value


    for i in doc["options"]["extract_types"]:
        if i not in  doc["options"]["extract_types_info"]:
            raise Exception("Value from `extract_type` missing from `extract_types_info` ({0})".format(i))


    # Description of the variable (units, range, etc.)
    doc["options"]["variable_description"] = str(
        data["options"]["variable_description"])


    # extras
    if not "extras" in data:
        print("Although fields in extras are not required, it may contain "
              "commonly used field which should be added whenever possible "
              "(example: sources_web field)")
        doc["extras"] = {}

    elif not isinstance(data["extras"], dict):
        quit("Invalid instance of extras ({0}) of type: {1}".format(
            data["extras"], type(data["extras"])))
    else:
        doc["extras"] = data["extras"]

    if not "tags" in doc["extras"]:
        doc["extras"]["tags"] = []

    if not "raster" in doc["extras"]["tags"]:
        doc["extras"]["tags"].append("raster")


    if "categorical" in doc["options"]["extract_types"] or "encoded" in doc["options"]["extract_types"]:
        if not "category_map" in doc["extras"]:
            quit("'categorical' or 'encoded' included as extract type but no 'category_map' dict provided in 'extras'.")
        elif not isinstance(doc["extras"]["category_map"], dict):
            quit("The 'category_map' field must be provided as a dict. Invalid type ({0}) given.".format(
                type(doc["extras"]["category_map"])))
        else:
            # make sure category names and values are in proper key:val format
            # and types
            # {"field_name": pixel_value}

            # NOTE: rasterstats requires input cmap as {pixel_value: "field_name"}
            #       this gets switched in extract utility. This was done since using integers
            #       or floats as key values is not valid json and would break ingest jsons
            # (could put int/float as str maybe? then could keep as key)

            # pixel value may be int, float
            # field name may be str, int, float (but only using string for ingest rasters)
            cat_map =  doc["extras"]["category_map"]
            invalid_cat_vals = [i for i in cat_map.values()
                                if not isinstance(i, (int, float))]
            invalid_cat_keys = [i for i in cat_map.keys()
                                if not isinstance(i, basestring)]

            # make sure keys are str
            if invalid_cat_keys:
                print "Invalid `category_map` keys: ({0})".format(invalid_cat_keys)

            # make sure vals or int/float
            if invalid_cat_vals:
                print "Invalid `category_map` values: ({0})".format(invalid_cat_vals)

            if invalid_cat_keys or invalid_cat_vals:
                raise Exception("Invalid `category_map` provided.")

            cat_map = dict(zip(
                [re.sub('[^0-9a-z]', '_', i.lower()) for i in cat_map.keys()],
                cat_map.values()
            ))


    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0



    # -------------------------------------
    # resource scan

    if data["file_mask"] == "None" and os.path.isfile(doc["base"]):
        file_list = [doc["base"]]

    else:
        # find all files with file_extension in path
        file_list = []
        for root, dirs, files in os.walk(doc["base"]):
            for file in files:

                file = os.path.join(root, file)
                file_check = file.endswith('.' + doc["file_extension"])

                if file_check == True:
                    file_list.append(file)

            if data["file_mask"] == "None":
                break


    if data["file_mask"] == "None" and len(file_list) != 1:
        quit("Multiple files found when `file_mask = None`")


    # -------------------------------------
    # check file mask

    def validate_file_mask(vmask):
        """Validate a file mask"""

        # designates temporally invariant dataset
        if vmask == "None":
            return True, None

        # test file_mask for first file in file_list
        test_date_str = ru.run_file_mask(vmask, file_list[0], doc["base"])
        valid_date = ru.validate_date(test_date_str)

        if valid_date[0] == False:
            return False, valid_date[1]

        return True, None


    # file mask identifying temporal attributes in path/file names
    valid_file_mask = validate_file_mask(data["file_mask"])

    if valid_file_mask[0]:
        doc["file_mask"] = data["file_mask"]
    else:
        quit(valid_file_mask[1])

    # -------------------------------------
    print "\nProcessing temporal..."

    doc["temporal"] = {}

    if doc["file_mask"] == "None":

        # temporally invariant dataset
        doc["temporal"]["name"] = "Temporally Invariant"
        doc["temporal"]["format"] = "None"
        doc["temporal"]["type"] = "None"
        doc["temporal"]["start"] = 10000101
        doc["temporal"]["end"] = 99991231

    elif len(file_list) > 0:

        # name for temporal data format
        doc["temporal"]["name"] = "Date Range"
        doc["temporal"]["format"] = "%Y%m%d"
        doc["temporal"]["type"] = ru.get_date_range(ru.run_file_mask(
            doc["file_mask"], file_list[0], doc["base"]))[2]
        doc["temporal"]["start"] = None
        doc["temporal"]["end"] = None
        # day range for each file (eg: MODIS 8 day composites)
        # if "day_range" in v.data:
            # "day_range", "File day range? (Must be integer)", v.day_range

    else:
        quit("Warning: file mask given but no resources were found")
        # doc["temporal"]["name"] = "Unknown"
        # doc["temporal"]["format"] = "Unknown"
        # doc["temporal"]["type"] = "Unknown"
        # doc["temporal"]["start"] = "Unknown"
        # doc["temporal"]["end"] = "Unknown"

    # -------------------------------------
    print "\nProcessing spatial..."

    # iterate over files to get bbox and do basic spatial validation
    # (mainly make sure rasters are all same size)
    f_count = 0
    for f in file_list:

        # get basic geo info from each file
        env = ru.raster_envelope(f)
        # get full geo info from first file
        if f_count == 0:
            base_geo = env

            f_count += 1

        # exit if basic geo does not match
        if base_geo != env:
            print f
            print base_geo
            print env
            warn("Raster bounding box does not match")
            # quit("Raster bounding box does not match")


    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    if doc["scale"] == "global":
        print ("This dataset has a bounding box larger than a hemisphere "
               "and will be treated as a global dataset. If this is not a "
               "global (or near global) dataset you may want to turn it into "
               "multiple smaller datasets and ingest them individually.")


    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    resource_list = []

    for f in file_list:
        print f

        # resources
        # individual resource info
        resource_tmp = {}

        # path relative to base
        resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]


        # file size
        resource_tmp["bytes"] = os.path.getsize(f)

        if doc["file_mask"] != "None":
            # temporal
            # get unique time range based on dir path / file names

            # get data from mask
            date_str = ru.run_file_mask(doc["file_mask"], resource_tmp["path"])

            validate_date_str = ru.validate_date(date_str)

            if not validate_date_str[0]:
                quit(validate_date_str[1])


            if "day_range" in doc:
                range_start, range_end, range_type = ru.get_date_range(
                    date_str, doc["day_range"])
            else:
                range_start, range_end, range_type = ru.get_date_range(
                    date_str)

            # name (unique among this dataset's resources,
            # not same name as dataset name)
            resource_tmp["name"] = (doc["name"] +"_"+
                                    date_str["year"] +
                                    date_str["month"] +
                                    date_str["day"])

        else:
            range_start = 10000101
            range_end = 99991231

            resource_tmp["name"] = doc["name"] + "_none"


        # file date range
        resource_tmp["start"] = range_start
        resource_tmp["end"] = range_end

        # # reorder resource fields
        # resource_order = ["name", "path", "bytes", "start", "end"]
        # resource_tmp = OrderedDict((k, resource_tmp[k])
        #                            for k in resource_order)

        # update main list
        resource_list.append(resource_tmp)


        # update dataset temporal info
        if (doc["temporal"]["start"] is None or
                range_start < doc["temporal"]["start"]):
            doc["temporal"]["start"] = range_start
        elif (doc["temporal"]["end"] is None or
                range_end > doc["temporal"]["end"]):
            doc["temporal"]["end"] = range_end


    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document..."
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 9
0
def run(path=None, client=None, version=None, config=None,
        generator="auto", update=False, dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))


    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')


    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------


    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")


    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    elif update in ["missing"]:
        update = "missing"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    base_original = client.asdf.data.find_one({'base': path})

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            if base_original is None and update != "missing":
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)


    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update or update == "missing":
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    if os.path.isdir(path):
        # remove trailing slash from path
        if path.endswith("/"):
            path = path[:-1]
    else:
        quit("Invalid base directory provided.")

    # -------------------------------------

    doc['base'] = path

    doc["type"] = "boundary"
    doc["file_format"] = "vector"
    doc["file_extension"] = "geojson"
    doc["file_mask"] = "None"


    # -------------------------------------

    name = os.path.basename(doc["base"])

    iso3 = name[:3]
    adm = name[4:]

    metadata_path = os.path.join(path, 'metadata.json')
    metadata = json.load(open(metadata_path, 'r'))
    country = unidecode(metadata["country"])


    doc["name"] = iso3.lower() + "_" + adm.lower() + "_gb_" + version


    inactive_bnds_list = config.inactive_bnds
    is_active = doc["name"] not in inactive_bnds_list

    doc["active"] = int(is_active)


    name_original = client.asdf.data.find_one({'name': doc["name"]})

    if not update and base_original is not None:
        msg = "No update specified but dataset exists (base: {0})".format(base_original['base'])
        raise Exception(msg)
    elif not update and name_original is not None:
        msg = "No update specified but dataset exists (name: {0})".format(name_original['name'])
        raise Exception(msg)


    if update:

        if update == "missing" and name_original is not None and base_original is not None:
            warn("Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible).")
            update = "partial"

        if update != "missing":
            if name_original is None and base_original is None:
                update = False
                warn(("Update specified but no dataset with matching "
                      "base ({0}) or name ({1}) was found").format(doc["base"],
                                                                   doc["name"]))

                # in case we ended up not finding a match for name
                doc["asdf"]["date_added"] = str(datetime.date.today())

            elif name_original is not None and base_original is not None:

                if str(name_original['_id']) != str(base_original['_id']):
                    quit("Update option specified but identifying fields (base "
                         "and name) belong to different existing datasets."
                         "\n\tBase: {0}\n\tName: {1}".format(doc["base"],
                                                             doc["name"]))
                else:
                    existing_original = name_original

            elif name_original is not None:
                existing_original = name_original

            elif base_original is not None:
                existing_original = base_original


            doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]

            if existing_original["active"] == -1:
                doc["active"] = -1


    doc["title"] = "{} {} - GeoBoundaries v{}".format(country, adm.upper(), version.replace("_", "."))

    doc["description"] = "PLACEHOLDER"

    doc["version"] = version


    doc["options"] = {}
    doc["options"]["group"] = iso3.lower() + "_gb_" + version
    doc["options"]["group_title"] = "{} - GeoBoundaries v{}".format(country, version.replace("_", "."))


    doc["extras"] = {}

    doc["extras"]["citation"] = ('Seitz, L., Lv, Z., Goodman, S., Runfola, D. '
                                 '"Chapter 3: GeoBoundaries - A Global, Redistributable Map of Administrative Zones." '
                                 'GeoQuery User\'s Guide. Ed. Dan Runfola Ariel BenYishay, Seth Goodman. '
                                 'Williamsburg, Va: AidData, 2018.')


    doc["extras"]["sources_web"] = "http://www.geoboundaries.org"
    doc["extras"]["sources_name"] = "AidData GeoBoundaries"

    doc["extras"]["country"] = country
    doc["extras"]["iso3"] = iso3
    doc["extras"]["adm"] = int(adm[-1:])

    doc["extras"]["tags"] = ["geoboundaries", adm, country, iso3]


    # boundary group
    if "adm0" in name.lower():
        doc["options"]["group_class"] = "actual"
        doc["active"] = -1
    else:
        doc["options"]["group_class"] = "sub"

    # -------------------------------------
    # resource scan

    # find all files with file_extension in path
    file_list = []
    for root, dirs, files in os.walk(doc["base"]):
        for fname in files:

            fname = os.path.join(root, fname)
            file_check = fname.endswith('.' + doc["file_extension"])

            if file_check == True and not fname.endswith('simplified.geojson'):
                file_list.append(fname)


    if len(file_list) == 0:
        quit("No vector file found in " + doc["base"])

    elif len(file_list) > 1:
        quit("Boundaries must be submitted individually.")


    f = file_list[0]
    print f


    doc["description"] = "GeoBoundaries boundary file for {} in {}.".format(
        adm.upper(), country)


    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # temporally invariant dataset
    doc["temporal"] = {}
    doc["temporal"]["name"] = "Temporally Invariant"
    doc["temporal"]["format"] = "None"
    doc["temporal"]["type"] = "None"
    doc["temporal"]["start"] = 10000101
    doc["temporal"]["end"] = 99991231

    # -------------------------------------
    print "\nProcessing spatial..."

    if not dry_run:
        convert_status = ru.add_asdf_id(f)
        if convert_status == 1:
             quit("Error adding ad_id to boundary file & outputting geojson.")


    env = ru.vector_envelope(f)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    # resources
    # individual resource info
    resource_tmp = {}

    # path relative to base
    resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

    resource_tmp["name"] = doc["name"]
    resource_tmp["bytes"] = os.path.getsize(f)
    resource_tmp["start"] = 10000101
    resource_tmp["end"] = 99991231

    # reorder resource fields
    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)

    # update main list
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document:"
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)
        # try:
        #     dbu.features_to_mongo(doc['name'])
        # except:
        #     # could remove data entry if it cannot be added
        #     # to mongo. or, at least make sure the data entry is
        #     # set to inactive
        #     raise

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0
Esempio n. 10
0
def run(path=None,
        client=None,
        version=None,
        config=None,
        generator="auto",
        update=False,
        dry_run=False):

    print '\n---------------------------------------'

    script = os.path.basename(__file__)

    def quit(reason):
        """quit script cleanly

        to do:
        - do error log stuff
        - output error logs somewhere
        - if auto, move job file to error location
        """
        raise Exception("{0}: terminating script - {1}\n".format(
            script, reason))

    if config is not None:
        client = config.client
    elif client is not None:
        config = client.info.config.findOne()
    else:
        quit('Neither config nor client provided.')

    version = config.versions["gadm-ingest"]

    # update mongo class instance
    dbu = MongoUpdate(client)

    # -------------------------------------

    # check path
    if path is not None:
        if not os.path.exists(path):
            quit("Invalid path provided.")
    else:
        quit("No path provided")

    # optional arg - mainly for user to specify manual run
    if generator not in ['auto', 'manual']:
        quit("Invalid generator input")

    if client is None:
        quit("No mongodb client connection provided")

    if config is None:
        quit("No config object provided")

    raw_update = update
    if update in ["partial", "meta"]:
        update = "partial"
    elif update in ["update", True, 1, "True", "full", "all"]:
        update = "full"
    elif update in ["missing"]:
        update = "missing"
    else:
        update = False

    print "running update status `{0}` (input: `{1}`)".format(
        update, raw_update)

    if dry_run in ["false", "False", "0", "None", "none", "no"]:
        dry_run = False

    dry_run = bool(dry_run)
    if dry_run:
        print "running dry run"

    base_original = client.asdf.data.find_one({'base': path})

    existing_original = None
    if update:
        if not "data" in client.asdf.collection_names():
            update = False
            msg = "Update specified but no data collection exists."
            if generator == "manual":
                raise Exception(msg)
            else:
                warn(msg)
        else:
            if base_original is None and update != "missing":
                update = False
                msg = "Update specified but no existing dataset found."
                if generator == "manual":
                    raise Exception(msg)
                else:
                    warn(msg)

    # init document
    doc = {}

    doc["asdf"] = {}
    doc["asdf"]["script"] = script
    doc["asdf"]["version"] = version
    doc["asdf"]["generator"] = generator
    doc["asdf"]["date_updated"] = str(datetime.date.today())
    if not update or update == "missing":
        doc["asdf"]["date_added"] = str(datetime.date.today())

    # -------------------------------------

    if os.path.isdir(path):
        # remove trailing slash from path
        if path.endswith("/"):
            path = path[:-1]
    else:
        quit("Invalid base directory provided.")

    # -------------------------------------

    doc['base'] = path

    doc["type"] = "boundary"
    doc["file_format"] = "vector"
    doc["file_extension"] = "geojson"
    doc["file_mask"] = "None"

    # -------------------------------------

    gadm_name = os.path.basename(doc["base"])

    gadm_version = os.path.basename(os.path.dirname(path))[4:]

    gadm_iso3 = gadm_name[:3]
    gadm_adm = gadm_name[4:]

    parent = os.path.dirname(os.path.abspath(__file__))
    gadm_lookup_path = parent + '/gadm_iso3.json'
    gadm_lookup = json.load(open(gadm_lookup_path, 'r'))

    gadm_country = unidecode(gadm_lookup[gadm_iso3])

    doc["name"] = (gadm_iso3.lower() + "_" + gadm_adm.lower() + "_gadm" +
                   gadm_version.replace('.', ''))

    inactive_bnds_list = config.inactive_bnds
    is_active = doc["name"] not in inactive_bnds_list

    doc["active"] = int(is_active)

    name_original = client.asdf.data.find_one({'name': doc["name"]})

    if not update and base_original is not None:
        msg = "No update specified but dataset exists (base: {0})".format(
            base_original['base'])
        raise Exception(msg)
    elif not update and name_original is not None:
        msg = "No update specified but dataset exists (name: {0})".format(
            name_original['name'])
        raise Exception(msg)

    if update:

        if update == "missing" and name_original is not None and base_original is not None:
            warn(
                "Dataset exists (running in 'missing' update mode). Running partial update and setting to active (if possible)."
            )
            update = "partial"

        if update != "missing":
            if name_original is None and base_original is None:
                update = False
                warn(("Update specified but no dataset with matching "
                      "base ({0}) or name ({1}) was found").format(
                          doc["base"], doc["name"]))

                # in case we ended up not finding a match for name
                doc["asdf"]["date_added"] = str(datetime.date.today())

            elif name_original is not None and base_original is not None:

                if str(name_original['_id']) != str(base_original['_id']):
                    quit(
                        "Update option specified but identifying fields (base "
                        "and name) belong to different existing datasets."
                        "\n\tBase: {0}\n\tName: {1}".format(
                            doc["base"], doc["name"]))
                else:
                    existing_original = name_original

            elif name_original is not None:
                existing_original = name_original

            elif base_original is not None:
                existing_original = base_original

            doc["asdf"]["date_added"] = existing_original["asdf"]["date_added"]

            if existing_original["active"] == -1:
                doc["active"] = -1

    doc["title"] = " ".join(
        [gadm_country,
         gadm_adm.upper(), "Boundary - GADM", gadm_version])

    doc["description"] = "PLACEHOLDER"

    doc["version"] = gadm_version

    doc["options"] = {}
    doc["options"]["group"] = (gadm_iso3.lower() + "_gadm" +
                               gadm_version.replace('.', ''))

    doc["extras"] = {}

    doc["extras"]["citation"] = ("Global Administrative Areas "
                                 "(GADM) http://www.gadm.org.")
    doc["extras"]["sources_web"] = "http://www.gadm.org"
    doc["extras"]["sources_name"] = "Global Administrative Areas (GADM)"

    doc["extras"]["gadm_country"] = gadm_country
    doc["extras"]["gadm_iso3"] = gadm_iso3
    doc["extras"]["gadm_adm"] = int(gadm_adm[-1:])
    doc["extras"]["gadm_unit"] = "PLACEHOLDER"
    doc["extras"]["tags"] = ["gadm", gadm_adm, gadm_country]

    doc["options"]["group_title"] = "{0} GADM {1}".format(
        gadm_country, gadm_version)

    # boundary group
    if "adm0" in gadm_name.lower():
        doc["options"]["group_class"] = "actual"
        doc["active"] = -1
    else:
        doc["options"]["group_class"] = "sub"

    # -------------------------------------
    # resource scan

    # find all files with file_extension in path
    file_list = []
    for root, dirs, files in os.walk(doc["base"]):
        for fname in files:

            fname = os.path.join(root, fname)
            file_check = fname.endswith('.' + doc["file_extension"])

            if file_check == True and not fname.endswith('simplified.geojson'):
                file_list.append(fname)

    if len(file_list) == 0:
        quit("No vector file found in " + doc["base"])

    elif len(file_list) > 1:
        quit("Boundaries must be submitted individually.")

    f = file_list[0]
    print f

    # get adm unit name for country and add to gadm info and description
    if gadm_adm.lower() == "adm0":
        gadm_unit = "Country"
    else:
        with fiona.open(f, 'r') as tmp_feature_src:
            tmp_feature = tmp_feature_src[0]
            gadm_unit = tmp_feature['properties']['ENGTYPE_' + gadm_adm[-1:]]

    doc["extras"]["gadm_unit"] = gadm_unit
    if gadm_unit not in [None, "Unknown"]:
        doc["extras"]["tags"].append(gadm_unit)
    doc["description"] = "GADM Boundary File for {0} ({1}) in {2}.".format(
        gadm_adm.upper(), gadm_unit, gadm_country)

    # -------------------------------------

    if update == "partial":
        print "\nProcessed document:"
        pprint(doc)

        print "\nUpdating database (dry run = {0})...".format(dry_run)
        if not dry_run:
            dbu.update(doc, update, existing_original)

        print "\n{0}: Done ({1} update).\n".format(script, update)
        return 0

    # -------------------------------------
    print "\nProcessing temporal..."

    # temporally invariant dataset
    doc["temporal"] = {}
    doc["temporal"]["name"] = "Temporally Invariant"
    doc["temporal"]["format"] = "None"
    doc["temporal"]["type"] = "None"
    doc["temporal"]["start"] = 10000101
    doc["temporal"]["end"] = 99991231

    # -------------------------------------
    print "\nProcessing spatial..."

    if not dry_run:
        convert_status = ru.add_asdf_id(f)
        if convert_status == 1:
            quit("Error adding ad_id to boundary file & outputting geojson.")

    env = ru.vector_envelope(f)
    env = ru.trim_envelope(env)
    print "Dataset bounding box: ", env

    doc["scale"] = ru.envelope_to_scale(env)

    # set spatial
    doc["spatial"] = ru.envelope_to_geom(env)

    # -------------------------------------
    print '\nProcessing resources...'

    # resources
    # individual resource info
    resource_tmp = {}

    # path relative to base
    resource_tmp["path"] = f[f.index(doc["base"]) + len(doc["base"]) + 1:]

    resource_tmp["name"] = doc["name"]
    resource_tmp["bytes"] = os.path.getsize(f)
    resource_tmp["start"] = 10000101
    resource_tmp["end"] = 99991231

    # reorder resource fields
    # resource_order = ["name", "path", "bytes", "start", "end"]
    # resource_tmp = OrderedDict((k, resource_tmp[k]) for k in resource_order)

    # update main list
    resource_list = [resource_tmp]

    doc["resources"] = resource_list

    # -------------------------------------
    # database updates

    print "\nProcessed document:"
    pprint(doc)

    print "\nUpdating database (dry run = {0})...".format(dry_run)
    if not dry_run:
        dbu.update(doc, update, existing_original)
        try:
            dbu.features_to_mongo(doc['name'])
        except:
            # could remove data entry if it cannot be added
            # to mongo. or, at least make sure the data entry is
            # set to inactive
            raise

    if update:
        print "\n{0}: Done ({1} update).\n".format(script, update)
    else:
        print "\n{0}: Done.\n".format(script)

    print '\n---------------------------------------\n'

    return 0