コード例 #1
0
ファイル: __init__.py プロジェクト: CSTARS/ckanext-ecosis
def prepareFile(package_id, resource_id, sheet_id=None, options={}):
    sheetInfo = collections.get("resource").find_one({
        "resourceId" : resource_id,
        "sheetId" : sheet_id
    })

    if sheetInfo is None:
        sheetInfo = {}

    # get the name of the resource
    if 'name' in sheetInfo:
        resource = sheetInfo
    else: # fallback on querying PG for the name
        resource = ckanResourceQuery.get(resource_id)

    # see if we have the path, otherwise lookup it up
    if 'file' in sheetInfo:
        filepath = sheetInfo.get('file')
    else:
        filepath = resourceUtil.get_path(resource_id)

    ext = _getFileExtension(resource.get('name'))

    # much like in the prepare() method aboves resource loop
    if ext == "zip":
        extractZip(package_id, resource.get('id'), filepath, resource.get('name'), options=options)
    else:
        importer.processFile(filepath, package_id, resource_id, sheetId=sheet_id, options=options, resource=resource)
コード例 #2
0
ファイル: __init__.py プロジェクト: CSTARS/ckanext-ecosis
def addEcosisNamespace(spectra, package, main, sheetInfo, processInfo=None):
    name = sheetInfo.get('name')

    # fall back to postgres if we don't have a name
    if name is None and sheetInfo.get('fromZip') != True:
        resource = ckanResourceQuery.get(sheetInfo.get('resourceId'))
        name = resource.get('name')

    # append sheet and package information
    ecosis = {
        'package_id': sheetInfo.get("packageId"),
        'package_title': package.get('title'),
        'resource_id' : main.get('resourceId'),
        'filename': name,
        'sheet_id': main.get('sheetId'),
        'layout' : sheetInfo.get('layout'),
        'index' : main.get('index'),
        'dataset_link' : '%s#result/%s' % (host, sheetInfo.get('packageId')),
        'dataset_api_link' : '%spackage/get?id=%s' % (host, sheetInfo.get('packageId')),
    }

    # append zip package information if from zip file
    if 'zip' in sheetInfo:
        ecosis['zip_package'] = {
            "id" : sheetInfo.get('zip').get('resourceId'),
            "name" : sheetInfo.get('zip').get('name')
        }

    # append the latest processing information (when the sheet was last parsed)
    if processInfo is not None:
        ecosis['processInfo'] = processInfo

    # append the organziation information
    if package.get('organization') != None:
        ecosis['organization'] = package['organization']['title']

    spectra['ecosis'] = ecosis
コード例 #3
0
ファイル: __init__.py プロジェクト: CSTARS/ckanext-ecosis
def getMetadataChunk(packageId, resourceId=None, sheetId=None, index=0):
    query = {
        "type" : "metadata",
        "packageId" : packageId
    }

    # add additional query parameters
    if resourceId is not None:
        query['resourceId'] = resourceId
    if sheetId is not None:
        query['sheetId'] = sheetId

    # grab metadata chunk at given index
    chunk = collections.get('spectra').find_one(query, skip=index, sort=[("index", pymongo.ASCENDING)])
    if chunk is None:
        raise Exception('Invalid resource ids given')

    # grab the sheet information
    del query['type']
    sheetInfo = collections.get('resource').find_one(query)

    # now look up information about what spectra we are joining to
    joinedNames = []
    joinOn = sheetInfo.get("joinOn")

    if sheetInfo is not None and joinOn is not None and joinOn != "" and chunk.get('spectra') is not None:
        # now make join query
        joinQuery = {
            "type" : "data",
            "packageId" : packageId
        }

        # we are going to find all spectra that have the 'joinOn' attribute equal to this metadata
        # chunks value.
        joinQuery['spectra.%s' % sheetInfo.get("joinOn")] = chunk.get('spectra')[sheetInfo.get("joinOn")]

        # run query
        joined = collections.get('spectra').find(joinQuery)

        # for all results, append sheet information to the 'joinedNames' resources array.
        for r in joined:
            # TODO: is there a better way to get the actual 'name' of a resource?
            joinedInfo = collections.get('resource').find_one(
                {
                    'resourceId': r.get('resourceId'),
                    'sheetId': r.get('sheetId')
                },
                {"layout": 1,"name": 1})


            if joinedInfo is None: # Badness
                joinedName = {}
                joinedInfo = {}
            elif 'name' in joinedInfo:
                joinedName = joinedInfo
            else: # if no name is provided in workspace, fallback to postgres
                try:
                    joinedName = ckanResourceQuery.get(r.get('resourceId'))
                except:
                    joinedName = {}

            # add information about which spectra this chunk joins to
            if joinedName is not None:
                joinedNames.append({
                    "resourceId" : r.get('resourceId'),
                    "sheetId" : r.get('sheetId'),
                    "name" : joinedName.get('name'),
                    "layout" : joinedInfo.get('layout'),
                    "index" : r.get("index")
                })

    # set photo
    setPhoto(packageId, chunk.get('spectra'))

    # return metadata and join information
    return {
        "metadata" : chunk.get('spectra'),
        "joinedResources" : joinedNames,
        "joinKey" : sheetInfo.get("joinOn")
    }
コード例 #4
0
ファイル: process.py プロジェクト: CSTARS/ckanext-ecosis
def processFile(file="", packageId="", resourceId="", sheetId=None, options={}, resource=None):

    # get config for sheet if on exists
    sheetConfig = collections.get('resource').find_one({
        "resourceId" : resourceId,
        "packageId" : packageId,
        "sheetId" : sheetId
    })

    # if we don't have a sheet config, query CKAN PG for details
    if resource is None:
        if 'name' in sheetConfig:
            resource = sheetConfig
        else:
            resource = ckanResourceQuery.get(resourceId)

    # grab the file extension
    ext = utils.getFileExtension(resource.get('name'))

    # check for ignore conditions
    ignore = False

    if sheetConfig == None:
        sheetConfig = {}

    # update with passed options
    keyCount = 0
    for key in options:
        sheetConfig[key] = options[key]
        keyCount += 1

    # now get md5 of current file
    hash = hashfile(file)

    # if the hash has changed compared to the value stored in the workspace
    # collection, then no changes have been made to this file and we do not need to
    # reparse.  Note, the response will be flagged as 'ignored'.
    if sheetConfig.get('hash') == hash and keyCount == 0:
        # no changes to file for config, just exit
        return {
            "ignored" : True,
            "message" : "nothing todo. hash is equal and new config given",
            "resourceId" : resourceId,
            "fromZip" : sheetConfig.get("fromZip"),
            "name" : resource.get("name")
        }

    # make sure defaults are set
    sheetConfig['file'] = file
    sheetConfig['packageId'] = packageId
    sheetConfig['resourceId'] = resourceId
    sheetConfig['sheetId'] = sheetId

    # clear spectra collection for this sheet
    removeQuery = {
        "resourceId" : resourceId,
        "packageId" : packageId
    }
    if sheetId is not None:
        removeQuery["sheetId"] = sheetId
    collections.get('spectra').remove(removeQuery)

    # has this sheet been marked by user to ignore?
    if sheetConfig.get('ignore') == True:
        ignore = True
    # is this sheet not of a valid extension?
    # TODO: make global array
    elif ext != "csv" and ext != "tsv" and ext != "spectra" and ext != "xlsx" and ext != "xls":
        ignore = True
        sheetConfig['ignore'] = True
        sheetConfig['invalidFileType'] = True
        if 'layout' in sheetConfig:
            del sheetConfig['layout']

    # if we should ignore sheet for reasons above, just save and return
    if ignore == True:
        collections.get('resource').update({
            "resourceId" : sheetConfig.get('resourceId'),
            "packageId" : sheetConfig.get('packageId'),
            "sheetId" : sheetConfig.get('sheetId')
        }, sheetConfig, upsert=True)

        return {
            "ignored" : True,
            "message" : "ignore flag set or invalid file type",
            "resourceId" : resourceId,
            "fromZip" : resource.get("fromZip"),
            "name" : resource.get("name")
        }

    # set our last processed timestamp
    sheetConfig['processed'] = datetime.datetime.utcnow()

    response = None
    # parse as comma sperated
    if ext == "csv" or ext == "spectra":
        sheetConfig['hash'] = hash
        response = _processCsv(sheetConfig)
        response['name'] = resource.get('name')

    # parse as tab sperated
    elif ext == "tsv":
        sheetConfig['hash'] = hash
        response = _processTsv(sheetConfig)
        response['name'] = resource.get('name')

    # parse as excel
    elif ext == "xlsx" or ext == "xls":
        # an excel file is going to actually expand to several files
        # so pass the files array so the placeholder can be removed
        # and the new 'sheet' files can be inserted
        sheets = excel.process(collections.get("resource"), sheetConfig, hash)
        response = []
        for sheet in sheets:
            t = _processSheetArray(sheet.get('data'), sheet.get('config'))
            t['sheetId'] = sheet.get('config').get('sheetId')
            t['name'] = resource.get('name')
            response.append(t)
    # badness
    else:
        response = {
            "message" : "not parsed, invalid file type"
        }

    return response