def prepareFile(package_id, resource_id, sheet_id=None, options={}): sheetInfo = collections.get("resource").find_one({ "resourceId" : resource_id, "sheetId" : sheet_id }) if sheetInfo is None: sheetInfo = {} # get the name of the resource if 'name' in sheetInfo: resource = sheetInfo else: # fallback on querying PG for the name resource = ckanResourceQuery.get(resource_id) # see if we have the path, otherwise lookup it up if 'file' in sheetInfo: filepath = sheetInfo.get('file') else: filepath = resourceUtil.get_path(resource_id) ext = _getFileExtension(resource.get('name')) # much like in the prepare() method aboves resource loop if ext == "zip": extractZip(package_id, resource.get('id'), filepath, resource.get('name'), options=options) else: importer.processFile(filepath, package_id, resource_id, sheetId=sheet_id, options=options, resource=resource)
def addEcosisNamespace(spectra, package, main, sheetInfo, processInfo=None): name = sheetInfo.get('name') # fall back to postgres if we don't have a name if name is None and sheetInfo.get('fromZip') != True: resource = ckanResourceQuery.get(sheetInfo.get('resourceId')) name = resource.get('name') # append sheet and package information ecosis = { 'package_id': sheetInfo.get("packageId"), 'package_title': package.get('title'), 'resource_id' : main.get('resourceId'), 'filename': name, 'sheet_id': main.get('sheetId'), 'layout' : sheetInfo.get('layout'), 'index' : main.get('index'), 'dataset_link' : '%s#result/%s' % (host, sheetInfo.get('packageId')), 'dataset_api_link' : '%spackage/get?id=%s' % (host, sheetInfo.get('packageId')), } # append zip package information if from zip file if 'zip' in sheetInfo: ecosis['zip_package'] = { "id" : sheetInfo.get('zip').get('resourceId'), "name" : sheetInfo.get('zip').get('name') } # append the latest processing information (when the sheet was last parsed) if processInfo is not None: ecosis['processInfo'] = processInfo # append the organziation information if package.get('organization') != None: ecosis['organization'] = package['organization']['title'] spectra['ecosis'] = ecosis
def getMetadataChunk(packageId, resourceId=None, sheetId=None, index=0): query = { "type" : "metadata", "packageId" : packageId } # add additional query parameters if resourceId is not None: query['resourceId'] = resourceId if sheetId is not None: query['sheetId'] = sheetId # grab metadata chunk at given index chunk = collections.get('spectra').find_one(query, skip=index, sort=[("index", pymongo.ASCENDING)]) if chunk is None: raise Exception('Invalid resource ids given') # grab the sheet information del query['type'] sheetInfo = collections.get('resource').find_one(query) # now look up information about what spectra we are joining to joinedNames = [] joinOn = sheetInfo.get("joinOn") if sheetInfo is not None and joinOn is not None and joinOn != "" and chunk.get('spectra') is not None: # now make join query joinQuery = { "type" : "data", "packageId" : packageId } # we are going to find all spectra that have the 'joinOn' attribute equal to this metadata # chunks value. joinQuery['spectra.%s' % sheetInfo.get("joinOn")] = chunk.get('spectra')[sheetInfo.get("joinOn")] # run query joined = collections.get('spectra').find(joinQuery) # for all results, append sheet information to the 'joinedNames' resources array. for r in joined: # TODO: is there a better way to get the actual 'name' of a resource? joinedInfo = collections.get('resource').find_one( { 'resourceId': r.get('resourceId'), 'sheetId': r.get('sheetId') }, {"layout": 1,"name": 1}) if joinedInfo is None: # Badness joinedName = {} joinedInfo = {} elif 'name' in joinedInfo: joinedName = joinedInfo else: # if no name is provided in workspace, fallback to postgres try: joinedName = ckanResourceQuery.get(r.get('resourceId')) except: joinedName = {} # add information about which spectra this chunk joins to if joinedName is not None: joinedNames.append({ "resourceId" : r.get('resourceId'), "sheetId" : r.get('sheetId'), "name" : joinedName.get('name'), "layout" : joinedInfo.get('layout'), "index" : r.get("index") }) # set photo setPhoto(packageId, chunk.get('spectra')) # return metadata and join information return { "metadata" : chunk.get('spectra'), "joinedResources" : joinedNames, "joinKey" : sheetInfo.get("joinOn") }
def processFile(file="", packageId="", resourceId="", sheetId=None, options={}, resource=None): # get config for sheet if on exists sheetConfig = collections.get('resource').find_one({ "resourceId" : resourceId, "packageId" : packageId, "sheetId" : sheetId }) # if we don't have a sheet config, query CKAN PG for details if resource is None: if 'name' in sheetConfig: resource = sheetConfig else: resource = ckanResourceQuery.get(resourceId) # grab the file extension ext = utils.getFileExtension(resource.get('name')) # check for ignore conditions ignore = False if sheetConfig == None: sheetConfig = {} # update with passed options keyCount = 0 for key in options: sheetConfig[key] = options[key] keyCount += 1 # now get md5 of current file hash = hashfile(file) # if the hash has changed compared to the value stored in the workspace # collection, then no changes have been made to this file and we do not need to # reparse. Note, the response will be flagged as 'ignored'. if sheetConfig.get('hash') == hash and keyCount == 0: # no changes to file for config, just exit return { "ignored" : True, "message" : "nothing todo. hash is equal and new config given", "resourceId" : resourceId, "fromZip" : sheetConfig.get("fromZip"), "name" : resource.get("name") } # make sure defaults are set sheetConfig['file'] = file sheetConfig['packageId'] = packageId sheetConfig['resourceId'] = resourceId sheetConfig['sheetId'] = sheetId # clear spectra collection for this sheet removeQuery = { "resourceId" : resourceId, "packageId" : packageId } if sheetId is not None: removeQuery["sheetId"] = sheetId collections.get('spectra').remove(removeQuery) # has this sheet been marked by user to ignore? if sheetConfig.get('ignore') == True: ignore = True # is this sheet not of a valid extension? # TODO: make global array elif ext != "csv" and ext != "tsv" and ext != "spectra" and ext != "xlsx" and ext != "xls": ignore = True sheetConfig['ignore'] = True sheetConfig['invalidFileType'] = True if 'layout' in sheetConfig: del sheetConfig['layout'] # if we should ignore sheet for reasons above, just save and return if ignore == True: collections.get('resource').update({ "resourceId" : sheetConfig.get('resourceId'), "packageId" : sheetConfig.get('packageId'), "sheetId" : sheetConfig.get('sheetId') }, sheetConfig, upsert=True) return { "ignored" : True, "message" : "ignore flag set or invalid file type", "resourceId" : resourceId, "fromZip" : resource.get("fromZip"), "name" : resource.get("name") } # set our last processed timestamp sheetConfig['processed'] = datetime.datetime.utcnow() response = None # parse as comma sperated if ext == "csv" or ext == "spectra": sheetConfig['hash'] = hash response = _processCsv(sheetConfig) response['name'] = resource.get('name') # parse as tab sperated elif ext == "tsv": sheetConfig['hash'] = hash response = _processTsv(sheetConfig) response['name'] = resource.get('name') # parse as excel elif ext == "xlsx" or ext == "xls": # an excel file is going to actually expand to several files # so pass the files array so the placeholder can be removed # and the new 'sheet' files can be inserted sheets = excel.process(collections.get("resource"), sheetConfig, hash) response = [] for sheet in sheets: t = _processSheetArray(sheet.get('data'), sheet.get('config')) t['sheetId'] = sheet.get('config').get('sheetId') t['name'] = resource.get('name') response.append(t) # badness else: response = { "message" : "not parsed, invalid file type" } return response