Ejemplo n.º 1
0
def prepareFile(package_id, resource_id, sheet_id=None, options={}):
    sheetInfo = collections.get("resource").find_one({
        "resourceId" : resource_id,
        "sheetId" : sheet_id
    })

    if sheetInfo is None:
        sheetInfo = {}

    # get the name of the resource
    if 'name' in sheetInfo:
        resource = sheetInfo
    else: # fallback on querying PG for the name
        resource = ckanResourceQuery.get(resource_id)

    # see if we have the path, otherwise lookup it up
    if 'file' in sheetInfo:
        filepath = sheetInfo.get('file')
    else:
        filepath = resourceUtil.get_path(resource_id)

    ext = _getFileExtension(resource.get('name'))

    # much like in the prepare() method aboves resource loop
    if ext == "zip":
        extractZip(package_id, resource.get('id'), filepath, resource.get('name'), options=options)
    else:
        importer.processFile(filepath, package_id, resource_id, sheetId=sheet_id, options=options, resource=resource)
Ejemplo n.º 2
0
def prepare(package_id, force=False):
    packageInfo = collections.get("package").find_one({
        "packageId" : package_id,
    },{"_id" : 0})

    # create a workspace package object if required
    if packageInfo is None:
        packageInfo = {
            "packageId" : package_id
        }

    # quit if there is nothing todo
    if not force and packageInfo.get("prepared") == True:
        return {
            "success" : True,
            "message" : "already prepared, use force flag to force prepare"
        }

    # make sure we are not trying to prepare a package that has been deleted
    ckanPackage = ckanPackageQuery.get(package_id)
    if ckanPackage.get('state') == 'deleted':
        raise Exception('Package has been deleted')

    # get all package resources
    resources = ckanResourceQuery.active(package_id)

    status = []
    for resource in resources:
        # get path on disk for file as well as file extension
        filepath = resourceUtil.get_path(resource.get('id'))
        ext = _getFileExtension(resource.get('name'))

        # extract zip contents if zip
        if ext == "zip":
            # TODO: we should be checking a zip hash before we go unzipping every time
            results = extractZip(package_id, resource.get('id'), filepath, resource.get('name'))
            for result in results:
                status.append(result)

        # extract 'normal' file (non-zip)
        else:
            result = importer.processFile(filepath, package_id, resource.get('id'), resource=resource)
            status.append(result)

    # respond with update of what we did (or did not) do.
    packageInfo["runInfo"] = status
    packageInfo["lastTouched"] = datetime.utcnow()
    packageInfo["prepared"] = True

    collections.get("package").update({"packageId":package_id}, packageInfo, upsert=True)
    return packageInfo
Ejemplo n.º 3
0
def extractZip(package_id, resource_id, zipPath, zipName, options={}):
    status = []

    # check to see if there are any changes
    zipFileInfo = collections.get("resource").find_one({
        "packageId" : package_id,
        "resourceId" : resource_id
    })
    if zipFileInfo is None:
        zipFileInfo = {}
    hash = importer.hashfile(zipPath)

    # if hashes are equal, we nothing has changed
    if zipFileInfo.get("hash") == hash:
        status.append({
            "resourceId" : resource_id,
            "name" : zipName,
            "unzipped" : False,
            "message" : "nothing todo, hash is equal"
        })
        return status

    # Send info back about what was processed
    zipFileInfo['hash'] = hash
    zipFileInfo['resourceId'] = resource_id
    zipFileInfo['packageId'] = package_id
    zipFileInfo['file'] = zipPath
    zipFileInfo['isZip'] = True

    # update resource collection
    collections.get("resource").update({
        "packageId" : package_id,
        "resourceId" : resource_id
    }, zipFileInfo, upsert=True)

    status.append({
        "resourceId" : resource_id,
        "name" : zipName,
        "unzipped" : True
    })

    # get the workspace path on disk
    workspacePath = os.path.join(workspaceDir, package_id, resource_id)

    # clean out any existing extraction
    if os.path.exists(workspacePath):
        shutil.rmtree(workspacePath)

    z = zipfile.ZipFile(zipPath, "r")

    zipPackageIds = []
    for info in z.infolist():
        if _isDataFile(info.filename):

            # create id for individual file
            name = re.sub(r".*/", "", info.filename)

            if re.match(r"^\..*", name): # ignore .dot files
                continue

            id = _getZipResourceId(resource_id, info.filename)

            #extract individual file
            z.extract(info, workspacePath)

            # check for existing config
            resource = collections.get("resource").find_one({
                "packageId" : package_id,
                "resourceId" : id
            })

            # create new config if one doesn't exist
            if resource is None:
                resource = {
                    "packageId" : package_id,
                    "resourceId" : id,
                    "name" : name,
                    "file" : os.path.join(workspacePath, info.filename),
                    "zip" : {
                        "name" : zipName,
                        "resourceId" : resource_id
                    },
                    "fromZip" : True
                }

                collections.get("resource").update({
                    "packageId" : package_id,
                    "resourceId" : id
                }, resource, upsert=True)

            zipPackageIds.append(id)

            # now we pass with new resource id, but path to file
            result = importer.processFile(resource.get('file'), package_id, id, resource=resource, options=options)
            status.append(result)
        # TODO: implement .ecosis file

    # cleanup
    collections.get("resource").remove({
        "packageId" : package_id,
        "zip.resourceId" : resource_id,
        "resourceId" : {
            "$nin" : zipPackageIds
        }
    })

    # more cleanup
    collections.get("spectra").remove({
        "packageId" : package_id,
        "zip.resourceId" : resource_id,
        "resourceId" : {
            "$nin" : zipPackageIds
        }
    })

    return status