def update_bag(self):
        """
        Update a bag if necessary.

        This uses the Django signal pre_check_bag_flag to prepare collections,
        and then checks the AVUs 'metadata_dirty' and 'bag_modified' to determine
        whether to regenerate the metadata files and/or bag.

        This is a synchronous update. The call waits until the update is finished.
        """
        from hs_core.tasks import create_bag_by_irods
        from hs_core.hydroshare.resource import check_resource_type
        from hs_core.hydroshare.hs_bagit import create_bag_files

        # send signal for pre_check_bag_flag
        resource_cls = check_resource_type(self.resource_type)
        pre_check_bag_flag.send(sender=resource_cls, resource=self)

        metadata_dirty = self.getAVU('metadata_dirty')
        bag_modified = self.getAVU('bag_modified')

        if metadata_dirty:  # automatically cast to Bool
            create_bag_files(self)
            self.setAVU('metadata_dirty', False)

        # the ticket system does synchronous bag creation.
        # async bag creation isn't supported.
        if bag_modified:  # automatically cast to Bool
            create_bag_by_irods(self.short_id)
            self.setAVU('bag_modified', False)
Exemple #2
0
    def update_bag(self):
        """
        Update a bag if necessary.

        This uses the Django signal pre_check_bag_flag to prepare collections,
        and then checks the AVUs 'metadata_dirty' and 'bag_modified' to determine
        whether to regenerate the metadata files and/or bag.

        This is a synchronous update. The call waits until the update is finished.
        """
        from hs_core.tasks import create_bag_by_irods
        from hs_core.hydroshare.resource import check_resource_type
        from hs_core.hydroshare.hs_bagit import create_bag_files

        # send signal for pre_check_bag_flag
        resource_cls = check_resource_type(self.resource_type)
        pre_check_bag_flag.send(sender=resource_cls, resource=self)

        metadata_dirty = self.getAVU('metadata_dirty')
        bag_modified = self.getAVU('bag_modified')

        if metadata_dirty:  # automatically cast to Bool
            create_bag_files(self)
            self.setAVU('metadata_dirty', False)

        # the ticket system does synchronous bag creation.
        # async bag creation isn't supported.
        if bag_modified:  # automatically cast to Bool
            create_bag_by_irods(self.short_id)
            self.setAVU('bag_modified', False)
 def test_create_bag_by_irods(self):
     try:
         # this is the api call we testing
         create_bag_by_irods(self.test_res.short_id)
     except Exception as ex:
         self.fail("create_bag_by_irods() raised exception.{}".format(
             ex.message))
 def handle(self, *args, **options):
     if len(options['resource_ids']
            ) > 0:  # an array of resource short_id to check.
         for rid in options['resource_ids']:
             resource = get_resource_by_shortkey(rid)
             if resource.raccess.published:
                 create_bag_by_irods(rid)
             else:
                 print(
                     "Resource {} is not published, hence ignored.".format(
                         rid))
     else:
         for resource in BaseResource.objects.filter(
                 raccess__published=True):
             create_bag_by_irods(resource.short_id)
Exemple #5
0
def replicate_resource_bag_to_user_zone(user, res_id):
    """
    Replicate resource bag to iRODS user zone
    Args:
        user: the requesting user
        res_id: the resource id with its bag to be replicated to iRODS user zone

    Returns:
    None, but exceptions will be raised if there is an issue with iRODS operation
    """
    # do on-demand bag creation

    res = utils.get_resource_by_shortkey(res_id)
    res_coll = res.root_path
    istorage = res.get_irods_storage()
    bag_modified_flag = True
    # needs to check whether res_id collection exists before getting/setting AVU on it to
    # accommodate the case where the very same resource gets deleted by another request when
    # it is getting downloaded
    if istorage.exists(res_coll):
        bag_modified = istorage.getAVU(res_coll, 'bag_modified')

        # make sure bag_modified_flag is set to False only if bag exists and bag_modified AVU
        # is False; otherwise, bag_modified_flag will take the default True value so that the
        # bag will be created or recreated
        if bag_modified:
            if bag_modified.lower() == "false":
                bag_file_name = res_id + '.zip'
                if res.resource_federation_path:
                    bag_full_path = os.path.join(res.resource_federation_path, 'bags',
                                                 bag_file_name)
                else:
                    bag_full_path = os.path.join('bags', bag_file_name)

                if istorage.exists(bag_full_path):
                    bag_modified_flag = False

        if bag_modified_flag:
            # import here to avoid circular import issue
            from hs_core.tasks import create_bag_by_irods
            status = create_bag_by_irods(res_id)
            if not status:
                # bag fails to be created successfully
                raise SessionException(-1, '', 'The resource bag fails to be created '
                                               'before bag replication')

        # do replication of the resource bag to irods user zone
        if not res.resource_federation_path:
            istorage.set_fed_zone_session()
        src_file = res.bag_path
        tgt_file = '/{userzone}/home/{username}/{resid}.zip'.format(
            userzone=settings.HS_USER_IRODS_ZONE, username=user.username, resid=res_id)
        fsize = istorage.size(src_file)
        utils.validate_user_quota(user, fsize)
        istorage.copyFiles(src_file, tgt_file)
        update_quota_usage(user=user)
    else:
        raise ValidationError("Resource {} does not exist in iRODS".format(res.short_id))
Exemple #6
0
    def test_bag_creation_and_deletion(self):
        status = create_bag_by_irods(self.test_res.short_id)
        self.assertTrue(status)
        # test checksum will be computed for published resource
        self.test_res.raccess.published = True
        self.test_res.raccess.save()
        status = create_bag_by_irods(self.test_res.short_id)
        self.assertTrue(status)
        res = get_resource_by_shortkey(self.test_res.short_id)
        self.assertNotEqual(res.bag_checksum,
                            '',
                            msg='bag_checksum property is empty')

        self.test_res.raccess.published = False
        self.test_res.raccess.save()
        hs_bagit.delete_files_and_bag(self.test_res)
        # resource should not have any bags
        istorage = self.test_res.get_irods_storage()
        bag_path = self.test_res.bag_path
        self.assertFalse(istorage.exists(bag_path))
Exemple #7
0
def replicate_resource_bag_to_user_zone(user, res_id):
    """
    Replicate resource bag to iRODS user zone
    Args:
        user: the requesting user
        res_id: the resource id with its bag to be replicated to iRODS user zone

    Returns:
    None, but exceptions will be raised if there is an issue with iRODS operation
    """
    # do on-demand bag creation
    res = get_resource_by_shortkey(res_id)
    res_coll = res.root_path
    istorage = res.get_irods_storage()
    bag_modified = "false"
    # needs to check whether res_id collection exists before getting/setting AVU on it to
    # accommodate the case where the very same resource gets deleted by another request when
    # it is getting downloaded
    # TODO: why would we want to do anything at all if the resource does not exist???
    if istorage.exists(res_coll):
        bag_modified = istorage.getAVU(res_coll, 'bag_modified')
        if bag_modified.lower() == "true":
            # import here to avoid circular import issue
            from hs_core.tasks import create_bag_by_irods
            create_bag_by_irods(res_id)

        # do replication of the resource bag to irods user zone
        if not res.resource_federation_path:
            istorage.set_fed_zone_session()
        src_file = res.bag_path
        # TODO: allow setting destination path
        tgt_file = '/{userzone}/home/{username}/{resid}.zip'.format(
            userzone=settings.HS_USER_IRODS_ZONE,
            username=user.username,
            resid=res_id)
        fsize = istorage.size(src_file)
        validate_user_quota(user, fsize)
        istorage.copyFiles(src_file, tgt_file)
    else:
        raise ValidationError("Resource {} does not exist in iRODS".format(
            res.short_id))
Exemple #8
0
def replicate_resource_bag_to_user_zone(user, res_id):
    """
    Replicate resource bag to iRODS user zone
    Args:
        user: the requesting user
        res_id: the resource id with its bag to be replicated to iRODS user zone

    Returns:
    None, but exceptions will be raised if there is an issue with iRODS operation
    """
    # do on-demand bag creation
    res = get_resource_by_shortkey(res_id)
    res_coll = res.root_path
    istorage = res.get_irods_storage()
    bag_modified = "false"
    # needs to check whether res_id collection exists before getting/setting AVU on it to
    # accommodate the case where the very same resource gets deleted by another request when
    # it is getting downloaded
    if istorage.exists(res_coll):
        bag_modified = istorage.getAVU(res_coll, 'bag_modified')
    if bag_modified == "true":
        # import here to avoid circular import issue
        from hs_core.tasks import create_bag_by_irods
        create_bag_by_irods(res_id)

    # do replication of the resource bag to irods user zone
    if not res.resource_federation_path:
        istorage.set_fed_zone_session()
    src_file = 'bags/{resid}.zip'.format(resid=res_id)
    if res.resource_federation_path:
        src_file = os.path.join(res.resource_federation_path, src_file)
    tgt_file = '/{userzone}/home/{username}/{resid}.zip'.format(
        userzone=settings.HS_USER_IRODS_ZONE,
        username=user.username,
        resid=res_id)
    istorage.copyFiles(src_file, tgt_file)
Exemple #9
0
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True,
             *args, **kwargs):
    """ perform a download request, either asynchronously or synchronously

    :param request: the request object.
    :param path: the path of the thing to be downloaded.
    :param rest_call: True if calling from REST API
    :param use_async: True means to utilize asynchronous creation of objects to download.
    :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming.

    The following variables are computed:

    * `path` is the public path of the thing to be downloaded.
    * `irods_path` is the location of `path` in irods.
    * `output_path` is the output path to be reported in the response object.
    * `irods_output_path` is the location of `output_path` in irods

    and there are six cases:

    Zipped query param signal the download should be zipped
        - folders are always zipped regardless of this paramter
        - single file aggregations are zipped with the aggregation metadata files

    A path may point to:
    1. a single file
    2. a single-file-aggregation object in a composite resource.
    3. a folder
    3. a metadata object that may need updating.
    4. a bag that needs to be updated and then returned.
    6. a previously zipped file that was zipped asynchronously.

    """
    if __debug__:
        logger.debug("request path is {}".format(path))

    split_path_strs = path.split('/')
    while split_path_strs[-1] == '':
        split_path_strs.pop()
    path = u'/'.join(split_path_strs)  # no trailing slash

    # initialize case variables
    is_bag_download = False
    is_zip_download = False
    is_zip_request = request.GET.get('zipped', "False").lower() == "true"
    is_sf_agg_file = False
    is_sf_request = False

    if split_path_strs[0] == 'bags':
        is_bag_download = True
        # format is bags/{rid}.zip
        res_id = os.path.splitext(split_path_strs[1])[0]
    elif split_path_strs[0] == 'zips':
        is_zip_download = True
        # zips prefix means that we are following up on an asynchronous download request
        # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid
        res_id = split_path_strs[3]
    else:  # regular download request
        res_id = split_path_strs[0]

    if __debug__:
        logger.debug("resource id is {}".format(res_id))

    # now we have the resource Id and can authorize the request
    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(request, res_id,
                                   needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
                                   raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    # default values are changed later as needed

    istorage = res.get_irods_storage()
    if res.is_federated:
        irods_path = os.path.join(res.resource_federation_path, path)
    else:
        irods_path = path
    # in many cases, path and output_path are the same.
    output_path = path

    irods_output_path = irods_path
    # folder requests are automatically zipped
    if not is_bag_download and not is_zip_download:  # path points into resource: should I zip it?
        store_path = u'/'.join(split_path_strs[1:])  # data/contents/{path-to-something}
        if res.is_folder(store_path):  # automatically zip folders
            is_zip_request = True
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
            if res.is_federated:
                irods_output_path = os.path.join(res.resource_federation_path, output_path)
            else:
                irods_output_path = output_path
            if __debug__:
                logger.debug("automatically zipping folder {} to {}".format(path, output_path))
        elif istorage.exists(irods_path):
            if __debug__:
                logger.debug("request for single file {}".format(path))
            is_sf_request = True

            # check for single file aggregations
            if "data/contents/" in path:  # not a metadata file
                for f in ResourceFile.objects.filter(object_id=res.id):
                    if path == f.storage_path:
                        is_sf_agg_file = True
                        if not is_zip_request and f.has_logical_file and \
                                f.logical_file.is_single_file_aggregation:
                            download_url = request.GET.get('url_download', 'false').lower()
                            if download_url == 'false':
                                # redirect to referenced url in the url file instead
                                redirect_url = f.logical_file.redirect_url
                                if redirect_url:
                                    return HttpResponseRedirect(redirect_url)
                        if __debug__:
                            logger.debug(
                                "request for single file aggregation {}".format(path))
                        break

            if is_zip_request:
                daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
                output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
                if res.is_federated:
                    irods_output_path = os.path.join(res.resource_federation_path, output_path)
                else:
                    irods_output_path = output_path

    # After this point, we have valid path, irods_path, output_path, and irods_output_path
    # * is_zip_request: signals download should be zipped, folders are always zipped
    # * is_sf_agg_file: path is a single-file aggregation in Composite Resource
    # * is_sf_request: path is a single-file
    # flags for download:
    # * is_bag_download: download a bag in format bags/{rid}.zip
    # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip
    # if none of these are set, it's a normal download

    # determine active session
    if res.is_federated:
        # the resource is stored in federated zone
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        if 'environment' in kwargs:
            logger.warn("setting iRODS from environment")
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            if __debug__:
                logger.debug("using GLOBAL_SESSION")
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            if __debug__:
                logger.debug("using ACTIVE_SESSION")
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if is_zip_request:

        if use_async:
            task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path,
                                                is_sf_agg_file, is_sf_request))
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours

            if rest_call:
                return HttpResponse(
                    json.dumps({
                        'zip_status': 'Not ready',
                        'task_id': task.task_id,
                        'download_path': '/django_irods/rest_download/' + output_path}),
                    content_type="application/json")
            else:
                # return status to the UI
                request.session['task_id'] = task.task_id
                # TODO: this is mistaken for a bag download in the UI!
                # TODO: multiple asynchronous downloads don't stack!
                request.session['download_path'] = '/django_irods/download/' + output_path
                # redirect to resource landing page, which interprets session variables.
                return HttpResponseRedirect(res.get_absolute_url())

        else:  # synchronous creation of download
            ret_status = create_temp_zip(res_id, irods_path, irods_output_path,
                                         is_sf_agg_file, is_sf_request)
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours
            if not ret_status:
                content_msg = "Zip could not be created."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response
            # At this point, output_path presumably exists and contains a zipfile
            # to be streamed below

    elif is_bag_download:
        # Shorten request if it contains extra junk at the end
        bag_file_name = res_id + '.zip'
        output_path = os.path.join('bags', bag_file_name)
        if not res.is_federated:
            irods_output_path = output_path
        else:
            irods_output_path = os.path.join(res.resource_federation_path, output_path)

        bag_modified = res.getAVU('bag_modified')
        # recreate the bag if it doesn't exist even if bag_modified is "false".
        if __debug__:
            logger.debug(u"irods_output_path is {}".format(irods_output_path))
        if bag_modified is None or not bag_modified:
            if not istorage.exists(irods_output_path):
                bag_modified = True

        # send signal for pre_check_bag_flag
        # this generates metadata other than that generated by create_bag_files.
        pre_check_bag_flag.send(sender=resource_cls, resource=res)

        metadata_dirty = res.getAVU('metadata_dirty')
        if metadata_dirty is None or metadata_dirty:
            create_bag_files(res)  # sets metadata_dirty to False
            bag_modified = "True"

        if bag_modified is None or bag_modified:
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id,), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({'bag_status': 'Not ready',
                                                    'task_id': task.task_id}),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    else:  # regular file download
        # if fetching main metadata files, then these need to be refreshed.
        if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'):
            metadata_dirty = res.getAVU("metadata_dirty")
            if metadata_dirty is None or metadata_dirty:
                create_bag_files(res)  # sets metadata_dirty to False

        # send signal for pre download file
        # TODO: does not contain subdirectory information: duplicate refreshes possible
        download_file_name = split_path_strs[-1]  # end of path
        # this logs the download request in the tracking system
        pre_download_file.send(sender=resource_cls, resource=res,
                               download_file_name=download_file_name,
                               request=request)

    # If we get this far,
    # * path and irods_path point to true input
    # * output_path and irods_output_path point to true output.
    # Try to stream the file back to the requester.

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(output_path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    # TODO: standardize this to make it less brittle
    stdout = session.run("ils", None, "-l", irods_output_path)[0].split()
    flen = int(stdout[3])

    # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true')
    # and reverse proxy is possible according to configuration (SENDFILE_ON=True)
    # and reverse proxy isn't overridden by user (use_reverse_proxy=True).

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # If this path is resource_federation_path, then the file is a local user file
        userpath = '/' + os.path.join(
            getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'),
            'home',
            getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy'))

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(irods_output_path):
            content_msg = "file path {} does not exist in iRODS".format(output_path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join([
                getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path])
            if __debug__:
                logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect']))
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path)
            if __debug__:
                logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect']))
            return response

    # if we get here, none of the above conditions are true
    # if reverse proxy is enabled, then this is because the resource is remote and federated
    # OR the user specifically requested a non-proxied download.

    options = ('-',)  # we're redirecting to stdout.
    # this unusual way of calling works for streaming federated or local resources
    if __debug__:
        logger.debug("Locally streaming {}".format(output_path))
    # track download count
    res.update_download_count()
    proc = session.run_safe('iget', None, irods_output_path, *options)
    response = FileResponse(proc.stdout, content_type=mtype)
    response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
        name=output_path.split('/')[-1])
    response['Content-Length'] = flen
    return response
Exemple #10
0
 def test_create_bag_by_irods(self):
     try:
         # this is the api call we testing
         create_bag_by_irods(self.test_res.short_id)
     except Exception as ex:
         self.fail("create_bag_by_irods() raised exception.{}".format(ex.message))
Exemple #11
0
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True,
             *args, **kwargs):
    """ perform a download request, either asynchronously or synchronously

    :param request: the request object.
    :param path: the path of the thing to be downloaded.
    :param rest_call: True if calling from REST API
    :param use_async: True means to utilize asynchronous creation of objects to download.
    :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming.

    The following variables are computed:

    * `path` is the public path of the thing to be downloaded.
    * `irods_path` is the location of `path` in irods.
    * `output_path` is the output path to be reported in the response object.
    * `irods_output_path` is the location of `output_path` in irods

    and there are six cases:

    Zipped query param signal the download should be zipped
        - folders are always zipped regardless of this paramter
        - single file aggregations are zipped with the aggregation metadata files

    A path may point to:
    1. a single file
    2. a single-file-aggregation object in a composite resource.
    3. a folder
    3. a metadata object that may need updating.
    4. a bag that needs to be updated and then returned.
    6. a previously zipped file that was zipped asynchronously.

    """
    if __debug__:
        logger.debug("request path is {}".format(path))

    split_path_strs = path.split('/')
    while split_path_strs[-1] == '':
        split_path_strs.pop()
    path = u'/'.join(split_path_strs)  # no trailing slash

    # initialize case variables
    is_bag_download = False
    is_zip_download = False
    is_zip_request = request.GET.get('zipped', "False").lower() == "true"
    is_sf_agg_file = False
    is_sf_request = False

    if split_path_strs[0] == 'bags':
        is_bag_download = True
        # format is bags/{rid}.zip
        res_id = os.path.splitext(split_path_strs[1])[0]
    elif split_path_strs[0] == 'zips':
        is_zip_download = True
        # zips prefix means that we are following up on an asynchronous download request
        # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid
        res_id = split_path_strs[3]
    else:  # regular download request
        res_id = split_path_strs[0]

    if __debug__:
        logger.debug("resource id is {}".format(res_id))

    # now we have the resource Id and can authorize the request
    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(request, res_id,
                                   needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
                                   raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    # default values are changed later as needed

    istorage = res.get_irods_storage()
    if res.is_federated:
        irods_path = os.path.join(res.resource_federation_path, path)
    else:
        irods_path = path
    # in many cases, path and output_path are the same.
    output_path = path

    irods_output_path = irods_path
    # folder requests are automatically zipped
    if not is_bag_download and not is_zip_download:  # path points into resource: should I zip it?
        store_path = u'/'.join(split_path_strs[1:])  # data/contents/{path-to-something}
        if res.is_folder(store_path):  # automatically zip folders
            is_zip_request = True
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
            if res.is_federated:
                irods_output_path = os.path.join(res.resource_federation_path, output_path)
            else:
                irods_output_path = output_path
            if __debug__:
                logger.debug("automatically zipping folder {} to {}".format(path, output_path))
        elif istorage.exists(irods_path):
            if __debug__:
                logger.debug("request for single file {}".format(path))
            is_sf_request = True

            # check for single file aggregations
            if "data/contents/" in path:  # not a metadata file
                for f in ResourceFile.objects.filter(object_id=res.id):
                    if path == f.storage_path:
                        is_sf_agg_file = True
                        if not is_zip_request and f.has_logical_file and \
                                f.logical_file.is_single_file_aggregation:
                            download_url = request.GET.get('url_download', 'false').lower()
                            if download_url == 'false':
                                # redirect to referenced url in the url file instead
                                redirect_url = f.logical_file.redirect_url
                                if redirect_url:
                                    return HttpResponseRedirect(redirect_url)
                        if __debug__:
                            logger.debug(
                                "request for single file aggregation {}".format(path))
                        break

            if is_zip_request:
                daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
                output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path)
                if res.is_federated:
                    irods_output_path = os.path.join(res.resource_federation_path, output_path)
                else:
                    irods_output_path = output_path

    # After this point, we have valid path, irods_path, output_path, and irods_output_path
    # * is_zip_request: signals download should be zipped, folders are always zipped
    # * is_sf_agg_file: path is a single-file aggregation in Composite Resource
    # * is_sf_request: path is a single-file
    # flags for download:
    # * is_bag_download: download a bag in format bags/{rid}.zip
    # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip
    # if none of these are set, it's a normal download

    # determine active session
    if res.is_federated:
        # the resource is stored in federated zone
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        if 'environment' in kwargs:
            logger.warn("setting iRODS from environment")
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            if __debug__:
                logger.debug("using GLOBAL_SESSION")
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            if __debug__:
                logger.debug("using ACTIVE_SESSION")
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if is_zip_request:

        if use_async:
            task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path,
                                                is_sf_agg_file, is_sf_request))
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours

            if rest_call:
                return HttpResponse(
                    json.dumps({
                        'zip_status': 'Not ready',
                        'task_id': task.task_id,
                        'download_path': '/django_irods/rest_download/' + output_path}),
                    content_type="application/json")
            else:
                # return status to the UI
                request.session['task_id'] = task.task_id
                # TODO: this is mistaken for a bag download in the UI!
                # TODO: multiple asynchronous downloads don't stack!
                request.session['download_path'] = '/django_irods/download/' + output_path
                # redirect to resource landing page, which interprets session variables.
                return HttpResponseRedirect(res.get_absolute_url())

        else:  # synchronous creation of download
            ret_status = create_temp_zip(res_id, irods_path, irods_output_path,
                                         is_sf_agg_file, is_sf_request)
            delete_zip.apply_async((irods_output_path, ),
                                   countdown=(60 * 60 * 24))  # delete after 24 hours
            if not ret_status:
                content_msg = "Zip could not be created."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response
            # At this point, output_path presumably exists and contains a zipfile
            # to be streamed below

    elif is_bag_download:
        # Shorten request if it contains extra junk at the end
        bag_file_name = res_id + '.zip'
        output_path = os.path.join('bags', bag_file_name)
        if not res.is_federated:
            irods_output_path = output_path
        else:
            irods_output_path = os.path.join(res.resource_federation_path, output_path)

        bag_modified = res.getAVU('bag_modified')
        # recreate the bag if it doesn't exist even if bag_modified is "false".
        if __debug__:
            logger.debug(u"irods_output_path is {}".format(irods_output_path))
        if bag_modified is None or not bag_modified:
            if not istorage.exists(irods_output_path):
                bag_modified = True

        # send signal for pre_check_bag_flag
        # this generates metadata other than that generated by create_bag_files.
        pre_check_bag_flag.send(sender=resource_cls, resource=res)

        metadata_dirty = res.getAVU('metadata_dirty')
        if metadata_dirty is None or metadata_dirty:
            create_bag_files(res)  # sets metadata_dirty to False
            bag_modified = "True"

        if bag_modified is None or bag_modified:
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id,), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({'bag_status': 'Not ready',
                                                    'task_id': task.task_id}),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    else:  # regular file download
        # if fetching main metadata files, then these need to be refreshed.
        if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'):
            metadata_dirty = res.getAVU("metadata_dirty")
            if metadata_dirty is None or metadata_dirty:
                create_bag_files(res)  # sets metadata_dirty to False

        # send signal for pre download file
        # TODO: does not contain subdirectory information: duplicate refreshes possible
        download_file_name = split_path_strs[-1]  # end of path
        # this logs the download request in the tracking system
        pre_download_file.send(sender=resource_cls, resource=res,
                               download_file_name=download_file_name,
                               request=request)

    # If we get this far,
    # * path and irods_path point to true input
    # * output_path and irods_output_path point to true output.
    # Try to stream the file back to the requester.

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(output_path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    # TODO: standardize this to make it less brittle
    stdout = session.run("ils", None, "-l", irods_output_path)[0].split()
    flen = int(stdout[3])

    # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true')
    # and reverse proxy is possible according to configuration (SENDFILE_ON=True)
    # and reverse proxy isn't overridden by user (use_reverse_proxy=True).

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # If this path is resource_federation_path, then the file is a local user file
        userpath = '/' + os.path.join(
            getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'),
            'home',
            getattr(settings, 'HS_IRODS_PROXY_USER_IN_USER_ZONE', 'localHydroProxy'))

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(irods_output_path):
            content_msg = "file path {} does not exist in iRODS".format(output_path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join([
                getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path])
            if __debug__:
                logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect']))
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # track download count
            res.update_download_count()

            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path)
            if __debug__:
                logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect']))
            return response

    # if we get here, none of the above conditions are true
    # if reverse proxy is enabled, then this is because the resource is remote and federated
    # OR the user specifically requested a non-proxied download.

    options = ('-',)  # we're redirecting to stdout.
    # this unusual way of calling works for streaming federated or local resources
    if __debug__:
        logger.debug("Locally streaming {}".format(output_path))
    # track download count
    res.update_download_count()
    proc = session.run_safe('iget', None, irods_output_path, *options)
    response = FileResponse(proc.stdout, content_type=mtype)
    response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
        name=output_path.split('/')[-1])
    response['Content-Length'] = flen
    return response
Exemple #12
0
    def handle(self, *args, **options):

        if len(options['resource_ids']) > 0:  # an array of resource short_id to check.
            for rid in options['resource_ids']:
                try:
                    resource = BaseResource.objects.get(short_id=rid)
                    istorage = resource.get_irods_storage()

                    scimeta_path = os.path.join(resource.root_path, 'data',
                                                'resourcemetadata.xml')
                    if istorage.exists(scimeta_path):
                        print("found {}".format(scimeta_path))
                    else:
                        print("{} NOT FOUND".format(scimeta_path))

                    resmap_path = os.path.join(resource.root_path, 'data',
                                               'resourcemap.xml')
                    if istorage.exists(resmap_path):
                        print("found {}".format(resmap_path))
                    else:
                        print("{} NOT FOUND".format(resmap_path))

                    if istorage.exists(resource.bag_path):
                        print("found bag {}".format(resource.bag_path))
                    else:
                        print("bag {} NOT FOUND".format(resource.bag_path))

                    dirty = resource.getAVU('metadata_dirty')
                    print("metadata_dirty is {}".format(str(dirty)))

                    modified = resource.getAVU('bag_modified')
                    print("bag_modified is {}".format(str(modified)))

                    if options['generate']:  # generate usable bag

                        create_bag_files(resource)
                        print("metadata generated for {} from Django".format(rid))
                        resource.setAVU('metadata_dirty', 'false')
                        print("metadata_dirty set to false for {}".format(rid))

                        create_bag_by_irods(rid)
                        print("bag generated for {} from iRODs".format(rid))
                        resource.setAVU('bag_modified', 'false')
                        print("bag_modified set to false for {}".format(rid))

                    elif options['generate_metadata']:

                        create_bag_files(resource)
                        print("metadata generated for {} from Django".format(rid))
                        resource.setAVU('metadata_dirty', 'false')
                        print("metadata_dirty set to false for {}".format(rid))

                    elif options['generate_bag']:

                        create_bag_by_irods(rid)
                        print("bag generated for {} from iRODs".format(rid))
                        resource.setAVU('bag_modified', 'false')
                        print("bag_modified set to false for {}".format(rid))

                    elif options['reset']:  # reset all data to pristine

                        resource.setAVU('metadata_dirty', 'true')
                        print("metadata_dirty set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.scimeta_path)
                            print("metadata {} deleted".format(resource.scimeta_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.scimeta_path,
                                          ex.stderr))
                        try:
                            istorage.delete(resource.resmap_path)
                            print("map {} deleted".format(resource.resmap_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.resmap_path,
                                          ex.stderr))

                        resource.setAVU('bag_modified', 'true')
                        print("bag_modified set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.bag_path)
                            print("bag {} deleted".format(resource.bag_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.bag_path,
                                          ex.stderr))

                    elif options['reset_metadata']:

                        resource.setAVU('metadata_dirty', 'true')
                        print("metadata_dirty set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.scimeta_path)
                            print("metadata {} deleted".format(resource.scimeta_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.scimeta_path,
                                          ex.stderr))
                        try:
                            istorage.delete(resource.resmap_path)
                            print("map {} deleted".format(resource.resmap_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.resmap_path,
                                          ex.stderr))

                    elif options['reset_bag']:
                        resource.setAVU('bag_modified', 'true')
                        print("bag_modified set to true for {}".format(rid))
                        try:
                            istorage.delete(resource.bag_path)
                            print("bag {} deleted".format(resource.bag_path))
                        except SessionException as ex:
                            print("delete of {} failed: {}"
                                  .format(resource.bag_path,
                                          ex.stderr))

                except BaseResource.DoesNotExist:
                    print("Resource with id {} NOT FOUND in Django".format(rid))
Exemple #13
0
def download(request,
             path,
             use_async=True,
             use_reverse_proxy=True,
             *args,
             **kwargs):
    """ perform a download request, either asynchronously or synchronously

    :param request: the request object.
    :param path: the path of the thing to be downloaded.
    :param use_async: True means to utilize asynchronous creation of objects to download.
    :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming.

    The following variables are computed:

    * `path` is the public path of the thing to be downloaded.
    * `irods_path` is the location of `path` in irods.
    * `output_path` is the output path to be reported in the response object.
    * `irods_output_path` is the location of `output_path` in irods

    and there are six cases:

    Zipped query param signal the download should be zipped
        - folders are always zipped regardless of this paramter
        - single file aggregations are zipped with the aggregation metadata files

    A path may point to:
    1. a single file
    2. a single-file-aggregation object in a composite resource.
    3. a folder
    3. a metadata object that may need updating.
    4. a bag that needs to be updated and then returned.
    6. a previously zipped file that was zipped asynchronously.

    """
    if __debug__:
        logger.debug("request path is {}".format(path))

    split_path_strs = path.split('/')
    while split_path_strs[-1] == '':
        split_path_strs.pop()
    path = '/'.join(split_path_strs)  # no trailing slash

    # initialize case variables
    is_bag_download = False
    is_zip_download = False
    is_zip_request = request.GET.get('zipped', "False").lower() == "true"
    is_aggregation_request = request.GET.get('aggregation',
                                             "False").lower() == "true"
    api_request = request.META.get('CSRF_COOKIE', None) is None
    aggregation_name = None
    is_sf_request = False

    if split_path_strs[0] == 'bags':
        is_bag_download = True
        # format is bags/{rid}.zip
        res_id = os.path.splitext(split_path_strs[1])[0]
    elif split_path_strs[0] == 'zips':
        is_zip_download = True
        # zips prefix means that we are following up on an asynchronous download request
        # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid
        res_id = split_path_strs[3]
    else:  # regular download request
        res_id = split_path_strs[0]

    if __debug__:
        logger.debug("resource id is {}".format(res_id))

    # now we have the resource Id and can authorize the request
    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(
        request,
        res_id,
        needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
        raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        response.content = content_msg
        return response

    istorage = res.get_irods_storage()

    irods_path = res.get_irods_path(path, prepend_short_id=False)

    # in many cases, path and output_path are the same.
    output_path = path
    irods_output_path = irods_path
    # folder requests are automatically zipped
    if not is_bag_download and not is_zip_download:  # path points into resource: should I zip it?
        # check for aggregations
        if is_aggregation_request and res.resource_type == "CompositeResource":
            prefix = res.file_path
            if path.startswith(prefix):
                # +1 to remove trailing slash
                aggregation_name = path[len(prefix) + 1:]
            aggregation = res.get_aggregation_by_aggregation_name(
                aggregation_name)
            if not is_zip_request:
                download_url = request.GET.get('url_download', 'false').lower()
                if download_url == 'false':
                    # redirect to referenced url in the url file instead
                    if hasattr(aggregation, 'redirect_url'):
                        return HttpResponseRedirect(aggregation.redirect_url)
            # point to the main file path
            path = aggregation.get_main_file.url[len("/resource/"):]
            is_zip_request = True
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            output_path = "zips/{}/{}/{}.zip".format(daily_date,
                                                     uuid4().hex, path)

            irods_path = res.get_irods_path(path, prepend_short_id=False)
            irods_output_path = res.get_irods_path(output_path,
                                                   prepend_short_id=False)

        store_path = '/'.join(
            split_path_strs[1:])  # data/contents/{path-to-something}
        if res.is_folder(store_path):  # automatically zip folders
            is_zip_request = True
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            output_path = "zips/{}/{}/{}.zip".format(daily_date,
                                                     uuid4().hex, path)
            irods_output_path = res.get_irods_path(output_path,
                                                   prepend_short_id=False)

            if __debug__:
                logger.debug("automatically zipping folder {} to {}".format(
                    path, output_path))
        elif istorage.exists(irods_path):
            if __debug__:
                logger.debug("request for single file {}".format(path))
            is_sf_request = True

            if is_zip_request:
                daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
                output_path = "zips/{}/{}/{}.zip".format(
                    daily_date,
                    uuid4().hex, path)
                irods_output_path = res.get_irods_path(output_path,
                                                       prepend_short_id=False)

    # After this point, we have valid path, irods_path, output_path, and irods_output_path
    # * is_zip_request: signals download should be zipped, folders are always zipped
    # * aggregation: aggregation object if the path matches an aggregation
    # * is_sf_request: path is a single-file
    # flags for download:
    # * is_bag_download: download a bag in format bags/{rid}.zip
    # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip
    # if none of these are set, it's a normal download

    # determine active session
    if icommands.ACTIVE_SESSION:
        if __debug__:
            logger.debug("using ACTIVE_SESSION")
        session = icommands.ACTIVE_SESSION
    else:
        raise KeyError('settings must have IRODS_GLOBAL_SESSION set ')

    resource_cls = check_resource_type(res.resource_type)

    if is_zip_request:
        download_path = '/django_irods/rest_download/' + output_path
        if use_async:
            user_id = get_task_user_id(request)
            task = create_temp_zip.apply_async(
                (res_id, irods_path, irods_output_path, aggregation_name,
                 is_sf_request, download_path, user_id))
            task_id = task.task_id
            delete_zip.apply_async(
                (irods_output_path, ),
                countdown=(60 * 60 * 24))  # delete after 24 hours
            if api_request:
                return JsonResponse({
                    'zip_status':
                    'Not ready',
                    'task_id':
                    task.task_id,
                    'download_path':
                    '/django_irods/rest_download/' + output_path
                })
            else:
                # return status to the task notification App AJAX call
                task_dict = get_or_create_task_notification(
                    task_id,
                    name='zip download',
                    payload=download_path,
                    username=user_id)
                return JsonResponse(task_dict)

        else:  # synchronous creation of download
            ret_status = create_temp_zip(res_id,
                                         irods_path,
                                         irods_output_path,
                                         aggregation_name=aggregation_name,
                                         sf_zip=is_sf_request,
                                         download_path=download_path)
            delete_zip.apply_async(
                (irods_output_path, ),
                countdown=(60 * 60 * 24))  # delete after 24 hours
            if not ret_status:
                content_msg = "Zip could not be created."
                response = HttpResponse()
                response.content = content_msg
                return response
            # At this point, output_path presumably exists and contains a zipfile
            # to be streamed below

    elif is_bag_download:
        # Shorten request if it contains extra junk at the end
        bag_file_name = res_id + '.zip'
        output_path = os.path.join('bags', bag_file_name)
        irods_output_path = res.bag_path

        bag_modified = res.getAVU('bag_modified')
        # recreate the bag if it doesn't exist even if bag_modified is "false".
        if __debug__:
            logger.debug("irods_output_path is {}".format(irods_output_path))
        if bag_modified is None or not bag_modified:
            if not istorage.exists(irods_output_path):
                bag_modified = True

        # send signal for pre_check_bag_flag
        # this generates metadata other than that generated by create_bag_files.
        pre_check_bag_flag.send(sender=resource_cls, resource=res)

        if bag_modified is None or bag_modified:
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task

                task_id = get_resource_bag_task(res_id)
                user_id = get_task_user_id(request)
                if not task_id:
                    # create the bag
                    task = create_bag_by_irods.apply_async((res_id, ))
                    task_id = task.task_id
                    if api_request:
                        return JsonResponse({
                            'bag_status': 'Not ready',
                            'task_id': task_id,
                            'download_path': res.bag_url,
                            # status and id are checked by by hs_core.tests.api.rest.test_create_resource.py
                            'status': 'Not ready',
                            'id': task_id
                        })
                    else:
                        task_dict = get_or_create_task_notification(
                            task_id,
                            name='bag download',
                            payload=res.bag_url,
                            username=user_id)
                        return JsonResponse(task_dict)
                else:
                    # bag creation has already started
                    if api_request:
                        return JsonResponse({
                            'bag_status': 'Not ready',
                            'task_id': task_id,
                            'download_path': res.bag_url
                        })
                    else:
                        task_dict = get_or_create_task_notification(
                            task_id,
                            name='bag download',
                            payload=res.bag_url,
                            username=user_id)
                        return JsonResponse(task_dict)
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    response.content = content_msg
                    return response
        elif request.is_ajax():
            task_dict = {
                'id': datetime.datetime.today().isoformat(),
                'name': "bag download",
                'status': "completed",
                'payload': res.bag_url
            }
            return JsonResponse(task_dict)
    else:  # regular file download
        # if fetching main metadata files, then these need to be refreshed.

        if path in [
                f"{res_id}/data/resourcemap.xml",
                f"{res_id}/data/resourcemetadata.xml",
                f"{res_id}/manifest-md5.txt", f"{res_id}/tagmanifest-md5.txt",
                f"{res_id}/readme.txt", f"{res_id}/bagit.txt"
        ]:

            bag_modified = res.getAVU("bag_modified")
            if bag_modified is None or bag_modified or not istorage.exists(
                    irods_output_path):
                res.setAVU(
                    "bag_modified", True
                )  # ensure bag_modified is set when irods_output_path does not exist
                create_bag_by_irods(res_id, False)

        # send signal for pre download file
        # TODO: does not contain subdirectory information: duplicate refreshes possible
        download_file_name = split_path_strs[-1]  # end of path
        # this logs the download request in the tracking system
        pre_download_file.send(sender=resource_cls,
                               resource=res,
                               download_file_name=download_file_name,
                               request=request)

    # If we get this far,
    # * path and irods_path point to true input
    # * output_path and irods_output_path point to true output.
    # Try to stream the file back to the requester.

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(output_path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    # TODO: standardize this to make it less brittle
    stdout = session.run("ils", None, "-l", irods_output_path)[0].split()
    flen = int(stdout[3])

    # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true')
    # and reverse proxy is possible according to configuration (SENDFILE_ON=True)
    # and reverse proxy isn't overridden by user (use_reverse_proxy=True).

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(irods_output_path):
            content_msg = "file path {} does not exist in iRODS".format(
                output_path)
            response = HttpResponse(status=404)
            response.content = content_msg
            return response

        # track download count
        res.update_download_count()
        # invoke X-Accel-Redirect on physical vault file in nginx
        response = HttpResponse(content_type=mtype)
        response[
            'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=output_path.split('/')[-1])
        response['Content-Length'] = flen
        response['X-Accel-Redirect'] = '/'.join(
            [getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path])
        if __debug__:
            logger.debug("Reverse proxying local {}".format(
                response['X-Accel-Redirect']))
        return response

    # if we get here, none of the above conditions are true
    # if reverse proxy is enabled, then this is because the resource is remote and federated
    # OR the user specifically requested a non-proxied download.

    options = ('-', )  # we're redirecting to stdout.
    # this unusual way of calling works for streaming federated or local resources
    if __debug__:
        logger.debug("Locally streaming {}".format(output_path))
    # track download count
    res.update_download_count()
    proc = session.run_safe('iget', None, irods_output_path, *options)
    response = FileResponse(proc.stdout, content_type=mtype)
    response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
        name=output_path.split('/')[-1])
    response['Content-Length'] = flen
    return response
Exemple #14
0
def check_bag(rid, options):
    requests.packages.urllib3.disable_warnings()
    try:
        resource = BaseResource.objects.get(short_id=rid)
        istorage = resource.get_irods_storage()

        root_exists = istorage.exists(resource.root_path)

        if root_exists:
            # print status of metadata/bag system
            scimeta_path = os.path.join(resource.root_path, 'data',
                                        'resourcemetadata.xml')
            scimeta_exists = istorage.exists(scimeta_path)
            if scimeta_exists:
                print("resource metadata {} found".format(scimeta_path))
            else:
                print("resource metadata {} NOT FOUND".format(scimeta_path))

            resmap_path = os.path.join(resource.root_path, 'data', 'resourcemap.xml')
            resmap_exists = istorage.exists(resmap_path)
            if resmap_exists:
                print("resource map {} found".format(resmap_path))
            else:
                print("resource map {} NOT FOUND".format(resmap_path))

            bag_exists = istorage.exists(resource.bag_path)
            if bag_exists:
                print("bag {} found".format(resource.bag_path))
            else:
                print("bag {} NOT FOUND".format(resource.bag_path))

            dirty = resource.getAVU('metadata_dirty')
            print("{}.metadata_dirty is {}".format(rid, str(dirty)))

            modified = resource.getAVU('bag_modified')
            print("{}.bag_modified is {}".format(rid, str(modified)))

            if options['reset']:  # reset all data to pristine
                resource.setAVU('metadata_dirty', 'true')
                print("{}.metadata_dirty set to true".format(rid))
                try:
                    istorage.delete(resource.scimeta_path)
                    print("{} deleted".format(resource.scimeta_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.scimeta_path,
                                  ex.stderr))
                try:
                    istorage.delete(resource.resmap_path)
                    print("{} deleted".format(resource.resmap_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.resmap_path,
                                  ex.stderr))

                resource.setAVU('bag_modified', 'true')
                print("{}.bag_modified set to true".format(rid))
                try:
                    istorage.delete(resource.bag_path)
                    print("{} deleted".format(resource.bag_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.bag_path,
                                  ex.stderr))

            if options['reset_metadata']:
                resource.setAVU('metadata_dirty', 'true')
                print("{}.metadata_dirty set to true".format(rid))
                try:
                    istorage.delete(resource.scimeta_path)
                    print("{} deleted".format(resource.scimeta_path))
                except SessionException as ex:
                    print("delete of {} failed: {}"
                          .format(resource.scimeta_path,
                                  ex.stderr))
                try:
                    istorage.delete(resource.resmap_path)
                    print("{} deleted".format(resource.resmap_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.resmap_path,
                                  ex.stderr))

            if options['reset_bag']:
                resource.setAVU('bag_modified', 'true')
                print("{}.bag_modified set to true".format(rid))
                try:
                    istorage.delete(resource.bag_path)
                    print("{} deleted".format(resource.bag_path))
                except SessionException as ex:
                    print("{} delete failed: {}"
                          .format(resource.bag_path,
                                  ex.stderr))

            if options['generate']:  # generate usable bag
                if not options['if_needed'] or dirty or not scimeta_exists or not resmap_exists:
                    try:
                        create_bag_files(resource)
                    except ValueError as e:
                        print("{}: value error encountered: {}".format(rid, e.message))
                        return

                    print("{} metadata generated from Django".format(rid))
                    resource.setAVU('metadata_dirty', 'false')
                    resource.setAVU('bag_modified', 'true')
                    print("{}.metadata_dirty set to false".format(rid))

                if not options['if_needed'] or modified or not bag_exists:
                    create_bag_by_irods(rid)
                    print("{} bag generated from iRODs".format(rid))
                    resource.setAVU('bag_modified', 'false')
                    print("{}.bag_modified set to false".format(rid))

            if options['generate_metadata']:
                if not options['if_needed'] or dirty or not scimeta_exists or not resmap_exists:
                    try:
                        create_bag_files(resource)
                    except ValueError as e:
                        print("{}: value error encountered: {}".format(rid, e.message))
                        return
                    print("{}: metadata generated from Django".format(rid))
                    resource.setAVU('metadata_dirty', 'false')
                    print("{}.metadata_dirty set to false".format(rid))
                    resource.setAVU('bag_modified', 'true')
                    print("{}.bag_modified set to false".format(rid))

            if options['generate_bag']:
                if not options['if_needed'] or modified or not bag_exists:
                    create_bag_by_irods(rid)
                    print("{}: bag generated from iRODs".format(rid))
                    resource.setAVU('bag_modified', 'false')
                    print("{}.bag_modified set to false".format(rid))

            if options['download_bag']:
                if options['password']:
                    server = getattr(settings, 'FQDN_OR_IP', 'www.hydroshare.org')
                    uri = "https://{}/hsapi/resource/{}/".format(server, rid)
                    print("download uri is {}".format(uri))
                    r = hs_requests.get(uri, verify=False, stream=True,
                                        auth=requests.auth.HTTPBasicAuth(options['login'],
                                                                         options['password']))
                    print("download return status is {}".format(str(r.status_code)))
                    print("redirects:")
                    for thing in r.history:
                        print("...url: {}".format(thing.url))
                    filename = 'tmp/check_bag_block'
                    with open(filename, 'wb') as fd:
                        for chunk in r.iter_content(chunk_size=128):
                            fd.write(chunk)
                else:
                    print("cannot download bag without username and password.")

            if options['open_bag']:
                if options['password']:
                    server = getattr(settings, 'FQDN_OR_IP', 'www.hydroshare.org')
                    uri = "https://{}/hsapi/resource/{}/".format(server, rid)
                    print("download uri is {}".format(uri))
                    r = hs_requests.get(uri, verify=False, stream=True,
                                        auth=requests.auth.HTTPBasicAuth(options['login'],
                                                                         options['password']))
                    print("download return status is {}".format(str(r.status_code)))
                    print("redirects:")
                    for thing in r.history:
                        print("...url: {}".format(thing.url))
                    filename = 'tmp/check_bag_block'
                    with open(filename, 'wb') as fd:
                        for chunk in r.iter_content(chunk_size=128):
                            fd.write(chunk)
                            break
                else:
                    print("cannot open bag without username and password.")
        else:
            print("Resource with id {} does not exist in iRODS".format(rid))
    except BaseResource.DoesNotExist:
        print("Resource with id {} NOT FOUND in Django".format(rid))
Exemple #15
0
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True,
             *args, **kwargs):
    split_path_strs = path.split('/')
    is_bag_download = False
    is_zip_download = False
    is_sf_agg_file = False
    if split_path_strs[0] == 'bags':
        res_id = os.path.splitext(split_path_strs[1])[0]
        is_bag_download = True
    elif split_path_strs[0] == 'zips':
        if path.endswith('.zip'):
            res_id = os.path.splitext(split_path_strs[2])[0]
        else:
            res_id = os.path.splitext(split_path_strs[1])[0]
        is_zip_download = True
    else:
        res_id = split_path_strs[0]

    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(request, res_id,
                                   needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
                                   raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    if res.resource_type == "CompositeResource" and not path.endswith(".zip"):
        for f in ResourceFile.objects.filter(object_id=res.id):
            if path == f.storage_path:
                if f.has_logical_file and f.logical_file.is_single_file_aggregation:
                    is_sf_agg_file = True

    if res.resource_federation_path:
        # the resource is stored in federated zone
        istorage = IrodsStorage('federated')
        federated_path = res.resource_federation_path
        path = os.path.join(federated_path, path)
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        istorage = IrodsStorage()
        federated_path = ''
        if 'environment' in kwargs:
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if federated_path:
        res_root = os.path.join(federated_path, res_id)
    else:
        res_root = res_id

    if is_zip_download or is_sf_agg_file:
        if not path.endswith(".zip"):  # requesting folder that needs to be zipped
            input_path = path.split(res_id)[1]
            random_hash = random.getrandbits(32)
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format(
                daily_date=daily_date, res_id=res_id,
                rand_folder=random_hash)
            output_path = '{random_hash_path}{path}.zip'.format(random_hash_path=random_hash_path,
                                                                path=input_path)

            if res.resource_type == "CompositeResource":
                aggregation_name = input_path[len('/data/contents/'):]
                res.create_aggregation_xml_documents(aggregation_name=aggregation_name)

            if use_async:
                task = create_temp_zip.apply_async((res_id, input_path, output_path,
                                                    is_sf_agg_file), countdown=3)
                delete_zip.apply_async((random_hash_path, ),
                                       countdown=(20 * 60))  # delete after 20 minutes
                if is_sf_agg_file:
                    download_path = request.path.split(res_id)[0] + output_path
                else:
                    download_path = request.path.split("zips")[0] + output_path
                if rest_call:
                    return HttpResponse(json.dumps({'zip_status': 'Not ready',
                                                    'task_id': task.task_id,
                                                    'download_path': download_path}),
                                        content_type="application/json")
                request.session['task_id'] = task.task_id
                request.session['download_path'] = download_path
                return HttpResponseRedirect(res.get_absolute_url())

            ret_status = create_temp_zip(res_id, input_path, output_path, is_sf_agg_file)
            delete_zip.apply_async((random_hash_path, ),
                                   countdown=(20 * 60))  # delete after 20 minutes
            if not ret_status:
                content_msg = "Zip cannot be created successfully. Check log for details."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response

            path = output_path

    bag_modified = istorage.getAVU(res_root, 'bag_modified')
    # make sure if bag_modified is not set to true, we still recreate the bag if the
    # bag file does not exist for some reason to resolve the error to download a nonexistent
    # bag when bag_modified is false due to the flag being out-of-sync with the real bag status
    if bag_modified is None or bag_modified.lower() == "false":
        # check whether the bag file exists
        bag_file_name = res_id + '.zip'
        if res_root.startswith(res_id):
            bag_full_path = os.path.join('bags', bag_file_name)
        else:
            bag_full_path = os.path.join(federated_path, 'bags', bag_file_name)
        # set bag_modified to 'true' if the bag does not exist so that it can be recreated
        # and the bag_modified AVU will be set correctly as well subsequently
        if not istorage.exists(bag_full_path):
            bag_modified = 'true'

    metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty')
    # do on-demand bag creation
    # needs to check whether res_id collection exists before getting/setting AVU on it
    # to accommodate the case where the very same resource gets deleted by another request
    # when it is getting downloaded

    if is_bag_download:
        # send signal for pre_check_bag_flag
        pre_check_bag_flag.send(sender=resource_cls, resource=res)
        if bag_modified is None or bag_modified.lower() == "true":
            if metadata_dirty is None or metadata_dirty.lower() == 'true':
                create_bag_files(res)
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id,), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({'bag_status': 'Not ready',
                                                    'task_id': task.task_id}),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    elif metadata_dirty is None or metadata_dirty.lower() == 'true':
        if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'):
            # we need to regenerate the metadata xml files
            create_bag_files(res)

    # send signal for pre download file
    download_file_name = split_path_strs[-1]
    pre_download_file.send(sender=resource_cls, resource=res,
                           download_file_name=download_file_name,
                           request=request)

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    stdout = session.run("ils", None, "-l", path)[0].split()
    flen = int(stdout[3])

    # If this path is resource_federation_path, then the file is a local user file
    userpath = '/' + os.path.join(
        getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'),
        'home',
        getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy'))

    # Allow reverse proxy if request was forwarded by nginx
    # (HTTP_X_DJANGO_REVERSE_PROXY is 'true')
    # and reverse proxy is possible according to configuration.

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(path):
            content_msg = "file path {} does not exist in iRODS".format(path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join([
                getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path])
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # invoke X-Accel-Redirect on physical vault file in nginx
            # if path is full user path; strip federation prefix
            if path.startswith(userpath):
                path = path[len(userpath)+1:]
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), path)
            return response

    # if we get here, none of the above conditions are true
    if flen <= FILE_SIZE_LIMIT:
        options = ('-',)  # we're redirecting to stdout.
        # this unusual way of calling works for federated or local resources
        proc = session.run_safe('iget', None, path, *options)
        response = FileResponse(proc.stdout, content_type=mtype)
        response['Content-Disposition'] = 'attachment; filename="{name}"'.format(
            name=path.split('/')[-1])
        response['Content-Length'] = flen
        return response

    else:
        content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \
                      "Please download the large file via iRODS clients."
        response = HttpResponse(status=403)
        if rest_call:
            response.content = content_msg
        else:
            response.content = "<h1>" + content_msg + "</h1>"
        return response
Exemple #16
0
def download(request,
             path,
             rest_call=False,
             use_async=True,
             use_reverse_proxy=True,
             *args,
             **kwargs):
    split_path_strs = path.split('/')
    is_bag_download = False
    is_zip_download = False
    is_sf_agg_file = False
    if split_path_strs[0] == 'bags':
        res_id = os.path.splitext(split_path_strs[1])[0]
        is_bag_download = True
    elif split_path_strs[0] == 'zips':
        if path.endswith('.zip'):
            res_id = os.path.splitext(split_path_strs[2])[0]
        else:
            res_id = os.path.splitext(split_path_strs[1])[0]
        is_zip_download = True
    else:
        res_id = split_path_strs[0]

    # if the resource does not exist in django, authorized will be false
    res, authorized, _ = authorize(
        request,
        res_id,
        needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE,
        raises_exception=False)
    if not authorized:
        response = HttpResponse(status=401)
        content_msg = "You do not have permission to download this resource!"
        if rest_call:
            raise PermissionDenied(content_msg)
        else:
            response.content = "<h1>" + content_msg + "</h1>"
            return response

    if res.resource_type == "CompositeResource" and not path.endswith(".zip"):
        for f in ResourceFile.objects.filter(object_id=res.id):
            if path == f.storage_path:
                if f.has_logical_file and f.logical_file.is_single_file_aggregation:
                    is_sf_agg_file = True

    if res.resource_federation_path:
        # the resource is stored in federated zone
        istorage = IrodsStorage('federated')
        federated_path = res.resource_federation_path
        path = os.path.join(federated_path, path)
        session = icommands.ACTIVE_SESSION
    else:
        # TODO: From Alva: I do not understand the use case for changing the environment.
        # TODO: This seems an enormous potential vulnerability, as arguments are
        # TODO: passed from the URI directly to IRODS without verification.
        istorage = IrodsStorage()
        federated_path = ''
        if 'environment' in kwargs:
            environment = int(kwargs['environment'])
            environment = m.RodsEnvironment.objects.get(pk=environment)
            session = Session("/tmp/django_irods",
                              settings.IRODS_ICOMMANDS_PATH,
                              session_id=uuid4())
            session.create_environment(environment)
            session.run('iinit', None, environment.auth)
        elif getattr(settings, 'IRODS_GLOBAL_SESSION', False):
            session = GLOBAL_SESSION
        elif icommands.ACTIVE_SESSION:
            session = icommands.ACTIVE_SESSION
        else:
            raise KeyError('settings must have IRODS_GLOBAL_SESSION set '
                           'if there is no environment object')

    resource_cls = check_resource_type(res.resource_type)

    if federated_path:
        res_root = os.path.join(federated_path, res_id)
    else:
        res_root = res_id

    if is_zip_download or is_sf_agg_file:
        if not path.endswith(
                ".zip"):  # requesting folder that needs to be zipped
            input_path = path.split(res_id)[1]
            random_hash = random.getrandbits(32)
            daily_date = datetime.datetime.today().strftime('%Y-%m-%d')
            random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format(
                daily_date=daily_date, res_id=res_id, rand_folder=random_hash)
            output_path = '{random_hash_path}{path}.zip'.format(
                random_hash_path=random_hash_path, path=input_path)

            if res.resource_type == "CompositeResource":
                aggregation_name = input_path[len('/data/contents/'):]
                res.create_aggregation_xml_documents(
                    aggregation_name=aggregation_name)

            if use_async:
                task = create_temp_zip.apply_async(
                    (res_id, input_path, output_path, is_sf_agg_file),
                    countdown=3)
                delete_zip.apply_async(
                    (random_hash_path, ),
                    countdown=(20 * 60))  # delete after 20 minutes
                if is_sf_agg_file:
                    download_path = request.path.split(res_id)[0] + output_path
                else:
                    download_path = request.path.split("zips")[0] + output_path
                if rest_call:
                    return HttpResponse(json.dumps({
                        'zip_status':
                        'Not ready',
                        'task_id':
                        task.task_id,
                        'download_path':
                        download_path
                    }),
                                        content_type="application/json")
                request.session['task_id'] = task.task_id
                request.session['download_path'] = download_path
                return HttpResponseRedirect(res.get_absolute_url())

            ret_status = create_temp_zip(res_id, input_path, output_path,
                                         is_sf_agg_file)
            delete_zip.apply_async(
                (random_hash_path, ),
                countdown=(20 * 60))  # delete after 20 minutes
            if not ret_status:
                content_msg = "Zip cannot be created successfully. Check log for details."
                response = HttpResponse()
                if rest_call:
                    response.content = content_msg
                else:
                    response.content = "<h1>" + content_msg + "</h1>"
                return response

            path = output_path

    bag_modified = istorage.getAVU(res_root, 'bag_modified')
    # make sure if bag_modified is not set to true, we still recreate the bag if the
    # bag file does not exist for some reason to resolve the error to download a nonexistent
    # bag when bag_modified is false due to the flag being out-of-sync with the real bag status
    if bag_modified is None or bag_modified.lower() == "false":
        # check whether the bag file exists
        bag_file_name = res_id + '.zip'
        if res_root.startswith(res_id):
            bag_full_path = os.path.join('bags', bag_file_name)
        else:
            bag_full_path = os.path.join(federated_path, 'bags', bag_file_name)
        # set bag_modified to 'true' if the bag does not exist so that it can be recreated
        # and the bag_modified AVU will be set correctly as well subsequently
        if not istorage.exists(bag_full_path):
            bag_modified = 'true'

    metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty')
    # do on-demand bag creation
    # needs to check whether res_id collection exists before getting/setting AVU on it
    # to accommodate the case where the very same resource gets deleted by another request
    # when it is getting downloaded

    if is_bag_download:
        # send signal for pre_check_bag_flag
        pre_check_bag_flag.send(sender=resource_cls, resource=res)
        if bag_modified is None or bag_modified.lower() == "true":
            if metadata_dirty is None or metadata_dirty.lower() == 'true':
                create_bag_files(res)
            if use_async:
                # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed
                # Note that since we are using JSON for task parameter serialization, no complex
                # object can be passed as parameters to a celery task
                task = create_bag_by_irods.apply_async((res_id, ), countdown=3)
                if rest_call:
                    return HttpResponse(json.dumps({
                        'bag_status': 'Not ready',
                        'task_id': task.task_id
                    }),
                                        content_type="application/json")

                request.session['task_id'] = task.task_id
                request.session['download_path'] = request.path
                return HttpResponseRedirect(res.get_absolute_url())
            else:
                ret_status = create_bag_by_irods(res_id)
                if not ret_status:
                    content_msg = "Bag cannot be created successfully. Check log for details."
                    response = HttpResponse()
                    if rest_call:
                        response.content = content_msg
                    else:
                        response.content = "<h1>" + content_msg + "</h1>"
                    return response

    elif metadata_dirty is None or metadata_dirty.lower() == 'true':
        if path.endswith("resourcemap.xml") or path.endswith(
                'resourcemetadata.xml'):
            # we need to regenerate the metadata xml files
            create_bag_files(res)

    # send signal for pre download file
    download_file_name = split_path_strs[-1]
    pre_download_file.send(sender=resource_cls,
                           resource=res,
                           download_file_name=download_file_name,
                           request=request)

    # obtain mime_type to set content_type
    mtype = 'application-x/octet-stream'
    mime_type = mimetypes.guess_type(path)
    if mime_type[0] is not None:
        mtype = mime_type[0]
    # retrieve file size to set up Content-Length header
    stdout = session.run("ils", None, "-l", path)[0].split()
    flen = int(stdout[3])

    # If this path is resource_federation_path, then the file is a local user file
    userpath = '/' + os.path.join(
        getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home',
        getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE',
                'localHydroProxy'))

    # Allow reverse proxy if request was forwarded by nginx
    # (HTTP_X_DJANGO_REVERSE_PROXY is 'true')
    # and reverse proxy is possible according to configuration.

    if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \
       'HTTP_X_DJANGO_REVERSE_PROXY' in request.META:

        # The NGINX sendfile abstraction is invoked as follows:
        # 1. The request to download a file enters this routine via the /rest_download or /download
        #    url in ./urls.py. It is redirected here from Django. The URI contains either the
        #    unqualified resource path or the federated resource path, depending upon whether
        #    the request is local or federated.
        # 2. This deals with unfederated resources by redirecting them to the uri
        #    /irods-data/{resource-id}/... on nginx. This URI is configured to read the file
        #    directly from the iRODS vault via NFS, and does not work for direct access to the
        #    vault due to the 'internal;' declaration in NGINX.
        # 3. This deals with federated resources by reading their path, matching local vaults, and
        #    redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At
        #    present, the only one of these is /irods-user, which handles files whose federation
        #    path is stored in the variable 'userpath'.
        # 4. If there is no vault available for the resource, the file is transferred without
        #    NGINX, exactly as it was transferred previously.

        # stop NGINX targets that are non-existent from hanging forever.
        if not istorage.exists(path):
            content_msg = "file path {} does not exist in iRODS".format(path)
            response = HttpResponse(status=404)
            if rest_call:
                response.content = content_msg
            else:
                response.content = "<h1>" + content_msg + "</h1>"
            return response

        if not res.is_federated:
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response[
                'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                    name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = '/'.join(
                [getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path])
            return response

        elif res.resource_federation_path == userpath:  # this guarantees a "user" resource
            # invoke X-Accel-Redirect on physical vault file in nginx
            # if path is full user path; strip federation prefix
            if path.startswith(userpath):
                path = path[len(userpath) + 1:]
            # invoke X-Accel-Redirect on physical vault file in nginx
            response = HttpResponse(content_type=mtype)
            response[
                'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                    name=path.split('/')[-1])
            response['Content-Length'] = flen
            response['X-Accel-Redirect'] = os.path.join(
                getattr(settings, 'IRODS_USER_URI', '/irods-user'), path)
            return response

    # if we get here, none of the above conditions are true
    if flen <= FILE_SIZE_LIMIT:
        options = ('-', )  # we're redirecting to stdout.
        # this unusual way of calling works for federated or local resources
        proc = session.run_safe('iget', None, path, *options)
        response = FileResponse(proc.stdout, content_type=mtype)
        response[
            'Content-Disposition'] = 'attachment; filename="{name}"'.format(
                name=path.split('/')[-1])
        response['Content-Length'] = flen
        return response

    else:
        content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \
                      "Please download the large file via iRODS clients."
        response = HttpResponse(status=403)
        if rest_call:
            response.content = content_msg
        else:
            response.content = "<h1>" + content_msg + "</h1>"
        return response