def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): """ perform a download request, either asynchronously or synchronously :param request: the request object. :param path: the path of the thing to be downloaded. :param rest_call: True if calling from REST API :param use_async: True means to utilize asynchronous creation of objects to download. :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming. The following variables are computed: * `path` is the public path of the thing to be downloaded. * `irods_path` is the location of `path` in irods. * `output_path` is the output path to be reported in the response object. * `irods_output_path` is the location of `output_path` in irods and there are six cases: Zipped query param signal the download should be zipped - folders are always zipped regardless of this paramter - single file aggregations are zipped with the aggregation metadata files A path may point to: 1. a single file 2. a single-file-aggregation object in a composite resource. 3. a folder 3. a metadata object that may need updating. 4. a bag that needs to be updated and then returned. 6. a previously zipped file that was zipped asynchronously. """ if __debug__: logger.debug("request path is {}".format(path)) split_path_strs = path.split('/') while split_path_strs[-1] == '': split_path_strs.pop() path = u'/'.join(split_path_strs) # no trailing slash # initialize case variables is_bag_download = False is_zip_download = False is_zip_request = request.GET.get('zipped', "False").lower() == "true" is_sf_agg_file = False is_sf_request = False if split_path_strs[0] == 'bags': is_bag_download = True # format is bags/{rid}.zip res_id = os.path.splitext(split_path_strs[1])[0] elif split_path_strs[0] == 'zips': is_zip_download = True # zips prefix means that we are following up on an asynchronous download request # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid res_id = split_path_strs[3] else: # regular download request res_id = split_path_strs[0] if __debug__: logger.debug("resource id is {}".format(res_id)) # now we have the resource Id and can authorize the request # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize(request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response # default values are changed later as needed istorage = res.get_irods_storage() if res.is_federated: irods_path = os.path.join(res.resource_federation_path, path) else: irods_path = path # in many cases, path and output_path are the same. output_path = path irods_output_path = irods_path # folder requests are automatically zipped if not is_bag_download and not is_zip_download: # path points into resource: should I zip it? store_path = u'/'.join(split_path_strs[1:]) # data/contents/{path-to-something} if res.is_folder(store_path): # automatically zip folders is_zip_request = True daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path if __debug__: logger.debug("automatically zipping folder {} to {}".format(path, output_path)) elif istorage.exists(irods_path): if __debug__: logger.debug("request for single file {}".format(path)) is_sf_request = True # check for single file aggregations if "data/contents/" in path: # not a metadata file for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: is_sf_agg_file = True if not is_zip_request and f.has_logical_file and \ f.logical_file.is_single_file_aggregation: download_url = request.GET.get('url_download', 'false').lower() if download_url == 'false': # redirect to referenced url in the url file instead redirect_url = f.logical_file.redirect_url if redirect_url: return HttpResponseRedirect(redirect_url) if __debug__: logger.debug( "request for single file aggregation {}".format(path)) break if is_zip_request: daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path # After this point, we have valid path, irods_path, output_path, and irods_output_path # * is_zip_request: signals download should be zipped, folders are always zipped # * is_sf_agg_file: path is a single-file aggregation in Composite Resource # * is_sf_request: path is a single-file # flags for download: # * is_bag_download: download a bag in format bags/{rid}.zip # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip # if none of these are set, it's a normal download # determine active session if res.is_federated: # the resource is stored in federated zone session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. if 'environment' in kwargs: logger.warn("setting iRODS from environment") environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): if __debug__: logger.debug("using GLOBAL_SESSION") session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: if __debug__: logger.debug("using ACTIVE_SESSION") session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if is_zip_request: if use_async: task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request)) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if rest_call: return HttpResponse( json.dumps({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': '/django_irods/rest_download/' + output_path}), content_type="application/json") else: # return status to the UI request.session['task_id'] = task.task_id # TODO: this is mistaken for a bag download in the UI! # TODO: multiple asynchronous downloads don't stack! request.session['download_path'] = '/django_irods/download/' + output_path # redirect to resource landing page, which interprets session variables. return HttpResponseRedirect(res.get_absolute_url()) else: # synchronous creation of download ret_status = create_temp_zip(res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if not ret_status: content_msg = "Zip could not be created." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response # At this point, output_path presumably exists and contains a zipfile # to be streamed below elif is_bag_download: # Shorten request if it contains extra junk at the end bag_file_name = res_id + '.zip' output_path = os.path.join('bags', bag_file_name) if not res.is_federated: irods_output_path = output_path else: irods_output_path = os.path.join(res.resource_federation_path, output_path) bag_modified = res.getAVU('bag_modified') # recreate the bag if it doesn't exist even if bag_modified is "false". if __debug__: logger.debug(u"irods_output_path is {}".format(irods_output_path)) if bag_modified is None or not bag_modified: if not istorage.exists(irods_output_path): bag_modified = True # send signal for pre_check_bag_flag # this generates metadata other than that generated by create_bag_files. pre_check_bag_flag.send(sender=resource_cls, resource=res) metadata_dirty = res.getAVU('metadata_dirty') if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False bag_modified = "True" if bag_modified is None or bag_modified: if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id,), countdown=3) if rest_call: return HttpResponse(json.dumps({'bag_status': 'Not ready', 'task_id': task.task_id}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response else: # regular file download # if fetching main metadata files, then these need to be refreshed. if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'): metadata_dirty = res.getAVU("metadata_dirty") if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False # send signal for pre download file # TODO: does not contain subdirectory information: duplicate refreshes possible download_file_name = split_path_strs[-1] # end of path # this logs the download request in the tracking system pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # If we get this far, # * path and irods_path point to true input # * output_path and irods_output_path point to true output. # Try to stream the file back to the requester. # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(output_path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header # TODO: standardize this to make it less brittle stdout = session.run("ils", None, "-l", irods_output_path)[0].split() flen = int(stdout[3]) # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true') # and reverse proxy is possible according to configuration (SENDFILE_ON=True) # and reverse proxy isn't overridden by user (use_reverse_proxy=True). if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy')) # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(irods_output_path): content_msg = "file path {} does not exist in iRODS".format(output_path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join([ getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path]) if __debug__: logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect'])) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path) if __debug__: logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect'])) return response # if we get here, none of the above conditions are true # if reverse proxy is enabled, then this is because the resource is remote and federated # OR the user specifically requested a non-proxied download. options = ('-',) # we're redirecting to stdout. # this unusual way of calling works for streaming federated or local resources if __debug__: logger.debug("Locally streaming {}".format(output_path)) # track download count res.update_download_count() proc = session.run_safe('iget', None, irods_output_path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen return response
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): """ perform a download request, either asynchronously or synchronously :param request: the request object. :param path: the path of the thing to be downloaded. :param rest_call: True if calling from REST API :param use_async: True means to utilize asynchronous creation of objects to download. :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming. The following variables are computed: * `path` is the public path of the thing to be downloaded. * `irods_path` is the location of `path` in irods. * `output_path` is the output path to be reported in the response object. * `irods_output_path` is the location of `output_path` in irods and there are six cases: Zipped query param signal the download should be zipped - folders are always zipped regardless of this paramter - single file aggregations are zipped with the aggregation metadata files A path may point to: 1. a single file 2. a single-file-aggregation object in a composite resource. 3. a folder 3. a metadata object that may need updating. 4. a bag that needs to be updated and then returned. 6. a previously zipped file that was zipped asynchronously. """ if __debug__: logger.debug("request path is {}".format(path)) split_path_strs = path.split('/') while split_path_strs[-1] == '': split_path_strs.pop() path = u'/'.join(split_path_strs) # no trailing slash # initialize case variables is_bag_download = False is_zip_download = False is_zip_request = request.GET.get('zipped', "False").lower() == "true" is_sf_agg_file = False is_sf_request = False if split_path_strs[0] == 'bags': is_bag_download = True # format is bags/{rid}.zip res_id = os.path.splitext(split_path_strs[1])[0] elif split_path_strs[0] == 'zips': is_zip_download = True # zips prefix means that we are following up on an asynchronous download request # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid res_id = split_path_strs[3] else: # regular download request res_id = split_path_strs[0] if __debug__: logger.debug("resource id is {}".format(res_id)) # now we have the resource Id and can authorize the request # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize(request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response # default values are changed later as needed istorage = res.get_irods_storage() if res.is_federated: irods_path = os.path.join(res.resource_federation_path, path) else: irods_path = path # in many cases, path and output_path are the same. output_path = path irods_output_path = irods_path # folder requests are automatically zipped if not is_bag_download and not is_zip_download: # path points into resource: should I zip it? store_path = u'/'.join(split_path_strs[1:]) # data/contents/{path-to-something} if res.is_folder(store_path): # automatically zip folders is_zip_request = True daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path if __debug__: logger.debug("automatically zipping folder {} to {}".format(path, output_path)) elif istorage.exists(irods_path): if __debug__: logger.debug("request for single file {}".format(path)) is_sf_request = True # check for single file aggregations if "data/contents/" in path: # not a metadata file for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: is_sf_agg_file = True if not is_zip_request and f.has_logical_file and \ f.logical_file.is_single_file_aggregation: download_url = request.GET.get('url_download', 'false').lower() if download_url == 'false': # redirect to referenced url in the url file instead redirect_url = f.logical_file.redirect_url if redirect_url: return HttpResponseRedirect(redirect_url) if __debug__: logger.debug( "request for single file aggregation {}".format(path)) break if is_zip_request: daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) if res.is_federated: irods_output_path = os.path.join(res.resource_federation_path, output_path) else: irods_output_path = output_path # After this point, we have valid path, irods_path, output_path, and irods_output_path # * is_zip_request: signals download should be zipped, folders are always zipped # * is_sf_agg_file: path is a single-file aggregation in Composite Resource # * is_sf_request: path is a single-file # flags for download: # * is_bag_download: download a bag in format bags/{rid}.zip # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip # if none of these are set, it's a normal download # determine active session if res.is_federated: # the resource is stored in federated zone session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. if 'environment' in kwargs: logger.warn("setting iRODS from environment") environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): if __debug__: logger.debug("using GLOBAL_SESSION") session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: if __debug__: logger.debug("using ACTIVE_SESSION") session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if is_zip_request: if use_async: task = create_temp_zip.apply_async((res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request)) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if rest_call: return HttpResponse( json.dumps({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': '/django_irods/rest_download/' + output_path}), content_type="application/json") else: # return status to the UI request.session['task_id'] = task.task_id # TODO: this is mistaken for a bag download in the UI! # TODO: multiple asynchronous downloads don't stack! request.session['download_path'] = '/django_irods/download/' + output_path # redirect to resource landing page, which interprets session variables. return HttpResponseRedirect(res.get_absolute_url()) else: # synchronous creation of download ret_status = create_temp_zip(res_id, irods_path, irods_output_path, is_sf_agg_file, is_sf_request) delete_zip.apply_async((irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if not ret_status: content_msg = "Zip could not be created." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response # At this point, output_path presumably exists and contains a zipfile # to be streamed below elif is_bag_download: # Shorten request if it contains extra junk at the end bag_file_name = res_id + '.zip' output_path = os.path.join('bags', bag_file_name) if not res.is_federated: irods_output_path = output_path else: irods_output_path = os.path.join(res.resource_federation_path, output_path) bag_modified = res.getAVU('bag_modified') # recreate the bag if it doesn't exist even if bag_modified is "false". if __debug__: logger.debug(u"irods_output_path is {}".format(irods_output_path)) if bag_modified is None or not bag_modified: if not istorage.exists(irods_output_path): bag_modified = True # send signal for pre_check_bag_flag # this generates metadata other than that generated by create_bag_files. pre_check_bag_flag.send(sender=resource_cls, resource=res) metadata_dirty = res.getAVU('metadata_dirty') if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False bag_modified = "True" if bag_modified is None or bag_modified: if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id,), countdown=3) if rest_call: return HttpResponse(json.dumps({'bag_status': 'Not ready', 'task_id': task.task_id}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response else: # regular file download # if fetching main metadata files, then these need to be refreshed. if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'): metadata_dirty = res.getAVU("metadata_dirty") if metadata_dirty is None or metadata_dirty: create_bag_files(res) # sets metadata_dirty to False # send signal for pre download file # TODO: does not contain subdirectory information: duplicate refreshes possible download_file_name = split_path_strs[-1] # end of path # this logs the download request in the tracking system pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # If we get this far, # * path and irods_path point to true input # * output_path and irods_output_path point to true output. # Try to stream the file back to the requester. # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(output_path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header # TODO: standardize this to make it less brittle stdout = session.run("ils", None, "-l", irods_output_path)[0].split() flen = int(stdout[3]) # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true') # and reverse proxy is possible according to configuration (SENDFILE_ON=True) # and reverse proxy isn't overridden by user (use_reverse_proxy=True). if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_IRODS_PROXY_USER_IN_USER_ZONE', 'localHydroProxy')) # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(irods_output_path): content_msg = "file path {} does not exist in iRODS".format(output_path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join([ getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path]) if __debug__: logger.debug("Reverse proxying local {}".format(response['X-Accel-Redirect'])) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), output_path) if __debug__: logger.debug("Reverse proxying user {}".format(response['X-Accel-Redirect'])) return response # if we get here, none of the above conditions are true # if reverse proxy is enabled, then this is because the resource is remote and federated # OR the user specifically requested a non-proxied download. options = ('-',) # we're redirecting to stdout. # this unusual way of calling works for streaming federated or local resources if __debug__: logger.debug("Locally streaming {}".format(output_path)) # track download count res.update_download_count() proc = session.run_safe('iget', None, irods_output_path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen return response
def download(request, path, use_async=True, use_reverse_proxy=True, *args, **kwargs): """ perform a download request, either asynchronously or synchronously :param request: the request object. :param path: the path of the thing to be downloaded. :param use_async: True means to utilize asynchronous creation of objects to download. :param use_reverse_proxy: True means to utilize NGINX reverse proxy for streaming. The following variables are computed: * `path` is the public path of the thing to be downloaded. * `irods_path` is the location of `path` in irods. * `output_path` is the output path to be reported in the response object. * `irods_output_path` is the location of `output_path` in irods and there are six cases: Zipped query param signal the download should be zipped - folders are always zipped regardless of this paramter - single file aggregations are zipped with the aggregation metadata files A path may point to: 1. a single file 2. a single-file-aggregation object in a composite resource. 3. a folder 3. a metadata object that may need updating. 4. a bag that needs to be updated and then returned. 6. a previously zipped file that was zipped asynchronously. """ if __debug__: logger.debug("request path is {}".format(path)) split_path_strs = path.split('/') while split_path_strs[-1] == '': split_path_strs.pop() path = '/'.join(split_path_strs) # no trailing slash # initialize case variables is_bag_download = False is_zip_download = False is_zip_request = request.GET.get('zipped', "False").lower() == "true" is_aggregation_request = request.GET.get('aggregation', "False").lower() == "true" api_request = request.META.get('CSRF_COOKIE', None) is None aggregation_name = None is_sf_request = False if split_path_strs[0] == 'bags': is_bag_download = True # format is bags/{rid}.zip res_id = os.path.splitext(split_path_strs[1])[0] elif split_path_strs[0] == 'zips': is_zip_download = True # zips prefix means that we are following up on an asynchronous download request # format is zips/{date}/{zip-uuid}/{public-path}.zip where {public-path} contains the rid res_id = split_path_strs[3] else: # regular download request res_id = split_path_strs[0] if __debug__: logger.debug("resource id is {}".format(res_id)) # now we have the resource Id and can authorize the request # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize( request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" response.content = content_msg return response istorage = res.get_irods_storage() irods_path = res.get_irods_path(path, prepend_short_id=False) # in many cases, path and output_path are the same. output_path = path irods_output_path = irods_path # folder requests are automatically zipped if not is_bag_download and not is_zip_download: # path points into resource: should I zip it? # check for aggregations if is_aggregation_request and res.resource_type == "CompositeResource": prefix = res.file_path if path.startswith(prefix): # +1 to remove trailing slash aggregation_name = path[len(prefix) + 1:] aggregation = res.get_aggregation_by_aggregation_name( aggregation_name) if not is_zip_request: download_url = request.GET.get('url_download', 'false').lower() if download_url == 'false': # redirect to referenced url in the url file instead if hasattr(aggregation, 'redirect_url'): return HttpResponseRedirect(aggregation.redirect_url) # point to the main file path path = aggregation.get_main_file.url[len("/resource/"):] is_zip_request = True daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) irods_path = res.get_irods_path(path, prepend_short_id=False) irods_output_path = res.get_irods_path(output_path, prepend_short_id=False) store_path = '/'.join( split_path_strs[1:]) # data/contents/{path-to-something} if res.is_folder(store_path): # automatically zip folders is_zip_request = True daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format(daily_date, uuid4().hex, path) irods_output_path = res.get_irods_path(output_path, prepend_short_id=False) if __debug__: logger.debug("automatically zipping folder {} to {}".format( path, output_path)) elif istorage.exists(irods_path): if __debug__: logger.debug("request for single file {}".format(path)) is_sf_request = True if is_zip_request: daily_date = datetime.datetime.today().strftime('%Y-%m-%d') output_path = "zips/{}/{}/{}.zip".format( daily_date, uuid4().hex, path) irods_output_path = res.get_irods_path(output_path, prepend_short_id=False) # After this point, we have valid path, irods_path, output_path, and irods_output_path # * is_zip_request: signals download should be zipped, folders are always zipped # * aggregation: aggregation object if the path matches an aggregation # * is_sf_request: path is a single-file # flags for download: # * is_bag_download: download a bag in format bags/{rid}.zip # * is_zip_download: download a zipfile in format zips/{date}/{random guid}/{path}.zip # if none of these are set, it's a normal download # determine active session if icommands.ACTIVE_SESSION: if __debug__: logger.debug("using ACTIVE_SESSION") session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ') resource_cls = check_resource_type(res.resource_type) if is_zip_request: download_path = '/django_irods/rest_download/' + output_path if use_async: user_id = get_task_user_id(request) task = create_temp_zip.apply_async( (res_id, irods_path, irods_output_path, aggregation_name, is_sf_request, download_path, user_id)) task_id = task.task_id delete_zip.apply_async( (irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if api_request: return JsonResponse({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': '/django_irods/rest_download/' + output_path }) else: # return status to the task notification App AJAX call task_dict = get_or_create_task_notification( task_id, name='zip download', payload=download_path, username=user_id) return JsonResponse(task_dict) else: # synchronous creation of download ret_status = create_temp_zip(res_id, irods_path, irods_output_path, aggregation_name=aggregation_name, sf_zip=is_sf_request, download_path=download_path) delete_zip.apply_async( (irods_output_path, ), countdown=(60 * 60 * 24)) # delete after 24 hours if not ret_status: content_msg = "Zip could not be created." response = HttpResponse() response.content = content_msg return response # At this point, output_path presumably exists and contains a zipfile # to be streamed below elif is_bag_download: # Shorten request if it contains extra junk at the end bag_file_name = res_id + '.zip' output_path = os.path.join('bags', bag_file_name) irods_output_path = res.bag_path bag_modified = res.getAVU('bag_modified') # recreate the bag if it doesn't exist even if bag_modified is "false". if __debug__: logger.debug("irods_output_path is {}".format(irods_output_path)) if bag_modified is None or not bag_modified: if not istorage.exists(irods_output_path): bag_modified = True # send signal for pre_check_bag_flag # this generates metadata other than that generated by create_bag_files. pre_check_bag_flag.send(sender=resource_cls, resource=res) if bag_modified is None or bag_modified: if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task_id = get_resource_bag_task(res_id) user_id = get_task_user_id(request) if not task_id: # create the bag task = create_bag_by_irods.apply_async((res_id, )) task_id = task.task_id if api_request: return JsonResponse({ 'bag_status': 'Not ready', 'task_id': task_id, 'download_path': res.bag_url, # status and id are checked by by hs_core.tests.api.rest.test_create_resource.py 'status': 'Not ready', 'id': task_id }) else: task_dict = get_or_create_task_notification( task_id, name='bag download', payload=res.bag_url, username=user_id) return JsonResponse(task_dict) else: # bag creation has already started if api_request: return JsonResponse({ 'bag_status': 'Not ready', 'task_id': task_id, 'download_path': res.bag_url }) else: task_dict = get_or_create_task_notification( task_id, name='bag download', payload=res.bag_url, username=user_id) return JsonResponse(task_dict) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() response.content = content_msg return response elif request.is_ajax(): task_dict = { 'id': datetime.datetime.today().isoformat(), 'name': "bag download", 'status': "completed", 'payload': res.bag_url } return JsonResponse(task_dict) else: # regular file download # if fetching main metadata files, then these need to be refreshed. if path in [ f"{res_id}/data/resourcemap.xml", f"{res_id}/data/resourcemetadata.xml", f"{res_id}/manifest-md5.txt", f"{res_id}/tagmanifest-md5.txt", f"{res_id}/readme.txt", f"{res_id}/bagit.txt" ]: bag_modified = res.getAVU("bag_modified") if bag_modified is None or bag_modified or not istorage.exists( irods_output_path): res.setAVU( "bag_modified", True ) # ensure bag_modified is set when irods_output_path does not exist create_bag_by_irods(res_id, False) # send signal for pre download file # TODO: does not contain subdirectory information: duplicate refreshes possible download_file_name = split_path_strs[-1] # end of path # this logs the download request in the tracking system pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # If we get this far, # * path and irods_path point to true input # * output_path and irods_output_path point to true output. # Try to stream the file back to the requester. # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(output_path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header # TODO: standardize this to make it less brittle stdout = session.run("ils", None, "-l", irods_output_path)[0].split() flen = int(stdout[3]) # Allow reverse proxy if request was forwarded by nginx (HTTP_X_DJANGO_REVERSE_PROXY='true') # and reverse proxy is possible according to configuration (SENDFILE_ON=True) # and reverse proxy isn't overridden by user (use_reverse_proxy=True). if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(irods_output_path): content_msg = "file path {} does not exist in iRODS".format( output_path) response = HttpResponse(status=404) response.content = content_msg return response # track download count res.update_download_count() # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join( [getattr(settings, 'IRODS_DATA_URI', '/irods-data'), output_path]) if __debug__: logger.debug("Reverse proxying local {}".format( response['X-Accel-Redirect'])) return response # if we get here, none of the above conditions are true # if reverse proxy is enabled, then this is because the resource is remote and federated # OR the user specifically requested a non-proxied download. options = ('-', ) # we're redirecting to stdout. # this unusual way of calling works for streaming federated or local resources if __debug__: logger.debug("Locally streaming {}".format(output_path)) # track download count res.update_download_count() proc = session.run_safe('iget', None, irods_output_path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=output_path.split('/')[-1]) response['Content-Length'] = flen return response
def download(request, path, rest_call=False, use_async=True, *args, **kwargs): split_path_strs = path.split('/') is_bag_download = False if split_path_strs[0] == 'bags': res_id = os.path.splitext(split_path_strs[1])[0] is_bag_download = True else: res_id = split_path_strs[0] res, authorized, _ = authorize( request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: signin_html = '</h1><div class="col-xs-12"><h2 class="page-title">' \ '<a href="/oauth_request/"><span class ="glyphicon glyphicon-log-in"></span>' \ 'Sign In</a></h2>' response.content = '<h1>' + content_msg + signin_html return response if not is_bag_download and "/data" not in path: idx_sep = path.find('/') path = path[idx_sep:] istorage = IrodsStorage() if 'environment' in kwargs: environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') if istorage.exists(res_id) and is_bag_download: bag_modified = istorage.getAVU(res_id, 'bag_modified') # make sure if bag_modified is not set to true, we still recreate the bag if the # bag file does not exist for some reason to resolve the error to download a nonexistent # bag when bag_modified is false due to the flag being out-of-sync with the real bag status if bag_modified is None or bag_modified.lower() == "false": # check whether the bag file exists bag_file_name = res_id + '.zip' bag_full_path = os.path.join('bags', bag_file_name) if not istorage.exists(bag_full_path): bag_modified = 'true' if bag_modified is None or bag_modified.lower() == "true": create_bag(res) resource_cls = check_resource_type(res.resource_type) # send signal for pre download file download_file_name = split_path_strs[-1] pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header stdout = session.run("ils", None, "-l", path)[0].split() flen = int(stdout[3]) options = ('-', ) # we're redirecting to stdout. proc = session.run_safe('iget', None, path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen return response
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): split_path_strs = path.split('/') is_bag_download = False is_zip_download = False is_sf_agg_file = False if split_path_strs[0] == 'bags': res_id = os.path.splitext(split_path_strs[1])[0] is_bag_download = True elif split_path_strs[0] == 'zips': if path.endswith('.zip'): res_id = os.path.splitext(split_path_strs[2])[0] else: res_id = os.path.splitext(split_path_strs[1])[0] is_zip_download = True else: res_id = split_path_strs[0] # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize(request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response if res.resource_type == "CompositeResource" and not path.endswith(".zip"): for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: if f.has_logical_file and f.logical_file.is_single_file_aggregation: is_sf_agg_file = True if res.resource_federation_path: # the resource is stored in federated zone istorage = IrodsStorage('federated') federated_path = res.resource_federation_path path = os.path.join(federated_path, path) session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. istorage = IrodsStorage() federated_path = '' if 'environment' in kwargs: environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if federated_path: res_root = os.path.join(federated_path, res_id) else: res_root = res_id if is_zip_download or is_sf_agg_file: if not path.endswith(".zip"): # requesting folder that needs to be zipped input_path = path.split(res_id)[1] random_hash = random.getrandbits(32) daily_date = datetime.datetime.today().strftime('%Y-%m-%d') random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format( daily_date=daily_date, res_id=res_id, rand_folder=random_hash) output_path = '{random_hash_path}{path}.zip'.format(random_hash_path=random_hash_path, path=input_path) if res.resource_type == "CompositeResource": aggregation_name = input_path[len('/data/contents/'):] res.create_aggregation_xml_documents(aggregation_name=aggregation_name) if use_async: task = create_temp_zip.apply_async((res_id, input_path, output_path, is_sf_agg_file), countdown=3) delete_zip.apply_async((random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if is_sf_agg_file: download_path = request.path.split(res_id)[0] + output_path else: download_path = request.path.split("zips")[0] + output_path if rest_call: return HttpResponse(json.dumps({'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': download_path}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = download_path return HttpResponseRedirect(res.get_absolute_url()) ret_status = create_temp_zip(res_id, input_path, output_path, is_sf_agg_file) delete_zip.apply_async((random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if not ret_status: content_msg = "Zip cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response path = output_path bag_modified = istorage.getAVU(res_root, 'bag_modified') # make sure if bag_modified is not set to true, we still recreate the bag if the # bag file does not exist for some reason to resolve the error to download a nonexistent # bag when bag_modified is false due to the flag being out-of-sync with the real bag status if bag_modified is None or bag_modified.lower() == "false": # check whether the bag file exists bag_file_name = res_id + '.zip' if res_root.startswith(res_id): bag_full_path = os.path.join('bags', bag_file_name) else: bag_full_path = os.path.join(federated_path, 'bags', bag_file_name) # set bag_modified to 'true' if the bag does not exist so that it can be recreated # and the bag_modified AVU will be set correctly as well subsequently if not istorage.exists(bag_full_path): bag_modified = 'true' metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty') # do on-demand bag creation # needs to check whether res_id collection exists before getting/setting AVU on it # to accommodate the case where the very same resource gets deleted by another request # when it is getting downloaded if is_bag_download: # send signal for pre_check_bag_flag pre_check_bag_flag.send(sender=resource_cls, resource=res) if bag_modified is None or bag_modified.lower() == "true": if metadata_dirty is None or metadata_dirty.lower() == 'true': create_bag_files(res) if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id,), countdown=3) if rest_call: return HttpResponse(json.dumps({'bag_status': 'Not ready', 'task_id': task.task_id}), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response elif metadata_dirty is None or metadata_dirty.lower() == 'true': if path.endswith("resourcemap.xml") or path.endswith('resourcemetadata.xml'): # we need to regenerate the metadata xml files create_bag_files(res) # send signal for pre download file download_file_name = split_path_strs[-1] pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header stdout = session.run("ils", None, "-l", path)[0].split() flen = int(stdout[3]) # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy')) # Allow reverse proxy if request was forwarded by nginx # (HTTP_X_DJANGO_REVERSE_PROXY is 'true') # and reverse proxy is possible according to configuration. if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(path): content_msg = "file path {} does not exist in iRODS".format(path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join([ getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path]) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # invoke X-Accel-Redirect on physical vault file in nginx # if path is full user path; strip federation prefix if path.startswith(userpath): path = path[len(userpath)+1:] # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), path) return response # if we get here, none of the above conditions are true if flen <= FILE_SIZE_LIMIT: options = ('-',) # we're redirecting to stdout. # this unusual way of calling works for federated or local resources proc = session.run_safe('iget', None, path, *options) response = FileResponse(proc.stdout, content_type=mtype) response['Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen return response else: content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \ "Please download the large file via iRODS clients." response = HttpResponse(status=403) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response
def download(request, path, rest_call=False, use_async=True, use_reverse_proxy=True, *args, **kwargs): split_path_strs = path.split('/') is_bag_download = False is_zip_download = False is_sf_agg_file = False if split_path_strs[0] == 'bags': res_id = os.path.splitext(split_path_strs[1])[0] is_bag_download = True elif split_path_strs[0] == 'zips': if path.endswith('.zip'): res_id = os.path.splitext(split_path_strs[2])[0] else: res_id = os.path.splitext(split_path_strs[1])[0] is_zip_download = True else: res_id = split_path_strs[0] # if the resource does not exist in django, authorized will be false res, authorized, _ = authorize( request, res_id, needed_permission=ACTION_TO_AUTHORIZE.VIEW_RESOURCE, raises_exception=False) if not authorized: response = HttpResponse(status=401) content_msg = "You do not have permission to download this resource!" if rest_call: raise PermissionDenied(content_msg) else: response.content = "<h1>" + content_msg + "</h1>" return response if res.resource_type == "CompositeResource" and not path.endswith(".zip"): for f in ResourceFile.objects.filter(object_id=res.id): if path == f.storage_path: if f.has_logical_file and f.logical_file.is_single_file_aggregation: is_sf_agg_file = True if res.resource_federation_path: # the resource is stored in federated zone istorage = IrodsStorage('federated') federated_path = res.resource_federation_path path = os.path.join(federated_path, path) session = icommands.ACTIVE_SESSION else: # TODO: From Alva: I do not understand the use case for changing the environment. # TODO: This seems an enormous potential vulnerability, as arguments are # TODO: passed from the URI directly to IRODS without verification. istorage = IrodsStorage() federated_path = '' if 'environment' in kwargs: environment = int(kwargs['environment']) environment = m.RodsEnvironment.objects.get(pk=environment) session = Session("/tmp/django_irods", settings.IRODS_ICOMMANDS_PATH, session_id=uuid4()) session.create_environment(environment) session.run('iinit', None, environment.auth) elif getattr(settings, 'IRODS_GLOBAL_SESSION', False): session = GLOBAL_SESSION elif icommands.ACTIVE_SESSION: session = icommands.ACTIVE_SESSION else: raise KeyError('settings must have IRODS_GLOBAL_SESSION set ' 'if there is no environment object') resource_cls = check_resource_type(res.resource_type) if federated_path: res_root = os.path.join(federated_path, res_id) else: res_root = res_id if is_zip_download or is_sf_agg_file: if not path.endswith( ".zip"): # requesting folder that needs to be zipped input_path = path.split(res_id)[1] random_hash = random.getrandbits(32) daily_date = datetime.datetime.today().strftime('%Y-%m-%d') random_hash_path = 'zips/{daily_date}/{res_id}/{rand_folder}'.format( daily_date=daily_date, res_id=res_id, rand_folder=random_hash) output_path = '{random_hash_path}{path}.zip'.format( random_hash_path=random_hash_path, path=input_path) if res.resource_type == "CompositeResource": aggregation_name = input_path[len('/data/contents/'):] res.create_aggregation_xml_documents( aggregation_name=aggregation_name) if use_async: task = create_temp_zip.apply_async( (res_id, input_path, output_path, is_sf_agg_file), countdown=3) delete_zip.apply_async( (random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if is_sf_agg_file: download_path = request.path.split(res_id)[0] + output_path else: download_path = request.path.split("zips")[0] + output_path if rest_call: return HttpResponse(json.dumps({ 'zip_status': 'Not ready', 'task_id': task.task_id, 'download_path': download_path }), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = download_path return HttpResponseRedirect(res.get_absolute_url()) ret_status = create_temp_zip(res_id, input_path, output_path, is_sf_agg_file) delete_zip.apply_async( (random_hash_path, ), countdown=(20 * 60)) # delete after 20 minutes if not ret_status: content_msg = "Zip cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response path = output_path bag_modified = istorage.getAVU(res_root, 'bag_modified') # make sure if bag_modified is not set to true, we still recreate the bag if the # bag file does not exist for some reason to resolve the error to download a nonexistent # bag when bag_modified is false due to the flag being out-of-sync with the real bag status if bag_modified is None or bag_modified.lower() == "false": # check whether the bag file exists bag_file_name = res_id + '.zip' if res_root.startswith(res_id): bag_full_path = os.path.join('bags', bag_file_name) else: bag_full_path = os.path.join(federated_path, 'bags', bag_file_name) # set bag_modified to 'true' if the bag does not exist so that it can be recreated # and the bag_modified AVU will be set correctly as well subsequently if not istorage.exists(bag_full_path): bag_modified = 'true' metadata_dirty = istorage.getAVU(res_root, 'metadata_dirty') # do on-demand bag creation # needs to check whether res_id collection exists before getting/setting AVU on it # to accommodate the case where the very same resource gets deleted by another request # when it is getting downloaded if is_bag_download: # send signal for pre_check_bag_flag pre_check_bag_flag.send(sender=resource_cls, resource=res) if bag_modified is None or bag_modified.lower() == "true": if metadata_dirty is None or metadata_dirty.lower() == 'true': create_bag_files(res) if use_async: # task parameter has to be passed in as a tuple or list, hence (res_id,) is needed # Note that since we are using JSON for task parameter serialization, no complex # object can be passed as parameters to a celery task task = create_bag_by_irods.apply_async((res_id, ), countdown=3) if rest_call: return HttpResponse(json.dumps({ 'bag_status': 'Not ready', 'task_id': task.task_id }), content_type="application/json") request.session['task_id'] = task.task_id request.session['download_path'] = request.path return HttpResponseRedirect(res.get_absolute_url()) else: ret_status = create_bag_by_irods(res_id) if not ret_status: content_msg = "Bag cannot be created successfully. Check log for details." response = HttpResponse() if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response elif metadata_dirty is None or metadata_dirty.lower() == 'true': if path.endswith("resourcemap.xml") or path.endswith( 'resourcemetadata.xml'): # we need to regenerate the metadata xml files create_bag_files(res) # send signal for pre download file download_file_name = split_path_strs[-1] pre_download_file.send(sender=resource_cls, resource=res, download_file_name=download_file_name, request=request) # obtain mime_type to set content_type mtype = 'application-x/octet-stream' mime_type = mimetypes.guess_type(path) if mime_type[0] is not None: mtype = mime_type[0] # retrieve file size to set up Content-Length header stdout = session.run("ils", None, "-l", path)[0].split() flen = int(stdout[3]) # If this path is resource_federation_path, then the file is a local user file userpath = '/' + os.path.join( getattr(settings, 'HS_USER_IRODS_ZONE', 'hydroshareuserZone'), 'home', getattr(settings, 'HS_LOCAL_PROXY_USER_IN_FED_ZONE', 'localHydroProxy')) # Allow reverse proxy if request was forwarded by nginx # (HTTP_X_DJANGO_REVERSE_PROXY is 'true') # and reverse proxy is possible according to configuration. if use_reverse_proxy and getattr(settings, 'SENDFILE_ON', False) and \ 'HTTP_X_DJANGO_REVERSE_PROXY' in request.META: # The NGINX sendfile abstraction is invoked as follows: # 1. The request to download a file enters this routine via the /rest_download or /download # url in ./urls.py. It is redirected here from Django. The URI contains either the # unqualified resource path or the federated resource path, depending upon whether # the request is local or federated. # 2. This deals with unfederated resources by redirecting them to the uri # /irods-data/{resource-id}/... on nginx. This URI is configured to read the file # directly from the iRODS vault via NFS, and does not work for direct access to the # vault due to the 'internal;' declaration in NGINX. # 3. This deals with federated resources by reading their path, matching local vaults, and # redirecting to URIs that are in turn mapped to read from appropriate iRODS vaults. At # present, the only one of these is /irods-user, which handles files whose federation # path is stored in the variable 'userpath'. # 4. If there is no vault available for the resource, the file is transferred without # NGINX, exactly as it was transferred previously. # stop NGINX targets that are non-existent from hanging forever. if not istorage.exists(path): content_msg = "file path {} does not exist in iRODS".format(path) response = HttpResponse(status=404) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response if not res.is_federated: # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = '/'.join( [getattr(settings, 'IRODS_DATA_URI', '/irods-data'), path]) return response elif res.resource_federation_path == userpath: # this guarantees a "user" resource # invoke X-Accel-Redirect on physical vault file in nginx # if path is full user path; strip federation prefix if path.startswith(userpath): path = path[len(userpath) + 1:] # invoke X-Accel-Redirect on physical vault file in nginx response = HttpResponse(content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen response['X-Accel-Redirect'] = os.path.join( getattr(settings, 'IRODS_USER_URI', '/irods-user'), path) return response # if we get here, none of the above conditions are true if flen <= FILE_SIZE_LIMIT: options = ('-', ) # we're redirecting to stdout. # this unusual way of calling works for federated or local resources proc = session.run_safe('iget', None, path, *options) response = FileResponse(proc.stdout, content_type=mtype) response[ 'Content-Disposition'] = 'attachment; filename="{name}"'.format( name=path.split('/')[-1]) response['Content-Length'] = flen return response else: content_msg = "File larger than 1GB cannot be downloaded directly via HTTP. " \ "Please download the large file via iRODS clients." response = HttpResponse(status=403) if rest_call: response.content = content_msg else: response.content = "<h1>" + content_msg + "</h1>" return response