def check_if_ready(self): ''' Makes sure all the proper environment variables, etc. are present to use this job runner. Should be invoked at startup of django app. ''' # check that we can reach the Cromwell server url = self.CROMWELL_URL + self.VERSION_ENDPOINT try: response = get_with_retry(url) except Exception as ex: logger.info( 'An exception was raised when checking if the remote Cromwell runner was ready.' ' The exception reads: {ex}'.format(ex=ex)) raise ImproperlyConfigured( 'Failed to check the remote Cromwell runner. See logs.') if response.status_code != 200: logger.info('The Cromwell server located at: {url}' ' was not ready.'.format(url=url)) raise ImproperlyConfigured('Failed to reach Cromwell server.') bucket_region = get_storage_backend().get_bucket_region( self.CROMWELL_BUCKET) instance_region = get_instance_region() if bucket_region != instance_region: raise ImproperlyConfigured( 'The application is running on a' ' machine in the following region: {instance_region}. The' ' Cromwell bucket was found in {bucket_region}. They should' ' be located in the same region.'.format( bucket_region=bucket_region, instance_region=instance_region))
def get(self, request, *args, **kwargs): user = request.user resource_pk = kwargs['pk'] try: r = check_resource_request_validity(user, resource_pk) except NoResourceFoundException: return Response(status=status.HTTP_404_NOT_FOUND) except InactiveResourceException: return Response({'error': 'The resource is inactive.'}, status=status.HTTP_400_BAD_REQUEST) except OwnershipException: return Response(status=status.HTTP_403_FORBIDDEN) # Get the storage backend since we need to know whether it's local or not storage_backend = get_storage_backend() url = storage_backend.get_download_url(r) if not url: logger.error( 'Encountered a problem when preparing download for resource' ' with pk={u}'.format(u=resource_pk)) return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR) if storage_backend.is_local_storage: download_url = request.build_absolute_uri( reverse('download-resource', kwargs={'pk': resource_pk})) download_type = 'local' else: download_url = url download_type = 'remote' # the 'download_type' is a flag for how the frontend should interpret the response. # For example, if it's a direct download from the local server, # then it may choose to call the download url expecting a blob. return Response({'url': download_url, 'download_type': download_type})
def get_local_path_from_uuid(self, resource_uuid): ''' Given a UUID for a Resource, return a path to that file on the local filesystem ''' r = self.get_resource(resource_uuid) local_path = get_storage_backend().get_local_resource_path(r) return local_path
def post(self, request, *args, **kwargs): logger.info('POSTing to create a new resource from bucket-based data') try: resource_url = request.data[self.BUCKET_PATH] except KeyError as ex: return Response( {self.BUCKET_PATH: 'You must supply this required key.'}, status=status.HTTP_400_BAD_REQUEST) try: resource_type = request.data[self.RESOURCE_TYPE] except KeyError as ex: resource_type = None # We require the ability to interact with our storage backend. storage_backend = get_storage_backend() # If the storage backend happens to be local storage, we immediately fail # the request. This could change, however, if a different decision is made. if storage_backend.is_local_storage: return Response( { self.BUCKET_PATH: 'The storage system does not support this endpoint.' }, status=status.HTTP_400_BAD_REQUEST) # If here, we are using a non-local storage # backend (which, for us, means bucket-based). # We still need to ensure the path given was real and accessible if storage_backend.resource_exists(resource_url): basename = os.path.basename(resource_url) # create a Resource instance r = Resource.objects.create(path=resource_url, owner=request.user, name=basename) if resource_type: async_validate_resource_and_store.delay(r.pk, resource_type) else: # no resource type was requested, so we just directly store it. final_path = storage_backend.store(r) r.path = final_path r.save() resource_serializer = ResourceSerializer( r, context={'request': request}) return Response(resource_serializer.data, status=status.HTTP_201_CREATED) else: msg = ( 'The file located at {p} could not be accessed. If the path is indeed' ' correct, then ensure that it is publicly accessible.') return Response({self.BUCKET_PATH: msg}, status=status.HTTP_400_BAD_REQUEST)
def move_resource_to_final_location(resource_instance): ''' resource_instance is the database object ''' try: return get_storage_backend().store(resource_instance) except Exception as ex: # alert_admins('A backend storage failure occurred for resource' # ' with pk={x}'.format(x=resource_instance.pk) # ) resource_instance.status = Resource.UNEXPECTED_STORAGE_ERROR # Since this was an unexpected issue with storing the item, we # effectively disable the resource. Otherwise, unexpected things # can happen downstream resource_instance.save() raise ex
def get_resource_view(resource_instance, query_params={}): ''' Returns a "view" of the resource_instance in JSON-format. Only valid for certain resource types and assumes that the resource is active. ''' logger.info('Retrieving data view for resource: {resource}.'.format( resource=resource_instance)) if not resource_instance.resource_type: logger.info( 'No resource type was known for resource: {resource}.'.format( resource=resource_instance)) return if RESOURCE_MAPPING[resource_instance. resource_type] in RESOURCE_TYPES_WITHOUT_CONTENTS_VIEW: # prevents us from pulling remote resources if we can't view the contents anyway return None else: local_path = get_storage_backend().get_local_resource_path( resource_instance) return get_contents(local_path, resource_instance.resource_type, resource_instance.file_extension, query_params)
def check_that_resource_exists(path): ''' Given a path, return a boolean indicating whether the file at the specified path exists. ''' return get_storage_backend().resource_exists(path)
def validate_resource(resource_instance, requested_resource_type): ''' This function performs validation against the requested resource type. If that fails, reverts to the original type (or remains None if the resource has never been successfully validated). ''' if requested_resource_type is not None: logger.info( 'Validate resource {x} against requested type of: {t}'.format( x=str(resource_instance.id), t=requested_resource_type)) resource_instance.status = Resource.VALIDATING # check the file extension is consistent with the requested type: try: type_is_consistent = check_extension(resource_instance, requested_resource_type) if not type_is_consistent: logger.info( 'The requested type was not consistent with the file extension. Skipping validation.' ) resource_instance.status = Resource.ERROR_WITH_REASON.format( ex='Requested resource type' ' was not consistent with the file extension') return except Exception as ex: raise Exception( 'There as an unexpected problem that occurred when parsing the file' ' extension. Please check that the file extension (csv, tsv, etc.) does not' ' contain unexpected content. It may be easiest to rename the file and upload again.' ) # The `requested_resource_type` is the shorthand identifier. # This returns an actual resource class implementation try: resource_class_instance = get_resource_type_instance( requested_resource_type) except KeyError as ex: raise Exception( 'The key {k} was not a known resource type.'.format( k=requested_resource_type)) except Exception as ex: raise Exception( 'There was an unexpected error when retrieving the validator' ' for the requested resource type.') if resource_class_instance.performs_validation(): logger.info('Since the resource class permits validation, go and' ' validate this resource.') # Regardless of whether we are validating a new upload or changing the type # of an existing file, the file is already located at its "final" location # which is dependent on the storage backend. Now, if the storage backend # is remote (e.g. bucket storage), we need to pull the file locally to # perform validation. # Note that failures to pull the file locally will raise an exception, which we # catch and respond to try: local_path = get_storage_backend().get_local_resource_path( resource_instance) except Exception as ex: # We know something went wrong, but here modify the error message to be more user # friendly for display purposes. raise Exception( 'Failed during validation. An unexpected issue occurred when' ' moving the file for inspection. An administrator has been notified. You may' ' attempt to validate again.') try: is_valid, message = resource_class_instance.validate_type( local_path, resource_instance.file_extension) except Exception as ex: # It's expected that files can be invalid. What is NOT expected, however, # are general Exceptions that can be raised due to unforeseen issues # that could occur duing the validation. Catch those logger.info( 'An exception was raised when attempting to validate' ' the Resource {pk} located at {local_path}'.format( pk=str(resource_instance.pk), local_path=local_path)) raise Exception(Resource.UNEXPECTED_VALIDATION_ERROR) else: # resource type does not include validation is_valid = True if is_valid: handle_valid_resource(resource_instance, resource_class_instance, requested_resource_type) else: if message and len(message) > 0: handle_invalid_resource(resource_instance, requested_resource_type, message) else: handle_invalid_resource(resource_instance, requested_resource_type) else: # requested_resource_type was None resource_instance.resource_type = None resource_instance.status = Resource.READY
def handle_valid_resource(resource, resource_class_instance, requested_resource_type): ''' Once a Resource has been successfully validated, this function does some final operations such as moving the file and extracting metadata. `resource` is the database object `resource_class_instance` is one of the DataResource subclasses ''' # if the resource class is capable of performing validation, we enter here. # This does NOT mean that any of the standardization, etc. steps occur, but # this admits that possibility. # If the resource type is such that is does not support validation, then we # skip this part as we have no need to pull the file locally (if the storage # backend is remote) if resource_class_instance.performs_validation(): # Actions below require local access to the file: try: local_path = get_storage_backend().get_local_resource_path( resource) except Exception as ex: # We know something went wrong, but here modify the error message to be more user # friendly for display purposes. raise Exception( 'Failed following successful validation. An unexpected issue occurred when' ' moving the file. An administrator has been notified. This may be a temporary error,' ' so you may try again to validate.') logger.info('The local path prior to standardization is: {p}'.format( p=local_path)) # the resource was valid, so first save it in our standardized format new_path, new_name = resource_class_instance.save_in_standardized_format(local_path, \ resource.name, resource.file_extension) print(new_path, new_name) print('????????????') # need to know the "updated" file extension file_extension = resource_class_instance.STANDARD_FORMAT # delete the "original" resource, if the standardization ended up making # a different file if new_path != local_path: logger.info('The standardization changed the path. ' 'Go delete the non-standardized file: {p}'.format( p=resource.path)) get_storage_backend().delete(resource.path) # temporarily change this so it doesn't point at the original path # in the non-standardized format. This way the standardized file will be # sent to the final storage location. Once the file is in the 'final' # storage location, the path member will be edited to reflect that resource.path = new_path else: logger.info('Standardization did not change the path...') if new_name != resource.name: # change the name of the resource # Recall that upon saving, this will also change the file_extension field. resource.name = new_name else: # since we did not have to perform any standardization, etc. simply # set the necessary variables without change. new_path = resource.path new_name = resource.name file_extension = resource.file_extension # since the resource was valid, we can also fill-in the metadata # Note that the metadata could fail for type issues and we have to plan # for failures there. For instance, a table can be compliant, but the # resulting metadata could violate a type constraint (e.g. if a string-based # attribute does not match our regex, is too long, etc.) try: metadata = resource_class_instance.extract_metadata( new_path, file_extension) except ValidationError as ex: logger.info('Caught a ValidationError when extracting metadata from' ' resource at path: {p}'.format(p=new_path)) err_list = [] for k, v in ex.get_full_details().items(): # v is a nested dict msg = v['message'] err_str = '{k}:{s}'.format(k=k, s=str(msg)) err_list.append(err_str) # don't want to set the resource type since the metadata failed as it sets up # inconsistencies between the validation of a format and its "usability" in WebMeV # For instance, a compliant matrix could have excessively long sample names and we don't # want to permit that. resource.resource_type = None raise Exception( Resource.ERROR_WITH_REASON.format(ex=','.join(err_list))) except Exception as ex: logger.info( 'Encountered an exception when extracting metadata: {ex}'.format( ex=ex)) resource.resource_type = None raise Exception( 'Encountered an unexpected issue when extracting metadata.' ' An administrator has been notified.') try: add_metadata_to_resource(resource, metadata) resource.status = Resource.READY except Exception as ex: resource.resource_type = None raise Exception( 'Encountered an unexpected issue when adding metadata to the resource.' ' An administrator has been notified.') try: # have to send the file to the final storage. If we are using local storage # this is trivial. However, if we are using remote storage, the data saved # in the standardized format needs to be pushed there also. final_path = move_resource_to_final_location(resource) # Only at this point (when we have successfully validated, moved, extracted metadata, etc.) # do we set the new path and resource type on the database object. resource.path = final_path resource.resource_type = requested_resource_type except Exception as ex: logger.info( 'Exception when moving valid final resource after extracting/appending metadata.' ' Exception was {ex}'.format(ex=str(ex))) resource.resource_type = None raise Exception( 'Encountered an unexpected issue when moving your validated resource.' ' An administrator has been notified. You may also attempt to validate again.' )
def get_resource_size(resource_instance): return get_storage_backend().get_filesize(resource_instance.path)
def get(self, request, *args, **kwargs): user = request.user resource_pk = kwargs['pk'] try: r = check_resource_request_validity(user, resource_pk) except NoResourceFoundException: return Response(status=status.HTTP_404_NOT_FOUND) except InactiveResourceException: return Response({'error': 'The resource is inactive.'}, status=status.HTTP_400_BAD_REQUEST) except OwnershipException: return Response(status=status.HTTP_403_FORBIDDEN) # Get the storage backend since we need to know whether it's local or not storage_backend = get_storage_backend() # if we don't have local storage, block this style of direct download if not storage_backend.is_local_storage: return Response( { 'error': 'The server configuration prevents direct server downloads.' }, status=status.HTTP_400_BAD_REQUEST) # requester can access, resource is active. OK so far. # Check the file size. We don't want large files tying up the server # Those should be performed by something else, like via Dropbox. # HOWEVER, this is only an issue if the storage backend is local. # Redirects for bucket storage can obviously be handled since they are # downloading directly from the bucket and this will not tie up our server. size_in_bytes = r.size if size_in_bytes > settings.MAX_DOWNLOAD_SIZE_BYTES: msg = ( 'The resource size exceeds our limits for a direct' ' download. Please use one of the alternative download methods' ' more suited for larger files.') return Response({'size': msg}, status=status.HTTP_400_BAD_REQUEST) # size is acceptable (if downloading from server) # or moot (if downloading directly from a bucket). # Now, depending on the storage backend, we return different things. # If we have local storage, we just return the file contents. # If we have remote storage, we return a signed url and issue a 302 to redirect them # Hence, url can be a local path or a remote url depending on the storage backend we are using url = storage_backend.get_download_url(r) if not url: logger.error( 'Encountered a problem when preparing download for resource' ' with pk={u}'.format(u=resource_pk)) return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR) if os.path.exists(url): contents = open(url, 'rb') mime_type, _ = mimetypes.guess_type(url) response = HttpResponse(content=contents) response['Content-Type'] = mime_type response[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( url) return response else: logger.error( 'Local storage was specified, but the resource at path {p}' ' was not found.'.format(p=url)) return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def delete_file(path): ''' Deletes a file. Can be a local or remote resource. ''' logger.info('Requesting deletion of {path}'.format(path=path)) get_storage_backend().delete(path)