def check_if_ready(self):
        '''
        Makes sure all the proper environment variables, etc. 
        are present to use this job runner. Should be invoked
        at startup of django app.
        '''

        # check that we can reach the Cromwell server
        url = self.CROMWELL_URL + self.VERSION_ENDPOINT
        try:
            response = get_with_retry(url)
        except Exception as ex:
            logger.info(
                'An exception was raised when checking if the remote Cromwell runner was ready.'
                ' The exception reads: {ex}'.format(ex=ex))
            raise ImproperlyConfigured(
                'Failed to check the remote Cromwell runner. See logs.')
        if response.status_code != 200:
            logger.info('The Cromwell server located at: {url}'
                        ' was not ready.'.format(url=url))
            raise ImproperlyConfigured('Failed to reach Cromwell server.')

        bucket_region = get_storage_backend().get_bucket_region(
            self.CROMWELL_BUCKET)
        instance_region = get_instance_region()
        if bucket_region != instance_region:
            raise ImproperlyConfigured(
                'The application is running on a'
                ' machine in the following region: {instance_region}. The'
                ' Cromwell bucket was found in {bucket_region}. They should'
                ' be located in the same region.'.format(
                    bucket_region=bucket_region,
                    instance_region=instance_region))
Exemple #2
0
    def get(self, request, *args, **kwargs):
        user = request.user
        resource_pk = kwargs['pk']
        try:
            r = check_resource_request_validity(user, resource_pk)
        except NoResourceFoundException:
            return Response(status=status.HTTP_404_NOT_FOUND)
        except InactiveResourceException:
            return Response({'error': 'The resource is inactive.'},
                            status=status.HTTP_400_BAD_REQUEST)
        except OwnershipException:
            return Response(status=status.HTTP_403_FORBIDDEN)

        # Get the storage backend since we need to know whether it's local or not
        storage_backend = get_storage_backend()
        url = storage_backend.get_download_url(r)
        if not url:
            logger.error(
                'Encountered a problem when preparing download for resource'
                ' with pk={u}'.format(u=resource_pk))
            return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)

        if storage_backend.is_local_storage:
            download_url = request.build_absolute_uri(
                reverse('download-resource', kwargs={'pk': resource_pk}))
            download_type = 'local'
        else:
            download_url = url
            download_type = 'remote'

        # the 'download_type' is a flag for how the frontend should interpret the response.
        # For example, if it's a direct download from the local server,
        # then it may choose to call the download url expecting a blob.
        return Response({'url': download_url, 'download_type': download_type})
Exemple #3
0
 def get_local_path_from_uuid(self, resource_uuid):
     '''
     Given a UUID for a Resource, return a path to that file
     on the local filesystem
     '''
     r = self.get_resource(resource_uuid)
     local_path = get_storage_backend().get_local_resource_path(r)
     return local_path
    def post(self, request, *args, **kwargs):
        logger.info('POSTing to create a new resource from bucket-based data')
        try:
            resource_url = request.data[self.BUCKET_PATH]
        except KeyError as ex:
            return Response(
                {self.BUCKET_PATH: 'You must supply this required key.'},
                status=status.HTTP_400_BAD_REQUEST)

        try:
            resource_type = request.data[self.RESOURCE_TYPE]
        except KeyError as ex:
            resource_type = None

        # We require the ability to interact with our storage backend.
        storage_backend = get_storage_backend()

        # If the storage backend happens to be local storage, we immediately fail
        # the request. This could change, however, if a different decision is made.
        if storage_backend.is_local_storage:
            return Response(
                {
                    self.BUCKET_PATH:
                    'The storage system does not support this endpoint.'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # If here, we are using a non-local storage
        # backend (which, for us, means bucket-based).
        # We still need to ensure the path given was real and accessible
        if storage_backend.resource_exists(resource_url):

            basename = os.path.basename(resource_url)

            # create a Resource instance
            r = Resource.objects.create(path=resource_url,
                                        owner=request.user,
                                        name=basename)
            if resource_type:
                async_validate_resource_and_store.delay(r.pk, resource_type)
            else:
                # no resource type was requested, so we just directly store it.
                final_path = storage_backend.store(r)
                r.path = final_path
                r.save()
            resource_serializer = ResourceSerializer(
                r, context={'request': request})
            return Response(resource_serializer.data,
                            status=status.HTTP_201_CREATED)
        else:
            msg = (
                'The file located at {p} could not be accessed. If the path is indeed'
                ' correct, then ensure that it is publicly accessible.')
            return Response({self.BUCKET_PATH: msg},
                            status=status.HTTP_400_BAD_REQUEST)
Exemple #5
0
def move_resource_to_final_location(resource_instance):
    '''
    resource_instance is the database object
    '''
    try:
        return get_storage_backend().store(resource_instance)
    except Exception as ex:
        # alert_admins('A backend storage failure occurred for resource'
        #     ' with pk={x}'.format(x=resource_instance.pk)
        # )
        resource_instance.status = Resource.UNEXPECTED_STORAGE_ERROR
        # Since this was an unexpected issue with storing the item, we
        # effectively disable the resource. Otherwise, unexpected things
        # can happen downstream
        resource_instance.save()
        raise ex
Exemple #6
0
def get_resource_view(resource_instance, query_params={}):
    '''
    Returns a "view" of the resource_instance in JSON-format.

    Only valid for certain resource types and assumes
    that the resource is active. 
    '''
    logger.info('Retrieving data view for resource: {resource}.'.format(
        resource=resource_instance))

    if not resource_instance.resource_type:
        logger.info(
            'No resource type was known for resource: {resource}.'.format(
                resource=resource_instance))
        return
    if RESOURCE_MAPPING[resource_instance.
                        resource_type] in RESOURCE_TYPES_WITHOUT_CONTENTS_VIEW:
        # prevents us from pulling remote resources if we can't view the contents anyway
        return None
    else:
        local_path = get_storage_backend().get_local_resource_path(
            resource_instance)
        return get_contents(local_path, resource_instance.resource_type,
                            resource_instance.file_extension, query_params)
Exemple #7
0
def check_that_resource_exists(path):
    '''
    Given a path, return a boolean indicating whether
    the file at the specified path exists.
    '''
    return get_storage_backend().resource_exists(path)
Exemple #8
0
def validate_resource(resource_instance, requested_resource_type):
    '''
    This function performs validation against the requested resource
    type.  If that fails, reverts to the original type (or remains None
    if the resource has never been successfully validated).
    '''
    if requested_resource_type is not None:

        logger.info(
            'Validate resource {x} against requested type of: {t}'.format(
                x=str(resource_instance.id), t=requested_resource_type))
        resource_instance.status = Resource.VALIDATING

        # check the file extension is consistent with the requested type:
        try:
            type_is_consistent = check_extension(resource_instance,
                                                 requested_resource_type)
            if not type_is_consistent:
                logger.info(
                    'The requested type was not consistent with the file extension. Skipping validation.'
                )
                resource_instance.status = Resource.ERROR_WITH_REASON.format(
                    ex='Requested resource type'
                    ' was not consistent with the file extension')
                return
        except Exception as ex:
            raise Exception(
                'There as an unexpected problem that occurred when parsing the file'
                ' extension. Please check that the file extension (csv, tsv, etc.) does not'
                ' contain unexpected content. It may be easiest to rename the file and upload again.'
            )

        # The `requested_resource_type` is the shorthand identifier.
        # This returns an actual resource class implementation
        try:
            resource_class_instance = get_resource_type_instance(
                requested_resource_type)
        except KeyError as ex:
            raise Exception(
                'The key {k} was not a known resource type.'.format(
                    k=requested_resource_type))
        except Exception as ex:
            raise Exception(
                'There was an unexpected error when retrieving the validator'
                ' for the requested resource type.')
        if resource_class_instance.performs_validation():

            logger.info('Since the resource class permits validation, go and'
                        ' validate this resource.')

            # Regardless of whether we are validating a new upload or changing the type
            # of an existing file, the file is already located at its "final" location
            # which is dependent on the storage backend.  Now, if the storage backend
            # is remote (e.g. bucket storage), we need to pull the file locally to
            # perform validation.
            # Note that failures to pull the file locally will raise an exception, which we
            # catch and respond to
            try:
                local_path = get_storage_backend().get_local_resource_path(
                    resource_instance)
            except Exception as ex:
                # We know something went wrong, but here modify the error message to be more user
                # friendly for display purposes.
                raise Exception(
                    'Failed during validation. An unexpected issue occurred when'
                    ' moving the file for inspection. An administrator has been notified. You may'
                    ' attempt to validate again.')

            try:
                is_valid, message = resource_class_instance.validate_type(
                    local_path, resource_instance.file_extension)
            except Exception as ex:
                # It's expected that files can be invalid. What is NOT expected, however,
                # are general Exceptions that can be raised due to unforeseen issues
                # that could occur duing the validation. Catch those
                logger.info(
                    'An exception was raised when attempting to validate'
                    ' the Resource {pk} located at {local_path}'.format(
                        pk=str(resource_instance.pk), local_path=local_path))
                raise Exception(Resource.UNEXPECTED_VALIDATION_ERROR)

        else:  # resource type does not include validation
            is_valid = True

        if is_valid:
            handle_valid_resource(resource_instance, resource_class_instance,
                                  requested_resource_type)
        else:
            if message and len(message) > 0:
                handle_invalid_resource(resource_instance,
                                        requested_resource_type, message)
            else:
                handle_invalid_resource(resource_instance,
                                        requested_resource_type)

    else:  # requested_resource_type was None
        resource_instance.resource_type = None
        resource_instance.status = Resource.READY
Exemple #9
0
def handle_valid_resource(resource, resource_class_instance,
                          requested_resource_type):
    '''
    Once a Resource has been successfully validated, this function does some
    final operations such as moving the file and extracting metadata.

    `resource` is the database object
    `resource_class_instance` is one of the DataResource subclasses
    '''
    # if the resource class is capable of performing validation, we enter here.
    # This does NOT mean that any of the standardization, etc. steps occur, but
    # this admits that possibility.
    # If the resource type is such that is does not support validation, then we
    # skip this part as we have no need to pull the file locally (if the storage
    # backend is remote)
    if resource_class_instance.performs_validation():

        # Actions below require local access to the file:
        try:
            local_path = get_storage_backend().get_local_resource_path(
                resource)
        except Exception as ex:
            # We know something went wrong, but here modify the error message to be more user
            # friendly for display purposes.
            raise Exception(
                'Failed following successful validation. An unexpected issue occurred when'
                ' moving the file. An administrator has been notified. This may be a temporary error,'
                ' so you may try again to validate.')

        logger.info('The local path prior to standardization is: {p}'.format(
            p=local_path))

        # the resource was valid, so first save it in our standardized format
        new_path, new_name = resource_class_instance.save_in_standardized_format(local_path, \
            resource.name, resource.file_extension)
        print(new_path, new_name)
        print('????????????')

        # need to know the "updated" file extension
        file_extension = resource_class_instance.STANDARD_FORMAT

        # delete the "original" resource, if the standardization ended up making
        # a different file
        if new_path != local_path:
            logger.info('The standardization changed the path. '
                        'Go delete the non-standardized file: {p}'.format(
                            p=resource.path))
            get_storage_backend().delete(resource.path)

            # temporarily change this so it doesn't point at the original path
            # in the non-standardized format. This way the standardized file will be
            # sent to the final storage location. Once the file is in the 'final'
            # storage location, the path member will be edited to reflect that
            resource.path = new_path

        else:
            logger.info('Standardization did not change the path...')

        if new_name != resource.name:
            # change the name of the resource
            # Recall that upon saving, this will also change the file_extension field.
            resource.name = new_name

    else:
        # since we did not have to perform any standardization, etc. simply
        # set the necessary variables without change.
        new_path = resource.path
        new_name = resource.name
        file_extension = resource.file_extension

    # since the resource was valid, we can also fill-in the metadata
    # Note that the metadata could fail for type issues and we have to plan
    # for failures there. For instance, a table can be compliant, but the
    # resulting metadata could violate a type constraint (e.g. if a string-based
    # attribute does not match our regex, is too long, etc.)
    try:
        metadata = resource_class_instance.extract_metadata(
            new_path, file_extension)
    except ValidationError as ex:
        logger.info('Caught a ValidationError when extracting metadata from'
                    ' resource at path: {p}'.format(p=new_path))
        err_list = []
        for k, v in ex.get_full_details().items():
            # v is a nested dict
            msg = v['message']
            err_str = '{k}:{s}'.format(k=k, s=str(msg))
            err_list.append(err_str)
        # don't want to set the resource type since the metadata failed as it sets up
        # inconsistencies between the validation of a format and its "usability" in WebMeV
        # For instance, a compliant matrix could have excessively long sample names and we don't
        # want to permit that.
        resource.resource_type = None
        raise Exception(
            Resource.ERROR_WITH_REASON.format(ex=','.join(err_list)))
    except Exception as ex:
        logger.info(
            'Encountered an exception when extracting metadata: {ex}'.format(
                ex=ex))
        resource.resource_type = None
        raise Exception(
            'Encountered an unexpected issue when extracting metadata.'
            ' An administrator has been notified.')

    try:
        add_metadata_to_resource(resource, metadata)
        resource.status = Resource.READY
    except Exception as ex:
        resource.resource_type = None
        raise Exception(
            'Encountered an unexpected issue when adding metadata to the resource.'
            ' An administrator has been notified.')

    try:
        # have to send the file to the final storage. If we are using local storage
        # this is trivial. However, if we are using remote storage, the data saved
        # in the standardized format needs to be pushed there also.
        final_path = move_resource_to_final_location(resource)

        # Only at this point (when we have successfully validated, moved, extracted metadata, etc.)
        # do we set the new path and resource type on the database object.
        resource.path = final_path
        resource.resource_type = requested_resource_type
    except Exception as ex:
        logger.info(
            'Exception when moving valid final resource after extracting/appending metadata.'
            ' Exception was {ex}'.format(ex=str(ex)))
        resource.resource_type = None
        raise Exception(
            'Encountered an unexpected issue when moving your validated resource.'
            ' An administrator has been notified. You may also attempt to validate again.'
        )
Exemple #10
0
def get_resource_size(resource_instance):
    return get_storage_backend().get_filesize(resource_instance.path)
Exemple #11
0
    def get(self, request, *args, **kwargs):
        user = request.user
        resource_pk = kwargs['pk']
        try:
            r = check_resource_request_validity(user, resource_pk)
        except NoResourceFoundException:
            return Response(status=status.HTTP_404_NOT_FOUND)
        except InactiveResourceException:
            return Response({'error': 'The resource is inactive.'},
                            status=status.HTTP_400_BAD_REQUEST)
        except OwnershipException:
            return Response(status=status.HTTP_403_FORBIDDEN)

        # Get the storage backend since we need to know whether it's local or not
        storage_backend = get_storage_backend()

        # if we don't have local storage, block this style of direct download
        if not storage_backend.is_local_storage:
            return Response(
                {
                    'error':
                    'The server configuration prevents direct server downloads.'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # requester can access, resource is active. OK so far.
        # Check the file size. We don't want large files tying up the server
        # Those should be performed by something else, like via Dropbox.
        # HOWEVER, this is only an issue if the storage backend is local.
        # Redirects for bucket storage can obviously be handled since they are
        # downloading directly from the bucket and this will not tie up our server.
        size_in_bytes = r.size
        if size_in_bytes > settings.MAX_DOWNLOAD_SIZE_BYTES:
            msg = (
                'The resource size exceeds our limits for a direct'
                ' download. Please use one of the alternative download methods'
                ' more suited for larger files.')
            return Response({'size': msg}, status=status.HTTP_400_BAD_REQUEST)

        # size is acceptable (if downloading from server)
        # or moot (if downloading directly from a bucket).
        # Now, depending on the storage backend, we return different things.
        # If we have local storage, we just return the file contents.
        # If we have remote storage, we return a signed url and issue a 302 to redirect them
        # Hence, url can be a local path or a remote url depending on the storage backend we are using
        url = storage_backend.get_download_url(r)

        if not url:
            logger.error(
                'Encountered a problem when preparing download for resource'
                ' with pk={u}'.format(u=resource_pk))
            return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)

        if os.path.exists(url):
            contents = open(url, 'rb')
            mime_type, _ = mimetypes.guess_type(url)
            response = HttpResponse(content=contents)
            response['Content-Type'] = mime_type
            response[
                'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(
                    url)
            return response
        else:
            logger.error(
                'Local storage was specified, but the resource at path {p}'
                ' was not found.'.format(p=url))
            return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def delete_file(path):
    '''
    Deletes a file.  Can be a local or remote resource.
    '''
    logger.info('Requesting deletion of {path}'.format(path=path))
    get_storage_backend().delete(path)