Ejemplo n.º 1
0
    def get_file_hash(self, uuid, path_hint):

        hash_alg = self._default_hash_alg
        hasher = hashlib.new(hash_alg)

        if path_hint:
            filepath = path_hint
        else:
            filepath = path_from_uuid(uuid)

        if not isinstance(filepath, unicode):
            filepath = six.u(filepath)

        with self._fs.open(filepath, 'rb') as f:
            while True:
                data = f.read(self._buffer_size)
                if not data:
                    break
                hasher.update(data)

        return format_hash(hash_alg, hasher.hexdigest())
Ejemplo n.º 2
0
 def hash(self):
     """Return the formatted hash of the file"""
     return storage.format_hash(self.hash_alg, self.hasher.hexdigest())
def test_py_fs_storage():
    pyfs = storage.create_flywheel_fs(type_='osfs', config={'path': '/tmp'})
    assert pyfs._fs is not None
    assert pyfs.is_signed_url() == False

    f = pyfs.open(None, u'test.txt', 'w')
    assert f is not None

    f.write(u'This is a test')
    f.close()
    f = pyfs.open(None, u'test.txt', 'r')
    d = f.read()
    assert d == 'This is a test'
    d = f.close()


    f = pyfs.open(None, u'test.txt', 'w')
    assert f is not None
    f.write(u'Overwrite an existing file')
    f.close()
    f = pyfs.open(None, u'test.txt', 'r')
    d = f.read()
    assert d == 'Overwrite an existing file'
    d = f.close()


    f = pyfs.open(None, u'newdir/test.txt', 'w')
    assert f is not None
    f.write(u'Test in a new directory')
    f.close()
    f = pyfs.open(None, u'newdir/test.txt', 'r')
    d = f.read()
    assert d == 'Test in a new directory'
    d = f.close()


    f = pyfs.open(None, u'newdir/test2.txt', 'w')
    assert f is not None
    f.write(u'Test in an existing directory')
    f.close()
    f = pyfs.open(None, u'newdir/test2.txt', 'r')
    d = f.read()
    assert d == 'Test in an existing directory'
    d = f.close()


    f = pyfs.open(None, u'newdir/nested/test.txt', 'w')
    assert f is not None
    f.write(u'Test in a new nested directory')
    f.close()
    f = pyfs.open(None, u'newdir/nested/test.txt', 'r')
    d = f.read()
    assert d == 'Test in a new nested directory'
    d = f.close()


    f = pyfs.open(None, u'new_nested/nested/test.txt', 'w')
    assert f is not None
    f.write(u'Test in a new deeply nested directory')
    f.close()
    f = pyfs.open(None, u'new_nested/nested/test.txt', 'r')
    d = f.read()
    assert d == 'Test in a new deeply nested directory'
    d = f.close()


    # Test filesize
    data = pyfs.get_file_info(None, u'test.txt')
    assert 'filesize' in data


    # Test hashing of uploaded files.
    hash_alg = pyfs._default_hash_alg
    hasher = hashlib.new(hash_alg)
    hasher.update(u'Test in a new deeply nested directory')
    hash_val = hasher.hexdigest()
    hash_val = storage.format_hash(hash_alg, hash_val)

    assert hash_val == pyfs.get_file_hash(None, u'new_nested/nested/test.txt')
Ejemplo n.º 4
0
def process_upload(request,
                   strategy,
                   access_logger,
                   container_type=None,
                   id_=None,
                   origin=None,
                   context=None,
                   response=None,
                   metadata=None,
                   file_fields=None,
                   tempdir=None):
    """
    Universal file upload entrypoint.

    Format:
        Multipart form upload with N file fields, each with their desired filename.
        For technical reasons, no form field names can be repeated. Instead, use (file1, file2) and so forth.

        Depending on the type of upload, a non-file form field called "metadata" may/must also be sent.
        If present, it is expected to be a JSON string matching the schema for the upload strategy.

        Currently, the JSON returned may vary by strategy.

        Some examples:
        curl -F [email protected]   -F [email protected] url
        curl -F metadata=<stuff.json -F [email protected]   url
        http --form POST url [email protected] [email protected]

    Features:
                                               | targeted |  reaper   | engine | packfile
        Must specify a target container        |     X    |           |    X   |
        May create hierarchy on demand         |          |     X     |        |     X

        May  send metadata about the files     |     X    |     X     |    X   |     X
        MUST send metadata about the files     |          |     X     |        |     X

        Creates a packfile from uploaded files |          |           |        |     X
    """
    log = request.logger

    if not isinstance(strategy, Strategy):
        raise Exception('Unknown upload strategy')

    if id_ is not None and container_type == None:
        raise Exception('Unspecified container type')

    allowed_container_types = ('project', 'subject', 'session', 'acquisition',
                               'gear', 'analysis', 'collection')
    if container_type is not None and container_type not in allowed_container_types:
        raise Exception('Unknown container type')

    timestamp = datetime.datetime.utcnow()

    container = None
    if container_type and id_:
        container = hierarchy.get_container(container_type, id_)

    # Check if filename should be basename or full path
    filename_path = request.GET.get('filename_path',
                                    '').lower() in ('1', 'true')
    if filename_path:
        name_fn = util.sanitize_path
    else:
        name_fn = os.path.basename

    # The vast majority of this function's wall-clock time is spent here.
    file_processor = files.FileProcessor(config.primary_storage)

    if not file_fields:

        # The only time we need the tempdir_name is when we use token and packfile.
        form = file_processor.process_form(request,
                                           use_filepath=filename_path,
                                           tempdir_name=tempdir)

        # Non-file form fields may have an empty string as filename, check for 'falsy' values
        file_fields = extract_file_fields(form)

        if 'metadata' in form:
            try:
                metadata = json.loads(form['metadata'].value)
            except Exception:
                raise FileFormException('wrong format for field "metadata"')
            if isinstance(metadata, dict):
                for f in metadata.get(container_type, {}).get('files', []):
                    f['name'] = name_fn(f['name'])
            elif isinstance(metadata, list):
                for f in metadata:
                    f['name'] = name_fn(f['name'])

    placer_class = strategy.value
    placer = placer_class(container_type,
                          container,
                          id_,
                          metadata,
                          timestamp,
                          origin,
                          context,
                          access_logger,
                          logger=log)

    placer.check()

    # Browsers, when sending a multipart upload, will send files with field name "file" (if sinuglar)
    # or "file1", "file2", etc (if multiple). Following this convention is probably a good idea.
    # Here, we accept any

    # TODO: Change schemas to enabled targeted uploads of more than one file.
    # Ref docs from placer.TargetedPlacer for details.
    if strategy == Strategy.targeted and len(file_fields) > 1:
        raise FileFormException("Targeted uploads can only send one file")

    for field in file_fields:
        if hasattr(field, 'file'):
            field.file.close()
            field.hash = storage.format_hash(files.DEFAULT_HASH_ALG,
                                             field.hasher.hexdigest())

        if not hasattr(field, 'hash'):
            field.hash = ''
        # Augment the cgi.FieldStorage with a variety of custom fields.
        # Not the best practice. Open to improvements.
        # These are presumbed to be required by every function later called with field as a parameter.

        #We can trust the filepath on upload is accurate after form processing
        if hasattr(field, 'filepath'):
            #Some placers need this value. Consistent object would be nice
            field.path = field.filepath

        if tempdir:
            field.size = (config.local_fs.get_file_info(
                None, field.filepath))['filesize']
        else:
            field.size = (config.primary_storage.get_file_info(
                field.uuid, util.path_from_uuid(field.uuid)))['filesize']

        field.mimetype = util.guess_mimetype(
            field.filename)  # TODO: does not honor metadata's mime type if any
        field.modified = timestamp

        # create a file-attribute map commonly used elsewhere in the codebase.
        # Stands in for a dedicated object... for now.
        file_attrs = make_file_attrs(field, origin)

        placer.process_file_field(file_attrs)

    # Respond either with Server-Sent Events or a standard json map
    if placer.sse and not response:
        raise Exception("Programmer error: response required")
    elif placer.sse:

        # Returning a callable will bypass webapp2 processing and allow
        # full control over the response.
        def sse_handler(environ, start_response):  # pylint: disable=unused-argument
            write = start_response(
                '200 OK',
                [('Content-Type', 'text/event-stream; charset=utf-8'),
                 ('Connection', 'keep-alive')])

            # Instead of handing the iterator off to response.app_iter, send it ourselves.
            # This prevents disconnections from leaving the API in a partially-complete state.
            #
            # Timing out between events or throwing an exception will result in undefinied behaviour.
            # Right now, in our environment:
            # - Timeouts may result in nginx-created 500 Bad Gateway HTML being added to the response.
            # - Exceptions add some error json to the response, which is not SSE-sanitized.

            for item in placer.finalize():
                try:
                    write(item)
                except Exception:  # pylint: disable=broad-except
                    log.info('SSE upload progress failed to send; continuing')

            return ''

        return sse_handler
    else:
        return placer.finalize()