Esempio n. 1
0
File: core.py Progetto: larsoner/api
 def put(self):
     """Receive a sortable reaper or user upload."""
     #if not self.uid and not self.drone_request:
     #    self.abort(402, 'uploads must be from an authorized user or drone')
     if 'Content-MD5' not in self.request.headers:
         self.abort(400, 'Request must contain a valid "Content-MD5" header.')
     filename = self.request.headers.get('Content-Disposition', '').partition('filename=')[2].strip('"')
     if not filename:
         self.abort(400, 'Request must contain a valid "Content-Disposition" header.')
     with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path:
         filepath = os.path.join(tempdir_path, filename)
         success, digest, filesize, duration = util.receive_stream_and_validate(self.request.body_file, filepath, self.request.headers['Content-MD5'])
         if not success:
             self.abort(400, 'Content-MD5 mismatch.')
         if not tarfile.is_tarfile(filepath):
             self.abort(415, 'Only tar files are accepted.')
         log.info('Received    %s [%s] from %s' % (filename, util.hrsize(self.request.content_length), self.request.user_agent))
         datainfo = util.parse_file(filepath, digest)
         if datainfo is None:
             util.quarantine_file(filepath, self.app.config['quarantine_path'])
             self.abort(202, 'Quarantining %s (unparsable)' % filename)
         util.commit_file(self.app.db.acquisitions, None, datainfo, filepath, self.app.config['data_path'])
         util.create_job(self.app.db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there
         throughput = filesize / duration.total_seconds()
         log.info('Received    %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr))
Esempio n. 2
0
def sort(args):
    logging.basicConfig(level=logging.WARNING)
    quarantine_path = os.path.join(args.sort_path, 'quarantine')
    if not os.path.exists(args.sort_path):
        os.makedirs(args.sort_path)
    if not os.path.exists(quarantine_path):
        os.makedirs(quarantine_path)
    print 'initializing DB'
    kwargs = dict(tz_aware=True)
    db_client = connect_db(args.db_uri, **kwargs)
    db = db_client.get_default_database()
    print 'inspecting %s' % args.path
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]:
            if not os.path.islink(filepath):
                files.append(filepath)
        dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior
    file_cnt = len(files)
    print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt
    for i, filepath in enumerate(files):
        print 'sorting     %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt)
        hash_ = hashlib.sha1()
        if not args.quick:
            with open(filepath, 'rb') as fd:
                for chunk in iter(lambda: fd.read(2**20), ''):
                    hash_.update(chunk)
        datainfo = util.parse_file(filepath, hash_.hexdigest())
        if datainfo is None:
            util.quarantine_file(filepath, quarantine_path)
            print 'Quarantining %s (unparsable)' % os.path.basename(filepath)
        else:
            util.commit_file(db.acquisitions, None, datainfo, filepath, args.sort_path)
            util.create_job(db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there
Esempio n. 3
0
def sort(args):
    logging.basicConfig(level=logging.WARNING)
    quarantine_path = os.path.join(args.sort_path, 'quarantine')
    if not os.path.exists(args.sort_path):
        os.makedirs(args.sort_path)
    if not os.path.exists(quarantine_path):
        os.makedirs(quarantine_path)
    print 'initializing DB'
    kwargs = dict(tz_aware=True)
    db_client = connect_db(args.db_uri, **kwargs)
    db = db_client.get_default_database()
    print 'inspecting %s' % args.path
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [
                os.path.join(dirpath, fn) for fn in filenames
                if not fn.startswith('.')
        ]:
            if not os.path.islink(filepath):
                files.append(filepath)
        dirnames[:] = [
            dn for dn in dirnames if not dn.startswith('.')
        ]  # need to use slice assignment to influence walk behavior
    file_cnt = len(files)
    print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt
    for i, filepath in enumerate(files):
        print 'sorting     %s [%s] (%d/%d)' % (os.path.basename(
            filepath), util.hrsize(os.path.getsize(filepath)), i + 1, file_cnt)
        hash_ = hashlib.sha1()
        if not args.quick:
            with open(filepath, 'rb') as fd:
                for chunk in iter(lambda: fd.read(2**20), ''):
                    hash_.update(chunk)
        datainfo = util.parse_file(filepath, hash_.hexdigest())
        if datainfo is None:
            util.quarantine_file(filepath, quarantine_path)
            print 'Quarantining %s (unparsable)' % os.path.basename(filepath)
        else:
            util.commit_file(db.acquisitions, None, datainfo, filepath,
                             args.sort_path)
            util.create_job(
                db.acquisitions, datainfo
            )  # FIXME we should only mark files as new and let engine take it from there
Esempio n. 4
0
File: core.py Progetto: larsoner/api
    def upload(self):
        """
        Recieve a multi-file upload.

        3 phases:
            1 - upload metadata, obtain upload ticket
            2 - upload files, one at a time, but in parallel
            3 - send a 'complete' message
        """

        def store_file(fd, filename, md5, arcpath, arcname):
            with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path:
                filepath = os.path.join(tempdir_path, filename)
                success, _, _, _ = util.receive_stream_and_validate(fd, filepath, md5)
                if not success:
                    self.abort(400, 'Content-MD5 mismatch.')
                with lockfile.LockFile(arcpath):
                    with tarfile.open(arcpath, 'a') as archive:
                        archive.add(filepath, os.path.join(arcname, filename))

        if self.public_request:
            self.abort(403, 'must be logged in to upload data')

        filename = self.request.GET.get('filename')
        ticket_id = self.request.GET.get('ticket')

        if not ticket_id:
            if filename != 'METADATA.json':
                self.abort(400, 'first file must be METADATA.json')
            try:
                json_body = self.request.json_body
                jsonschema.validate(json_body, UPLOAD_SCHEMA)
            except (ValueError, jsonschema.ValidationError) as e:
                self.abort(400, str(e))
            filetype = json_body['filetype']
            overwrites = json_body['overwrite']

            query = {'name': overwrites['project_name'], 'group': overwrites['group_name']}
            project = self.app.db.projects.find_one(query) # verify permissions
            if not self.superuser_request:
                user_perm = util.user_perm(project['permissions'], self.uid)
                if not user_perm:
                    self.abort(403, self.uid + ' does not have permissions on this project')
                if users.INTEGER_ROLES[user_perm['access']] < users.INTEGER_ROLES['rw']:
                    self.abort(403, self.uid + ' does not have at least ' + min_role + ' permissions on this project')

            acq_no = overwrites.get('acq_no')
            arcname = overwrites['series_uid'] + ('_' + str(acq_no) if acq_no is not None else '') + '_' + filetype
            ticket = util.upload_ticket(arcname=arcname) # store arcname for later reference
            self.app.db.uploads.insert_one(ticket)
            arcpath = os.path.join(self.app.config['upload_path'], ticket['_id'] + '.tar')
            store_file(self.request.body_file, filename, self.request.headers['Content-MD5'], arcpath, arcname)
            return {'ticket': ticket['_id']}

        ticket = self.app.db.uploads.find_one({'_id': ticket_id})
        if not ticket:
            self.abort(404, 'no such ticket')
        arcpath = os.path.join(self.app.config['upload_path'], ticket_id + '.tar')

        if self.request.GET.get('complete', '').lower() not in ('1', 'true'):
            if 'Content-MD5' not in self.request.headers:
                self.app.db.uploads.remove({'_id': ticket_id}) # delete ticket
                self.abort(400, 'Request must contain a valid "Content-MD5" header.')
            if not filename:
                self.app.db.uploads.remove({'_id': ticket_id}) # delete ticket
                self.abort(400, 'Request must contain a filename query parameter.')
            self.app.db.uploads.update_one({'_id': ticket_id}, {'$set': {'timestamp': datetime.datetime.utcnow()}}) # refresh ticket
            store_file(self.request.body_file, filename, self.request.headers['Content-MD5'], arcpath, ticket['arcname'])
        else: # complete -> zip, hash, commit
            filepath = arcpath[:-2] + 'gz'
            with gzip.open(filepath, 'wb', compresslevel=6) as gzfile:
                with open(arcpath) as rawfile:
                    gzfile.writelines(rawfile)
            os.remove(arcpath)
            sha1 = hashlib.sha1()
            with open(filepath, 'rb') as fd:
                for chunk in iter(lambda: fd.read(2**20), ''):
                    sha1.update(chunk)
            datainfo = util.parse_file(filepath, sha1.hexdigest())
            if datainfo is None:
                util.quarantine_file(filepath, self.app.config['quarantine_path'])
                self.abort(202, 'Quarantining %s (unparsable)' % filename)
            util.commit_file(self.app.db.acquisitions, None, datainfo, filepath, self.app.config['data_path'])
Esempio n. 5
0
    def _put_file(self, _id, container, filename):
        """Receive a targeted processor or user upload."""
        tags = []
        metadata = {}
        if self.request.content_type == 'multipart/form-data':
            filestream = None
            # use cgi lib to parse multipart data without loading all into memory; use tempfile instead
            # FIXME avoid using tempfile; processs incoming stream on the fly
            fs_environ = self.request.environ.copy()
            fs_environ.setdefault('CONTENT_LENGTH', '0')
            fs_environ['QUERY_STRING'] = ''
            form = cgi.FieldStorage(fp=self.request.body_file,
                                    environ=fs_environ,
                                    keep_blank_values=True)
            for fieldname in form:
                field = form[fieldname]
                if fieldname == 'file':
                    filestream = field.file
                    filename = field.filename
                elif fieldname == 'tags':
                    try:
                        tags = json.loads(field.value)
                    except ValueError:
                        self.abort(400, 'non-JSON value in "tags" parameter')
                elif fieldname == 'metadata':
                    try:
                        metadata = json.loads(field.value)
                    except ValueError:
                        self.abort(400,
                                   'non-JSON value in "metadata" parameter')
            if filestream is None:
                self.abort(400,
                           'multipart/form-data must contain a "file" field')
        elif filename is None:
            self.abort(400, 'Request must contain a filename parameter.')
        else:
            if 'Content-MD5' not in self.request.headers:
                self.abort(
                    400, 'Request must contain a valid "Content-MD5" header.')
            try:
                tags = json.loads(self.request.get('tags', '[]'))
            except ValueError:
                self.abort(400, 'invalid "tags" parameter')
            try:
                metadata = json.loads(self.request.get('metadata', '{}'))
            except ValueError:
                self.abort(400, 'invalid "metadata" parameter')
            filestream = self.request.body_file
        flavor = self.request.GET.get('flavor',
                                      'data')  # TODO: flavor should go away
        if flavor not in ['data', 'attachment']:
            self.abort(
                400,
                'Query must contain flavor parameter: "data" or "attachment".')

        with tempfile.TemporaryDirectory(
                prefix='.tmp',
                dir=self.app.config['upload_path']) as tempdir_path:
            filepath = os.path.join(tempdir_path, filename)
            md5 = self.request.headers.get('Content-MD5')
            success, digest, _, duration = util.receive_stream_and_validate(
                filestream, filepath, md5)

            if not success:
                self.abort(400, 'Content-MD5 mismatch.')
            filesize = os.path.getsize(filepath)
            mimetype = util.guess_mimetype(filepath)
            filetype = util.guess_filetype(filepath, mimetype)
            datainfo = {
                'fileinfo': {
                    'filename': filename,
                    'filesize': filesize,
                    'filehash': digest,
                    'filetype': filetype,
                    'flavor': flavor,
                    'mimetype': mimetype,
                    'tags': tags,
                    'metadata': metadata,
                },
            }
            throughput = filesize / duration.total_seconds()
            log.info('Received    %s [%s, %s/s] from %s' %
                     (filename, util.hrsize(filesize), util.hrsize(throughput),
                      self.request.client_addr))
            util.commit_file(self.dbc, _id, datainfo, filepath,
                             self.app.config['data_path'])
Esempio n. 6
0
    def _put_file(self, _id, container, filename):
        """Receive a targeted processor or user upload."""
        tags = []
        metadata = {}
        if self.request.content_type == 'multipart/form-data':
            filestream = None
            # use cgi lib to parse multipart data without loading all into memory; use tempfile instead
            # FIXME avoid using tempfile; processs incoming stream on the fly
            fs_environ = self.request.environ.copy()
            fs_environ.setdefault('CONTENT_LENGTH', '0')
            fs_environ['QUERY_STRING'] = ''
            form = cgi.FieldStorage(fp=self.request.body_file, environ=fs_environ, keep_blank_values=True)
            for fieldname in form:
                field = form[fieldname]
                if fieldname == 'file':
                    filestream = field.file
                    filename = field.filename
                elif fieldname == 'tags':
                    try:
                        tags = json.loads(field.value)
                    except ValueError:
                        self.abort(400, 'non-JSON value in "tags" parameter')
                elif fieldname == 'metadata':
                    try:
                        metadata = json.loads(field.value)
                    except ValueError:
                        self.abort(400, 'non-JSON value in "metadata" parameter')
            if filestream is None:
                self.abort(400, 'multipart/form-data must contain a "file" field')
        elif filename is None:
            self.abort(400, 'Request must contain a filename parameter.')
        else:
            if 'Content-MD5' not in self.request.headers:
                self.abort(400, 'Request must contain a valid "Content-MD5" header.')
            try:
                tags = json.loads(self.request.get('tags', '[]'))
            except ValueError:
                self.abort(400, 'invalid "tags" parameter')
            try:
                metadata = json.loads(self.request.get('metadata', '{}'))
            except ValueError:
                self.abort(400, 'invalid "metadata" parameter')
            filestream = self.request.body_file
        flavor = self.request.GET.get('flavor', 'data') # TODO: flavor should go away
        if flavor not in ['data', 'attachment']:
            self.abort(400, 'Query must contain flavor parameter: "data" or "attachment".')

        with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path:
            filepath = os.path.join(tempdir_path, filename)
            md5 = self.request.headers.get('Content-MD5')
            success, digest, _, duration = util.receive_stream_and_validate(filestream, filepath, md5)

            if not success:
                self.abort(400, 'Content-MD5 mismatch.')
            filesize = os.path.getsize(filepath)
            mimetype = util.guess_mimetype(filepath)
            filetype = util.guess_filetype(filepath, mimetype)
            datainfo = {
                    'fileinfo': {
                        'filename': filename,
                        'filesize': filesize,
                        'filehash': digest,
                        'filetype': filetype,
                        'flavor': flavor,
                        'mimetype': mimetype,
                        'tags': tags,
                        'metadata': metadata,
                        },
                    }
            throughput = filesize / duration.total_seconds()
            log.info('Received    %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr))
            util.commit_file(self.dbc, _id, datainfo, filepath, self.app.config['data_path'])