Exemple #1
0
 def put(self):
     """Receive a sortable reaper or user upload."""
     #if not self.uid and not self.drone_request:
     #    self.abort(402, 'uploads must be from an authorized user or drone')
     if 'Content-MD5' not in self.request.headers:
         self.abort(400, 'Request must contain a valid "Content-MD5" header.')
     filename = self.request.headers.get('Content-Disposition', '').partition('filename=')[2].strip('"')
     if not filename:
         self.abort(400, 'Request must contain a valid "Content-Disposition" header.')
     with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path:
         filepath = os.path.join(tempdir_path, filename)
         success, digest, filesize, duration = util.receive_stream_and_validate(self.request.body_file, filepath, self.request.headers['Content-MD5'])
         if not success:
             self.abort(400, 'Content-MD5 mismatch.')
         if not tarfile.is_tarfile(filepath):
             self.abort(415, 'Only tar files are accepted.')
         log.info('Received    %s [%s] from %s' % (filename, util.hrsize(self.request.content_length), self.request.user_agent))
         datainfo = util.parse_file(filepath, digest)
         if datainfo is None:
             util.quarantine_file(filepath, self.app.config['quarantine_path'])
             self.abort(202, 'Quarantining %s (unparsable)' % filename)
         util.commit_file(self.app.db.acquisitions, None, datainfo, filepath, self.app.config['data_path'])
         util.create_job(self.app.db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there
         throughput = filesize / duration.total_seconds()
         log.info('Received    %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr))
Exemple #2
0
def upload(args):
    import util
    import datetime
    import requests
    print 'inspecting %s' % args.path
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [
                os.path.join(dirpath, fn) for fn in filenames
                if not fn.startswith('.')
        ]:
            if not os.path.islink(filepath):
                files.append(filepath)
        dirnames[:] = [
            dn for dn in dirnames if not dn.startswith('.')
        ]  # need to use slice assignment to influence walk behavior
    print 'found %d files to upload (ignoring symlinks and dotfiles)' % len(
        files)
    for filepath in files:
        filename = os.path.basename(filepath)
        print 'hashing     %s' % filename
        hash_ = hashlib.md5()
        with open(filepath, 'rb') as fd:
            for chunk in iter(lambda: fd.read(2**20), ''):
                hash_.update(chunk)
        print 'uploading   %s [%s]' % (filename,
                                       util.hrsize(os.path.getsize(filepath)))
        with open(filepath, 'rb') as fd:
            headers = {
                'User-Agent': 'bootstrapper',
                'Content-MD5': hash_.hexdigest(),
                'Content-Disposition': 'attachment; filename="%s"' % filename,
            }
            try:
                start = datetime.datetime.now()
                r = requests.put(args.url,
                                 data=fd,
                                 headers=headers,
                                 verify=not args.no_verify)
                upload_duration = (datetime.datetime.now() -
                                   start).total_seconds()
            except requests.exceptions.ConnectionError as e:
                print 'error       %s: %s' % (filename, e)
            else:
                if r.status_code == 200:
                    print 'success     %s [%s/s]' % (
                        filename,
                        util.hrsize(
                            os.path.getsize(filepath) / upload_duration))
                else:
                    print 'failure     %s: %s %s, %s' % (
                        filename, r.status_code, r.reason, r.text)
Exemple #3
0
def sort(args):
    logging.basicConfig(level=logging.WARNING)
    quarantine_path = os.path.join(args.sort_path, 'quarantine')
    if not os.path.exists(args.sort_path):
        os.makedirs(args.sort_path)
    if not os.path.exists(quarantine_path):
        os.makedirs(quarantine_path)
    print 'initializing DB'
    kwargs = dict(tz_aware=True)
    db_client = connect_db(args.db_uri, **kwargs)
    db = db_client.get_default_database()
    print 'inspecting %s' % args.path
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]:
            if not os.path.islink(filepath):
                files.append(filepath)
        dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior
    file_cnt = len(files)
    print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt
    for i, filepath in enumerate(files):
        print 'sorting     %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt)
        hash_ = hashlib.sha1()
        if not args.quick:
            with open(filepath, 'rb') as fd:
                for chunk in iter(lambda: fd.read(2**20), ''):
                    hash_.update(chunk)
        datainfo = util.parse_file(filepath, hash_.hexdigest())
        if datainfo is None:
            util.quarantine_file(filepath, quarantine_path)
            print 'Quarantining %s (unparsable)' % os.path.basename(filepath)
        else:
            util.commit_file(db.acquisitions, None, datainfo, filepath, args.sort_path)
            util.create_job(db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there
Exemple #4
0
def upload(args):
    import util
    import datetime
    import requests
    print 'inspecting %s' % args.path
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]:
            if not os.path.islink(filepath):
                files.append(filepath)
        dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior
    print 'found %d files to upload (ignoring symlinks and dotfiles)' % len(files)
    for filepath in files:
        filename = os.path.basename(filepath)
        print 'hashing     %s' % filename
        hash_ = hashlib.md5()
        with open(filepath, 'rb') as fd:
            for chunk in iter(lambda: fd.read(2**20), ''):
                hash_.update(chunk)
        print 'uploading   %s [%s]' % (filename, util.hrsize(os.path.getsize(filepath)))
        with open(filepath, 'rb') as fd:
            headers = {
                    'User-Agent': 'bootstrapper',
                    'Content-MD5': hash_.hexdigest(),
                    'Content-Disposition': 'attachment; filename="%s"' % filename,
                    }
            try:
                start = datetime.datetime.now()
                r = requests.put(args.url, data=fd, headers=headers, verify=not args.no_verify)
                upload_duration = (datetime.datetime.now() - start).total_seconds()
            except requests.exceptions.ConnectionError as e:
                print 'error       %s: %s' % (filename, e)
            else:
                if r.status_code == 200:
                    print 'success     %s [%s/s]' % (filename, util.hrsize(os.path.getsize(filepath)/upload_duration))
                else:
                    print 'failure     %s: %s %s, %s' % (filename, r.status_code, r.reason, r.text)
Exemple #5
0
def sort(args):
    logging.basicConfig(level=logging.WARNING)
    quarantine_path = os.path.join(args.sort_path, 'quarantine')
    if not os.path.exists(args.sort_path):
        os.makedirs(args.sort_path)
    if not os.path.exists(quarantine_path):
        os.makedirs(quarantine_path)
    print 'initializing DB'
    kwargs = dict(tz_aware=True)
    db_client = connect_db(args.db_uri, **kwargs)
    db = db_client.get_default_database()
    print 'inspecting %s' % args.path
    files = []
    for dirpath, dirnames, filenames in os.walk(args.path):
        for filepath in [
                os.path.join(dirpath, fn) for fn in filenames
                if not fn.startswith('.')
        ]:
            if not os.path.islink(filepath):
                files.append(filepath)
        dirnames[:] = [
            dn for dn in dirnames if not dn.startswith('.')
        ]  # need to use slice assignment to influence walk behavior
    file_cnt = len(files)
    print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt
    for i, filepath in enumerate(files):
        print 'sorting     %s [%s] (%d/%d)' % (os.path.basename(
            filepath), util.hrsize(os.path.getsize(filepath)), i + 1, file_cnt)
        hash_ = hashlib.sha1()
        if not args.quick:
            with open(filepath, 'rb') as fd:
                for chunk in iter(lambda: fd.read(2**20), ''):
                    hash_.update(chunk)
        datainfo = util.parse_file(filepath, hash_.hexdigest())
        if datainfo is None:
            util.quarantine_file(filepath, quarantine_path)
            print 'Quarantining %s (unparsable)' % os.path.basename(filepath)
        else:
            util.commit_file(db.acquisitions, None, datainfo, filepath,
                             args.sort_path)
            util.create_job(
                db.acquisitions, datainfo
            )  # FIXME we should only mark files as new and let engine take it from there
Exemple #6
0
    def _put_file(self, _id, container, filename):
        """Receive a targeted processor or user upload."""
        tags = []
        metadata = {}
        if self.request.content_type == 'multipart/form-data':
            filestream = None
            # use cgi lib to parse multipart data without loading all into memory; use tempfile instead
            # FIXME avoid using tempfile; processs incoming stream on the fly
            fs_environ = self.request.environ.copy()
            fs_environ.setdefault('CONTENT_LENGTH', '0')
            fs_environ['QUERY_STRING'] = ''
            form = cgi.FieldStorage(fp=self.request.body_file,
                                    environ=fs_environ,
                                    keep_blank_values=True)
            for fieldname in form:
                field = form[fieldname]
                if fieldname == 'file':
                    filestream = field.file
                    filename = field.filename
                elif fieldname == 'tags':
                    try:
                        tags = json.loads(field.value)
                    except ValueError:
                        self.abort(400, 'non-JSON value in "tags" parameter')
                elif fieldname == 'metadata':
                    try:
                        metadata = json.loads(field.value)
                    except ValueError:
                        self.abort(400,
                                   'non-JSON value in "metadata" parameter')
            if filestream is None:
                self.abort(400,
                           'multipart/form-data must contain a "file" field')
        elif filename is None:
            self.abort(400, 'Request must contain a filename parameter.')
        else:
            if 'Content-MD5' not in self.request.headers:
                self.abort(
                    400, 'Request must contain a valid "Content-MD5" header.')
            try:
                tags = json.loads(self.request.get('tags', '[]'))
            except ValueError:
                self.abort(400, 'invalid "tags" parameter')
            try:
                metadata = json.loads(self.request.get('metadata', '{}'))
            except ValueError:
                self.abort(400, 'invalid "metadata" parameter')
            filestream = self.request.body_file
        flavor = self.request.GET.get('flavor',
                                      'data')  # TODO: flavor should go away
        if flavor not in ['data', 'attachment']:
            self.abort(
                400,
                'Query must contain flavor parameter: "data" or "attachment".')

        with tempfile.TemporaryDirectory(
                prefix='.tmp',
                dir=self.app.config['upload_path']) as tempdir_path:
            filepath = os.path.join(tempdir_path, filename)
            md5 = self.request.headers.get('Content-MD5')
            success, digest, _, duration = util.receive_stream_and_validate(
                filestream, filepath, md5)

            if not success:
                self.abort(400, 'Content-MD5 mismatch.')
            filesize = os.path.getsize(filepath)
            mimetype = util.guess_mimetype(filepath)
            filetype = util.guess_filetype(filepath, mimetype)
            datainfo = {
                'fileinfo': {
                    'filename': filename,
                    'filesize': filesize,
                    'filehash': digest,
                    'filetype': filetype,
                    'flavor': flavor,
                    'mimetype': mimetype,
                    'tags': tags,
                    'metadata': metadata,
                },
            }
            throughput = filesize / duration.total_seconds()
            log.info('Received    %s [%s, %s/s] from %s' %
                     (filename, util.hrsize(filesize), util.hrsize(throughput),
                      self.request.client_addr))
            util.commit_file(self.dbc, _id, datainfo, filepath,
                             self.app.config['data_path'])
Exemple #7
0
    def _put_file(self, _id, container, filename):
        """Receive a targeted processor or user upload."""
        tags = []
        metadata = {}
        if self.request.content_type == 'multipart/form-data':
            filestream = None
            # use cgi lib to parse multipart data without loading all into memory; use tempfile instead
            # FIXME avoid using tempfile; processs incoming stream on the fly
            fs_environ = self.request.environ.copy()
            fs_environ.setdefault('CONTENT_LENGTH', '0')
            fs_environ['QUERY_STRING'] = ''
            form = cgi.FieldStorage(fp=self.request.body_file, environ=fs_environ, keep_blank_values=True)
            for fieldname in form:
                field = form[fieldname]
                if fieldname == 'file':
                    filestream = field.file
                    filename = field.filename
                elif fieldname == 'tags':
                    try:
                        tags = json.loads(field.value)
                    except ValueError:
                        self.abort(400, 'non-JSON value in "tags" parameter')
                elif fieldname == 'metadata':
                    try:
                        metadata = json.loads(field.value)
                    except ValueError:
                        self.abort(400, 'non-JSON value in "metadata" parameter')
            if filestream is None:
                self.abort(400, 'multipart/form-data must contain a "file" field')
        elif filename is None:
            self.abort(400, 'Request must contain a filename parameter.')
        else:
            if 'Content-MD5' not in self.request.headers:
                self.abort(400, 'Request must contain a valid "Content-MD5" header.')
            try:
                tags = json.loads(self.request.get('tags', '[]'))
            except ValueError:
                self.abort(400, 'invalid "tags" parameter')
            try:
                metadata = json.loads(self.request.get('metadata', '{}'))
            except ValueError:
                self.abort(400, 'invalid "metadata" parameter')
            filestream = self.request.body_file
        flavor = self.request.GET.get('flavor', 'data') # TODO: flavor should go away
        if flavor not in ['data', 'attachment']:
            self.abort(400, 'Query must contain flavor parameter: "data" or "attachment".')

        with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path:
            filepath = os.path.join(tempdir_path, filename)
            md5 = self.request.headers.get('Content-MD5')
            success, digest, _, duration = util.receive_stream_and_validate(filestream, filepath, md5)

            if not success:
                self.abort(400, 'Content-MD5 mismatch.')
            filesize = os.path.getsize(filepath)
            mimetype = util.guess_mimetype(filepath)
            filetype = util.guess_filetype(filepath, mimetype)
            datainfo = {
                    'fileinfo': {
                        'filename': filename,
                        'filesize': filesize,
                        'filehash': digest,
                        'filetype': filetype,
                        'flavor': flavor,
                        'mimetype': mimetype,
                        'tags': tags,
                        'metadata': metadata,
                        },
                    }
            throughput = filesize / duration.total_seconds()
            log.info('Received    %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr))
            util.commit_file(self.dbc, _id, datainfo, filepath, self.app.config['data_path'])