def put(self): """Receive a sortable reaper or user upload.""" #if not self.uid and not self.drone_request: # self.abort(402, 'uploads must be from an authorized user or drone') if 'Content-MD5' not in self.request.headers: self.abort(400, 'Request must contain a valid "Content-MD5" header.') filename = self.request.headers.get('Content-Disposition', '').partition('filename=')[2].strip('"') if not filename: self.abort(400, 'Request must contain a valid "Content-Disposition" header.') with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) success, digest, filesize, duration = util.receive_stream_and_validate(self.request.body_file, filepath, self.request.headers['Content-MD5']) if not success: self.abort(400, 'Content-MD5 mismatch.') if not tarfile.is_tarfile(filepath): self.abort(415, 'Only tar files are accepted.') log.info('Received %s [%s] from %s' % (filename, util.hrsize(self.request.content_length), self.request.user_agent)) datainfo = util.parse_file(filepath, digest) if datainfo is None: util.quarantine_file(filepath, self.app.config['quarantine_path']) self.abort(202, 'Quarantining %s (unparsable)' % filename) util.commit_file(self.app.db.acquisitions, None, datainfo, filepath, self.app.config['data_path']) util.create_job(self.app.db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there throughput = filesize / duration.total_seconds() log.info('Received %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr))
def upload(args): import util import datetime import requests print 'inspecting %s' % args.path files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [ os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.') ]: if not os.path.islink(filepath): files.append(filepath) dirnames[:] = [ dn for dn in dirnames if not dn.startswith('.') ] # need to use slice assignment to influence walk behavior print 'found %d files to upload (ignoring symlinks and dotfiles)' % len( files) for filepath in files: filename = os.path.basename(filepath) print 'hashing %s' % filename hash_ = hashlib.md5() with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) print 'uploading %s [%s]' % (filename, util.hrsize(os.path.getsize(filepath))) with open(filepath, 'rb') as fd: headers = { 'User-Agent': 'bootstrapper', 'Content-MD5': hash_.hexdigest(), 'Content-Disposition': 'attachment; filename="%s"' % filename, } try: start = datetime.datetime.now() r = requests.put(args.url, data=fd, headers=headers, verify=not args.no_verify) upload_duration = (datetime.datetime.now() - start).total_seconds() except requests.exceptions.ConnectionError as e: print 'error %s: %s' % (filename, e) else: if r.status_code == 200: print 'success %s [%s/s]' % ( filename, util.hrsize( os.path.getsize(filepath) / upload_duration)) else: print 'failure %s: %s %s, %s' % ( filename, r.status_code, r.reason, r.text)
def sort(args): logging.basicConfig(level=logging.WARNING) quarantine_path = os.path.join(args.sort_path, 'quarantine') if not os.path.exists(args.sort_path): os.makedirs(args.sort_path) if not os.path.exists(quarantine_path): os.makedirs(quarantine_path) print 'initializing DB' kwargs = dict(tz_aware=True) db_client = connect_db(args.db_uri, **kwargs) db = db_client.get_default_database() print 'inspecting %s' % args.path files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]: if not os.path.islink(filepath): files.append(filepath) dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior file_cnt = len(files) print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt for i, filepath in enumerate(files): print 'sorting %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt) hash_ = hashlib.sha1() if not args.quick: with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) datainfo = util.parse_file(filepath, hash_.hexdigest()) if datainfo is None: util.quarantine_file(filepath, quarantine_path) print 'Quarantining %s (unparsable)' % os.path.basename(filepath) else: util.commit_file(db.acquisitions, None, datainfo, filepath, args.sort_path) util.create_job(db.acquisitions, datainfo) # FIXME we should only mark files as new and let engine take it from there
def upload(args): import util import datetime import requests print 'inspecting %s' % args.path files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]: if not os.path.islink(filepath): files.append(filepath) dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior print 'found %d files to upload (ignoring symlinks and dotfiles)' % len(files) for filepath in files: filename = os.path.basename(filepath) print 'hashing %s' % filename hash_ = hashlib.md5() with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) print 'uploading %s [%s]' % (filename, util.hrsize(os.path.getsize(filepath))) with open(filepath, 'rb') as fd: headers = { 'User-Agent': 'bootstrapper', 'Content-MD5': hash_.hexdigest(), 'Content-Disposition': 'attachment; filename="%s"' % filename, } try: start = datetime.datetime.now() r = requests.put(args.url, data=fd, headers=headers, verify=not args.no_verify) upload_duration = (datetime.datetime.now() - start).total_seconds() except requests.exceptions.ConnectionError as e: print 'error %s: %s' % (filename, e) else: if r.status_code == 200: print 'success %s [%s/s]' % (filename, util.hrsize(os.path.getsize(filepath)/upload_duration)) else: print 'failure %s: %s %s, %s' % (filename, r.status_code, r.reason, r.text)
def sort(args): logging.basicConfig(level=logging.WARNING) quarantine_path = os.path.join(args.sort_path, 'quarantine') if not os.path.exists(args.sort_path): os.makedirs(args.sort_path) if not os.path.exists(quarantine_path): os.makedirs(quarantine_path) print 'initializing DB' kwargs = dict(tz_aware=True) db_client = connect_db(args.db_uri, **kwargs) db = db_client.get_default_database() print 'inspecting %s' % args.path files = [] for dirpath, dirnames, filenames in os.walk(args.path): for filepath in [ os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.') ]: if not os.path.islink(filepath): files.append(filepath) dirnames[:] = [ dn for dn in dirnames if not dn.startswith('.') ] # need to use slice assignment to influence walk behavior file_cnt = len(files) print 'found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt for i, filepath in enumerate(files): print 'sorting %s [%s] (%d/%d)' % (os.path.basename( filepath), util.hrsize(os.path.getsize(filepath)), i + 1, file_cnt) hash_ = hashlib.sha1() if not args.quick: with open(filepath, 'rb') as fd: for chunk in iter(lambda: fd.read(2**20), ''): hash_.update(chunk) datainfo = util.parse_file(filepath, hash_.hexdigest()) if datainfo is None: util.quarantine_file(filepath, quarantine_path) print 'Quarantining %s (unparsable)' % os.path.basename(filepath) else: util.commit_file(db.acquisitions, None, datainfo, filepath, args.sort_path) util.create_job( db.acquisitions, datainfo ) # FIXME we should only mark files as new and let engine take it from there
def _put_file(self, _id, container, filename): """Receive a targeted processor or user upload.""" tags = [] metadata = {} if self.request.content_type == 'multipart/form-data': filestream = None # use cgi lib to parse multipart data without loading all into memory; use tempfile instead # FIXME avoid using tempfile; processs incoming stream on the fly fs_environ = self.request.environ.copy() fs_environ.setdefault('CONTENT_LENGTH', '0') fs_environ['QUERY_STRING'] = '' form = cgi.FieldStorage(fp=self.request.body_file, environ=fs_environ, keep_blank_values=True) for fieldname in form: field = form[fieldname] if fieldname == 'file': filestream = field.file filename = field.filename elif fieldname == 'tags': try: tags = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "tags" parameter') elif fieldname == 'metadata': try: metadata = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "metadata" parameter') if filestream is None: self.abort(400, 'multipart/form-data must contain a "file" field') elif filename is None: self.abort(400, 'Request must contain a filename parameter.') else: if 'Content-MD5' not in self.request.headers: self.abort( 400, 'Request must contain a valid "Content-MD5" header.') try: tags = json.loads(self.request.get('tags', '[]')) except ValueError: self.abort(400, 'invalid "tags" parameter') try: metadata = json.loads(self.request.get('metadata', '{}')) except ValueError: self.abort(400, 'invalid "metadata" parameter') filestream = self.request.body_file flavor = self.request.GET.get('flavor', 'data') # TODO: flavor should go away if flavor not in ['data', 'attachment']: self.abort( 400, 'Query must contain flavor parameter: "data" or "attachment".') with tempfile.TemporaryDirectory( prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) md5 = self.request.headers.get('Content-MD5') success, digest, _, duration = util.receive_stream_and_validate( filestream, filepath, md5) if not success: self.abort(400, 'Content-MD5 mismatch.') filesize = os.path.getsize(filepath) mimetype = util.guess_mimetype(filepath) filetype = util.guess_filetype(filepath, mimetype) datainfo = { 'fileinfo': { 'filename': filename, 'filesize': filesize, 'filehash': digest, 'filetype': filetype, 'flavor': flavor, 'mimetype': mimetype, 'tags': tags, 'metadata': metadata, }, } throughput = filesize / duration.total_seconds() log.info('Received %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr)) util.commit_file(self.dbc, _id, datainfo, filepath, self.app.config['data_path'])
def _put_file(self, _id, container, filename): """Receive a targeted processor or user upload.""" tags = [] metadata = {} if self.request.content_type == 'multipart/form-data': filestream = None # use cgi lib to parse multipart data without loading all into memory; use tempfile instead # FIXME avoid using tempfile; processs incoming stream on the fly fs_environ = self.request.environ.copy() fs_environ.setdefault('CONTENT_LENGTH', '0') fs_environ['QUERY_STRING'] = '' form = cgi.FieldStorage(fp=self.request.body_file, environ=fs_environ, keep_blank_values=True) for fieldname in form: field = form[fieldname] if fieldname == 'file': filestream = field.file filename = field.filename elif fieldname == 'tags': try: tags = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "tags" parameter') elif fieldname == 'metadata': try: metadata = json.loads(field.value) except ValueError: self.abort(400, 'non-JSON value in "metadata" parameter') if filestream is None: self.abort(400, 'multipart/form-data must contain a "file" field') elif filename is None: self.abort(400, 'Request must contain a filename parameter.') else: if 'Content-MD5' not in self.request.headers: self.abort(400, 'Request must contain a valid "Content-MD5" header.') try: tags = json.loads(self.request.get('tags', '[]')) except ValueError: self.abort(400, 'invalid "tags" parameter') try: metadata = json.loads(self.request.get('metadata', '{}')) except ValueError: self.abort(400, 'invalid "metadata" parameter') filestream = self.request.body_file flavor = self.request.GET.get('flavor', 'data') # TODO: flavor should go away if flavor not in ['data', 'attachment']: self.abort(400, 'Query must contain flavor parameter: "data" or "attachment".') with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['upload_path']) as tempdir_path: filepath = os.path.join(tempdir_path, filename) md5 = self.request.headers.get('Content-MD5') success, digest, _, duration = util.receive_stream_and_validate(filestream, filepath, md5) if not success: self.abort(400, 'Content-MD5 mismatch.') filesize = os.path.getsize(filepath) mimetype = util.guess_mimetype(filepath) filetype = util.guess_filetype(filepath, mimetype) datainfo = { 'fileinfo': { 'filename': filename, 'filesize': filesize, 'filehash': digest, 'filetype': filetype, 'flavor': flavor, 'mimetype': mimetype, 'tags': tags, 'metadata': metadata, }, } throughput = filesize / duration.total_seconds() log.info('Received %s [%s, %s/s] from %s' % (filename, util.hrsize(filesize), util.hrsize(throughput), self.request.client_addr)) util.commit_file(self.dbc, _id, datainfo, filepath, self.app.config['data_path'])