def _get_downloaded_file(filename): """ given a test filename, return the file_meta dictionary needed by the workflow. Used for tests. """ if filename: from util import get_test_file csv_file = get_test_file(filename, 'scheduler') magic_type = util.MS.file(csv_file) return { 'out_file': csv_file, 'file_name': filename, 'file_size': os.path.getsize(csv_file), 'content_type': 'text/csv', 'md5sum': md5_for_file(csv_file), 'magic_type': util.magic_to_mime(magic_type), } print print '*' * 80 print "WARNING" print '*' * 80 print "When testing the workflow, you should set what to do with " \ "each source you're dealing with. Check " \ "webui.scheduler.test_helpers.py" print '*' * 80 print return {}
def _get_downloaded_file(filename): """ given a test filename, return the file_meta dictionary needed by the workflow. Used for tests. """ if filename: from util import get_test_file csv_file = get_test_file( filename, 'scheduler' ) magic_type = util.MS.file(csv_file) return { 'out_file': csv_file, 'file_name': filename, 'file_size': os.path.getsize(csv_file), 'content_type': 'text/csv', 'md5sum': md5_for_file(csv_file), 'magic_type': util.magic_to_mime(magic_type), } print print '*' * 80 print "WARNING" print '*' * 80 print "When testing the workflow, you should set what to do with " \ "each source you're dealing with. Check " \ "webui.scheduler.test_helpers.py" print '*' * 80 print return {}
def _download_url(url, save_dir): """Downloads a URL into a file and save HTTP metadata.""" loggy = local.logger request = urllib2.Request(url) opener = urllib2.build_opener(CustomRedirectHandler()) url_handler = opener.open(request) file_meta = __file_meta_from_headers( url, url_handler.headers, url_handler.redirect_headers if hasattr(url_handler, 'redirect_headers') else None ) file_name = file_meta['file_name'] # do not remove directory here, download may be in parallel with other # tasks using the same URL # shutil.rmtree(save_dir, True) __mkdir_p(save_dir) if file_name[-1] == '/': file_name = file_name[0:-1] out_file = save_dir + '/' + file_name output = open(out_file, 'wb') file_size = file_meta['file_size'] if file_size == 0: file_size = 1 loggy.info("Downloading: %s KBytes: %s", file_name, file_size / 1024) file_size_dl = 0 block_sz = 8192 emit_status_count = 0 while True: data_buffer = url_handler.read(block_sz) if not data_buffer: break file_size_dl += len(data_buffer) output.write(data_buffer) if emit_status_count >= 100: loggy.info("%10d [%3.2f%%]", file_size_dl, file_size_dl * 100. / file_size) emit_status_count = 0 else: emit_status_count += 1 output.close() # http://www.gavinj.net/2007/05/python-file-magic.html magic_type = util.MS.file(out_file) file_meta.update({ 'out_file': out_file, 'magic_type': util.magic_to_mime(magic_type), 'md5sum': md5_for_file(out_file) }) if file_meta['file_size'] == -1: file_meta['file_size'] = os.path.getsize(file_meta['out_file']) return file_meta
def _download_url(url, save_dir): """Downloads a URL into a file and save HTTP metadata.""" loggy = local.logger request = urllib2.Request(url) opener = urllib2.build_opener(CustomRedirectHandler()) url_handler = opener.open(request) file_meta = __file_meta_from_headers( url, url_handler.headers, url_handler.redirect_headers if hasattr( url_handler, 'redirect_headers') else None) file_name = file_meta['file_name'] # do not remove directory here, download may be in parallel with other # tasks using the same URL # shutil.rmtree(save_dir, True) __mkdir_p(save_dir) if file_name[-1] == '/': file_name = file_name[0:-1] out_file = save_dir + '/' + file_name output = open(out_file, 'wb') file_size = file_meta['file_size'] if file_size == 0: file_size = 1 loggy.info("Downloading: %s KBytes: %s", file_name, file_size / 1024) file_size_dl = 0 block_sz = 8192 emit_status_count = 0 while True: data_buffer = url_handler.read(block_sz) if not data_buffer: break file_size_dl += len(data_buffer) output.write(data_buffer) if emit_status_count >= 100: loggy.info("%10d [%3.2f%%]", file_size_dl, file_size_dl * 100. / file_size) emit_status_count = 0 else: emit_status_count += 1 output.close() # http://www.gavinj.net/2007/05/python-file-magic.html magic_type = util.MS.file(out_file) file_meta.update({ 'out_file': out_file, 'magic_type': util.magic_to_mime(magic_type), 'md5sum': md5_for_file(out_file) }) if file_meta['file_size'] == -1: file_meta['file_size'] = os.path.getsize(file_meta['out_file']) return file_meta
def test_magic_to_mime(self): self.assertEqual("text/plain", util.magic_to_mime('ascii text, with crlf line terminators'))