def test_split_url_to_filename(self): self.assertEqual(['example.com', 'index.php_article=Main_Page'], util.split_url_to_filename( 'http://example.com/index.php?article=Main_Page') ) def f1(): util.split_url_to_filename('http://example.com/../system') self.assertRaises(ValueError, f1) def f2(): util.split_url_to_filename('http://example.com/./system') self.assertRaises(ValueError, f2)
def test_split_url_to_filename(self): self.assertEqual(['example.com', 'index.php_article=Main_Page'], util.split_url_to_filename( 'http://example.com/index.php?article=Main_Page')) def f1(): util.split_url_to_filename('http://example.com/../system') self.assertRaises(ValueError, f1) def f2(): util.split_url_to_filename('http://example.com/./system') self.assertRaises(ValueError, f2)
def action(self, record): if record.warc_type != 'response': return if not isinstance(record.content_block, model.BlockWithPayload): return if not isinstance(record.content_block.fields, model.HTTPHeaders): return if not record.content_block.fields.status_code == http.client.OK: return url = record.header.fields['WARC-Target-URI'] binary_block = record.content_block.binary_block file_obj = binary_block.get_file() data = file_obj.read(binary_block.length) response = util.parse_http_response(data) path_list = util.split_url_to_filename(url) path_list = util.truncate_filename_parts(path_list) path = os.path.join(self.out_dir, *path_list) dir_path = os.path.dirname(path) if os.path.isdir(path): path = util.append_index_filename(path) _logger.debug('Extracting %s to %s', record.record_id, path) util.rename_filename_dirs(path) os.makedirs(dir_path, exist_ok=True) try: with open(path, 'wb') as f: shutil.copyfileobj(response, f) except http.client.IncompleteRead as error: _logger.warning('Malformed HTTP response: %s', error) with open(path, 'wb') as f: f.write(error.partial) last_modified_str = response.getheader('Last-Modified') if last_modified_str: try: last_modified = util.parse_http_date(last_modified_str) except ValueError: pass else: timestamp = time.mktime(last_modified.utctimetuple()) os.utime(path, (time.time(), timestamp)) _logger.debug('Apply mtime %d to %s', timestamp, path) _logger.info('Extracted %s to %s', record.record_id, path)
def test_split_url_to_filename(self): self.assertEqual( ["example.com", "index.php_article=Main_Page"], util.split_url_to_filename("http://example.com/index.php?article=Main_Page"), ) def f1(): util.split_url_to_filename("http://example.com/../system") self.assertRaises(ValueError, f1) def f2(): util.split_url_to_filename("http://example.com/./system") self.assertRaises(ValueError, f2)
def f2(): util.split_url_to_filename('http://example.com/./system')
def f1(): util.split_url_to_filename("http://example.com/../system")