def process(self): '''Process. Coroutine. ''' verdict = self._fetch_rule.check_ftp_request( self._url_item.url_info, self._url_item.url_record)[0] if not verdict: self._url_item.skip() return request = Request(self._url_item.url_info.url) # TODO: dependency inject if self._fetch_rule.ftp_login: request.username, request.password = self._fetch_rule.ftp_login dir_name, filename = self._url_item.url_info.split_path() if self._processor.fetch_params.glob and frozenset(filename) & GLOB_CHARS: directory_url = to_dir_path_url(request.url_info) directory_request = copy.deepcopy(request) directory_request.url = directory_url request = directory_request is_file = False self._glob_pattern = urllib.parse.unquote(filename) else: is_file = yield From(self._prepare_request_file_vs_dir(request)) self._file_writer_session.process_request(request) wait_time = yield From(self._fetch(request, is_file)) if wait_time: _logger.debug('Sleeping {0}.'.format(wait_time)) yield From(trollius.sleep(wait_time))
def test_login_no_password_required(self): client = Client() file = io.BytesIO() with client.session() as session: request = Request(self.get_url('/example (copy).txt')) request.username = '******' yield From(session.fetch(request)) yield From(session.read_content(file))
def test_fetch_file_restart_not_supported(self): client = Client() file = io.BytesIO() with client.session() as session: request = Request(self.get_url('/example (copy).txt')) request.set_continue(99999) # Magic value in the test server response = yield From(session.fetch(request)) self.assertFalse(response.restart_value) yield From(session.read_content(file)) self.assertEqual( 'The real treasure is in Smaug’s heart 💗.\n'.encode('utf-8'), response.body.content())
def test_fetch_file_restart(self): client = Client() file = io.BytesIO() with client.session() as session: request = Request(self.get_url('/example (copy).txt')) request.set_continue(10) response = yield From(session.fetch(request)) self.assertEqual(10, response.restart_value) yield From(session.read_content(file)) self.assertEqual( 'reasure is in Smaug’s heart 💗.\n'.encode('utf-8'), response.body.content())
def test_fetch_file_restart_not_supported(self): client = Client() file = io.BytesIO() with client.session() as session: request = Request(self.get_url('/example (copy).txt')) request.set_continue(99999) # Magic value in the test server response = yield From(session.fetch(request)) self.assertFalse(response.restart_value) yield From(session.read_content(file)) self.assertEqual( 'The real treasure is in Smaug’s heart 💗.\n'.encode('utf-8'), response.body.content() )
def test_fetch_file_restart(self): client = Client() file = io.BytesIO() with client.session() as session: request = Request(self.get_url('/example (copy).txt')) request.set_continue(10) response = yield From(session.fetch(request)) self.assertEqual(10, response.restart_value) yield From(session.read_content(file)) self.assertEqual( 'reasure is in Smaug’s heart 💗.\n'.encode('utf-8'), response.body.content() )
def test_duration_timeout(self): client = Client() file = io.BytesIO() with self.assertRaises(DurationTimeout), client.session() as session: yield From( session.fetch(Request(self.get_url('/hidden/sleep.txt')))) yield From(session.read_content(file, duration_timeout=0.1))
def test_to_dict(self): request = Request('ftp://foofle.com') request_dict = request.to_dict() self.assertEqual('ftp://foofle.com', request_dict['url']) self.assertEqual('ftp', request_dict['protocol']) response = Response() response.request = request response.reply = Reply(code=200, text='Success') response_dict = response.to_dict() self.assertEqual('ftp://foofle.com', response_dict['request']['url']) self.assertEqual('ftp', response_dict['protocol']) self.assertEqual(200, response_dict['reply']['code']) self.assertEqual(200, response_dict['response_code']) self.assertEqual('Success', response_dict['reply']['text']) self.assertEqual('Success', response_dict['response_message'])
def test_fetch_no_file(self): client = Client() file = io.BytesIO() with client.session() as session: try: yield From(session.fetch(Request(self.get_url('/asdf.txt')))) yield From(session.read_content(file)) except FTPServerError as error: self.assertEqual(550, error.reply_code) else: self.fail() # pragma: no cover
def test_fetch_file(self): client = Client() file = io.BytesIO() with client.session() as session: response = yield From( session.fetch(Request(self.get_url('/example (copy).txt')))) yield From(session.read_content(file)) self.assertEqual( 'The real treasure is in Smaug’s heart 💗.\n'.encode('utf-8'), response.body.content())
def test_fetch_listing(self): client = Client() file = io.BytesIO() with client.session() as session: response = yield From( session.fetch_file_listing(Request(self.get_url('/')))) yield From(session.read_listing_content(file)) print(response.body.content()) self.assertEqual(5, len(response.files)) self.assertEqual('junk', response.files[0].name) self.assertEqual('example1', response.files[1].name) self.assertEqual('example2💎', response.files[2].name) self.assertEqual('example (copy).txt', response.files[3].name) self.assertEqual('readme.txt', response.files[4].name)
def test_fetch_bad_pasv_addr(self): client = Client() file = io.BytesIO() with client.session() as session: original_func = session._log_in @trollius.coroutine def override_func(): yield From(original_func()) yield From( session._control_stream.write_command( Command('EVIL_BAD_PASV_ADDR'))) print('Evil awaits') session._log_in = override_func with self.assertRaises(ProtocolError): yield From( session.fetch(Request( self.get_url('/example (copy).txt'))))
def test_warc_recorder_ftp(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' warc_recorder = WARCRecorder(file_prefix, params=WARCRecorderParams(compress=False)) request = FTPRequest('ftp://example.com/example.txt') request.address = ('0.0.0.0', 80) response = FTPResponse() response.reply = FTPReply(200, 'OK') response.body = Body() response.data_address = ('0.0.0.0', 12345) with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.begin_control(request) session.request_control_data(b'GIMMEH example.txt') session.response_control_data(b'200 OK, no need to yell.') session.pre_response(response) session.response_data(b'KITTEH DOGE') session.response(response) session.end_control(response) warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertNotIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: resource\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: ftp://', warc_file_content) self.assertIn(b'Content-Type: application/octet-stream', warc_file_content) self.assertIn(b'WARC-Type: metadata', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: text/x-ftp-control-conversation', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content) self.assertIn( 'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'), warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'* Opening control connection to', warc_file_content) self.assertIn(b'* Kept control connection to', warc_file_content) self.assertIn(b'* Opened data connection to ', warc_file_content) self.assertIn(b'* Closed data connection to ', warc_file_content) self.assertIn(b'> GIMMEH example.txt', warc_file_content) self.assertIn(b'< 200 OK, no need to yell.', warc_file_content) # Ignore Concurrent Record ID not seen yet self.validate_warc(warc_filename, ignore_minor_error=True) with open(warc_filename, 'r+b') as in_file: # Intentionally modify the contents in_file.seek(355) in_file.write(b'f') with self.assertRaises(Exception): # Sanity check that it actually raises error on bad digest self.validate_warc(warc_filename, ignore_minor_error=True)
def test_warc_recorder_ftp(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams(compress=False) ) request = FTPRequest('ftp://example.com/example.txt') request.address = ('0.0.0.0', 80) response = FTPResponse() response.reply = FTPReply(200, 'OK') response.body = Body() response.data_address = ('0.0.0.0', 12345) with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.begin_control(request) session.request_control_data(b'GIMMEH example.txt') session.response_control_data(b'200 OK, no need to yell.') session.pre_response(response) session.response_data(b'KITTEH DOGE') session.response(response) session.end_control(response) warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertNotIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: resource\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: ftp://', warc_file_content) self.assertIn(b'Content-Type: application/octet-stream', warc_file_content) self.assertIn(b'WARC-Type: metadata', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: text/x-ftp-control-conversation', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content ) self.assertIn( 'Python/{0}'.format( wpull.util.python_version()).encode('utf-8'), warc_file_content ) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'* Opening control connection to', warc_file_content) self.assertIn(b'* Kept control connection to', warc_file_content) self.assertIn(b'* Opened data connection to ', warc_file_content) self.assertIn(b'* Closed data connection to ', warc_file_content) self.assertIn(b'> GIMMEH example.txt', warc_file_content) self.assertIn(b'< 200 OK, no need to yell.', warc_file_content) # Ignore Concurrent Record ID not seen yet self.validate_warc(warc_filename, ignore_minor_error=True) with open(warc_filename, 'r+b') as in_file: # Intentionally modify the contents in_file.seek(355) in_file.write(b'f') with self.assertRaises(Exception): # Sanity check that it actually raises error on bad digest self.validate_warc(warc_filename, ignore_minor_error=True)