Example #1
0
    def test_app_args_warc_dedup(self):
        arg_parser = AppArgumentParser()

        with open('dedup.cdx', 'wb') as out_file:
            out_file.write(b' CDX a k u\n')
            out_file.write(self.get_url('/static/my_file.txt').encode('ascii'))
            out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34')
            out_file.write(b' <under-the-deer>\n')

        args = arg_parser.parse_args([
            self.get_url('/static/my_file.txt'),
            '--no-parent',
            '--warc-file',
            'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--warc-dedup',
            'dedup.cdx',
        ])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        with open('test.warc', 'rb') as in_file:
            data = in_file.read()

            self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data)
            self.assertIn(b'Type: revisit', data)
            self.assertIn(b'<under-the-deer>', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
Example #2
0
    def test_app_args_warc(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--warc-file',
            'test',
            '-4',
            '--no-robots',
            '--no-warc-digests',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('test.warc.gz'))

        with gzip.GzipFile('test.warc.gz') as in_file:
            data = in_file.read()
            self.assertIn(b'FINISHED', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
Example #3
0
    def test_ssl_bad_certificate(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'), '--no-robots', '--no-check-certificate',
            '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        class MockWebSession(WebSession):
            @asyncio.coroutine
            def start(self):
                raise SSLVerificationError('A very bad certificate!')

        class MockWebClient(builder.factory.class_map['WebClient']):
            def session(self, request):
                return MockWebSession(request, self._http_client,
                                      self._redirect_tracker_factory(),
                                      Request)

        builder.factory.class_map['WebClient'] = MockWebClient

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(7, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
Example #4
0
    def test_app_sanity(self):
        arg_items = [
            ('--verbose', '--quiet'),
            ('--timestamp', '--no-clobber'),
            ('--inet4-only', '--inet6-only'),
            ('--warc-file=test', '--no-clobber'),
            ('--warc-file=test', '--timestamping'),
            ('--warc-file=test', '--continue'),
            ('--no-iri', '--local-encoding=shiftjis'),
            ('--no-iri', '--remote-encoding=shiftjis'),
        ]

        for arg_item in arg_items:

            def print_(message=None):
                print(message)

            def test_exit(status=0, message=None):
                raise ValueError(status, message)

            arg_parser = AppArgumentParser()
            arg_parser.exit = test_exit
            arg_parser.print_help = print_
            arg_parser.print_usage = print_

            try:
                print(arg_item)
                arg_parser.parse_args(['http://example.invalid'] +
                                      list(arg_item))
            except ValueError as error:
                self.assertEqual(2, error.args[0])
            else:
                self.assertTrue(False)
Example #5
0
    def test_app_phantomjs_scroll(self):
        arg_parser = AppArgumentParser()

        # Change localhost into something else to test proxy
        args = arg_parser.parse_args([
            self.get_url('/static/DEUUEAUGH.html').replace(
                'localhost', 'example.invalid'),
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-wait',
            '0.4',
            '--phantomjs-scroll',
            '20',
            '--no-check-certificate',
        ])
        builder = Builder(args, unit_test=True)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        app = builder.build()
        exit_code = yield from app.run()

        with open('DEUUEAUGH.html.snapshot.html', 'rb') as in_file:
            data = in_file.read()
            self.assertIn(b'Count: 10', data)

        self.assertEqual(0, exit_code)
Example #6
0
    def test_big_payload(self):
        hash_obj = hashlib.sha1(b'foxfoxfox')
        payload_list = []

        for dummy in range(10000):
            data = hash_obj.digest()
            hash_obj.update(data)
            payload_list.append(data)

        data = hash_obj.digest()
        payload_list.append(data)
        expected_payload = b''.join(payload_list)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/big_payload')])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        self.assertTrue(os.path.exists('big_payload'))

        with open('big_payload', 'rb') as in_file:
            self.assertEqual(expected_payload, in_file.read())

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
Example #7
0
    def test_save_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b'# Kittens\n')
            in_file.write(b'localhost.local')
            in_file.write(b'\tFALSE\t/\tFALSE\t9999999999\tisloggedin\t1\n')
            in_file.write(b'\tFALSE\t/\tFALSE\t\tadmin\t1\n')
            in_file.flush()

            args = arg_parser.parse_args([
                self.get_url('/some_page/'),
                '--load-cookies',
                in_file.name,
                '--tries',
                '1',
                '--save-cookies',
                'wpull_test_cookies.txt',
            ])
            builder = Builder(args, unit_test=True)

            app = builder.build()
            exit_code = yield from app.run()

            self.assertEqual(0, exit_code)
            self.assertEqual(1, builder.factory['Statistics'].files)

            with open('wpull_test_cookies.txt', 'rb') as saved_file:
                cookie_data = saved_file.read()

            self.assertIn(b'isloggedin\t1', cookie_data)
            self.assertNotIn(b'admin\t1', cookie_data)
Example #8
0
    def test_timestamping_hit_orig(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url('/lastmod'), '--timestamping'])

        filename = os.path.join(self.temp_dir.name, 'lastmod')
        filename_orig = os.path.join(self.temp_dir.name, 'lastmod')

        with open(filename, 'wb') as out_file:
            out_file.write(b'HI')

        with open(filename_orig, 'wb') as out_file:
            out_file.write(b'HI')

        os.utime(filename_orig, (631152000, 631152000))

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)

        with open(filename, 'rb') as in_file:
            self.assertEqual(b'HI', in_file.read())

        with open(filename_orig, 'rb') as in_file:
            self.assertEqual(b'HI', in_file.read())
Example #9
0
    def test_app_python_plugin_script(self):
        arg_parser = AppArgumentParser()
        filename = os.path.join(os.path.dirname(__file__),
                                'sample_user_scripts', 'extensive.plugin.py')
        args = arg_parser.parse_args([
            self.get_url('/'),
            self.get_url('/some_page'),
            self.get_url('/mordor'), 'localhost:1/wolf', '--plugin-script',
            filename, '--page-requisites', '--reject-regex', '/post/',
            '--wait', '12', '--retry-connrefused', '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(42, exit_code)

        engine = builder.factory['PipelineSeries']
        self.assertEqual(2, engine.concurrency)

        stats = builder.factory['Statistics']

        self.assertEqual(3, stats.files)

        # duration should be virtually 0 but account for slowness on travis ci
        self.assertGreater(10.0, stats.duration)
Example #10
0
def main(exit=True, install_tornado_bridge=True, use_signals=True):
    if install_tornado_bridge:
        tornado.platform.asyncio.AsyncIOMainLoop().install()

    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    application = builder.build()

    if use_signals:
        application.setup_signal_handlers()

    if args.debug_manhole:
        import manhole
        import wpull
        wpull.wpull_builder = builder
        manhole.install()

    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
Example #11
0
    def test_propagate_ipv4_only_and_no_cert_check_to_youtube_dl(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            'https://www.youtube.com/watch?v=tPEE9ZwTmy0',
            '--warc-file',
            'test',
            '--debug',  # to capture youtube-dl arguments in the log
            '--no-warc-compression',
            '--youtube-dl',
            '--inet4-only',
            '--no-check-certificate',
            '--output-file',
            'test.log'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)

        with open('test.log', 'rb') as test_log:
            data = test_log.read()

            self.assertTrue(
                re.search(b'Starting process \[\'youtube-dl.*--force-ipv4',
                          data))
            self.assertTrue(
                re.search(
                    b'Starting process \[\'youtube-dl.*--no-check-certificate',
                    data))
Example #12
0
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        script_filename = os.path.join(os.path.dirname(__file__),
                                       'sample_user_scripts',
                                       'boring.plugin.py')

        # Change localhost into something else to test proxy
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html').replace(
                'localhost', 'example.invalid'),
            '--warc-file',
            'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-exe',
            'phantomjs',
            '--phantomjs-wait',
            '0.1',
            '--phantomjs-scroll',
            '2',
            '--header',
            'accept-language: dragon',
            '--plugin-script',
            script_filename,
            '--no-check-certificate',
        ])
        builder = Builder(args, unit_test=True)
        builder.factory.class_map['Resolver'] = MockDNSResolver

        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('test.warc'))
        self.assertTrue(os.path.exists('simple_javascript.html.snapshot.html'))
        self.assertTrue(os.path.exists('simple_javascript.html.snapshot.pdf'))

        with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
            data = in_file.read()
            self.assertIn(b'Hello world!', data)

        with open('test.warc', 'rb') as in_file:
            data = in_file.read()

            self.assertIn(b'urn:X-wpull:snapshot?url=', data)
            self.assertIn(b'text/html', data)
            self.assertIn(b'application/pdf', data)
            self.assertIn(b'application/json', data)
            self.assertIn(b'"set_scroll_top"', data)
            try:
                self.assertIn(b'Accept-Encoding: identity', data)
            except AssertionError:
                # webkit treats localhost differently
                self.assertNotIn(b'Accept-Encoding: gzip', data)
            self.assertIn(b'Accept-Language: dragon', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
Example #13
0
    def test_no_iri(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url('/'), '--no-iri', '--no-robots'])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
Example #14
0
 def test_app_args_post_data(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/post/'),
         '--post-data',
         'text=hi',
     ])
     builder = Builder(args, unit_test=True)
     app = builder.build()
     exit_code = yield from app.run()
     self.assertEqual(0, exit_code)
Example #15
0
    def test_database_uri(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(
            [self.get_url('/'), '--database-uri', 'sqlite:///test.db'])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
Example #16
0
    def test_check_certificate(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-robots',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(5, exit_code)
Example #17
0
    def test_session_timeout(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/sleep_long'), '--tries=1', '--session-timeout=0.1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(4, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
Example #18
0
    def test_referer_option(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/referrer/'), '-r', '--referer', 'http://left.shark/'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
Example #19
0
    def test_escaped_fragment_recursive(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/escape_from_fragments/'), '-r', '--escaped-fragment'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
Example #20
0
    def test_database_path_question_mark(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(
            [self.get_url('/'), '--database', 'test?.db'])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists('test_.db'))
Example #21
0
    def test_login_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/example (copy).txt'), '--user', 'smaug',
            '--password', 'hunter2', '--tries', '1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(6, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
Example #22
0
    def test_globbing(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/read*.txt'),
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
Example #23
0
    def test_immediate_robots_forbidden(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forbidden'),
            '--recursive',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
Example #24
0
    def test_strip_session_id(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forum/'),
            '-r',
            '--strip-session-id',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
Example #25
0
    def test_file_vs_directory(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/example2💎'), '--no-host-directories',
            '--no-remove-listing', '-r', '-l=1', '--tries=1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists('example2💎/.listing'))
Example #26
0
    def test_app_python_script_stop(self):
        arg_parser = AppArgumentParser()
        filename = os.path.join(os.path.dirname(__file__),
                                'sample_user_scripts', 'stopper.plugin.py')
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--plugin-script',
            filename,
        ])
        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(1, exit_code)
Example #27
0
    def test_referer_option_negative(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/referrer/'), '-r', '--referer',
            'http://superinformation.highway/', '--tries', '1', '--waitretry',
            '.1'
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
Example #28
0
    def test_output_document(self):
        arg_parser = AppArgumentParser()

        args = arg_parser.parse_args(
            [self.get_url('/'), '--output-document', 'blah.dat'])

        builder = Builder(args, unit_test=True)
        app = builder.build()
        exit_code = yield from app.run()

        self.assertTrue(os.path.exists('blah.dat'))
        self.assertTrue(os.path.getsize('blah.dat'))

        self.assertEqual(0, exit_code)
Example #29
0
    def test_invalid_char_dir_list(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/hidden/invalid_chars/'),
            '--no-host-directories',
            '--no-remove-listing',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()
        print(list(os.walk('.')))

        self.assertEqual(0, exit_code)
        self.assertTrue(os.path.exists('.listing'))
Example #30
0
    def test_no_cache_arg(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/no-cache'), '--tries=1'])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(8, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/no-cache'),
            '--tries=1',
            '--no-cache',
        ])
        builder = Builder(args, unit_test=True)

        app = builder.build()
        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)