コード例 #1
0
ファイル: app_test.py プロジェクト: nwpu063291/wpull
    def test_app_args_warc(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--warc-file', 'test',
            '-4',
            '--no-robots',
            '--no-warc-digests',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

            self.assertTrue(os.path.exists('test.warc.gz'))

            with wpull.backport.gzip.GzipFile('test.warc.gz') as in_file:
                data = in_file.read()
                self.assertIn(b'FINISHED', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #2
0
ファイル: __main__.py プロジェクト: DanielOaks/wpull
def main():
    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()
    io_loop = tornado.ioloop.IOLoop.current()
    engine = Builder(args).build()
    status = {'graceful_called': False}

    def graceful_stop_handler(dummy1, dummy2):
        if status['graceful_called']:
            forceful_stop_handler(dummy1, dummy2)
            return

        status['graceful_called'] = True

        _logger.info(_('Stopping once all requests complete...'))
        _logger.info(_('Interrupt again to force stopping immediately.'))
        engine.stop()

    def forceful_stop_handler(dummy1, dummy2):
        _logger.info(_('Forcing immediate stop...'))
        engine.stop(force=True)

    signal.signal(signal.SIGINT, graceful_stop_handler)
    signal.signal(signal.SIGTERM, forceful_stop_handler)

    exit_code = io_loop.run_sync(engine)
    sys.exit(exit_code)
コード例 #3
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_app_args_warc_dedup(self):
        arg_parser = AppArgumentParser()

        with cd_tempdir():
            with open('dedup.cdx', 'wb') as out_file:
                out_file.write(b' CDX a k u\n')
                out_file.write(
                    self.get_url('/static/my_file.txt').encode('ascii')
                )
                out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34')
                out_file.write(b' <under-the-deer>\n')

            args = arg_parser.parse_args([
                self.get_url('/static/my_file.txt'),
                '--no-parent',
                '--warc-file', 'test',
                '--no-warc-compression',
                '-4',
                '--no-robots',
                '--warc-dedup', 'dedup.cdx',
            ])

            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            with open('test.warc', 'rb') as in_file:
                data = in_file.read()

                self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data)
                self.assertIn(b'Type: revisit', data)
                self.assertIn(b'<under-the-deer>', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #4
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_local_encoding(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/?qwerty').encode('utf-32-le'))
            in_file.write('\n'.encode('utf-32-le'))
            in_file.flush()

            opts = [
                self.get_url('/?asdf'),
                '--local-encoding', 'utf-32-le',
                '--input-file', in_file.name
            ]

            opts = [string.encode('utf-32-le') for string in opts]

            args = arg_parser.parse_args(opts)
            builder = Builder(args)

            with cd_tempdir():
                engine = builder.build()
                exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
コード例 #5
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html'),
            '--warc-file', 'test',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-wait', '0.1',
            '--phantomjs-scroll', '2',
        ])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

            self.assertTrue(os.path.exists('test.warc.gz'))
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.html')
            )
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.pdf')
            )

            with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
                data = in_file.read()
                self.assertIn(b'Hello world!', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #6
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_app_args_warc_dedup(self):
        arg_parser = AppArgumentParser()

        with cd_tempdir():
            with open('dedup.cdx', 'wb') as out_file:
                out_file.write(b' CDX a k u\n')
                out_file.write(
                    self.get_url('/static/my_file.txt').encode('ascii')
                )
                out_file.write(b' KQ4IUKATKL63FT5GMAE2YDRV3WERNL34')
                out_file.write(b' <under-the-deer>\n')

            args = arg_parser.parse_args([
                self.get_url('/static/my_file.txt'),
                '--no-parent',
                '--warc-file', 'test',
                '--no-warc-compression',
                '-4',
                '--no-robots',
                '--warc-dedup', 'dedup.cdx',
            ])

            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            with open('test.warc', 'rb') as in_file:
                data = in_file.read()

                self.assertIn(b'KQ4IUKATKL63FT5GMAE2YDRV3WERNL34', data)
                self.assertIn(b'Type: revisit', data)
                self.assertIn(b'<under-the-deer>', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #7
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html'),
            '--warc-file',
            'test',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-wait',
            '0.1',
            '--phantomjs-scroll',
            '2',
        ])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

            self.assertTrue(os.path.exists('test.warc.gz'))
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.html'))
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.pdf'))

            with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
                data = in_file.read()
                self.assertIn(b'Hello world!', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #8
0
ファイル: app_test.py プロジェクト: lowks/wpull
 def test_iri_handling(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([self.get_url('/static/mojibake.html')])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
コード例 #9
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
 def test_iri_handling(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([self.get_url('/static/mojibake.html')])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
コード例 #10
0
def main(exit=True, install_tornado_bridge=True, prefer_trollius=True):
    if prefer_trollius:
        try:
            import asyncio
        except ImportError:
            pass
        else:
            asyncio.set_event_loop_policy(trollius.get_event_loop_policy())

    if install_tornado_bridge:
        tornado.platform.asyncio.AsyncIOMainLoop().install()

    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    builder.build()

    application = builder.factory['Application']
    application.setup_signal_handlers()

    if args.debug_manhole:
        import manhole
        import wpull
        wpull.wpull_builder = builder
        manhole.install()

    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
コード例 #11
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_big_payload(self):
        hash_obj = hashlib.sha1(b'foxfoxfox')
        payload_list = []

        for dummy in range(10000):
            data = hash_obj.digest()
            hash_obj.update(data)
            payload_list.append(data)

        data = hash_obj.digest()
        payload_list.append(data)
        expected_payload = b''.join(payload_list)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/big_payload')])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()
            self.assertTrue(os.path.exists('big_payload'))

            with open('big_payload', 'rb') as in_file:
                self.assertEqual(expected_payload, in_file.read())

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #12
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b'# Kittens\n')
            in_file.write(b'localhost.local')
            in_file.write(b'\tFALSE\t/\tFALSE\t\ttest\tno\n')
            in_file.flush()

            args = arg_parser.parse_args([
                self.get_url('/cookie'),
                '--load-cookies', in_file.name,
                '--tries', '1',
                '--save-cookies', 'wpull_test_cookies.txt',
                '--keep-session-cookies',
            ])
            builder = Builder(args)

            with cd_tempdir():
                engine = builder.build()
                exit_code = yield engine()

                self.assertEqual(0, exit_code)
                self.assertEqual(1, builder.factory['Statistics'].files)

                cookies = list(builder.factory['CookieJar'])
                _logger.debug('{0}'.format(cookies))
                self.assertEqual(1, len(cookies))
                self.assertEqual('test', cookies[0].name)
                self.assertEqual('yes', cookies[0].value)

                with open('wpull_test_cookies.txt', 'rb') as saved_file:
                    cookie_data = saved_file.read()

                self.assertIn(b'test\tyes', cookie_data)
コード例 #13
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_big_payload(self):
        hash_obj = hashlib.sha1(b'foxfoxfox')
        payload_list = []

        for dummy in range(10000):
            data = hash_obj.digest()
            hash_obj.update(data)
            payload_list.append(data)

        data = hash_obj.digest()
        payload_list.append(data)
        expected_payload = b''.join(payload_list)

        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/big_payload')])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()
            self.assertTrue(os.path.exists('big_payload'))

            with open('big_payload', 'rb') as in_file:
                self.assertEqual(expected_payload, in_file.read())

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #14
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_local_encoding(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/?qwerty').encode('utf-32-le'))
            in_file.write('\n'.encode('utf-32-le'))
            in_file.flush()

            opts = [
                self.get_url('/?asdf'),
                '--local-encoding', 'utf-32-le',
                '--input-file', in_file.name
            ]

            opts = [string.encode('utf-32-le') for string in opts]

            args = arg_parser.parse_args(opts)
            builder = Builder(args)

            with cd_tempdir():
                engine = builder.build()
                exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(2, builder.factory['Statistics'].files)
コード例 #15
0
ファイル: __main__.py プロジェクト: DanielOaks/wpull
def main():
    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()
    io_loop = tornado.ioloop.IOLoop.current()
    engine = Builder(args).build()
    status = {'graceful_called': False}

    def graceful_stop_handler(dummy1, dummy2):
        if status['graceful_called']:
            forceful_stop_handler(dummy1, dummy2)
            return

        status['graceful_called'] = True

        _logger.info(_('Stopping once all requests complete...'))
        _logger.info(_('Interrupt again to force stopping immediately.'))
        engine.stop()

    def forceful_stop_handler(dummy1, dummy2):
        _logger.info(_('Forcing immediate stop...'))
        engine.stop(force=True)

    signal.signal(signal.SIGINT, graceful_stop_handler)
    signal.signal(signal.SIGTERM, forceful_stop_handler)

    exit_code = io_loop.run_sync(engine)
    sys.exit(exit_code)
コード例 #16
0
ファイル: writer_test.py プロジェクト: DanielOaks/wpull
    def test_timestamping_hit_orig(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/lastmod'),
            '--timestamping'
        ])

        with cd_tempdir() as temp_dir:
            filename = os.path.join(temp_dir, 'lastmod')
            filename_orig = os.path.join(temp_dir, 'lastmod')

            with open(filename, 'wb') as out_file:
                out_file.write(b'HI')

            with open(filename_orig, 'wb') as out_file:
                out_file.write(b'HI')

            os.utime(filename_orig, (631152000, 631152000))

            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            self.assertEqual(0, exit_code)

            with open(filename, 'rb') as in_file:
                self.assertEqual(b'HI', in_file.read())

            with open(filename_orig, 'rb') as in_file:
                self.assertEqual(b'HI', in_file.read())
コード例 #17
0
ファイル: writer_test.py プロジェクト: DanielOaks/wpull
    def test_new_file_and_clobber(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/static/my_file.txt')])

        with cd_tempdir() as temp_dir:
            engine = Builder(args).build()
            exit_code = yield engine()

            self.assertEqual(0, exit_code)

            expected_filename = os.path.join(temp_dir, 'my_file.txt')

            self.assertTrue(os.path.exists(expected_filename))

            with open(expected_filename, 'rb') as in_file:
                self.assertIn(b'END', in_file.read())

            engine = Builder(args).build()
            exit_code = yield engine()

            self.assertEqual(0, exit_code)

            expected_filename = os.path.join(temp_dir, 'my_file.txt.1')

            self.assertTrue(os.path.exists(expected_filename))
コード例 #18
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_cookie(self):
        arg_parser = AppArgumentParser()

        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(b'# Netscape HTTP Cookie File\n')
            in_file.write(b'localhost.local')
            in_file.write(b'\tFALSE\t/\tFALSE\t\ttest\tno\n')
            in_file.flush()

            args = arg_parser.parse_args([
                self.get_url('/cookie'),
                '--load-cookies', in_file.name,
                '--tries', '1',
                '--save-cookies', 'wpull_test_cookies.txt',
                '--keep-session-cookies',
            ])
            builder = Builder(args)

            with cd_tempdir():
                engine = builder.build()
                exit_code = yield engine()

                self.assertEqual(0, exit_code)
                self.assertEqual(1, builder.factory['Statistics'].files)

                cookies = list(builder.factory['CookieJar'])
                _logger.debug('{0}'.format(cookies))
                self.assertEqual(1, len(cookies))
                self.assertEqual('test', cookies[0].name)
                self.assertEqual('yes', cookies[0].value)

                with open('wpull_test_cookies.txt', 'rb') as saved_file:
                    cookie_data = saved_file.read()

                self.assertIn(b'test\tyes', cookie_data)
コード例 #19
0
    def test_new_file_and_clobber(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/static/my_file.txt')])

        with cd_tempdir() as temp_dir:
            engine = Builder(args).build()
            exit_code = yield engine()

            self.assertEqual(0, exit_code)

            expected_filename = os.path.join(temp_dir, 'my_file.txt')

            self.assertTrue(os.path.exists(expected_filename))

            with open(expected_filename, 'rb') as in_file:
                self.assertIn(b'END', in_file.read())

            engine = Builder(args).build()
            exit_code = yield engine()

            self.assertEqual(0, exit_code)

            expected_filename = os.path.join(temp_dir, 'my_file.txt.1')

            self.assertTrue(os.path.exists(expected_filename))
コード例 #20
0
    def test_timestamping_hit_orig(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url('/lastmod'), '--timestamping'])

        with cd_tempdir() as temp_dir:
            filename = os.path.join(temp_dir, 'lastmod')
            filename_orig = os.path.join(temp_dir, 'lastmod')

            with open(filename, 'wb') as out_file:
                out_file.write(b'HI')

            with open(filename_orig, 'wb') as out_file:
                out_file.write(b'HI')

            os.utime(filename_orig, (631152000, 631152000))

            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            self.assertEqual(0, exit_code)

            with open(filename, 'rb') as in_file:
                self.assertEqual(b'HI', in_file.read())

            with open(filename_orig, 'rb') as in_file:
                self.assertEqual(b'HI', in_file.read())
コード例 #21
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
 def test_redirect_diff_host_recursive(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args(
         [self.get_url('/redirect?where=diff-host'), '--recursive'])
     builder = Builder(args)
     with cd_tempdir():
         engine = builder.build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
     self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #22
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
 def test_app_args_post_data(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/post/'),
         '--post-data', 'text=hi',
     ])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
コード例 #23
0
ファイル: app_test.py プロジェクト: lowks/wpull
 def test_app_args_post_data(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/post/'),
         '--post-data', 'text=hi',
     ])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
コード例 #24
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
 def test_redirect_diff_host_recursive(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/redirect?where=diff-host'),
         '--recursive'
     ])
     builder = Builder(args)
     with cd_tempdir():
         engine = builder.build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
     self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #25
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_app_args(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            '/',
            '--base', self.get_url('/').encode('utf-8'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--database', b'test.db',
            '--server-response',
            '--random-wait',
            b'--wait', b'0.1',
            '--protocol-directories',
            '--referer', 'http://test.test',
            '--accept-regex', r'.*',
            '--header', 'Hello: world!',
            '--exclude-domains', 'asdf.invalid',
            '--exclude-hostnames', 'qwerty.invalid,uiop.invalid',
            '--no-clobber',
            '--rotate-dns',
            '-4',
            '--concurrent', '2',
            '--no-check-certificate',
            '--ascii-print',
            '--progress', 'dot',
            '--secure-protocol', 'TLSv1',
            '--convert-links', '--backup-converted',
            '--accept', '*',
            '--no-strong-robots',
            '--restrict-file-names', 'windows,lower',
            '--quota', '10m',
            '--max-filename-length', '100',
            '--user-agent', 'ΑΒΓαβγ',
            '--remote-encoding', 'latin1',
            '--http-compression',
            '--bind-address', '127.0.0.1',
        ])
        with cd_tempdir():
            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            print(list(os.walk('.')))
            self.assertTrue(os.path.exists(
                'http/localhost+{0}/index.html'.format(self.get_http_port())
            ))
            self.assertTrue(os.path.exists(
                'http/localhost+{0}/index.html.orig'.format(
                    self.get_http_port())
            ))

        self.assertEqual(0, exit_code)
        self.assertEqual(builder.factory['Statistics'].files, 2)
コード例 #26
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
 def test_app_python_script_stop(self):
     arg_parser = AppArgumentParser()
     filename = os.path.join(os.path.dirname(__file__),
         'testing', 'py_hook_script_stop.py')
     args = arg_parser.parse_args([
         self.get_url('/'),
         '--python-script', filename,
     ])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(1, exit_code)
コード例 #27
0
ファイル: app_test.py プロジェクト: lowks/wpull
 def test_app_python_script_stop(self):
     arg_parser = AppArgumentParser()
     filename = os.path.join(os.path.dirname(__file__),
                             'testing', 'py_hook_script_stop.py')
     args = arg_parser.parse_args([
         self.get_url('/'),
         '--python-script', filename,
     ])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(1, exit_code)
コード例 #28
0
ファイル: app_test.py プロジェクト: nwpu063291/wpull
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        script_filename = os.path.join(os.path.dirname(__file__),
                                       'testing', 'boring_script.py')
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html'),
            '--warc-file', 'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-exe', 'phantomjs',
            '--phantomjs-wait', '0.1',
            '--phantomjs-scroll', '2',
            '--header', 'accept-language: dragon',
            '--python-script', script_filename,
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

            self.assertTrue(os.path.exists('test.warc'))
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.html')
            )
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.pdf')
            )

            with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
                data = in_file.read()
                self.assertIn(b'Hello world!', data)

            with open('test.warc', 'rb') as in_file:
                data = in_file.read()

                self.assertIn(b'urn:X-wpull:snapshot?url=', data)
                self.assertIn(b'text/html', data)
                self.assertIn(b'application/pdf', data)
                self.assertIn(b'application/json', data)
                self.assertIn(b'"set_scroll_top"', data)
                try:
                    self.assertIn(b'Accept-Encoding: identity', data)
                except AssertionError:
                    # webkit treats localhost differently
                    self.assertNotIn(b'Accept-Encoding: gzip', data)
                self.assertIn(b'Accept-Language: dragon', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #29
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_app_input_file_arg(self):
        arg_parser = AppArgumentParser(real_exit=False)
        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/').encode('utf-8'))
            in_file.write(b'\n')
            in_file.write(self.get_url('/blog/').encode('utf-8'))
            in_file.flush()

            args = arg_parser.parse_args(['--input-file', in_file.name])
            with cd_tempdir():
                engine = Builder(args).build()
                exit_code = yield engine()
        self.assertEqual(0, exit_code)
コード例 #30
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_redirect_diff_host(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args(
            [self.get_url('/redirect?where=diff-host'), '--waitretry', '0'])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        # FIXME: for now, we'll assume the DNS failed to resolve because
        # it tried to span hosts
        self.assertEqual(4, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #31
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_immediate_robots_forbidden(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forbidden'),
            '--recursive',
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #32
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_app_phantomjs(self):
        arg_parser = AppArgumentParser()
        script_filename = os.path.join(os.path.dirname(__file__),
            'testing', 'boring_script.py')
        args = arg_parser.parse_args([
            self.get_url('/static/simple_javascript.html'),
            '--warc-file', 'test',
            '--no-warc-compression',
            '-4',
            '--no-robots',
            '--phantomjs',
            '--phantomjs-wait', '0.1',
            '--phantomjs-scroll', '2',
            '--header', 'accept-language: dragon',
            '--python-script', script_filename,
        ])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

            self.assertTrue(os.path.exists('test.warc'))
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.html')
            )
            self.assertTrue(
                os.path.exists('simple_javascript.html.snapshot.pdf')
            )

            with open('simple_javascript.html.snapshot.html', 'rb') as in_file:
                data = in_file.read()
                self.assertIn(b'Hello world!', data)

            with open('test.warc', 'rb') as in_file:
                data = in_file.read()

                self.assertIn(b'urn:X-wpull:snapshot?url=', data)
                self.assertIn(b'text/html', data)
                self.assertIn(b'application/pdf', data)
                self.assertIn(b'application/json', data)
                self.assertIn(b'"set_scroll_top"', data)
                try:
                    self.assertIn(b'Accept-Encoding: identity', data)
                except AssertionError:
                    # webkit treats localhost differently
                    self.assertNotIn(b'Accept-Encoding: gzip', data)
                self.assertIn(b'Accept-Language: dragon', data)

        self.assertEqual(0, exit_code)
        self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #33
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_immediate_robots_forbidden(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/forbidden'),
            '--recursive',
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #34
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_app_args(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/').encode('utf-8'),
            '--no-parent',
            '--recursive',
            '--page-requisites',
            '--database',
            b'test.db',
            '--server-response',
            '--random-wait',
            b'--wait',
            b'0.1',
            '--protocol-directories',
            '--referer',
            'http://test.test',
            '--accept-regex',
            r'.*',
            '--header',
            'Hello: world!',
            '--exclude-domains',
            'asdf.invalid',
            '--exclude-hostnames',
            'qwerty.invalid,uiop.invalid',
            '--no-clobber',
            '--rotate-dns',
            '-4',
            '--concurrent',
            '2',
            '--no-check-certificate',
            '--ascii-print',
            '--progress',
            'dot',
            '--secure-protocol',
            'TLSv1',
            '--convert-links',
            '--backup-converted',
            '--accept',
            '*',
        ])
        with cd_tempdir():
            engine = Builder(args).build()
            exit_code = yield engine()

            print(list(os.walk('.')))
            self.assertTrue(os.path.exists('http/localhost/index.html'))
            self.assertTrue(os.path.exists('http/localhost/index.html.orig'))

        self.assertEqual(0, exit_code)
コード例 #35
0
ファイル: __main__.py プロジェクト: mback2k/wpull
def main(exit=True):
    arg_parser = AppArgumentParser()
    args = arg_parser.parse_args()

    builder = Builder(args)
    builder.build()

    application = builder.factory['Application']
    application.setup_signal_handlers()
    exit_code = application.run_sync()

    if exit:
        sys.exit(exit_code)
    else:
        return exit_code
コード例 #36
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_quota(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/blog/'),
            '--recursive',
            '--quota', '1',
        ])

        with cd_tempdir():
            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #37
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_non_http_redirect(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/non_http_redirect'),
            '--recursive',
            '--no-robots'
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #38
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_redirect_diff_host(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/redirect?where=diff-host'),
            '--waitretry', '0'
        ])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        # FIXME: for now, we'll assume the DNS failed to resolve because
        # it tried to span hosts
        self.assertEqual(4, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #39
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
 def test_app_lua_script(self):
     arg_parser = AppArgumentParser()
     filename = os.path.join(os.path.dirname(__file__),
         'testing', 'lua_hook_script.lua')
     args = arg_parser.parse_args([
         self.get_url('/'),
         'localhost:1',
         '--lua-script', filename,
         '--page-requisites',
         '--reject-regex', '/post/',
     ])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(42, exit_code)
コード例 #40
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_app_input_file_arg(self):
        arg_parser = AppArgumentParser(real_exit=False)
        with tempfile.NamedTemporaryFile() as in_file:
            in_file.write(self.get_url('/').encode('utf-8'))
            in_file.write(b'\n')
            in_file.write(self.get_url('/blog/').encode('utf-8'))
            in_file.flush()

            args = arg_parser.parse_args([
                '--input-file', in_file.name
            ])
            with cd_tempdir():
                engine = Builder(args).build()
                exit_code = yield engine()
        self.assertEqual(0, exit_code)
コード例 #41
0
ファイル: app_test.py プロジェクト: nwpu063291/wpull
    def test_no_iri(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--no-iri',
            '--no-robots'
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #42
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_bad_cookie(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/bad_cookie'),
        ])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()
        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)

        cookies = list(builder.factory['CookieJar'])
        _logger.debug('{0}'.format(cookies))
        self.assertEqual(2, len(cookies))
コード例 #43
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_non_http_redirect(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/non_http_redirect'),
            '--recursive',
            '--no-robots'
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #44
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_long_cookie(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/long_cookie'),
        ])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()
        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)

        cookies = list(builder.factory['CookieJar'])
        _logger.debug('{0}'.format(cookies))
        self.assertEqual(0, len(cookies))
コード例 #45
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_ignore_length(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/underrun'),
            '--ignore-length',
            '--no-robots',
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #46
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_ignore_length(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/underrun'),
            '--ignore-length',
            '--no-robots',
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #47
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_quota(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/blog/'),
            '--recursive',
            '--quota', '1',
        ])

        with cd_tempdir():
            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #48
0
ファイル: app_test.py プロジェクト: lowks/wpull
    def test_output_document(self):
        arg_parser = AppArgumentParser()

        with cd_tempdir():
            args = arg_parser.parse_args([
                self.get_url('/'),
                '--output-document', 'blah.dat'
            ])

            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            self.assertTrue(os.path.exists('blah.dat'))

        self.assertEqual(0, exit_code)
コード例 #49
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
 def test_app_args_warc_with_cdx(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/'),
         '--no-parent',
         '--warc-file', 'test',
         '-4',
         '--no-robots',
         '--warc-cdx',
     ])
     builder = Builder(args)
     with cd_tempdir():
         engine = builder.build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
     self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #50
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_output_document(self):
        arg_parser = AppArgumentParser()

        with cd_tempdir():
            args = arg_parser.parse_args([
                self.get_url('/'),
                '--output-document', 'blah.dat'
            ])

            builder = Builder(args)
            engine = builder.build()
            exit_code = yield engine()

            self.assertTrue(os.path.exists('blah.dat'))

        self.assertEqual(0, exit_code)
コード例 #51
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
 def test_many_page_with_some_fail(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/blog/'),
         '--no-parent',
         '--recursive',
         '--page-requisites',
         '-4',
     ])
     builder = Builder(args)
     with cd_tempdir():
         engine = builder.build()
         exit_code = yield engine()
     self.assertEqual(ExitStatus.server_error, exit_code)
     self.assertGreater(builder.factory['Statistics'].files, 1)
     self.assertGreater(builder.factory['Statistics'].duration, 3)
コード例 #52
0
ファイル: app_test.py プロジェクト: lowks/wpull
 def test_app_args_warc_with_cdx(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/'),
         '--no-parent',
         '--warc-file', 'test',
         '-4',
         '--no-robots',
         '--warc-cdx',
     ])
     builder = Builder(args)
     with cd_tempdir():
         engine = builder.build()
         exit_code = yield engine()
     self.assertEqual(0, exit_code)
     self.assertGreaterEqual(builder.factory['Statistics'].files, 1)
コード例 #53
0
ファイル: app_test.py プロジェクト: lowks/wpull
 def test_many_page_with_some_fail(self):
     arg_parser = AppArgumentParser()
     args = arg_parser.parse_args([
         self.get_url('/blog/'),
         '--no-parent',
         '--recursive',
         '--page-requisites',
         '-4',
     ])
     builder = Builder(args)
     with cd_tempdir():
         engine = builder.build()
         exit_code = yield engine()
     self.assertEqual(ExitStatus.server_error, exit_code)
     self.assertGreater(builder.factory['Statistics'].files, 1)
     self.assertGreater(builder.factory['Statistics'].duration, 3)
コード例 #54
0
ファイル: app_test.py プロジェクト: nwpu063291/wpull
    def test_bad_redirect(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/bad_redirect'),
            '--recursive',
            '--no-robots',
            '--waitretry', '0.1',
        ])
        builder = Builder(args)

        with cd_tempdir():
            app = builder.build()
            exit_code = yield app.run()

        self.assertEqual(7, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #55
0
ファイル: app_test.py プロジェクト: imshashank/data-mining
    def test_https_only(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/?1'),
            self.get_url('/?2').replace('https://', 'http://'),
            '--https-only',
            '--no-robots',
            '--no-check-certificate',
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)
コード例 #56
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
 def test_app_lua_script(self):
     arg_parser = AppArgumentParser()
     filename = os.path.join(os.path.dirname(__file__), 'testing',
                             'lua_hook_script.lua')
     args = arg_parser.parse_args([
         self.get_url('/'),
         'localhost:1',
         '--lua-script',
         filename,
         '--page-requisites',
         '--reject-regex',
         '/post/',
     ])
     with cd_tempdir():
         engine = Builder(args).build()
         exit_code = yield engine()
     self.assertEqual(42, exit_code)
コード例 #57
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_immediate_robots_fail(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([
            self.get_url('/'),
            '--recursive',
        ])
        builder = Builder(args)

        with cd_tempdir():
            engine = builder.build()
            robots_txt_pool = builder.factory['RobotsTxtPool']
            robots_txt_pool.load_robots_txt(URLInfo.parse(self.get_url('/')),
                                            'User-Agent: *\nDisallow: *\n')
            exit_code = yield engine()

        self.assertEqual(0, exit_code)
        self.assertEqual(0, builder.factory['Statistics'].files)
コード例 #58
0
ファイル: app_test.py プロジェクト: DanielOaks/wpull
    def test_one_page(self):
        arg_parser = AppArgumentParser()
        args = arg_parser.parse_args([self.get_url('/')])
        builder = Builder(args)
        with cd_tempdir():
            engine = builder.build()
            exit_code = yield engine()
            self.assertTrue(os.path.exists('index.html'))

        self.assertEqual(0, exit_code)
        self.assertEqual(1, builder.factory['Statistics'].files)

        cookies = list(builder.factory['CookieJar'])
        _logger.debug('{0}'.format(cookies))
        self.assertEqual(1, len(cookies))
        self.assertEqual('hi', cookies[0].name)
        self.assertEqual('hello', cookies[0].value)