Example #1
0
    def test_spurious_item_events(self):
        class StupidTask(SimpleTask):
            def __init__(self):
                SimpleTask.__init__(self, "StupidTask")

            def process(self, item):
                item.log_output('Failing the item.')
                self.fail_item(item)
                item.log_output('Completing the item.')
                self.complete_item(item)
                item.log_output('Failing the item.')
                self.fail_item(item)

        pipeline = Pipeline(StupidTask())
        pipeline.fail_count_test = 0

        def fail_callback(task, item):
            pipeline.fail_count_test += 1

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        self.assertEqual(1, pipeline.fail_count_test)
        self.assertIOLoopOK()
Example #2
0
    def test_spurious_item_events(self):
        class StupidTask(SimpleTask):
            def __init__(self):
                SimpleTask.__init__(self, "StupidTask")

            def process(self, item):
                item.log_output('Failing the item.')
                self.fail_item(item)
                item.log_output('Completing the item.')
                self.complete_item(item)
                item.log_output('Failing the item.')
                self.fail_item(item)

        pipeline = Pipeline(StupidTask())
        pipeline.fail_count_test = 0

        def fail_callback(task, item):
            pipeline.fail_count_test += 1

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        self.assertEqual(1, pipeline.fail_count_test)
        self.assertIOLoopOK()
Example #3
0
    def test_no_such_file(self):
        external_process = ExternalProcessUser(
            "Fake", ["kitteh and doge.avi.exe"])
        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()
        self.assertTrue(pipeline.has_failed)
        self.assertIOLoopOK()
    def test_no_such_file(self):
        external_process = ExternalProcessUser(
            "Fake", ["kitteh and doge.avi.exe"])
        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()
        self.assertTrue(pipeline.has_failed)
        self.assertIOLoopOK()
Example #5
0
    def test_proc(self):
        external_process = ExternalProcessUser(
            "Echo", ["python", "-c", "print('hello world!')"], max_tries=4)
        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        output = external_process.output_buffer.getvalue()
        self.assertFalse(pipeline.has_failed)
        self.assertTrue('hello world!' in output)
        self.assertIOLoopOK()
    def test_proc(self):
        external_process = ExternalProcessUser(
            "Echo", ["python", "-c", "print('hello world!')"], max_tries=4)
        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        output = external_process.output_buffer.getvalue()
        self.assertFalse(pipeline.has_failed)
        self.assertTrue('hello world!' in output)
        self.assertIOLoopOK()
Example #7
0
    def test_proc_utf8(self):
        external_process = ExternalProcessUser(
            "Echo", ["python", "-c", "print(u'hello world!áßðfáßðf')"],
        )

        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        self.assertFalse(pipeline.has_failed)
        self.assertIOLoopOK()
    def test_proc_utf8(self):
        external_process = ExternalProcessUser(
            "Echo", ["python", "-c", "print(u'hello world!áßðfáßðf')"],
        )

        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        self.assertFalse(pipeline.has_failed)
        self.assertIOLoopOK()
    def test_proc_stdin_error(self):
        external_process = ExternalProcessUser(
            "Echo", ["python", "-c" "print('hello world!')"], max_tries=4)

        external_process.stdin_data = lambda item: 123456

        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        self.assertTrue(pipeline.has_failed)
        self.assertIOLoopOK()
        self.assertEqual(4, external_process.exit_count)
    def test_proc_fail(self):
        for max_tries in [1, 2, 20]:
            external_process = ExternalProcessUser(
                "Quitter", ["python", "-c", "import sys;sys.exit(33)"],
                max_tries=max_tries)
            pipeline = Pipeline(external_process)
            pipeline.has_failed = None

            def fail_callback(task, item):
                pipeline.has_failed = True

            pipeline.on_fail_item += fail_callback

            runner = SimpleRunner(pipeline, max_items=1)
            runner.start()

            self.assertTrue(pipeline.has_failed)
            self.assertEqual(33, external_process.return_code)
            self.assertEqual(max_tries, external_process.exit_count)
            self.assertIOLoopOK()
Example #11
0
    def test_proc_stdin_error(self):
        external_process = ExternalProcessUser(
            "Echo", ["python", "-c" "print('hello world!')"], max_tries=4)

        external_process.stdin_data = lambda item: 123456

        pipeline = Pipeline(external_process)
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=1)
        runner.start()

        self.assertTrue(pipeline.has_failed)
        self.assertIOLoopOK()
        self.assertEqual(4, external_process.exit_count)
Example #12
0
    def test_proc_fail(self):
        for max_tries in [1, 2, 20]:
            external_process = ExternalProcessUser(
                "Quitter", ["python", "-c", "import sys;sys.exit(33)"],
                max_tries=max_tries)
            pipeline = Pipeline(external_process)
            pipeline.has_failed = None

            def fail_callback(task, item):
                pipeline.has_failed = True

            pipeline.on_fail_item += fail_callback

            runner = SimpleRunner(pipeline, max_items=1)
            runner.start()

            self.assertTrue(pipeline.has_failed)
            self.assertEqual(33, external_process.return_code)
            self.assertEqual(max_tries, external_process.exit_count)
            self.assertIOLoopOK()
Example #13
0
    def test_max_items(self):
        pipeline = Pipeline(PrintItem(), PrintItem())
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=3)

        def finish_item_callback(runner, pipeline, item):
            if runner.item_count > 10:
                raise Exception('Too many items.')

        runner.on_pipeline_finish_item += finish_item_callback
        runner.start()

        self.assertFalse(pipeline.has_failed)
        self.assertEqual(3, runner.item_count)
        self.assertIOLoopOK()
Example #14
0
    def test_max_items(self):
        pipeline = Pipeline(PrintItem(), PrintItem())
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=3)

        def finish_item_callback(runner, pipeline, item):
            if runner.item_count > 10:
                raise Exception('Too many items.')

        runner.on_pipeline_finish_item += finish_item_callback
        runner.start()

        self.assertFalse(pipeline.has_failed)
        self.assertEqual(3, runner.item_count)
        self.assertIOLoopOK()
Example #15
0
    def test_runner_signals_pipeline_on_stop(self):
        pipeline = Pipeline(PrintItem())
        runner = SimpleRunner(pipeline, max_items=1)

        def stop_requested():
            self.stop_requested_calls += 1

        pipeline.on_stop_requested += stop_requested
        runner.start()
        runner.stop_gracefully()

        self.assertEqual(1, self.stop_requested_calls)
Example #16
0
    def test_runner_does_pipeline_cleanup_before_shutdown(self):
        pipeline = Pipeline(PrintItem())
        runner = SimpleRunner(pipeline, max_items=1)

        def cleanup():
            self.cleanup_calls += 1

        pipeline.on_cleanup += cleanup
        runner.start()

        self.assertEqual(1, self.cleanup_calls)
        self.assertEqual(1, runner.item_count)
Example #17
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="furaffinity"),
    ExternalProcess(
        'Begin',
        [sys.executable, 'helper.py', 'begin'],
        env={
            'user_agent': user_agent,
            'bind_address': globals().get('bind_address', ''),
            'disco_tracker': DISCO_TRACKER_URL,
            "item_dir": ItemValue("item_dir"),
        },
        accept_on_exit_code=[0],
    ),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=6,
            default=globals().get("num_procs", "1"),
            name="shared:fagrab:num_procs",
            title="Number of Processes",
            description="The maximum number of concurrent download processes."
        ),
        WgetDownload(WgetArgs(),
                     max_tries=1,
                     accept_on_exit_code=[0, 4, 7, 8],
                     env={
                         "item_dir": ItemValue("item_dir"),
                         "downloader": downloader,
                         "item_name": ItemValue("item_name"),
                     }),
    ),
    ExternalProcess(
        'End',
        [sys.executable, 'helper.py', 'end'],
        env={
            'user_agent': user_agent,
            'bind_address': globals().get('bind_address', ''),
            'disco_tracker': DISCO_TRACKER_URL,
            "item_dir": ItemValue("item_dir"),
        },
        accept_on_exit_code=[0],
    ),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
            ]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir",
                ".rsync-tmp",
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Example #18
0
pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(file_prefix="isohunt"),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=10,
            default="10",
            name="isohunt:download_threads",
            title="Isohunt downloading threads",
            description=
            "How many threads downloading Isohunt torrents and pages can run at once, to avoid throttling."
        ),
        WgetDownloadTorrentRange(
            [
                WGET_LUA,
                "-U",
                USER_AGENT,
                "--no-check-certificate",
                "-e",
                "robots=off",
                "--rotate-dns",
                "--timeout",
                "60",
                "--level=inf",
                "--tries",
                "20",
                "--waitretry",
                "5",
                # "--bind-address", "%BIND_ADDRESS%",
            ],
            max_tries=5,
            accept_on_exit_code=[0]),
    ),
    PrepareStatsForTracker2(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": RangeInterpolation("%(item_dir)s/%(range_filename)s")
        }),  # Used to MoveFiles here, but that's actually kind of stupid.
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker2(
            "http://tracker.archiveteam.org/%s" % TRACKER_ID,
            downloader=downloader,
            version=VERSION,
            files=RangeInterpolation("%(item_dir)s/%(range_filename)s"),
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive", "--partial", "--partial-dir", ".rsync-tmp"
            ]),
    ),
    CleanUpDirectories(),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Example #19
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),  # noqa: F821
    PrepareDirectories(warc_prefix='yg-api'),
    YgaDownload(
        YgaArgs(),
        max_tries=0,  # 2,          #changed
        accept_on_exit_code=[0],  # [0, 4, 8],  #changed
        env={
            'item_dir': ItemValue('item_dir'),
            'item_value': ItemValue('item_value'),
            'item_type': ItemValue('item_type'),
            'warc_file_base': ItemValue('warc_file_base'),
        }),
    MoveFiles(),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },  # noqa: F821
        file_groups={
            'data': [
                ItemInterpolation(
                    '%(data_dir)s/%(warc_file_base)s.warc.gz')  # TODO ?
            ]
        },
        id_function=stats_id_function,
    ),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='20',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,  # noqa: F821
            version=VERSION,
            files=ItemValue('files'),
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive',
                '--partial',
                '--partial-dir',
                '.rsync-tmp',
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Example #20
0
pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="ybw-username"),
    ExternalProcess(
        'Scraper', [
            "python", "scraper.py",
            ItemInterpolation("%(item_name_punycode)s"),
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s")
        ],
        env={'SCRAPER_BIND_ADDRESS': globals().get('bind_address', '')}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(item_dir)s/%(warc_file_base)s.wretch.txt"),
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.yahoo.txt"),
            ]
        }), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://tracker.archiveteam.org/%s" % TRACKER_ID,
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation(
                    "%(data_dir)s/%(warc_file_base)s.wretch.txt"),
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.yahoo.txt"),
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive", "--partial", "--partial-dir", ".rsync-tmp"
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
        IOLoop.instance().add_timeout(datetime.timedelta(seconds=10),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])),
    IdleTask(),
)
Example #22
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='periscope'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'warc_file_base': ItemValue('warc_file_base'),
                 }), SetBadUrls(),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='2',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        ChooseTargetAndUpload(),
    ),
    MaybeSendDoneToTracker(tracker_url='http://%s/%s' %
                           (TRACKER_HOST, TRACKER_ID),
                           stats=ItemValue('stats')))
Example #23
0
pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="hyves"),
    ConditionalTask(
        is_domain_site,
        WgetDownload(
            wget_args,
            max_tries=5,
            accept_on_exit_code=[0, 8],
            env={'hyves_username': ItemInterpolation("%(item_name)s")}),
    ),
    ConditionalTask(
        is_not_domain_site,
        WgetDownload(
            wget_args_not_domain,
            max_tries=5,
            accept_on_exit_code=[0, 8],
            env={'hyves_username': ItemInterpolation("%(item_name)s")}),
    ),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        }), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://tracker.archiveteam.org/%s" % TRACKER_ID,
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive", "--partial", "--partial-dir", ".rsync-tmp"
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Example #24
0
pipeline = Pipeline(
    GetItemFromTracker("http://tracker.archiveteam.org/%s" % TRACKER_ID,
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="xanga.com"), Login(),
    WgetDownload(
        [
            WGET_LUA, "-U", USER_AGENT, "-nv", "-o",
            ItemInterpolation("%(item_dir)s/wget.log"), "--load-cookies",
            ItemInterpolation("%(cookie_jar)s"), "--lua-script", "xanga.lua",
            "--no-check-certificate", "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output",
            "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf",
            "--page-requisites", "--timeout", "60", "--tries", "20",
            "--waitretry", "5", "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team", "--warc-header",
            "xanga-dld-script-version: " + VERSION, "--warc-header",
            ItemInterpolation("xanga-user: %(item_name)s"),
            ItemInterpolation("http://%(item_name)s.xanga.com/")
        ],
        max_tries=2,
        accept_on_exit_code=[0, 4, 6, 8],
    ),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        }), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://tracker.archiveteam.org/%s" % TRACKER_ID,
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive", "--partial", "--partial-dir", ".rsync-tmp"
            ]),
    ),
    SendDoneToTracker(tracker_url="http://tracker.archiveteam.org/%s" %
                      TRACKER_ID,
                      stats=ItemValue("stats")))
Example #25
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="friendfeed"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir",
                ".rsync-tmp",
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(),
    WgetDownload(WgetArgs(), ), DeduplicateWarcExtProc(DedupeArgs()),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(data_dir)s/%(item_name)s.deduplicated.warc.gz")
            ]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadToIA(UploadToIAArgs()),
    ), DeleteFiles(),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Example #27
0
pipeline = Pipeline(
	CheckIP(),
	GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
	ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]),
	LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])),
	ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]),
	LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
		name="shared:rsync_threads", title="Rsync threads",
		description="The maximum number of concurrent uploads."),
		UploadWithTracker(
			"http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
			downloader=downloader,
			version=VERSION,
			files=[
				cleanItem("%(data_dir)s/%(item_name)s.tar.gz")
				#ItemInterpolation("foo.tar.gz")
			],
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			rsync_extra_args=[
				"--recursive",
				"--partial",
				"--partial-dir", ".rsync-tmp",
			]
		),
	),
	PrepareStatsForTracker(
		defaults={"downloader": downloader, "version": VERSION},
		file_groups={
			"data": [
				cleanItem("%(data_dir)s/%(item_name)s.tar.gz")
			]
		},
		id_function=stats_id_function,
	),
	SendDoneToTracker(
		tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
		stats=ItemValue("stats")
	)
)
Example #28
0
pipeline = Pipeline(
    CheckIP(),
    # GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
    #                    VERSION),
    SetItemKey("item_name", "smaug.fart.website:8080"),
    PrepareDirectories(warc_prefix="examplecity"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 7, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "downloader": downloader
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    # LimitConcurrent(
    #     NumberConfigValue(min=1, max=4, default="1",
    #                       name="shared:rsync_threads", title="Rsync threads",
    #                       description="The maximum number of concurrent uploads."),
    #     UploadWithTracker(
    #         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
    #         downloader=downloader,
    #         version=VERSION,
    #         files=[
    #             ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
    #         ],
    #         rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
    #         rsync_extra_args=[
    #             "--recursive",
    #             "--partial",
    #             "--partial-dir", ".rsync-tmp",
    #             ]
    #     ),
    # ),
    # SendDoneToTracker(
    #     tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
    #     stats=ItemValue("stats")
    # )
    ExternalProcess("sleep", ["sleep", "60"]),
)
Example #29
0
pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
    ExtraItemParams(),
    PrepareDirectories(warc_prefix="puush"),
    SpecializedWgetDownloadMany([ WGET_LUA,
          "-U", USER_AGENT,
          "-nv",
          "-o", ItemInterpolation("%(item_dir)s/wget.log"),
          "--lua-script", "puush.lua",
          "--no-check-certificate",
          "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
          "--truncate-output",
          "-e", "robots=off",
          "--rotate-dns",
          "--timeout", "60",
          "--tries", "20",
          "--waitretry", "5",
          "--warc-file", ItemInterpolation("%(item_dir)s/%(current_warc_file_base)s"),
          "--warc-header", "operator: Archive Team",
          "--warc-header", "puush-dld-script-version: " + VERSION,
        ],
        URLsToDownload(),
        max_tries=20,
        accept_on_exit_code=[
            0,
            EXIT_STATUS_PERMISSION_DENIED,
            EXIT_STATUS_NOT_FOUND
        ],  # see the lua script, also MoveFiles
    ),
    MoveFiles(),
    PrepareStatsForTracker2(
        defaults={ "downloader": downloader, "version": VERSION },
        file_groups={
            "data": FilesToUpload(),
        },
        id_function=prepare_stats_id_function,
    ),
    CleanUpItemDir(),
    LimitConcurrent(
        NumberConfigValue(min=1, max=4, default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        ConditionalTask(
            files_to_upload,
            UploadWithTracker2(
                "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                downloader=downloader,
                version=VERSION,
                files=FilesToUpload(),
                rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
                rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir", ".rsync-tmp"
                ]
            )
        )
    ),
    SendDoneToTracker(
        tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue("stats")
    )
)
Example #30
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix='sketch'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base')
                 }),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data': [
                ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')
                #ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz')
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default="20",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--sockopts=SO_SNDBUF=8388608,SO_RCVBUF=8388608",  # 02:50 <Kenshin> the extra options should improve rsync speeds when the latency is higher
                "--recursive",
                "--partial",
                "--partial-dir",
                ".rsync-tmp",
                "--min-size",
                "1",
                "--no-compress",
                "--compress-level=0"
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Example #31
0
pipeline = Pipeline(
	# request an item from the tracker (using the universal-tracker protocol)
	# the downloader variable will be set by the warrior environment
	#
	# this task will wait for an item and sets item["item_name"] to the item name
	# before finishing
	GetItemFromTracker(TRACKER_URL, downloader, VERSION),

	# create the directories and initialize the filenames (see above)
	# warc_prefix is the first part of the warc filename
	#
	# this task will set item["item_dir"] and item["warc_file_base"]
	PrepareDirectories(warc_prefix="greader"),

	# execute Wget+Lua
	#
	# the ItemInterpolation() objects are resolved during runtime
	# (when there is an Item with values that can be added to the strings)
	WgetDownloadWithStdin([
			# TODO: cert-pin
			WGET_LUA,
			"-U", ItemInterpolation("%(user_agent)s"),
			"-nv",
			"-o", ItemInterpolation("%(item_dir)s/wget.log"),
			"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
			"--truncate-output",
			"-e", "robots=off",
			"--rotate-dns",
			"--timeout", ItemInterpolation("%(wget_timeout)s"),
			"--tries", ItemInterpolation("%(wget_tries)s"),
			"--waitretry", ItemInterpolation("%(wget_waitretry)s"),
			"--lua-script", "greader.lua",
			"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
			"--warc-header", "operator: Archive Team",
			"--warc-header", "greader-dld-script-version: " + VERSION,
			"--input", "-"
		],
		max_tries=2,
		accept_on_exit_code=[0, 8], # which Wget exit codes count as a success?
		env=dict(SSL_CERT_DIR=SSL_CERT_DIR),
		stdin_data_function=(lambda item: "\n".join(u.encode("utf-8") for u in item["task_urls"]) + "\n"),
	),

	# this will set the item["stats"] string that is sent to the tracker (see below)
	PrepareStatsForTracker(
		# there are a few normal values that need to be sent
		defaults={"downloader": downloader, "version": VERSION},
		# this is used for the size counter on the tracker:
		# the groups should correspond with the groups set configured on the tracker
		file_groups={
			# there can be multiple groups with multiple files
			# file sizes are measured per group
			"data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
		},
		id_function=(lambda item: {"ua": item["user_agent"] })
	),

	# remove the temporary files, move the warc file from
	# item["item_dir"] to item["data_dir"]
	MoveFiles(),

	# there can be multiple items in the pipeline, but this wrapper ensures
	# that there is only one item uploading at a time
	#
	# the NumberConfigValue can be changed in the configuration panel
	LimitConcurrent(
		NumberConfigValue(
			min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads",
			description="The maximum number of concurrent uploads."),
		# this upload task asks the tracker for an upload target
		# this can be HTTP or rsync and can be changed in the tracker admin panel
		UploadWithTracker(
			TRACKER_URL,
			downloader=downloader,
			version=VERSION,
			# list the files that should be uploaded.
			# this may include directory names.
			# note: HTTP uploads will only upload the first file on this list
			files=[
				ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
			],
			# the relative path for the rsync command
			# (this defines if the files are uploaded to a subdirectory on the server)
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			# extra rsync parameters (probably standard)
			rsync_extra_args=[
				"--recursive",
				"--partial",
				"--partial-dir", ".rsync-tmp"
			]
		),
	),

	# if the item passed every task, notify the tracker and report the statistics
	SendDoneToTracker(
		tracker_url=TRACKER_URL,
		stats=ItemValue("stats")
	)
)
Example #32
0
    finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"],
    warc_max_size=WARC_MAX_SIZE,
    monitor_disk=WPULL_MONITOR_DISK,
    monitor_memory=WPULL_MONITOR_MEMORY,
)

pipeline = Pipeline(
    CheckIP(), CheckLocalWebserver(),
    GetItemFromQueue(control,
                     pipeline_id,
                     downloader,
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE'),
                     version_check=(VERSION, pipeline_version)),
    StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(),
    DownloadUrlFile(control),
    Wpull(wpull_args,
          accept_on_exit_code=AcceptAny(),
          env={
              'ITEM_IDENT': ItemInterpolation('%(ident)s'),
              'LOG_KEY': ItemInterpolation('%(log_key)s'),
              'REDIS_URL': REDIS_URL,
              'PATH': os.environ['PATH']
          }), RelabelIfAborted(control), CompressLogIfFailed(), WriteInfo(),
    MoveFiles(target_directory=os.environ["FINISHED_WARCS_DIR"]),
    StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME))


def stop_control():
    #control.flag_logging_thread_for_termination()
    control.unregister_pipeline(pipeline_id)
Example #33
0
                globals()['downloader'],
                globals().get('bind_address', ''), USER_AGENT
            ],
            env=env)


project = Project(title="URLTeam 2",
                  project_html="""
    <img class="project-logo" alt=""
        src="http://archiveteam.org/images/9/9d/Urlteam-logo-reasonable-size.png"
        height="50"
    title="url shortening was a f*****g awful idea" />
    <h2>URLTeam 2
        <span class="links">
            <a href="http://urlte.am/">Website</a> &middot;
            <a href="http://%s/">Leaderboard</a> &middot;
            <a href="http://archiveteam.org/index.php?title=URLTeam">Wiki</a>
        </span>
    </h2>
    <p>The Terror of Tiny Town</p>
    """ % (TRACKER_HOST))

tasks = [CheckIP(), RunScraper()]

if globals().get('no_submodule'):
    print('Not updating submodule')
else:
    tasks.insert(0, MaybeUpdateSubmodule())

pipeline = Pipeline(*tasks)
Example #34
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY')),
    StartHeartbeat(control),
    SetFetchDepth(),
    PreparePaths(),
    WriteInfo(),
    DownloadUrlFile(control),
    WgetDownload(
        wpull_args,
        accept_on_exit_code=AcceptAny(),
        env={
            'ITEM_IDENT': ItemInterpolation('%(ident)s'),
            'LOG_KEY': ItemInterpolation('%(log_key)s'),
            'REDIS_URL': REDIS_URL,
            'PATH': os.environ['PATH']
        }
    ),
    RelabelIfAborted(control),
    WriteInfo(),
    MoveFiles(),
    LimitConcurrent(2,
        RsyncUpload(
            target = RSYNC_URL,
            target_source_path = ItemInterpolation("%(data_dir)s"),
            files=ItemValue("all_target_files"),
            extra_args = [
                '--partial',
                '--partial-dir', '.rsync-tmp'
            ]
        )
    ),
    StopHeartbeat(),
    MarkItemAsDone(control, EXPIRE_TIME)
)
Example #35
0
                       phantomjs_exe=PHANTOMJS,
                       finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"],
                       warc_max_size=WARC_MAX_SIZE)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromQueue(control,
                     pipeline_id,
                     downloader,
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE')), StartHeartbeat(control),
    SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control),
    WgetDownload(wpull_args,
                 accept_on_exit_code=AcceptAny(),
                 env={
                     'ITEM_IDENT': ItemInterpolation('%(ident)s'),
                     'LOG_KEY': ItemInterpolation('%(log_key)s'),
                     'REDIS_URL': REDIS_URL,
                     'PATH': os.environ['PATH']
                 }), RelabelIfAborted(control), WriteInfo(), MoveFiles(),
    LimitConcurrent(
        2,
        RsyncUpload(target=RSYNC_URL,
                    target_source_path=ItemInterpolation("%(data_dir)s"),
                    files=ItemValue("all_target_files"),
                    extra_args=['--partial', '--partial-dir', '.rsync-tmp'])),
    StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME))


def stop_control():
    #control.flag_logging_thread_for_termination()
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(
        1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])),
    IdleTask(),
)
Example #37
0
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='bitbucket'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base'),
                 }),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='2',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation(
                    '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'
                ),
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive', '--partial', '--partial-dir', '.rsync-tmp',
                '--min-size', '1', '--no-compress', '--compress-level', '0'
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Example #38
0
        'python_version': sys.version,
    }
    return d


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='load',
                  project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>
        <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
        <p>Archiving everything from reddit.</p>
    ''')

pipeline = Pipeline(
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={'data': ['/dev/null']},
        id_function=stats_id_function,
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))