def test_spurious_item_events(self): class StupidTask(SimpleTask): def __init__(self): SimpleTask.__init__(self, "StupidTask") def process(self, item): item.log_output('Failing the item.') self.fail_item(item) item.log_output('Completing the item.') self.complete_item(item) item.log_output('Failing the item.') self.fail_item(item) pipeline = Pipeline(StupidTask()) pipeline.fail_count_test = 0 def fail_callback(task, item): pipeline.fail_count_test += 1 pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=1) runner.start() self.assertEqual(1, pipeline.fail_count_test) self.assertIOLoopOK()
def test_no_such_file(self): external_process = ExternalProcessUser( "Fake", ["kitteh and doge.avi.exe"]) pipeline = Pipeline(external_process) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=1) runner.start() self.assertTrue(pipeline.has_failed) self.assertIOLoopOK()
def test_proc(self): external_process = ExternalProcessUser( "Echo", ["python", "-c", "print('hello world!')"], max_tries=4) pipeline = Pipeline(external_process) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=1) runner.start() output = external_process.output_buffer.getvalue() self.assertFalse(pipeline.has_failed) self.assertTrue('hello world!' in output) self.assertIOLoopOK()
def test_proc_utf8(self): external_process = ExternalProcessUser( "Echo", ["python", "-c", "print(u'hello world!áßðfáßðf')"], ) pipeline = Pipeline(external_process) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=1) runner.start() self.assertFalse(pipeline.has_failed) self.assertIOLoopOK()
def test_proc_stdin_error(self): external_process = ExternalProcessUser( "Echo", ["python", "-c" "print('hello world!')"], max_tries=4) external_process.stdin_data = lambda item: 123456 pipeline = Pipeline(external_process) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=1) runner.start() self.assertTrue(pipeline.has_failed) self.assertIOLoopOK() self.assertEqual(4, external_process.exit_count)
def test_proc_fail(self): for max_tries in [1, 2, 20]: external_process = ExternalProcessUser( "Quitter", ["python", "-c", "import sys;sys.exit(33)"], max_tries=max_tries) pipeline = Pipeline(external_process) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=1) runner.start() self.assertTrue(pipeline.has_failed) self.assertEqual(33, external_process.return_code) self.assertEqual(max_tries, external_process.exit_count) self.assertIOLoopOK()
def test_max_items(self): pipeline = Pipeline(PrintItem(), PrintItem()) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=3) def finish_item_callback(runner, pipeline, item): if runner.item_count > 10: raise Exception('Too many items.') runner.on_pipeline_finish_item += finish_item_callback runner.start() self.assertFalse(pipeline.has_failed) self.assertEqual(3, runner.item_count) self.assertIOLoopOK()
def test_runner_signals_pipeline_on_stop(self): pipeline = Pipeline(PrintItem()) runner = SimpleRunner(pipeline, max_items=1) def stop_requested(): self.stop_requested_calls += 1 pipeline.on_stop_requested += stop_requested runner.start() runner.stop_gracefully() self.assertEqual(1, self.stop_requested_calls)
def test_runner_does_pipeline_cleanup_before_shutdown(self): pipeline = Pipeline(PrintItem()) runner = SimpleRunner(pipeline, max_items=1) def cleanup(): self.cleanup_calls += 1 pipeline.on_cleanup += cleanup runner.start() self.assertEqual(1, self.cleanup_calls) self.assertEqual(1, runner.item_count)
pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="furaffinity"), ExternalProcess( 'Begin', [sys.executable, 'helper.py', 'begin'], env={ 'user_agent': user_agent, 'bind_address': globals().get('bind_address', ''), 'disco_tracker': DISCO_TRACKER_URL, "item_dir": ItemValue("item_dir"), }, accept_on_exit_code=[0], ), LimitConcurrent( NumberConfigValue( min=1, max=6, default=globals().get("num_procs", "1"), name="shared:fagrab:num_procs", title="Number of Processes", description="The maximum number of concurrent download processes." ), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "downloader": downloader, "item_name": ItemValue("item_name"), }), ), ExternalProcess( 'End', [sys.executable, 'helper.py', 'end'], env={ 'user_agent': user_agent, 'bind_address': globals().get('bind_address', ''), 'disco_tracker': DISCO_TRACKER_URL, "item_dir": ItemValue("item_dir"), }, accept_on_exit_code=[0], ), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"), ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp", ]), ), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(file_prefix="isohunt"), LimitConcurrent( NumberConfigValue( min=1, max=10, default="10", name="isohunt:download_threads", title="Isohunt downloading threads", description= "How many threads downloading Isohunt torrents and pages can run at once, to avoid throttling." ), WgetDownloadTorrentRange( [ WGET_LUA, "-U", USER_AGENT, "--no-check-certificate", "-e", "robots=off", "--rotate-dns", "--timeout", "60", "--level=inf", "--tries", "20", "--waitretry", "5", # "--bind-address", "%BIND_ADDRESS%", ], max_tries=5, accept_on_exit_code=[0]), ), PrepareStatsForTracker2( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": RangeInterpolation("%(item_dir)s/%(range_filename)s") }), # Used to MoveFiles here, but that's actually kind of stupid. LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker2( "http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader=downloader, version=VERSION, files=RangeInterpolation("%(item_dir)s/%(range_filename)s"), rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ]), ), CleanUpDirectories(), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), # noqa: F821 PrepareDirectories(warc_prefix='yg-api'), YgaDownload( YgaArgs(), max_tries=0, # 2, #changed accept_on_exit_code=[0], # [0, 4, 8], #changed env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), }), MoveFiles(), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, # noqa: F821 file_groups={ 'data': [ ItemInterpolation( '%(data_dir)s/%(warc_file_base)s.warc.gz') # TODO ? ] }, id_function=stats_id_function, ), LimitConcurrent( NumberConfigValue( min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, # noqa: F821 version=VERSION, files=ItemValue('files'), rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="ybw-username"), ExternalProcess( 'Scraper', [ "python", "scraper.py", ItemInterpolation("%(item_name_punycode)s"), ItemInterpolation("%(item_dir)s/%(warc_file_base)s") ], env={'SCRAPER_BIND_ADDRESS': globals().get('bind_address', '')}), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(item_dir)s/%(warc_file_base)s.wretch.txt"), ItemInterpolation("%(item_dir)s/%(warc_file_base)s.yahoo.txt"), ] }), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader=downloader, version=VERSION, files=[ ItemInterpolation( "%(data_dir)s/%(warc_file_base)s.wretch.txt"), ItemInterpolation("%(data_dir)s/%(warc_file_base)s.yahoo.txt"), ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ]), ), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
IOLoop.instance().add_timeout(datetime.timedelta(seconds=10), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) class IdleTask(Task): def __init__(self): Task.__init__(self, 'IdleTask') def enqueue(self, item): self.start_item(item) item.may_be_canceled = True item.log_output('Pausing for 60 seconds...') IOLoop.instance().add_timeout(datetime.timedelta(seconds=60), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) pipeline = Pipeline( WarningTask(), LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])), IdleTask(), )
pipeline = Pipeline( CheckIP(), GetItemFromTracker( 'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE), downloader, VERSION), PrepareDirectories(warc_prefix='periscope'), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base'), }), SetBadUrls(), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), ChooseTargetAndUpload(), ), MaybeSendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="hyves"), ConditionalTask( is_domain_site, WgetDownload( wget_args, max_tries=5, accept_on_exit_code=[0, 8], env={'hyves_username': ItemInterpolation("%(item_name)s")}), ), ConditionalTask( is_not_domain_site, WgetDownload( wget_args_not_domain, max_tries=5, accept_on_exit_code=[0, 8], env={'hyves_username': ItemInterpolation("%(item_name)s")}), ), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ]), ), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
pipeline = Pipeline( GetItemFromTracker("http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader, VERSION), PrepareDirectories(warc_prefix="xanga.com"), Login(), WgetDownload( [ WGET_LUA, "-U", USER_AGENT, "-nv", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--load-cookies", ItemInterpolation("%(cookie_jar)s"), "--lua-script", "xanga.lua", "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--page-requisites", "--timeout", "60", "--tries", "20", "--waitretry", "5", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "xanga-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("xanga-user: %(item_name)s"), ItemInterpolation("http://%(item_name)s.xanga.com/") ], max_tries=2, accept_on_exit_code=[0, 4, 6, 8], ), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ]), ), SendDoneToTracker(tracker_url="http://tracker.archiveteam.org/%s" % TRACKER_ID, stats=ItemValue("stats")))
pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="friendfeed"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp", ]), ), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(), WgetDownload(WgetArgs(), ), DeduplicateWarcExtProc(DedupeArgs()), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(data_dir)s/%(item_name)s.deduplicated.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadToIA(UploadToIAArgs()), ), DeleteFiles(), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]), LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])), ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ cleanItem("%(data_dir)s/%(item_name)s.tar.gz") #ItemInterpolation("foo.tar.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp", ] ), ), PrepareStatsForTracker( defaults={"downloader": downloader, "version": VERSION}, file_groups={ "data": [ cleanItem("%(data_dir)s/%(item_name)s.tar.gz") ] }, id_function=stats_id_function, ), SendDoneToTracker( tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats") ) )
pipeline = Pipeline( CheckIP(), # GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, # VERSION), SetItemKey("item_name", "smaug.fart.website:8080"), PrepareDirectories(warc_prefix="examplecity"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "downloader": downloader }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ] }, id_function=stats_id_function, ), MoveFiles(), # LimitConcurrent( # NumberConfigValue(min=1, max=4, default="1", # name="shared:rsync_threads", title="Rsync threads", # description="The maximum number of concurrent uploads."), # UploadWithTracker( # "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), # downloader=downloader, # version=VERSION, # files=[ # ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"), # ], # rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), # rsync_extra_args=[ # "--recursive", # "--partial", # "--partial-dir", ".rsync-tmp", # ] # ), # ), # SendDoneToTracker( # tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), # stats=ItemValue("stats") # ) ExternalProcess("sleep", ["sleep", "60"]), )
pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), ExtraItemParams(), PrepareDirectories(warc_prefix="puush"), SpecializedWgetDownloadMany([ WGET_LUA, "-U", USER_AGENT, "-nv", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--lua-script", "puush.lua", "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--timeout", "60", "--tries", "20", "--waitretry", "5", "--warc-file", ItemInterpolation("%(item_dir)s/%(current_warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "puush-dld-script-version: " + VERSION, ], URLsToDownload(), max_tries=20, accept_on_exit_code=[ 0, EXIT_STATUS_PERMISSION_DENIED, EXIT_STATUS_NOT_FOUND ], # see the lua script, also MoveFiles ), MoveFiles(), PrepareStatsForTracker2( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": FilesToUpload(), }, id_function=prepare_stats_id_function, ), CleanUpItemDir(), LimitConcurrent( NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), ConditionalTask( files_to_upload, UploadWithTracker2( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=FilesToUpload(), rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ] ) ) ), SendDoneToTracker( tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats") ) )
pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='sketch'), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base') }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz') #ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz') ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default="20", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"), ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--sockopts=SO_SNDBUF=8388608,SO_RCVBUF=8388608", # 02:50 <Kenshin> the extra options should improve rsync speeds when the latency is higher "--recursive", "--partial", "--partial-dir", ".rsync-tmp", "--min-size", "1", "--no-compress", "--compress-level=0" ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
pipeline = Pipeline( # request an item from the tracker (using the universal-tracker protocol) # the downloader variable will be set by the warrior environment # # this task will wait for an item and sets item["item_name"] to the item name # before finishing GetItemFromTracker(TRACKER_URL, downloader, VERSION), # create the directories and initialize the filenames (see above) # warc_prefix is the first part of the warc filename # # this task will set item["item_dir"] and item["warc_file_base"] PrepareDirectories(warc_prefix="greader"), # execute Wget+Lua # # the ItemInterpolation() objects are resolved during runtime # (when there is an Item with values that can be added to the strings) WgetDownloadWithStdin([ # TODO: cert-pin WGET_LUA, "-U", ItemInterpolation("%(user_agent)s"), "-nv", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--timeout", ItemInterpolation("%(wget_timeout)s"), "--tries", ItemInterpolation("%(wget_tries)s"), "--waitretry", ItemInterpolation("%(wget_waitretry)s"), "--lua-script", "greader.lua", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "greader-dld-script-version: " + VERSION, "--input", "-" ], max_tries=2, accept_on_exit_code=[0, 8], # which Wget exit codes count as a success? env=dict(SSL_CERT_DIR=SSL_CERT_DIR), stdin_data_function=(lambda item: "\n".join(u.encode("utf-8") for u in item["task_urls"]) + "\n"), ), # this will set the item["stats"] string that is sent to the tracker (see below) PrepareStatsForTracker( # there are a few normal values that need to be sent defaults={"downloader": downloader, "version": VERSION}, # this is used for the size counter on the tracker: # the groups should correspond with the groups set configured on the tracker file_groups={ # there can be multiple groups with multiple files # file sizes are measured per group "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=(lambda item: {"ua": item["user_agent"] }) ), # remove the temporary files, move the warc file from # item["item_dir"] to item["data_dir"] MoveFiles(), # there can be multiple items in the pipeline, but this wrapper ensures # that there is only one item uploading at a time # # the NumberConfigValue can be changed in the configuration panel LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), # this upload task asks the tracker for an upload target # this can be HTTP or rsync and can be changed in the tracker admin panel UploadWithTracker( TRACKER_URL, downloader=downloader, version=VERSION, # list the files that should be uploaded. # this may include directory names. # note: HTTP uploads will only upload the first file on this list files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], # the relative path for the rsync command # (this defines if the files are uploaded to a subdirectory on the server) rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), # extra rsync parameters (probably standard) rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ] ), ), # if the item passed every task, notify the tracker and report the statistics SendDoneToTracker( tracker_url=TRACKER_URL, stats=ItemValue("stats") ) )
finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"], warc_max_size=WARC_MAX_SIZE, monitor_disk=WPULL_MONITOR_DISK, monitor_memory=WPULL_MONITOR_MEMORY, ) pipeline = Pipeline( CheckIP(), CheckLocalWebserver(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE'), version_check=(VERSION, pipeline_version)), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), Wpull(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), CompressLogIfFailed(), WriteInfo(), MoveFiles(target_directory=os.environ["FINISHED_WARCS_DIR"]), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME)) def stop_control(): #control.flag_logging_thread_for_termination() control.unregister_pipeline(pipeline_id)
globals()['downloader'], globals().get('bind_address', ''), USER_AGENT ], env=env) project = Project(title="URLTeam 2", project_html=""" <img class="project-logo" alt="" src="http://archiveteam.org/images/9/9d/Urlteam-logo-reasonable-size.png" height="50" title="url shortening was a f*****g awful idea" /> <h2>URLTeam 2 <span class="links"> <a href="http://urlte.am/">Website</a> · <a href="http://%s/">Leaderboard</a> · <a href="http://archiveteam.org/index.php?title=URLTeam">Wiki</a> </span> </h2> <p>The Terror of Tiny Town</p> """ % (TRACKER_HOST)) tasks = [CheckIP(), RunScraper()] if globals().get('no_submodule'): print('Not updating submodule') else: tasks.insert(0, MaybeUpdateSubmodule()) pipeline = Pipeline(*tasks)
pipeline = Pipeline( CheckIP(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload( wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] } ), RelabelIfAborted(control), WriteInfo(), MoveFiles(), LimitConcurrent(2, RsyncUpload( target = RSYNC_URL, target_source_path = ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args = [ '--partial', '--partial-dir', '.rsync-tmp' ] ) ), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME) )
phantomjs_exe=PHANTOMJS, finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"], warc_max_size=WARC_MAX_SIZE) pipeline = Pipeline( CheckIP(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), WriteInfo(), MoveFiles(), LimitConcurrent( 2, RsyncUpload(target=RSYNC_URL, target_source_path=ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args=['--partial', '--partial-dir', '.rsync-tmp'])), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME)) def stop_control(): #control.flag_logging_thread_for_termination()
functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) class IdleTask(Task): def __init__(self): Task.__init__(self, 'IdleTask') def enqueue(self, item): self.start_item(item) item.may_be_canceled = True item.log_output('Pausing for 60 seconds...') IOLoop.instance().add_timeout(datetime.timedelta(seconds=60), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) pipeline = Pipeline( WarningTask(), LimitConcurrent( 1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])), IdleTask(), )
pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='bitbucket'), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation( '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' ), ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt') ], rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', '--min-size', '1', '--no-compress', '--compress-level', '0' ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
'python_version': sys.version, } return d ########################################################################### # Initialize the project. # # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project(title='load', project_html=''' <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/> <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> · <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2> <p>Archiving everything from reddit.</p> ''') pipeline = Pipeline( GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={'data': ['/dev/null']}, id_function=stats_id_function, ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))