def test_max_items_with_subproc(self): pipeline = Pipeline(PrintItem(), PrintItem(), ExternalProcess("pwd", ["pwd"])) pipeline.has_failed = None def fail_callback(task, item): pipeline.has_failed = True pipeline.on_fail_item += fail_callback runner = SimpleRunner(pipeline, max_items=3) def finish_item_callback(runner, pipeline, item): if runner.item_count > 10: raise Exception('Too many items.') runner.on_pipeline_finish_item += finish_item_callback runner.start() self.assertFalse(pipeline.has_failed) self.assertEqual(3, runner.item_count) self.assertIOLoopOK()
<a href="http://tracker.archiveteam.org/quizilladisco/">Leaderboard</a> <a href="http://archiveteam.org/index.php?title=Quizilla">Wiki</a> · </span> </h2> <p>Quizilla shuts down. This is phase 1: content discovery.</p> """, utc_deadline=datetime.datetime(2014, 10, 1, 23, 59, 0)) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="quizilladisco"), ExternalProcess('Scraper', CustomProcessArgs(), max_tries=2, accept_on_exit_code=[0], env={"item_dir": ItemValue("item_dir")}), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue(
"warc_file_base": ItemValue("warc_file_base"), }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function, ), MoveFiles(), ExternalProcess("rsync", [ "rsync", "-avz", "--progress", ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt"), "rsync://storage.harrycross.me/dev/bayimg" ]), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[
project = Project(title="Yahoo! Blog & Wretch Username", project_html=""" <img class="project-logo" alt="" src="http://archiveteam.org/images/7/76/Archiveteam1.png" height="50" /> <h2>Yahoo! Blog & Wretch <span class="links"><a href="http://blog.yahoo.com/">Yahoo! Blog</a> · <a href="http://www.wretch.cc/">Wretch</a> · <a href="http://%s/%s/">Leaderboard</a></span></h2> <p><b>Yahoo!</b> is a horrible monster.</p> """ % (TRACKER_HOST, TRACKER_ID), utc_deadline=datetime.datetime(2013, 12, 26, 00, 00, 1)) pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="ybw-username"), ExternalProcess( 'Scraper', [ "python", "scraper.py", ItemInterpolation("%(item_name_punycode)s"), ItemInterpolation("%(item_dir)s/%(warc_file_base)s") ], env={'SCRAPER_BIND_ADDRESS': globals().get('bind_address', '')}), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(item_dir)s/%(warc_file_base)s.wretch.txt"), ItemInterpolation("%(item_dir)s/%(warc_file_base)s.yahoo.txt"), ] }), MoveFiles(),
# # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project( title="sourceforgersync", project_html=""" <img class="project-logo" alt="Project logo" src="" height="50px" title=""/> <h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> · <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2> <p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p> """ ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]), LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])), ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ cleanItem("%(data_dir)s/%(item_name)s.tar.gz") #ItemInterpolation("foo.tar.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[
IOLoop.instance().add_timeout(datetime.timedelta(seconds=10), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) class IdleTask(Task): def __init__(self): Task.__init__(self, 'IdleTask') def enqueue(self, item): self.start_item(item) item.may_be_canceled = True item.log_output('Pausing for 60 seconds...') IOLoop.instance().add_timeout(datetime.timedelta(seconds=60), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) pipeline = Pipeline( WarningTask(), LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])), IdleTask(), )
functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) class IdleTask(Task): def __init__(self): Task.__init__(self, 'IdleTask') def enqueue(self, item): self.start_item(item) item.may_be_canceled = True item.log_output('Pausing for 60 seconds...') IOLoop.instance().add_timeout(datetime.timedelta(seconds=60), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) pipeline = Pipeline( WarningTask(), LimitConcurrent( 1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])), IdleTask(), )
] }, id_function=stats_id_function, ), MoveFiles(), # LimitConcurrent( # NumberConfigValue(min=1, max=4, default="1", # name="shared:rsync_threads", title="Rsync threads", # description="The maximum number of concurrent uploads."), # UploadWithTracker( # "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), # downloader=downloader, # version=VERSION, # files=[ # ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"), # ], # rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), # rsync_extra_args=[ # "--recursive", # "--partial", # "--partial-dir", ".rsync-tmp", # ] # ), # ), # SendDoneToTracker( # tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), # stats=ItemValue("stats") # ) ExternalProcess("sleep", ["sleep", "60"]), )
<p>Downloading FurAffinity</p> <!--<p class="projectBroadcastMessage"></p>--> """, # utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0) ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="furaffinity"), ExternalProcess( 'Begin', [sys.executable, 'helper.py', 'begin'], env={ 'user_agent': user_agent, 'bind_address': globals().get('bind_address', ''), 'disco_tracker': DISCO_TRACKER_URL, "item_dir": ItemValue("item_dir"), }, accept_on_exit_code=[0], ), LimitConcurrent( NumberConfigValue( min=1, max=6, default=globals().get("num_procs", "1"), name="shared:fagrab:num_procs", title="Number of Processes", description="The maximum number of concurrent download processes." ), WgetDownload(WgetArgs(),