<img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/Archiveteamsmall.png?959ea" height="50px"/> <h2>Super Mario Maker Bookmarks <span class="links"><a href="https://supermariomakerbookmark.nintendo.net/">Website</a> · <a href="http://tracker.archiveteam.org/super-mario-maker-bookmarks/">Leaderboard</a></span></h2> ''', utc_deadline=datetime.datetime.fromtimestamp(1617148800)) pipeline = Pipeline( CheckIP(), GetItemFromTracker( 'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE), downloader, VERSION), PrepareDirectories(warc_prefix='super-mario-maker-bookmarks'), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base'), 'item_name_newline': ItemValue('item_name_newline'), }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(
</span> </h2> <p>Quizilla shuts down. This is phase 1: content discovery.</p> """, utc_deadline=datetime.datetime(2014, 10, 1, 23, 59, 0)) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="quizilladisco"), ExternalProcess('Scraper', CustomProcessArgs(), max_tries=2, accept_on_exit_code=[0], env={"item_dir": ItemValue("item_dir")}), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4,
project_html=""" <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/b/bd/Great_Seal_of_the_United_States.png/240px-Great_Seal_of_the_United_States.png" height="50px" title=""/> <h2>archives.gov <span class="links"><a href="http://archives.gov/">Website</a> · <a href="http://tracker.archiveteam.org/archives-gov/">Leaderboard</a></span></h2> <p>Archiving data from archives.gov.</p> """) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="archives-gov"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "warc_file_base": ItemValue("warc_file_base"), }), Deduplicate(), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz") ]
</span> </h2> ''' ) pipeline = Pipeline( CheckIP(), CheckBan(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='yourshot-static'), WgetDownload( WgetArgs(), max_tries=0, # 2, #changed accept_on_exit_code=[0], # [0, 4, 8], #changed env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), 'todo_url_count': ItemValue('todo_url_count'), } ), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, file_groups={ 'data': [ ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz') #TODO ? ] }, id_function=stats_id_function, ),
<a href="http://{0}/{1}/">Leaderboard</a> </span> </h2> '''.format(TRACKER_HOST, TRACKER_ID)) pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='8tracks'), WgetDownload( WgetArgs(), max_tries=1, accept_on_exit_code=[0], #, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), 'downloader': downloader, 'url_count_target': ItemValue('url_count_target'), }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] },
project = Project(title="friendfeed", project_html=""" <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/8/83/Friendfeed_logo.png" height="50px" title=""/> <h2>friendfeed.com <span class="links"><a href="http://friendfeed.com/">Website</a> · <a href="http://tracker.archiveteam.org/friendfeed/">Leaderboard</a></span></h2> <p>Grabbing all accounts from friendfeed.com.</p> """) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="friendfeed"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(
<a href="http://tracker.archiveteam.org/ftp/">Leaderboard</a></span></h2> <p>Archiving all FTPs!</p> """ ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="ftp"), WgetDownload( WgetArgs(), max_tries=1, accept_on_exit_code=[0, 8], env={ "item_dir": ItemValue("item_dir"), "item_item": ItemValue("item_item"), "downloader": downloader } ), PrepareStatsForTracker( defaults={"downloader": downloader, "version": VERSION}, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(
<p>Archiving public Telegram channels.</p> ''' ) pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://{}/{}/multi={}/' .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE), downloader, VERSION), PrepareDirectories(warc_prefix=TRACKER_ID), WgetDownload( WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base') } ), SetBadUrls(), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, file_groups={ 'data': [ ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst') ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
<h2>twitpic.com <span class="links"><a href="http://twitpic.com/">Website</a> · <a href="http://tracker.archiveteam.org/twitpic2/">Leaderboard</a></span></h2> <p>Saving TwitPic's smoldering remains.</p> <!--<p class="projectBroadcastMessage"></p>--> """, # utc_deadline=datetime.datetime(2014, 9, 25, 23, 59, 0) ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="twitpic2"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "escaped_item_name": ItemValue("escaped_item_name"), "downloader": downloader }), ProcessScrapeFile(), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ItemInterpolation( "%(item_dir)s/twitpic2-scrape-%(escaped_item_name)s.txt.gz"
<p><b>Canv.as</b> is closed.</p> """ % (TRACKER_HOST, TRACKER_ID) , utc_deadline=datetime.datetime(2014, 03, 03, 00, 00, 1) ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="canvas"), WgetDownload( WgetArgs(), max_tries=5, accept_on_exit_code=[0, 8], env={ 'item_type': ItemValue("item_type"), 'item_data': ItemValue("item_data"), 'item_dir': ItemValue("item_dir"), } ), PrepareStatsForTracker( defaults={"downloader": downloader, "version": VERSION}, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
project_html=""" <img class="project-logo" alt="Project logo" src="http://rapidshare.com/files/251393042" height="50px" title=""/> <h2>www.rapidshare.com <span class="links"><a href="https://www.rapidshare.com/">Website</a> · <a href="http://tracker.archiveteam.org/rapidshare/">Leaderboard</a></span></h2> <p>Grabbing files from RapidShare.</p> """, utc_deadline=datetime.datetime(2015, 3, 31, 23, 59, 0)) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="rapidshare"), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "item_id": ItemValue("item_id"), "item_keyword": ItemValue("item_keyword"), }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function,
<p>This is an example wpull project</p> <!--<p class="projectBroadcastMessage"></p>--> """, utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0)) pipeline = Pipeline( CheckIP(), # GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, # VERSION), SetItemKey("item_name", "smaug.fart.website:8080"), PrepareDirectories(warc_prefix="examplecity"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "downloader": downloader }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ] }, id_function=stats_id_function, ), MoveFiles(),
<h2>Twitch Phase 1: Content Discovery. <span class="links"><a href="http://twitch.tv/">Website</a> · <a href="http://tracker.archiveteam.org/twitchdisco/">Leaderboard</a></span></h2> <p>Twitch is releasing videos from their PC. <a href="https://archive.org/donate/">Donate to IA for disk space!</a></p> """, utc_deadline=datetime.datetime(2014, 8, 20, 23, 59, 0) ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="twitchdisco"), ExternalProcess('Scraper', CustomProcessArgs(), max_tries=2, accept_on_exit_code=[0], env={ "item_dir": ItemValue("item_dir") } ), PrepareStatsForTracker( defaults={"downloader": downloader, "version": VERSION}, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."),
""", # utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0) ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="furaffinity"), ExternalProcess( 'Begin', [sys.executable, 'helper.py', 'begin'], env={ 'user_agent': user_agent, 'bind_address': globals().get('bind_address', ''), 'disco_tracker': DISCO_TRACKER_URL, "item_dir": ItemValue("item_dir"), }, accept_on_exit_code=[0], ), LimitConcurrent( NumberConfigValue( min=1, max=6, default=globals().get("num_procs", "1"), name="shared:fagrab:num_procs", title="Number of Processes", description="The maximum number of concurrent download processes." ), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 7, 8],
ao_only=env.get('AO_ONLY'), large=env.get('LARGE')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), WriteInfo(), MoveFiles(), LimitConcurrent( 2, RsyncUpload(target=RSYNC_URL, target_source_path=ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args=['--partial', '--partial-dir', '.rsync-tmp'])), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME)) def stop_control(): #control.flag_logging_thread_for_termination() control.unregister_pipeline(pipeline_id) pipeline.on_cleanup += stop_control pipeline.running_status = "Running" def status_running():
project_html=""" <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/0/03/Flick_logo_black.png/320px-Flick_logo_black.png" height="50px" title=""/> <h2>flickr.com <span class="links"><a href="http://flickr.com/">Website</a> · <a href="http://tracker.archiveteam.org/flickr/">Leaderboard</a></span></h2> <p>Archiving CC photos from flickr.</p> """) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="flickr"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), 'warc_file_base': ItemValue('warc_file_base') }), Deduplicate(), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz") ]
"data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default="20", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp", ]), ), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
[ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz'), ItemInterpolation( '%(data_dir)s/%(warc_file_base)s-tail.warc.gz'), ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt') ], rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', '--min-size', '1', '--no-compress', '--compress-level', '0' ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
# be too big. The deadline is optional. project = Project(title='Bitbucket', project_html=''' <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/7/70/Bitbucket-atlassian-logo.png" height="50px"/> <h2>bitbucket.org <span class="links"><a href="https://bitbucket.org/">Website</a> · <a href="http://tracker.archiveteam.org/bitbucket/">Leaderboard</a></span></h2> ''') pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='bitbucket'), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')] }, id_function=stats_id_function, ), MoveFiles(),
max=4, default="2", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), # this upload task asks the tracker for an upload target # this can be HTTP or rsync and can be changed in the tracker admin panel UploadWithTracker( TRACKER_URL, downloader=downloader, version=VERSION, # list the files that should be uploaded. # this may include directory names. # note: HTTP uploads will only upload the first file on this list files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"), ItemInterpolation( "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz") ], # the relative path for the rsync command # (this defines if the files are uploaded to a subdirectory on the server) rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), # extra rsync parameters (probably standard) rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ]), ), # if the item passed every task, notify the tracker and report the statistics SendDoneToTracker(tracker_url=TRACKER_URL, stats=ItemValue("stats")))