<img class="project-logo" alt="logo" src="hhttps://archiveteam.org/images/a/af/Kinja-icon.png" height="50px"/> <h2>kinja.com <span class="links"><a href="https://kinja.com/">Website</a> · <a href="http://tracker.archiveteam.org/kinja/">Leaderboard</a></span></h2> ''' ) pipeline = Pipeline( CheckIP(), GetItemFromTracker('https://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='kinja'), WgetDownload( WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), } ), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, file_groups={ 'data': [ ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz') ] }, id_function=stats_id_function, ), MoveFiles(),
<img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/6/68/Twitpic-logo.png" height="50px" title=""/> <h2>twitpic.com <span class="links"><a href="http://twitpic.com/">Website</a> · <a href="http://tracker.archiveteam.org/twitpic/">Leaderboard</a></span></h2> <p>Archiving images and webpages from twitpic.com.</p> <p class="projectBroadcastMessage">Please use only concurrency of 1 to avoid overloading Twitpic.</p> """, utc_deadline=datetime.datetime(2014, 9, 25, 23, 59, 0)) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="twitpic"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "downloader": downloader }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function, ), MoveFiles(),
project = Project(title="nujij", project_html=""" <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/f/f4/Nujij-logo.png" height="50px" title=""/> <h2>www.nujij.nl <span class="links"><a href="http://www.nujij.nl/">Website</a> · <a href="http://tracker.archiveteam.org/nujij/">Leaderboard</a></span></h2> <p>Archiving all articles from NUjij.nl.</p> """) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="nujij"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "warc_file_base": ItemValue("warc_file_base"), }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ItemInterpolation("%(item_dir)s/%(warc_file_base)s_data.txt") ] },
_, _, _, pipeline_id = monitoring.pipeline_id() pipeline = Pipeline( GetItemFromQueue(control, pipeline_id, ao_only=env.get('AO_ONLY')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload( WpullArgs(default_user_agent=DEFAULT_USER_AGENT, wpull_exe=WPULL_EXE, phantomjs_exe=PHANTOMJS), accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] } ), RelabelIfAborted(control), WriteInfo(), MoveFiles(), SetWarcFileSizeInRedis(control), LimitConcurrent(2, RsyncUpload( target = RSYNC_URL, target_source_path = ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args = [