########################################################################### # Initialize the project. # # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project( title = 'Pastebin', project_html = ''' <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/3/35/Pastebin.com_logo.png" height="50px"/> <h2>pastebin.com <span class="links"><a href="https://pastebin.com/">Website</a> · <a href="http://tracker.archiveteam.org/pastebin/">Leaderboard</a></span></h2> ''' ) pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='pastebin'), WgetDownload( WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base') } ), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, file_groups={
########################################################################### # Initialize the project. # # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project(title='Bitbucket', project_html=''' <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/7/70/Bitbucket-atlassian-logo.png" height="50px"/> <h2>bitbucket.org <span class="links"><a href="https://bitbucket.org/">Website</a> · <a href="http://tracker.archiveteam.org/bitbucket/">Leaderboard</a></span></h2> ''') pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='bitbucket'), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={
########################################################################### # Initialize the project. # # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project(title="friendfeed", project_html=""" <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/8/83/Friendfeed_logo.png" height="50px" title=""/> <h2>friendfeed.com <span class="links"><a href="http://friendfeed.com/">Website</a> · <a href="http://tracker.archiveteam.org/friendfeed/">Leaderboard</a></span></h2> <p>Grabbing all accounts from friendfeed.com.</p> """) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="friendfeed"), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), }), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data":
########################################################################### # Initialize the project. # # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project(title='Periscope', project_html=''' <img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/1/1c/Periscope_logo.png" height="50px"/> <h2>periscope.tv <span class="links"><a href="https://periscope.tv/">Website</a> · <a href="http://tracker.archiveteam.org/periscope/">Leaderboard</a></span></h2> ''') pipeline = Pipeline( CheckIP(), GetItemFromTracker( 'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE), downloader, VERSION), PrepareDirectories(warc_prefix='periscope'), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base'), }), SetBadUrls(), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={
project = Project(title="Xanga", project_html=""" <img class="project-logo" alt="Weblog.nl logo" src="http://archiveteam.org/images/4/4d/Xanga-logo-main.gif" width="120" /> <h2>Xanga.com <span class="links"><a href="http://www.xanga.com/">Website</a> · <a href="http://tracker.archiveteam.org/xanga/">Leaderboard</a></span></h2> <p><i>Xanga</i> is getting old. Archive Team investigates.</p> """ # , utc_deadline = datetime.datetime(2013,03,01, 23,59,0) ) TRACKER_ID = "xanga" RSYNC_TARGET = ConfigInterpolation( "fos.textfiles.com::alardland/warrior/xanga/%s/", downloader) pipeline = Pipeline( GetItemFromTracker("http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader, VERSION), PrepareDirectories(warc_prefix="xanga.com"), Login(), WgetDownload( [ WGET_LUA, "-U", USER_AGENT, "-nv", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--load-cookies", ItemInterpolation("%(cookie_jar)s"), "--lua-script", "xanga.lua", "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--page-requisites", "--timeout", "60", "--tries", "20", "--waitretry", "5", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "xanga-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("xanga-user: %(item_name)s"),
########################################################################### # Initialize the project. # # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project(title="Newsgrabber-Deduplication", project_html=""" <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/f/f3/Archive_team.png/235px-Archive_team.png" height="50px" title=""/> <h2>archiveteam.org <span class="links"><a href="http://archiveteam.org/">Website</a> · <a href="http://tracker.archiveteam.org/newsgrabber/">Leaderboard</a></span></h2> <p>We did a bad - Recovering from it....</p> """) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(), WgetDownload(WgetArgs(), ), DeduplicateWarcExtProc(DedupeArgs()), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(data_dir)s/%(item_name)s.deduplicated.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(
wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') project = Project(title="Yahoo Blog", project_html=""" <img class="project-logo" alt="Yahoo logo" src="http://archiveteam.org/images/thumb/a/a2/Yahoo-logo.png/120px-Yahoo-logo.png" /> <h2>Yahoo Blogs <span class="links"><a href="http://blog.yahoo.com/">Website</a> · <a href="http://tracker.archiveteam.org/yahooblog/">Leaderboard</a></span></h2> <p><i>Yahoo!</i> is a horrible monster.</p> """, utc_deadline=datetime.datetime(2013, 12, 26, 0, 0, 1)) pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='yahooblog'), WgetDownload( wget_args, max_tries=2, accept_on_exit_code=[0, 8], ), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }), MoveFiles(), LimitConcurrent(