###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title = 'Pastebin',
    project_html = '''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/3/35/Pastebin.com_logo.png" height="50px"/>
    <h2>pastebin.com <span class="links"><a href="https://pastebin.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/pastebin/">Leaderboard</a></span></h2>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareDirectories(warc_prefix='pastebin'),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
            'item_value': ItemValue('item_value'),
            'item_type': ItemValue('item_type'),
            'warc_file_base': ItemValue('warc_file_base')
        }
    ),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
Exemple #2
0

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='Bitbucket',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/7/70/Bitbucket-atlassian-logo.png" height="50px"/>
    <h2>bitbucket.org <span class="links"><a href="https://bitbucket.org/">Website</a> &middot; <a href="http://tracker.archiveteam.org/bitbucket/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='bitbucket'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base'),
                 }),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="friendfeed",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/8/83/Friendfeed_logo.png" height="50px" title=""/>
        <h2>friendfeed.com <span class="links"><a href="http://friendfeed.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/friendfeed/">Leaderboard</a></span></h2>
        <p>Grabbing all accounts from friendfeed.com.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="friendfeed"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
Exemple #4
0
###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='Periscope',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/1/1c/Periscope_logo.png" height="50px"/>
    <h2>periscope.tv <span class="links"><a href="https://periscope.tv/">Website</a> &middot; <a href="http://tracker.archiveteam.org/periscope/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='periscope'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'warc_file_base': ItemValue('warc_file_base'),
                 }), SetBadUrls(),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
Exemple #5
0
project = Project(title="Xanga",
                  project_html="""
    <img class="project-logo" alt="Weblog.nl logo" src="http://archiveteam.org/images/4/4d/Xanga-logo-main.gif" width="120" />
    <h2>Xanga.com <span class="links"><a href="http://www.xanga.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/xanga/">Leaderboard</a></span></h2>
    <p><i>Xanga</i> is getting old. Archive Team investigates.</p>
  """

                  # , utc_deadline = datetime.datetime(2013,03,01, 23,59,0)
                  )

TRACKER_ID = "xanga"
RSYNC_TARGET = ConfigInterpolation(
    "fos.textfiles.com::alardland/warrior/xanga/%s/", downloader)

pipeline = Pipeline(
    GetItemFromTracker("http://tracker.archiveteam.org/%s" % TRACKER_ID,
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="xanga.com"), Login(),
    WgetDownload(
        [
            WGET_LUA, "-U", USER_AGENT, "-nv", "-o",
            ItemInterpolation("%(item_dir)s/wget.log"), "--load-cookies",
            ItemInterpolation("%(cookie_jar)s"), "--lua-script", "xanga.lua",
            "--no-check-certificate", "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output",
            "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf",
            "--page-requisites", "--timeout", "60", "--tries", "20",
            "--waitretry", "5", "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team", "--warc-header",
            "xanga-dld-script-version: " + VERSION, "--warc-header",
            ItemInterpolation("xanga-user: %(item_name)s"),
###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="Newsgrabber-Deduplication",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/f/f3/Archive_team.png/235px-Archive_team.png" height="50px" title=""/>
        <h2>archiveteam.org <span class="links"><a href="http://archiveteam.org/">Website</a> &middot; <a href="http://tracker.archiveteam.org/newsgrabber/">Leaderboard</a></span></h2>
        <p>We did a bad - Recovering from it....</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(),
    WgetDownload(WgetArgs(), ), DeduplicateWarcExtProc(DedupeArgs()),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(data_dir)s/%(item_name)s.deduplicated.warc.gz")
            ]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
    wget_args.extend(['--bind-address', globals()['bind_address']])
    print('')
    print('*** Wget will bind address at {0} ***'.format(
        globals()['bind_address']))
    print('')

project = Project(title="Yahoo Blog",
                  project_html="""
    <img class="project-logo" alt="Yahoo logo" src="http://archiveteam.org/images/thumb/a/a2/Yahoo-logo.png/120px-Yahoo-logo.png" />
    <h2>Yahoo Blogs <span class="links"><a href="http://blog.yahoo.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/yahooblog/">Leaderboard</a></span></h2>
    <p><i>Yahoo!</i> is a horrible monster.</p>
      """,
                  utc_deadline=datetime.datetime(2013, 12, 26, 0, 0, 1))

pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='yahooblog'),
    WgetDownload(
        wget_args,
        max_tries=2,
        accept_on_exit_code=[0, 8],
    ),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        }), MoveFiles(),
    LimitConcurrent(