Exemple #1
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="friendfeed",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/8/83/Friendfeed_logo.png" height="50px" title=""/>
        <h2>friendfeed.com <span class="links"><a href="http://friendfeed.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/friendfeed/">Leaderboard</a></span></h2>
        <p>Grabbing all accounts from friendfeed.com.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="friendfeed"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                 }),
Exemple #2
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title = 'tumblr-static',
    project_html = '''
    <img class="project-logo" alt="logo" src="https://archiveteam.org/images/b/ba/Tumblr_on_white.png" height="50px"/>
    <h2>Tumblr <span class="links"><a href="https://www.tumblr.com/">Website</a> &middot; <a href="https://tracker.archiveteam.org/tumblr/">Leaderboard</a></span></h2>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareDirectories(warc_prefix='tumblr-static'),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
Exemple #3
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="reddit",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/b/b5/Reddit_logo.png/320px-Reddit_logo.png" height="50px" title=""/>
        <h2>www.reddit.com <span class="links"><a href="https://www.reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
        <p>Grabbing reddit.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="reddit"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                 }),
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='StoryFire',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/e/e9/Storyfire-icon.png" height="50px"/>
    <h2>storyfire.com <span class="links"><a href="https://storyfire.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/storyfire/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='storyfire'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'warc_file_base': ItemValue('warc_file_base'),
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='reddit',
                  project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>
        <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
        <p>Archiving everything from reddit.</p>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='reddit'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_names': ItemValue('item_name_newline'),
Exemple #6
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='500px',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/8/83/500px_logo.png" height="50px"/>
    <h2>500px.com <span class="links"><a href="https://500px.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/500px/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='500px'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base'),
            print('')
            print('*** Wpull will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wpull_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="newsgrabber",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/f/f3/Archive_team.png/235px-Archive_team.png" height="50px" title=""/>
        <h2>archiveteam.org <span class="links"><a href="http://archiveteam.org/">Website</a> &middot; <a href="http://tracker.archiveteam.org/newsgrabber/">Leaderboard</a></span></h2>
        <p>Archiving all the news!</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="newsgrabber"),
    WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8]),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default="1",
            name="shared:dedupe_threads",
            title="Deduplicate threads",
Exemple #8
0
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title='bintray',
    project_html='''
    <img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/Archiveteamsmall.png?959ea" height="50px"/>
    <h2>Bintray <span class="links"><a href="https://bintray.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/bintray/">Leaderboard</a></span></h2>
    ''',
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='bintray'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
Exemple #9
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='googleplus',
                  project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/9/95/Google%2B_logo.png" height="50px" title=""/>
        <h2>plus.google.com <span class="links"><a href="https://plus.google.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/googleplus/">Leaderboard</a></span></h2>
        <p>Archiving everything from Google+.</p>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix='googleplus'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
Exemple #10
0
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="Verizon",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/b/bc/Verizon_Logo.png/320px-Verizon_Logo.png" height="50px" title=""/>
        <h2>mysite.verizon.net <span class="links"><a href="http://mysite.verizon.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/verizon/">Leaderboard</a></span></h2>
        <h2>members.bellatlantic.net <span class="links"><a href="htp://members.bellatlantic.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/verizon/">Leaderboard</a></span></h2>
        <p>Archiving websites from mysite.verizon.net and members.bellatlantic.net.</p>
    """,
                  utc_deadline=datetime.datetime(2014, 9, 30, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="verizon"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 7, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
Exemple #11
0
# CONTROL CONNECTION
# ------------------------------------------------------------------------------

control = control.Control(REDIS_URL, LOG_CHANNEL, PIPELINE_CHANNEL)

# ------------------------------------------------------------------------------
# SEESAW EXTENSIONS
# ------------------------------------------------------------------------------

extensions.install_stdout_extension(control)

# ------------------------------------------------------------------------------
# PIPELINE
# ------------------------------------------------------------------------------

project = Project(title="ArchiveBot request handler")

#FIXME: Same hack as above; seesaw executes pipeline.py with the pipeline dir as the cwd.
# __file__ can't be used because seesaw exec()s the file contents rather than importing the file.
REPO_DIRECTORY = os.path.dirname(os.path.realpath('.'))


def pipeline_version():
    # Returns something like 20190820.5cd1e38
    output = subprocess.check_output([
        'git', 'show', '-s', '--format=format:%cd.%h', '--date=format:%Y%m%d'
    ],
                                     cwd=REPO_DIRECTORY)
    return output.decode('utf-8').strip()

Exemple #12
0
        return realize(wget_args, item)


downloader = globals()['downloader']  # quiet the code checker

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title="Canvas",
    project_html="""
    <img class="project-logo" alt="" src="http://archiveteam.org/images/0/0d/Canvas-beta-logo-medium.png" height="50"
    title="" />
    <h2>Canv.as <span class="links">
        <a href="http://canv.as/">Website</a> &middot;
        <a href="http://%s/%s/">Leaderboard</a></span></h2>
    <p><b>Canv.as</b> is closed.</p>
    """ % (TRACKER_HOST, TRACKER_ID)
    , utc_deadline=datetime.datetime(2014, 03, 03, 00, 00, 1)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareDirectories(warc_prefix="canvas"),
    WgetDownload(
        WgetArgs(),
        max_tries=5,
Exemple #13
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="miiverse",
                  project_html="""
    <img class="project-logo" alt="logo" src="http://www.archiveteam.org/images/8/87/Miiverselogo.png" />
    <h2>miiverse.net <span class="links"><a href="https://miiverse.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/miiverse/">Leaderboard</a></span></h2>
    """,
                  utc_deadline=datetime.datetime(2017, 11, 7, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="miiverse"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     "warc_file_base": ItemValue("warc_file_base"),
Exemple #14
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='yourshot',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/7/7a/Yourshot-logo.png" height="50px"/>
    <h2>yourshot.nationalgeographic.com <span class="links"><a href="https://yourshot.nationalgeographic.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/yourshot/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='yourshot'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base'),
Exemple #15
0
    return d


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="Foodily Discovery",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://t.nerds.io/c4d70e9d01baf0931f130f60839d9844.png" height="50px" title=""/>
        <h2>Foodily Phase 1.
        <span class="links">
             <a href="http://www.foodily.com/">Website</a> &middot;
             <a href="http://tracker.archiveteam.org/foodilydisco/">Leaderboard</a>
             <a href="http://archiveteam.org/index.php?title=Foodily">Wiki</a> &middot;
         </span>
        </h2>
        <p>Foodily gets aquired. This is phase 1: content discovery.</p>
    """,
                  utc_deadline=datetime.datetime(2015, 05, 30, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="foodilydisco"),
    ExternalProcess('Scraper',
                    CustomProcessArgs(),
                    max_tries=2,
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='YouTube Playlist Notes',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://archiveteam.org/images/4/4d/YouTube_logo_2017.png" height="50px"/>
    <h2>youtube.com <span class="links"><a href="https://youtube.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/youtube-playlistnotes/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix='youtube-playlistnotes'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_type': ItemValue('item_type'),
                     'item_value': ItemValue('item_value'),
import datetime

import functools

from seesaw.externalprocess import ExternalProcess
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.task import Task, LimitConcurrent
from tornado.ioloop import IOLoop

project = Project(title='Software Update',
                  project_html='''
        <h2>Software Update</h2>
        <p>Select this project, when required, to automatically
        download and install software to update components of the Warrior.
        </p>
        <p>Components: Python3.5 </p>
        ''')


class WarningTask(Task):
    def __init__(self):
        Task.__init__(self, 'WarningTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output(
            'Software will be automatically downloaded and installed to update components of the Warrior.'
        )
        item.log_output('Update will continue in 10 seconds...')
Exemple #18
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='GitHub',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/2/21/Github-icon.png" height="50px"/>
    <h2>github.com <span class="links"><a href="https://github.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/github/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='github'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'item_config': ItemValue('item_config'),
Exemple #19
0
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title = 'niconico',
    project_html = '''
    <img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/0/02/Niconico_Official_Logo.png" height="50px"/>
    <h2>Niconico <span class="links"><a href="http://www.nicovideo.jp/">Website</a> &middot; <a href="http://tracker.archiveteam.org/niconico/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://{}/{}/multi={}/'
        .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),
        downloader, VERSION),
    PrepareDirectories(warc_prefix='niconico'),
    WgetDownload(
        WgetArgs(),
        max_tries=1,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
Exemple #20
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='Google Sites',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://archiveteam.org/images/9/9b/Google-Sites-Icon-2016.png" height="50px"/>
    <h2>sites.google.com <span class="links"><a href="https://sites.google.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/google-sites/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix='google-sites'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
Exemple #21
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='tinypic',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/7/74/Tinypic-logo.jpg" height="50px"/>
    <h2>tinypic.com <span class="links"><a href="http://www.tinypic.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/tinypic/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='tinypic'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base'),
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="rapidshare",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://rapidshare.com/files/251393042" height="50px" title=""/>
        <h2>www.rapidshare.com <span class="links"><a href="https://www.rapidshare.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/rapidshare/">Leaderboard</a></span></h2>
        <p>Grabbing files from RapidShare.</p>
    """,
                  utc_deadline=datetime.datetime(2015, 3, 31, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="rapidshare"),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
import datetime

import functools

from seesaw.externalprocess import ExternalProcess
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.task import Task, LimitConcurrent
from tornado.ioloop import IOLoop

project = Project(title='Warrior Extras Installer',
                  project_html='''
        <img class="project-logo" alt=""
            src="https://tracker.archiveteam.org/~chfoo/image/200px-Applications-system.svg.png"
            height="50" />
        <h2>Warrior Extras Installer
            <span class="links">
                <a href="https://github.com/ArchiveTeam/warrior-extras-installer">source code</a>
            <span>
        </h2>
        <p>Select this project, when required, to install extra software components required by other projects.</p>
        ''')


class WarningTask(Task):
    def __init__(self):
        Task.__init__(self, 'WarningTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output(
Exemple #24
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="Halo 2",
                  project_html="""
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/f/f0/Halo_2_Logo.png" height="50px" />
    <h2>halo.bungie.net <span class="links"><a href="http://halo.bungie.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/halo2/">Leaderboard</a></span></h2>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="halo2"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
Exemple #25
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='halo',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/8/80/Bungie_Logo_white_background.png" height="50px"/>
    <h2>halo.bungie.net <span class="links"><a href="https://halo.bungie.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/halo-new/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='halo'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'warc_file_base': ItemValue('warc_file_base'),
Exemple #26
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='yourshot-static',
                  project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/2/22/Radio24syv.png" height="50px" title=""/>
        <h2>Radio24syv &middot; <class="links"><a href="https://www.24syv.dk/">Website</a> &middot; <a href="http://%s/%s/">Leaderboard</a></span></h2>
        <p>Archiving audio from radio24syv archive</p>
    ''' % (TRACKER_HOST, TRACKER_ID))

pipeline = Pipeline(
    CheckIP(),
    CheckBan(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix='yourshot-static'),
    WgetDownload(
        WgetArgs(),
        max_tries=0,  # 2,          #changed
        accept_on_exit_code=[0],  # [0, 4, 8],  #changed
        env={
            'item_dir': ItemValue('item_dir'),
Exemple #27
0
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)

###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title='Telegram',
    project_html='''
        <img class="project-logo" alt="Project logo" src="https://wiki.archiveteam.org/images/thumb/7/7d/Telegram-icon.png/600px-Telegram-icon.png" height="50px" title=""/>
        <h2>telegram.org <span class="links"><a href="https://telegram.org/">Website</a> &middot; <a href="http://tracker.archiveteam.org/telegram/">Leaderboard</a></span></h2>
        <p>Archiving public Telegram channels.</p>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://{}/{}/multi={}/'
        .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),
        downloader, VERSION),
    PrepareDirectories(warc_prefix=TRACKER_ID),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
Exemple #28
0
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title="Justin.tv",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/9/97/Justintv_logo.png/320px-Justintv_logo.png" height="50px" />
        <h2>Justin.tv <span class="links"><a href="http://justin.tv/">Justin.tv</a> &middot; <a href="http://tracker.archiveteam.org/justintv/">Leaderboard</a></span></h2>
        <p>Justin.tv is deleting all archives sometime in the next week.  We DPoS.</p>
    """,
                  utc_deadline=datetime.datetime(2014, 6, 8, 0, 0, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="justintv"),
    Bouncer(),
    WgetDownload(WgetArgs(),
                 max_tries=5,
                 accept_on_exit_code=[0, 8],
                 env={"item_dir": ItemValue("item_dir")}),
    PrepareStatsForTracker(
        defaults={
Exemple #29
0
		'pipeline_hash': PIPELINE_SHA1,
		'python_version': sys.version,
	}

	return d


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
	title="sourceforgersync",
	project_html="""
		<img class="project-logo" alt="Project logo" src="" height="50px" title=""/>
		<h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2>
		<p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p>
	"""
)

pipeline = Pipeline(
	CheckIP(),
	GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
	ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]),
	LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])),
	ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]),
	LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
		name="shared:rsync_threads", title="Rsync threads",
		description="The maximum number of concurrent uploads."),
		UploadWithTracker(
			"http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
Exemple #30
0
            print('')
            print('*** Wget will bind addresss at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)


###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(title='mediafire',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://archiveteam.org/images/thumb/8/8b/Mediafire-icon.png/320px-Mediafire-icon.png" height="50px"/>
    <h2>mediafire.com <span class="links"><a href="https://mediafire.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/mediafire/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='mediafire'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_type': ItemValue('item_type'),
                     'item_value': ItemValue('item_value'),
                     'warc_file_base': ItemValue('warc_file_base'),