<img class="project-logo" alt="logo" src="https://wiki.archiveteam.org/images/Archiveteamsmall.png?959ea" height="50px"/>
    <h2>Super Mario Maker Bookmarks <span class="links"><a href="https://supermariomakerbookmark.nintendo.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/super-mario-maker-bookmarks/">Leaderboard</a></span></h2>
    ''',
                  utc_deadline=datetime.datetime.fromtimestamp(1617148800))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker(
        'http://{}/{}/multi={}/'.format(TRACKER_HOST, TRACKER_ID,
                                        MULTI_ITEM_SIZE), downloader, VERSION),
    PrepareDirectories(warc_prefix='super-mario-maker-bookmarks'),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'warc_file_base': ItemValue('warc_file_base'),
                     'item_name_newline': ItemValue('item_name_newline'),
                 }),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
Ejemplo n.º 2
0
         </span>
        </h2>
        <p>Quizilla shuts down. This is phase 1: content discovery.</p>
    """,
                  utc_deadline=datetime.datetime(2014, 10, 1, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="quizilladisco"),
    ExternalProcess('Scraper',
                    CustomProcessArgs(),
                    max_tries=2,
                    accept_on_exit_code=[0],
                    env={"item_dir": ItemValue("item_dir")}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
Ejemplo n.º 3
0
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/b/bd/Great_Seal_of_the_United_States.png/240px-Great_Seal_of_the_United_States.png" height="50px" title=""/>
        <h2>archives.gov <span class="links"><a href="http://archives.gov/">Website</a> &middot; <a href="http://tracker.archiveteam.org/archives-gov/">Leaderboard</a></span></h2>
        <p>Archiving data from archives.gov.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix="archives-gov"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     "warc_file_base": ItemValue("warc_file_base"),
                 }),
    Deduplicate(),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz")
            ]
Ejemplo n.º 4
0
 </span>
</h2>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    CheckBan(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
    PrepareDirectories(warc_prefix='yourshot-static'),
    WgetDownload(
        WgetArgs(),
        max_tries=0,              # 2,          #changed
        accept_on_exit_code=[0],  # [0, 4, 8],  #changed
        env={
            'item_dir': ItemValue('item_dir'),
            'item_value': ItemValue('item_value'),
            'item_type': ItemValue('item_type'),
            'warc_file_base': ItemValue('warc_file_base'),
            'todo_url_count': ItemValue('todo_url_count'),
        }
    ),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
            'data': [
                ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')  #TODO ?
            ]
        },
        id_function=stats_id_function,
    ),
Ejemplo n.º 5
0
  <a href="http://{0}/{1}/">Leaderboard</a>
 </span>
</h2>
    '''.format(TRACKER_HOST, TRACKER_ID))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix='8tracks'),
    WgetDownload(
        WgetArgs(),
        max_tries=1,
        accept_on_exit_code=[0],  #, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
            'item_value': ItemValue('item_value'),
            'item_type': ItemValue('item_type'),
            'warc_file_base': ItemValue('warc_file_base'),
            'downloader': downloader,
            'url_count_target': ItemValue('url_count_target'),
        }),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
        },
Ejemplo n.º 6
0
project = Project(title="friendfeed",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/8/83/Friendfeed_logo.png" height="50px" title=""/>
        <h2>friendfeed.com <span class="links"><a href="http://friendfeed.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/friendfeed/">Leaderboard</a></span></h2>
        <p>Grabbing all accounts from friendfeed.com.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="friendfeed"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
Ejemplo n.º 7
0
            <a href="http://tracker.archiveteam.org/ftp/">Leaderboard</a></span></h2>
        <p>Archiving all FTPs!</p>
    """
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix="ftp"),
    WgetDownload(
        WgetArgs(),
        max_tries=1,
        accept_on_exit_code=[0, 8],
        env={
            "item_dir": ItemValue("item_dir"),
            "item_item": ItemValue("item_item"),
            "downloader": downloader
        }
    ),
    PrepareStatsForTracker(
        defaults={"downloader": downloader, "version": VERSION},
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(
Ejemplo n.º 8
0
        <p>Archiving public Telegram channels.</p>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://{}/{}/multi={}/'
        .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),
        downloader, VERSION),
    PrepareDirectories(warc_prefix=TRACKER_ID),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
            'warc_file_base': ItemValue('warc_file_base')
        }
    ),
    SetBadUrls(),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
            'data': [
                ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
Ejemplo n.º 9
0
        <h2>twitpic.com <span class="links"><a href="http://twitpic.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/twitpic2/">Leaderboard</a></span></h2>
        <p>Saving TwitPic's smoldering remains.</p>
        <!--<p class="projectBroadcastMessage"></p>-->
    """,
    # utc_deadline=datetime.datetime(2014, 9, 25, 23, 59, 0)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="twitpic2"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 7, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     "escaped_item_name": ItemValue("escaped_item_name"),
                     "downloader": downloader
                 }), ProcessScrapeFile(),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
                ItemInterpolation(
                    "%(item_dir)s/twitpic2-scrape-%(escaped_item_name)s.txt.gz"
Ejemplo n.º 10
0
    <p><b>Canv.as</b> is closed.</p>
    """ % (TRACKER_HOST, TRACKER_ID)
    , utc_deadline=datetime.datetime(2014, 03, 03, 00, 00, 1)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareDirectories(warc_prefix="canvas"),
    WgetDownload(
        WgetArgs(),
        max_tries=5,
        accept_on_exit_code=[0, 8],
        env={
            'item_type': ItemValue("item_type"),
            'item_data': ItemValue("item_data"),
            'item_dir': ItemValue("item_dir"),
        }
    ),
    PrepareStatsForTracker(
        defaults={"downloader": downloader, "version": VERSION},
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
Ejemplo n.º 11
0
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://rapidshare.com/files/251393042" height="50px" title=""/>
        <h2>www.rapidshare.com <span class="links"><a href="https://www.rapidshare.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/rapidshare/">Leaderboard</a></span></h2>
        <p>Grabbing files from RapidShare.</p>
    """,
                  utc_deadline=datetime.datetime(2015, 3, 31, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="rapidshare"),
    WgetDownload(WgetArgs(),
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     "item_id": ItemValue("item_id"),
                     "item_keyword": ItemValue("item_keyword"),
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
Ejemplo n.º 12
0
        <p>This is an example wpull project</p>
        <!--<p class="projectBroadcastMessage"></p>-->
    """,
                  utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    # GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
    #                    VERSION),
    SetItemKey("item_name", "smaug.fart.website:8080"),
    PrepareDirectories(warc_prefix="examplecity"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 7, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "downloader": downloader
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
Ejemplo n.º 13
0
        <h2>Twitch Phase 1: Content Discovery. <span class="links"><a href="http://twitch.tv/">Website</a> &middot; <a href="http://tracker.archiveteam.org/twitchdisco/">Leaderboard</a></span></h2>
        <p>Twitch is releasing videos from their PC. <a href="https://archive.org/donate/">Donate to IA for disk space!</a></p>
    """,
    utc_deadline=datetime.datetime(2014, 8, 20, 23, 59, 0)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareDirectories(warc_prefix="twitchdisco"),
    ExternalProcess('Scraper', CustomProcessArgs(),
        max_tries=2,
        accept_on_exit_code=[0],
        env={
            "item_dir": ItemValue("item_dir")
        }
    ),
    PrepareStatsForTracker(
        defaults={"downloader": downloader, "version": VERSION},
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz")
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
        name="shared:rsync_threads", title="Rsync threads",
        description="The maximum number of concurrent uploads."),
Ejemplo n.º 14
0
    """,
    # utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="furaffinity"),
    ExternalProcess(
        'Begin',
        [sys.executable, 'helper.py', 'begin'],
        env={
            'user_agent': user_agent,
            'bind_address': globals().get('bind_address', ''),
            'disco_tracker': DISCO_TRACKER_URL,
            "item_dir": ItemValue("item_dir"),
        },
        accept_on_exit_code=[0],
    ),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=6,
            default=globals().get("num_procs", "1"),
            name="shared:fagrab:num_procs",
            title="Number of Processes",
            description="The maximum number of concurrent download processes."
        ),
        WgetDownload(WgetArgs(),
                     max_tries=1,
                     accept_on_exit_code=[0, 4, 7, 8],
Ejemplo n.º 15
0
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE')), StartHeartbeat(control),
    SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control),
    WgetDownload(wpull_args,
                 accept_on_exit_code=AcceptAny(),
                 env={
                     'ITEM_IDENT': ItemInterpolation('%(ident)s'),
                     'LOG_KEY': ItemInterpolation('%(log_key)s'),
                     'REDIS_URL': REDIS_URL,
                     'PATH': os.environ['PATH']
                 }), RelabelIfAborted(control), WriteInfo(), MoveFiles(),
    LimitConcurrent(
        2,
        RsyncUpload(target=RSYNC_URL,
                    target_source_path=ItemInterpolation("%(data_dir)s"),
                    files=ItemValue("all_target_files"),
                    extra_args=['--partial', '--partial-dir', '.rsync-tmp'])),
    StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME))


def stop_control():
    #control.flag_logging_thread_for_termination()
    control.unregister_pipeline(pipeline_id)


pipeline.on_cleanup += stop_control

pipeline.running_status = "Running"


def status_running():
Ejemplo n.º 16
0
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/thumb/0/03/Flick_logo_black.png/320px-Flick_logo_black.png" height="50px" title=""/>
        <h2>flickr.com <span class="links"><a href="http://flickr.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/flickr/">Leaderboard</a></span></h2>
        <p>Archiving CC photos from flickr.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION),
    PrepareDirectories(warc_prefix="flickr"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     'warc_file_base': ItemValue('warc_file_base')
                 }),
    Deduplicate(),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz")
            ]
Ejemplo n.º 17
0
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default="20",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir",
                ".rsync-tmp",
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Ejemplo n.º 18
0
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='2',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz'),
                ItemInterpolation(
                    '%(data_dir)s/%(warc_file_base)s-tail.warc.gz'),
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive', '--partial', '--partial-dir', '.rsync-tmp',
                '--min-size', '1', '--no-compress', '--compress-level', '0'
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Ejemplo n.º 19
0
# be too big. The deadline is optional.
project = Project(title='Bitbucket',
                  project_html='''
    <img class="project-logo" alt="logo" src="https://www.archiveteam.org/images/7/70/Bitbucket-atlassian-logo.png" height="50px"/>
    <h2>bitbucket.org <span class="links"><a href="https://bitbucket.org/">Website</a> &middot; <a href="http://tracker.archiveteam.org/bitbucket/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='bitbucket'),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'item_value': ItemValue('item_value'),
                     'item_type': ItemValue('item_type'),
                     'warc_file_base': ItemValue('warc_file_base'),
                 }),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
Ejemplo n.º 20
0
            max=4,
            default="2",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        # this upload task asks the tracker for an upload target
        # this can be HTTP or rsync and can be changed in the tracker admin panel
        UploadWithTracker(
            TRACKER_URL,
            downloader=downloader,
            version=VERSION,
            # list the files that should be uploaded.
            # this may include directory names.
            # note: HTTP uploads will only upload the first file on this list
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"),
                ItemInterpolation(
                    "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz")
            ],
            # the relative path for the rsync command
            # (this defines if the files are uploaded to a subdirectory on the server)
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            # extra rsync parameters (probably standard)
            rsync_extra_args=[
                "--recursive", "--partial", "--partial-dir", ".rsync-tmp"
            ]),
    ),

    # if the item passed every task, notify the tracker and report the statistics
    SendDoneToTracker(tracker_url=TRACKER_URL, stats=ItemValue("stats")))