Esempio n. 1
0
    <img class="project-logo" alt="logo" src="hhttps://archiveteam.org/images/a/af/Kinja-icon.png" height="50px"/>
    <h2>kinja.com <span class="links"><a href="https://kinja.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/kinja/">Leaderboard</a></span></h2>
    '''
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('https://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareDirectories(warc_prefix='kinja'),
    WgetDownload(
        WgetArgs(),
        max_tries=2,
        accept_on_exit_code=[0, 4, 8],
        env={
            'item_dir': ItemValue('item_dir'),
            'item_value': ItemValue('item_value'),
            'item_type': ItemValue('item_type'),
            'warc_file_base': ItemValue('warc_file_base'),
        }
    ),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
            'data': [
                ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
Esempio n. 2
0
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/6/68/Twitpic-logo.png" height="50px" title=""/>
        <h2>twitpic.com <span class="links"><a href="http://twitpic.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/twitpic/">Leaderboard</a></span></h2>
        <p>Archiving images and webpages from twitpic.com.</p>
        <p class="projectBroadcastMessage">Please use only concurrency of 1 to avoid overloading Twitpic.</p>
    """,
                  utc_deadline=datetime.datetime(2014, 9, 25, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="twitpic"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 7, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     "downloader": downloader
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
Esempio n. 3
0
project = Project(title="nujij",
                  project_html="""
        <img class="project-logo" alt="Project logo" src="http://archiveteam.org/images/f/f4/Nujij-logo.png" height="50px" title=""/>
        <h2>www.nujij.nl <span class="links"><a href="http://www.nujij.nl/">Website</a> &middot; <a href="http://tracker.archiveteam.org/nujij/">Leaderboard</a></span></h2>
        <p>Archiving all articles from NUjij.nl.</p>
    """)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="nujij"),
    WgetDownload(WgetArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     "item_dir": ItemValue("item_dir"),
                     "item_value": ItemValue("item_value"),
                     "item_type": ItemValue("item_type"),
                     "warc_file_base": ItemValue("warc_file_base"),
                 }),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s_data.txt")
            ]
        },
Esempio n. 4
0
_, _, _, pipeline_id = monitoring.pipeline_id()

pipeline = Pipeline(
    GetItemFromQueue(control, pipeline_id, ao_only=env.get('AO_ONLY')),
    StartHeartbeat(control),
    SetFetchDepth(),
    PreparePaths(),
    WriteInfo(),
    DownloadUrlFile(control),
    WgetDownload(
        WpullArgs(default_user_agent=DEFAULT_USER_AGENT, wpull_exe=WPULL_EXE,
                  phantomjs_exe=PHANTOMJS),
        accept_on_exit_code=AcceptAny(),
        env={
            'ITEM_IDENT': ItemInterpolation('%(ident)s'),
            'LOG_KEY': ItemInterpolation('%(log_key)s'),
            'REDIS_URL': REDIS_URL,
            'PATH': os.environ['PATH']
        }
    ),
    RelabelIfAborted(control),
    WriteInfo(),
    MoveFiles(),
    SetWarcFileSizeInRedis(control),
    LimitConcurrent(2,
        RsyncUpload(
            target = RSYNC_URL,
            target_source_path = ItemInterpolation("%(data_dir)s"),
            files=ItemValue("all_target_files"),
            extra_args = [