MULTI_ITEM_SIZE), downloader, VERSION), PrepareDirectories(warc_prefix='halo'), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base'), 'item_name_newline': ItemValue('item_name_newline') }), SetBadUrls(), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
NumberConfigValue( min=1, max=20, default="1", name="shared:dedupe_threads", title="Deduplicate threads", description="The maximum number of concurrent dedupes."), DeduplicateWarcExtProc(), ), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker(
YgaArgs(), max_tries=1, # 2, #changed accept_on_exit_code=[0], # [0, 4, 8], #changed env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), } ), MoveFiles(), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, # noqa: F821 file_groups={ 'data': [ ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz') #TODO ? ] }, id_function=stats_id_function, ), LimitConcurrent(NumberConfigValue(min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, # noqa: F821 version=VERSION, files=ItemValue('files'), rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial',
""", utc_deadline=datetime.datetime(2013, 12, 26, 0, 0, 1)) pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='yahooblog'), WgetDownload( wget_args, max_tries=2, accept_on_exit_code=[0, 8], ), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader,
# # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project( title='load', project_html=''' <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/> <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> · <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2> <p>Archiving everything from reddit.</p> ''' ) pipeline = Pipeline( GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, file_groups={ 'data': [ '/dev/null' ] }, id_function=stats_id_function, ), SendDoneToTracker( tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats') ) )
# gunzipped HTTP responses. Note that the .gz compression on the WARC # itself remains. CookWARC(), # this will set the item["stats"] string that is sent to the tracker (see below) PrepareStatsForTracker( # there are a few normal values that need to be sent defaults={ "downloader": downloader, "version": VERSION }, # this is used for the size counter on the tracker: # the groups should correspond with the groups set configured on the tracker file_groups={ # there can be multiple groups with multiple files # file sizes are measured per group "data": [ ItemInterpolation( "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz") ], "hrefs": [ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2")] }, id_function=(lambda item: { "ua": item["user_agent"] })), # there can be multiple items in the pipeline, but this wrapper ensures # that there is only one item uploading at a time # # the NumberConfigValue can be changed in the configuration panel