Example #1
0
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir",
                ".rsync-tmp",
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Example #2
0
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='2',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation(
                    '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'
                ),
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive', '--partial', '--partial-dir', '.rsync-tmp',
                '--min-size', '1', '--no-compress', '--compress-level', '0'
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Example #3
0
                 max_tries=1,
                 accept_on_exit_code=[0, 4, 8],
                 env={
                     'item_dir': ItemValue('item_dir'),
                     'warc_file_base': ItemValue('warc_file_base'),
                 }), SetBadUrls(),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='2',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        ChooseTargetAndUpload(),
    ),
    MaybeSendDoneToTracker(tracker_url='http://%s/%s' %
                           (TRACKER_HOST, TRACKER_ID),
                           stats=ItemValue('stats')))
Example #4
0
                     pipeline_id,
                     downloader,
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE')), StartHeartbeat(control),
    SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control),
    WgetDownload(wpull_args,
                 accept_on_exit_code=AcceptAny(),
                 env={
                     'ITEM_IDENT': ItemInterpolation('%(ident)s'),
                     'LOG_KEY': ItemInterpolation('%(log_key)s'),
                     'REDIS_URL': REDIS_URL,
                     'PATH': os.environ['PATH']
                 }), RelabelIfAborted(control), WriteInfo(), MoveFiles(),
    LimitConcurrent(
        2,
        RsyncUpload(target=RSYNC_URL,
                    target_source_path=ItemInterpolation("%(data_dir)s"),
                    files=ItemValue("all_target_files"),
                    extra_args=['--partial', '--partial-dir', '.rsync-tmp'])),
    StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME))


def stop_control():
    #control.flag_logging_thread_for_termination()
    control.unregister_pipeline(pipeline_id)


pipeline.on_cleanup += stop_control

pipeline.running_status = "Running"

Example #5
0
            'data': [
                ItemInterpolation(
                    '%(data_dir)s/%(warc_file_base)s.warc.gz')  # TODO ?
            ]
        },
        id_function=stats_id_function,
    ),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='20',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,  # noqa: F821
            version=VERSION,
            files=ItemValue('files'),
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive',
                '--partial',
                '--partial-dir',
                '.rsync-tmp',
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Example #6
0
	#
	# the NumberConfigValue can be changed in the configuration panel
	LimitConcurrent(
		NumberConfigValue(
			min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads",
			description="The maximum number of concurrent uploads."),
		# this upload task asks the tracker for an upload target
		# this can be HTTP or rsync and can be changed in the tracker admin panel
		UploadWithTracker(
			TRACKER_URL,
			downloader=downloader,
			version=VERSION,
			# list the files that should be uploaded.
			# this may include directory names.
			# note: HTTP uploads will only upload the first file on this list
			files=[
				ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
			],
			# the relative path for the rsync command
			# (this defines if the files are uploaded to a subdirectory on the server)
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			# extra rsync parameters (probably standard)
			rsync_extra_args=[
				"--recursive",
				"--partial",
				"--partial-dir", ".rsync-tmp"
			]
		),
	),

	# if the item passed every task, notify the tracker and report the statistics
Example #7
0
            "data": FilesToUpload(),
        },
        id_function=prepare_stats_id_function,
    ),
    CleanUpItemDir(),
    LimitConcurrent(
        NumberConfigValue(min=1, max=4, default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        ConditionalTask(
            files_to_upload,
            UploadWithTracker2(
                "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                downloader=downloader,
                version=VERSION,
                files=FilesToUpload(),
                rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
                rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir", ".rsync-tmp"
                ]
            )
        )
    ),
    SendDoneToTracker(
        tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue("stats")
    )
)
Example #8
0
 MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=20,
         default="20",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
     UploadWithTracker(
         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
         version=VERSION,
         files=[
             ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
             ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt")
         ],
         rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
         rsync_extra_args=[
             "--sockopts=SO_SNDBUF=8388608,SO_RCVBUF=8388608",  # 02:50 <Kenshin> the extra options should improve rsync speeds when the latency is higher
             "--recursive",
             "--partial",
             "--partial-dir",
             ".rsync-tmp",
             "--min-size",
             "1",
             "--no-compress",
             "--compress-level=0"
         ]),
 ),
 SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
        IOLoop.instance().add_timeout(datetime.timedelta(seconds=10),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])),
    IdleTask(),
)
Example #10
0
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
	title="sourceforgersync",
	project_html="""
		<img class="project-logo" alt="Project logo" src="" height="50px" title=""/>
		<h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2>
		<p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p>
	"""
)

pipeline = Pipeline(
	CheckIP(),
	GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
	ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]),
	LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])),
	ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]),
	LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
		name="shared:rsync_threads", title="Rsync threads",
		description="The maximum number of concurrent uploads."),
		UploadWithTracker(
			"http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
			downloader=downloader,
			version=VERSION,
			files=[
				cleanItem("%(data_dir)s/%(item_name)s.tar.gz")
				#ItemInterpolation("foo.tar.gz")
			],
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			rsync_extra_args=[
				"--recursive",
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(
        1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])),
    IdleTask(),
)
pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(),
    WgetDownload(WgetArgs(), ), DeduplicateWarcExtProc(DedupeArgs()),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(data_dir)s/%(item_name)s.deduplicated.warc.gz")
            ]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadToIA(UploadToIAArgs()),
    ), DeleteFiles(),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Example #13
0
         'user_agent': user_agent,
         'bind_address': globals().get('bind_address', ''),
         'disco_tracker': DISCO_TRACKER_URL,
         "item_dir": ItemValue("item_dir"),
     },
     accept_on_exit_code=[0],
 ),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=6,
         default=globals().get("num_procs", "1"),
         name="shared:fagrab:num_procs",
         title="Number of Processes",
         description="The maximum number of concurrent download processes."
     ),
     WgetDownload(WgetArgs(),
                  max_tries=1,
                  accept_on_exit_code=[0, 4, 7, 8],
                  env={
                      "item_dir": ItemValue("item_dir"),
                      "downloader": downloader,
                      "item_name": ItemValue("item_name"),
                  }),
 ),
 ExternalProcess(
     'End',
     [sys.executable, 'helper.py', 'end'],
     env={
         'user_agent': user_agent,
         'bind_address': globals().get('bind_address', ''),
         'disco_tracker': DISCO_TRACKER_URL,
Example #14
0
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=10,
         default="10",
         name="isohunt:download_threads",
         title="Isohunt downloading threads",
         description=
         "How many threads downloading Isohunt torrents and pages can run at once, to avoid throttling."
     ),
     WgetDownloadTorrentRange(
         [
             WGET_LUA,
             "-U",
             USER_AGENT,
             "--no-check-certificate",
             "-e",
             "robots=off",
             "--rotate-dns",
             "--timeout",
             "60",
             "--level=inf",
             "--tries",
             "20",
             "--waitretry",
             "5",
             # "--bind-address", "%BIND_ADDRESS%",
         ],
         max_tries=5,
         accept_on_exit_code=[0]),
 ),
Example #15
0
    <h2>freeml.com <span class="links"><a href="http://www.freeml.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/freeml/">Leaderboard</a></span></h2>
    ''')

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='freeml'),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=1,
            default='1',
            name='shared:wget_download',
            title='wget-lua threads',
            description='The maximum number of concurrent downloads.'),
        WgetDownload(WgetArgs(),
                     max_tries=2,
                     accept_on_exit_code=[0, 4, 8],
                     env={
                         'item_dir': ItemValue('item_dir'),
                         'item_value': ItemValue('item_value'),
                         'item_type': ItemValue('item_type'),
                         'warc_file_base': ItemValue('warc_file_base')
                     }),
    ),
    PrepareStatsForTracker(
        defaults={
            'downloader': downloader,
            'version': VERSION
        },
        file_groups={
            'data':