Beispiel #1
0
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt")
            ],
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            rsync_extra_args=[
                "--recursive",
                "--partial",
                "--partial-dir",
                ".rsync-tmp",
            ]),
    ),
    SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue("stats")))
Beispiel #2
0
        }
    ),
    MoveFiles(),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},    # noqa: F821
        file_groups={
            'data': [
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz')  #TODO ?
            ]
        },
        id_function=stats_id_function,
    ),
    LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
                                      name='shared:rsync_threads', title='Rsync threads',
                                     description='The maximum number of concurrent uploads.'),
                    UploadWithTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                                      downloader=downloader,        # noqa: F821
                                      version=VERSION,
                                      files=ItemValue('files'),
                                      rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
                                      rsync_extra_args=[
                                                         '--recursive',
                                                         '--partial',
                                                         '--partial-dir', '.rsync-tmp',
                                                       ]),),
    SendDoneToTracker(
        tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue('stats')
    )
)
Beispiel #3
0
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    LimitConcurrent(NumberConfigValue(min=1, max=20, default='2',
        name='shared:rsync_threads', title='Rsync threads',
        description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive',
                '--partial',
                '--partial-dir', '.rsync-tmp',
                '--min-size', '1',
                '--no-compress',
                '--compress-level', '0'
            ]
        ),
    ),
    SendDoneToTracker(
        tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue('stats')
    )
)
Beispiel #4
0
            'data':
            [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=20,
            default='20',
            name='shared:rsync_threads',
            title='Rsync threads',
            description='The maximum number of concurrent uploads.'),
        UploadWithTracker(
            'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
            version=VERSION,
            files=[
                ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz')
            ],
            rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
            rsync_extra_args=[
                '--recursive',
                '--partial',
                '--partial-dir',
                '.rsync-tmp',
            ]),
    ),
    SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                      stats=ItemValue('stats')))
Beispiel #5
0
            max=4,
            default="2",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        # this upload task asks the tracker for an upload target
        # this can be HTTP or rsync and can be changed in the tracker admin panel
        UploadWithTracker(
            TRACKER_URL,
            downloader=downloader,
            version=VERSION,
            # list the files that should be uploaded.
            # this may include directory names.
            # note: HTTP uploads will only upload the first file on this list
            files=[
                ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"),
                ItemInterpolation(
                    "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz")
            ],
            # the relative path for the rsync command
            # (this defines if the files are uploaded to a subdirectory on the server)
            rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
            # extra rsync parameters (probably standard)
            rsync_extra_args=[
                "--recursive", "--partial", "--partial-dir", ".rsync-tmp"
            ]),
    ),

    # if the item passed every task, notify the tracker and report the statistics
    SendDoneToTracker(tracker_url=TRACKER_URL, stats=ItemValue("stats")))