Example #1
0
    youtube_dl_exe=YOUTUBE_DL,
    finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"],
    warc_max_size=WARC_MAX_SIZE,
    monitor_disk=WPULL_MONITOR_DISK,
    monitor_memory=WPULL_MONITOR_MEMORY,
)

pipeline = Pipeline(
    CheckIP(), CheckLocalWebserver(),
    GetItemFromQueue(control,
                     pipeline_id,
                     downloader,
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE'),
                     version_check=(VERSION, pipeline_version)),
    StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(),
    DownloadUrlFile(control),
    Wpull(wpull_args,
          accept_on_exit_code=AcceptAny(),
          env={
              'ITEM_IDENT': ItemInterpolation('%(ident)s'),
              'LOG_KEY': ItemInterpolation('%(log_key)s'),
              'REDIS_URL': REDIS_URL,
              'PATH': os.environ['PATH']
          }), RelabelIfAborted(control), CompressLogIfFailed(), WriteInfo(),
    MoveFiles(target_directory=os.environ["FINISHED_WARCS_DIR"]),
    StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME))


def stop_control():
    #control.flag_logging_thread_for_termination()
Example #2
0
_, _, _, pipeline_id = monitoring.pipeline_id()

wpull_args = WpullArgs(default_user_agent=DEFAULT_USER_AGENT,
                       wpull_exe=WPULL_EXE,
                       youtube_dl_exe=YOUTUBE_DL,
                       phantomjs_exe=PHANTOMJS,
                       finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"],
                       warc_max_size=WARC_MAX_SIZE)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromQueue(control,
                     pipeline_id,
                     downloader,
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE')), StartHeartbeat(control),
    SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control),
    WgetDownload(wpull_args,
                 accept_on_exit_code=AcceptAny(),
                 env={
                     'ITEM_IDENT': ItemInterpolation('%(ident)s'),
                     'LOG_KEY': ItemInterpolation('%(log_key)s'),
                     'REDIS_URL': REDIS_URL,
                     'PATH': os.environ['PATH']
                 }), RelabelIfAborted(control), WriteInfo(), MoveFiles(),
    LimitConcurrent(
        2,
        RsyncUpload(target=RSYNC_URL,
                    target_source_path=ItemInterpolation("%(data_dir)s"),
                    files=ItemValue("all_target_files"),
                    extra_args=['--partial', '--partial-dir', '.rsync-tmp'])),
Example #3
0
            args.extend(phantomjs_args)

        return args

    @classmethod
    def add_args(cls, args, names, item):
        for name in names:
            value = name % item
            if value:
                args.append(value)

_, _, _, pipeline_id = monitoring.pipeline_id()

pipeline = Pipeline(
    GetItemFromQueue(control, pipeline_id, ao_only=env.get('AO_ONLY')),
    StartHeartbeat(control),
    SetFetchDepth(),
    PreparePaths(),
    WriteInfo(),
    DownloadUrlFile(control),
    WgetDownload(WpullArgs(),
    accept_on_exit_code=AcceptAny(),
    env={
        'ITEM_IDENT': ItemInterpolation('%(ident)s'),
        'LOG_KEY': ItemInterpolation('%(log_key)s'),
        'REDIS_URL': REDIS_URL,
        'PATH': os.environ['PATH']
    }),
    RelabelIfAborted(control),
    WriteInfo(),
    MoveFiles(),