youtube_dl_exe=YOUTUBE_DL, finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"], warc_max_size=WARC_MAX_SIZE, monitor_disk=WPULL_MONITOR_DISK, monitor_memory=WPULL_MONITOR_MEMORY, ) pipeline = Pipeline( CheckIP(), CheckLocalWebserver(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE'), version_check=(VERSION, pipeline_version)), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), Wpull(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), CompressLogIfFailed(), WriteInfo(), MoveFiles(target_directory=os.environ["FINISHED_WARCS_DIR"]), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME)) def stop_control(): #control.flag_logging_thread_for_termination()
_, _, _, pipeline_id = monitoring.pipeline_id() wpull_args = WpullArgs(default_user_agent=DEFAULT_USER_AGENT, wpull_exe=WPULL_EXE, youtube_dl_exe=YOUTUBE_DL, phantomjs_exe=PHANTOMJS, finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"], warc_max_size=WARC_MAX_SIZE) pipeline = Pipeline( CheckIP(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), WriteInfo(), MoveFiles(), LimitConcurrent( 2, RsyncUpload(target=RSYNC_URL, target_source_path=ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args=['--partial', '--partial-dir', '.rsync-tmp'])),
args.extend(phantomjs_args) return args @classmethod def add_args(cls, args, names, item): for name in names: value = name % item if value: args.append(value) _, _, _, pipeline_id = monitoring.pipeline_id() pipeline = Pipeline( GetItemFromQueue(control, pipeline_id, ao_only=env.get('AO_ONLY')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload(WpullArgs(), accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), WriteInfo(), MoveFiles(),