def test_find_executable_bad_version(self): self.assertFalse(find_executable( 'pipeline runner', '123-notrealversion', ['./run-pipeline', '../run-pipeline'], version_arg='--version') )
def test_find_executable(self): self.assertTrue(find_executable( 'pipeline runner', seesaw.__version__, ['./run-pipeline', '../run-pipeline'], version_arg='--version') )
def test_find_executable_bad_version(self): if seesaw.six.PY3: exes = ['./run-pipeline3', '../run-pipeline3'] else: exes = ['./run-pipeline', '../run-pipeline'] self.assertFalse(find_executable( 'pipeline runner', '123-notrealversion', exes, version_arg='--version') )
def test_find_executable(self): if seesaw.six.PY3: exes = ['./run-pipeline3', '../run-pipeline3'] else: exes = ['./run-pipeline', '../run-pipeline'] self.assertTrue(find_executable( 'pipeline runner', seesaw.__version__, exes, version_arg='--version') )
def test_find_executable_regex_version(self): if seesaw.six.PY3: exes = ['./run-pipeline3', '../run-pipeline3'] else: exes = ['./run-pipeline', '../run-pipeline'] self.assertTrue(find_executable( 'pipeline runner', re.compile(seesaw.__version__.replace('.', '\\.')), exes, version_arg='--version') )
from archivebot import control from archivebot import shared_config from archivebot.seesaw import extensions from archivebot.seesaw import monitoring from archivebot.seesaw.wpull import WpullArgs from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \ SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \ RelabelIfAborted, MoveFiles, SetWarcFileSizeInRedis, StopHeartbeat, \ MarkItemAsDone VERSION = "20140915.01" PHANTOMJS_VERSION = '1.9.7' EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WPULL_EXE = find_executable('Wpull', None, [ './wpull' ]) PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION, ['phantomjs', './phantomjs', '../phantomjs'], '-v') version_integer = (sys.version_info.major * 10) + sys.version_info.minor assert version_integer >= 33, \ "This pipeline requires Python >= 3.3. You are running %s." % \ sys.version assert WPULL_EXE, 'No usable Wpull found.' assert PHANTOMJS, 'PhantomJS %s was not found.' % PHANTOMJS_VERSION assert 'RSYNC_URL' in env, 'RSYNC_URL not set.' assert 'REDIS_URL' in env, 'REDIS_URL not set.' if StrictVersion(seesaw.__version__) < StrictVersion("0.1.8b1"):
import json from os import environ as env from urlparse import urlparse from seesaw.project import * from seesaw.item import * from seesaw.task import * from seesaw.pipeline import * from seesaw.externalprocess import * from seesaw.util import find_executable VERSION = "20131101.01" USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WGET_LUA = find_executable("Wget+Lua", "GNU Wget 1.14.0-archivebot1", ["./wget-lua"]) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") if "RSYNC_URL" not in env: raise Exception("RSYNC_URL not set.") if "REDIS_URL" not in env: raise Exception("REDIS_URL not set.") if "LOG_CHANNEL" not in env: raise Exception("LOG_CHANNEL not set.") RSYNC_URL = env["RSYNC_URL"] REDIS_URL = env["REDIS_URL"]
sys.path.append(os.getcwd()) from config import * from depcheck import * checkDeps() ########################################################################### # Find a useful grabProject executable. # GRAB_TEST = find_executable( "grabProject", ["1"], [ "./grabProject.py", "../grabProject.py", "../../grabProject.py", "/home/warrior/grabProject.py", "/usr/bin/grabProject.py" ] ) ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = "20151123.05" USER_AGENT = 'ArchiveTeam' TRACKER_ID = 'googlecodersync'
raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable( 'Wget+Lua', ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'], [ './wget-lua', './wget-lua-warrior', './wget-lua-local', '../wget-lua', '../../wget-lua', '/home/warrior/wget-lua', '/usr/bin/wget-lua' ] ) if not WGET_LUA: raise Exception('No usable Wget+Lua found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change.
raise Exception("This pipeline needs seesaw version 0.1.5 or higher.") ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable( "Wget+Lua", ["GNU Wget 1.14.lua.20130523-9a5c"], [ "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua", "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua" ] ) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") ########################################################################### # Determine if FFMPEG is available # Should probably utilize an ffmpeg build (or source) distributed from the # repo to avoid nasty API incompatibilities between FFMPEG versions. # However, if the options used are relatively simple, using distro-provided
import json from os import environ as env from urlparse import urlparse from seesaw.project import * from seesaw.item import * from seesaw.task import * from seesaw.pipeline import * from seesaw.externalprocess import * from seesaw.util import find_executable VERSION = "20131101.01" USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WGET_LUA = find_executable('Wget+Lua', "GNU Wget 1.14.0-archivebot1", ['./wget-lua']) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") if 'RSYNC_URL' not in env: raise Exception('RSYNC_URL not set.') if 'REDIS_URL' not in env: raise Exception('REDIS_URL not set.') if 'LOG_CHANNEL' not in env: raise Exception('LOG_CHANNEL not set.') RSYNC_URL = env['RSYNC_URL'] REDIS_URL = env['REDIS_URL']
raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_AT will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_AT = find_executable( 'Wget+AT', [ 'GNU Wget 1.20.3-at.20211001.01', 'GNU Wget 1.21.3-at.20220503.02' ], [ './wget-at', '/home/warrior/data/wget-at-gnutls' ] ) if not WGET_AT: raise Exception('No usable Wget+At found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker.
from seesaw.pipeline import Pipeline from seesaw.project import Project from seesaw.util import find_executable if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_AT will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_AT = find_executable('Wget+AT', [ 'GNU Wget 1.20.3-at.20210212.02', ], ['./wget-at', '/home/warrior/data/wget-at']) if not WGET_AT: raise Exception('No usable Wget+At found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20210325.06' USER_AGENT = 'Archive Team (ircs://irc.hackint.org#nintendone https://webirc.hackint.org/#irc://irc.hackint.org/#nintendone)' TRACKER_ID = 'super-mario-maker-bookmarks' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 1
# check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") ########################################################################### # Find a useful Wpull executable. # # WPULL_EXE will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WPULL_EXE = find_executable("Wpull", re.compile(r"\b1\.2\.3\b"), [ "./wpull", "/usr/local/bin/wpull", os.path.expanduser("~/.local/share/wpull-1.2.3/wpull"), os.path.expanduser("~/.local/bin/wpull"), "/usr/bin/wpull", "/usr/local/lib/python3.7/site-packages/wpull", "./wpull_bootstrap", ]) YOUTUBE_DL_EXE = find_executable( "youtube-dl", None, # No version requirements [ "./youtube-dl", "/usr/local/bin/youtube-dl", "/usr/bin/youtube-dl", "youtube-dl", ], '--version', )
from seesaw.externalprocess import WgetDownload from seesaw.pipeline import Pipeline from seesaw.project import Project from seesaw.util import find_executable if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_AT will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_AT = find_executable('Wget+AT', ['GNU Wget 1.20.3-at.20201030.01'], ['./wget-at', '/home/warrior/data/wget-at']) if not WGET_AT: raise Exception('No usable Wget+At found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20210203.05' USER_AGENT = 'Archive Team' TRACKER_ID = 'halo-new' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 5
import shutil import json from tornado.httpclient import HTTPClient, HTTPRequest from seesaw.project import * from seesaw.item import * from seesaw.config import * from seesaw.task import * from seesaw.pipeline import * from seesaw.externalprocess import * from seesaw.tracker import * from seesaw.util import find_executable WGET_LUA = find_executable('wget-lua', '1.14.lua.20130523-9a5c', ['./wget-lua', 'wget-lua']) CURL = find_executable('curl', '7.2', ['curl']) if not WGET_LUA: raise Exception("wget-lua cannot be found") if not CURL: raise Exception("curl cannot be found") # ---- DATA_DIR = "data" USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0" VERSION = "20130910.01" TRACKER = "http://quilt.at.ninjawedding.org/patchy"
from seesaw.externalprocess import WgetDownload, ExternalProcess from seesaw.pipeline import Pipeline from seesaw.project import Project from seesaw.util import find_executable # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") ########################################################################### # Find a useful rsync executable RSYNC = find_executable( "rsync",["2.6.9"], [ "/usr/bin/rsync" ] ) #if not RSYNC: # raise Exception("No usable rsync found.") ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = "20150614.01" USER_AGENT = 'ArchiveTeam' TRACKER_ID = 'sourceforge-rsync'
from seesaw.pipeline import Pipeline from seesaw.project import Project from seesaw.util import find_executable # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.1.5"): raise Exception("This pipeline needs seesaw version 0.1.5 or higher.") ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable("Wget+Lua", ["GNU Wget 1.14.lua.20130523-9a5c"], [ "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua", "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua" ]) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") ########################################################################### # Determine if FFMPEG is available # Should probably utilize an ffmpeg build (or source) distributed from the # repo to avoid nasty API incompatibilities between FFMPEG versions. # However, if the options used are relatively simple, using distro-provided # ffmpeg builds shouldn't be too problematic. Just be sure to add a metadata # WARCRecord indicating the version string of the ffmpeg that is used. FFMPEG = find_executable( "ffmpeg", ["ffmpeg version 2"], ["/usr/bin/ffmpeg", "/usr/local/bin/ffmpeg", "./ffmpeg"], "-version")
# nice, though. sys.path.append(os.getcwd()) from archivebot import control from archivebot import shared_config from archivebot.seesaw import extensions from archivebot.seesaw import monitoring from archivebot.seesaw.preflight import check_wpull_args from archivebot.seesaw.wpull import WpullArgs from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \ SetFetchDepth, PreparePaths, Wpull, CompressLogIfFailed, WriteInfo, DownloadUrlFile, \ RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP, CheckLocalWebserver WPULL_VERSION = ('2.0.3') EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WPULL_EXE = find_executable('Wpull', WPULL_VERSION, ['wpull', './wpull'], '--version') YOUTUBE_DL = find_executable('youtube-dl', None, ['./youtube-dl'], '--version') version_integer = (sys.version_info.major * 10) + sys.version_info.minor assert version_integer >= 33, \ "This pipeline requires Python >= 3.3. You are running %s." % \ sys.version if not os.environ.get('NO_SEGFAULT_340'): assert sys.version_info[:3] != (3, 4, 0), \ "Python 3.4.0 should not be used. It may segfault. " \ "Set NO_SEGFAULT_340=1 if your Python is patched. " \ "See https://bugs.python.org/issue21435" assert WPULL_EXE, 'No usable Wpull found.'
# check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.1.5"): raise Exception("This pipeline needs seesaw version 0.1.5 or higher.") ########################################################################### # Find a useful rsync_size_tester executable. # RSYNC_TEST = find_executable( "rsync_size_tester", ["1"], [ "./rsync_size_tester.py", "../rsync_size_tester.py", "../../rsync_size_tester.py", "/home/warrior/rsync_size_tester.py", "/usr/bin/rsync_size_tester.py" ] ) #Yes this is hackish but run-pipeline won't let you add more command line args #If the file "LARGE-RSYNC" is in the directory, allow larger rsync's #Using Gigabytes not Gibibytes to be safe if os.path.isfile("LARGE-RSYNC"): MAX_RSYNC = "150000000000" else: MAX_RSYNC = "25000000000"
# check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_AT = find_executable( 'Wget+AT', [ 'GNU Wget 1.20.3-at.20200401.01', 'GNU Wget 1.20.3-at.20200804.01', 'GNU Wget 1.20.3-at.20200902.01' ], ['./wget-at'] ) if not WGET_AT: raise Exception('No usable Wget+Lua found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20200902.01'
from seesaw.config import * from seesaw.item import * from seesaw.task import * from seesaw.pipeline import * from seesaw.externalprocess import * from seesaw.tracker import * from seesaw.util import find_executable WGET_LUA = find_executable( "Wget+Lua", "GNU Wget 1.14.lua.20130120-8476", [ "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua", "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua", ], ) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27" VERSION = "20130129.01"
raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable( 'Wget+Lua', ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'], [ './wget-lua', './wget-lua-warrior', './wget-lua-local', '../wget-lua', '../../wget-lua', '/home/warrior/wget-lua', '/usr/bin/wget-lua' ] ) if not WGET_LUA: raise Exception('No usable Wget+Lua found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change.
from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \ UploadWithTracker, SendDoneToTracker from seesaw.util import find_executable import zstandard if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_AT will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_AT = find_executable('Wget+AT', ['GNU Wget 1.20.3-at.20201030.01'], ['./wget-at']) if not WGET_AT: raise Exception('No usable Wget+At found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20201112.01' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' TRACKER_ID = 'urls' TRACKER_HOST = 'trackerproxy.archiveteam.org' MULTI_ITEM_SIZE = 20
sys.path.append(os.getcwd()) from archivebot import control from archivebot import shared_config from archivebot.seesaw import extensions from archivebot.seesaw import monitoring from archivebot.seesaw.preflight import check_wpull_args from archivebot.seesaw.wpull import WpullArgs from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \ SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \ RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP VERSION = "20150715.01" PHANTOMJS_VERSION = '1.9.8' EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WPULL_EXE = find_executable('Wpull', None, ['./wpull']) PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION, ['phantomjs', './phantomjs', '../phantomjs'], '-v') YOUTUBE_DL = find_executable('youtube-dl', None, ['./youtube-dl'], '--version') version_integer = (sys.version_info.major * 10) + sys.version_info.minor assert version_integer >= 33, \ "This pipeline requires Python >= 3.3. You are running %s." % \ sys.version if not os.environ.get('NO_SEGFAULT_340'): assert sys.version_info[:3] != (3, 4, 0), \ "Python 3.4.0 should not be used. It may segfault. " \ "Set NO_SEGFAULT_340=1 if your Python is patched. " \ "See https://bugs.python.org/issue21435"
import json from os import environ as env from urlparse import urlparse from seesaw.project import * from seesaw.item import * from seesaw.task import * from seesaw.pipeline import * from seesaw.externalprocess import * from seesaw.util import find_executable VERSION = "20140119.01" USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WGET_LUA = find_executable('Wget+Lua', "GNU Wget 1.14.0-archivebot1", [ './wget-lua' ]) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") if 'RSYNC_URL' not in env: raise Exception('RSYNC_URL not set.') if 'REDIS_URL' not in env: raise Exception('REDIS_URL not set.') if 'LOG_CHANNEL' not in env: raise Exception('LOG_CHANNEL not set.') RSYNC_URL = env['RSYNC_URL'] REDIS_URL = env['REDIS_URL']
from seesaw.pipeline import Pipeline from seesaw.project import Project from seesaw.util import find_executable # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable('Wget+Lua', ['GNU Wget 1.20.3-at-lua'], [ './wget-lua', './wget-lua-warrior', './wget-lua-local', '../wget-lua', '../../wget-lua', '/home/warrior/wget-lua', '/usr/bin/wget-lua' ]) if not WGET_LUA: raise Exception('No usable Wget+Lua found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20190925.01' USER_AGENT = 'ArchiveTeam' TRACKER_ID = 'sketch' TRACKER_HOST = 'tracker.archiveteam.org'
UploadWithTracker, SendDoneToTracker from seesaw.util import find_executable # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") ########################################################################### # Find a useful Wpull executable. # # WPULL_EXE will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string PYTHON35_EXE = find_executable("Python 3.5", re.compile(r"^Python 3\.5"), [ "/usr/local/bin/python3.5", "python3.5", "python3", "python", ]) if not PYTHON35_EXE: raise Exception("No usable python3.5 library found.") if not os.environ.get('s3access'): raise Exception("s3 access key missing") if not os.environ.get('s3secret'): raise Exception("s3 secret key missing") ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker.
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.3"): raise Exception("This pipeline needs seesaw version 0.8.3 or higher.") ########################################################################### # Find a useful Wpull executable. # # WPULL_EXE will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WPULL_EXE = find_executable( "Wpull", re.compile(r"\b1\.2\b"), [ "./wpull", os.path.expanduser("~/.local/share/wpull-1.2/wpull"), os.path.expanduser("~/.local/bin/wpull"), "./wpull_bootstrap", "wpull", ], ) if not WPULL_EXE: raise Exception("No usable Wpull found.") ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker.
UploadWithTracker, SendDoneToTracker from tornado.ioloop import IOLoop import zstandard if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_AT will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_AT = find_executable('Wget+AT', [ 'GNU Wget 1.20.3-at.20200902.01', 'GNU Wget 1.20.3-at.20200917.01', 'GNU Wget 1.20.3-at.20200919.01', 'GNU Wget 1.20.3-at.20201030.01' ], ['./wget-at', '/usr/local/bin/wget-at']) if not WGET_AT: raise Exception('No usable Wget+At found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20210114.01' USER_AGENT = 'Archive Team' TRACKER_ID = 'github' TRACKER_HOST = 'trackerproxy.archiveteam.org'
from seesaw.util import find_executable # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable( "Wget+Lua", ["GNU Wget 1.14.lua.20130523-9a5c", "GNU Wget 1.14.lua.20160530-955376b"], [ "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua", "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua" ]) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = "20161212.01" USER_AGENT = 'ArchiveTeam' TRACKER_ID = 'exua'
from seesaw.util import find_executable # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") ########################################################################### # Find a useful Wpull executable. # # WPULL_EXE will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WPULL_EXE = find_executable("Wpull", re.compile(r"\b1\.2\.3\b"), [ "./wpull", os.path.expanduser("~/.local/share/wpull-1.2.3/wpull"), os.path.expanduser("~/.local/bin/wpull"), "./wpull_bootstrap", "wpull", ]) if not WPULL_EXE: raise Exception("No usable Wpull found.") ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = "20170826.01" TRACKER_ID = 'newsgrabber' TRACKER_HOST = 'tracker.archiveteam.org'
# import requests # check the seesaw version if StrictVersion(seesaw.__version__) < StrictVersion('0.10.3'): raise Exception('This pipeline needs seesaw version 0.10.3 or higher.') ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string # TODO PYTHON = find_executable( 'Python3', ['Python 3.8', 'Python 3.7', 'Python 3.6', 'Python 3.5'], [ '/usr/bin/python3', '/usr/local/bin/python3', './python3', ]) if not PYTHON: raise Exception('No usable Python 3 found.') ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. VERSION = '20191202.02' USER_AGENT = 'ArchiveTeam' TRACKER_ID = 'yahoo-groups-api' # TRACKER_HOST = 'tracker.archiveteam.org' #prod-env
return path return None #--------------------------------------- ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable("Wget+Lua", [ "GNU Wget 1.14.lua.20130120-8476", "GNU Wget 1.14.lua.20130407-1f1d", "GNU Wget 1.14.lua.20130427-92d2", "GNU Wget 1.14.lua.20130523-9a5c" ], [ "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua", "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua" ]) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") ########################################################################### # The user agent for external requests. # # Use this constant in the Wget command line. USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.28"
raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") ########################################################################### # Find a useful Wget+Lua executable. # # WGET_LUA will be set to the first path that # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable( "Wget+Lua", ["GNU Wget 1.14.lua.20130523-9a5c"], [ "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua", "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua" ] ) if not WGET_LUA: raise Exception("No usable Wget+Lua found.") ########################################################################### # The version number of this pipeline definition. # # Update this each time you make a non-cosmetic change.
sys.path.append(os.getcwd()) from archivebot import control from archivebot import shared_config from archivebot.seesaw import extensions from archivebot.seesaw import monitoring from archivebot.seesaw.preflight import check_wpull_args from archivebot.seesaw.wpull import WpullArgs from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \ SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \ RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP VERSION = "20150424.01" PHANTOMJS_VERSION = '1.9.8' EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WPULL_EXE = find_executable('Wpull', None, [ './wpull' ]) PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION, ['phantomjs', './phantomjs', '../phantomjs'], '-v') version_integer = (sys.version_info.major * 10) + sys.version_info.minor assert version_integer >= 33, \ "This pipeline requires Python >= 3.3. You are running %s." % \ sys.version if not os.environ.get('NO_SEGFAULT_340'): assert sys.version_info[:3] != (3, 4, 0), \ "Python 3.4.0 should not be used. It may segfault. " \ "Set NO_SEGFAULT_340=1 if your Python is patched. " \ "See https://bugs.python.org/issue21435"
# nice, though. sys.path.append(os.getcwd()) from archivebot import control from archivebot import shared_config from archivebot.seesaw import extensions from archivebot.seesaw import monitoring from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \ SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \ RelabelIfAborted, MoveFiles, SetWarcFileSizeInRedis, StopHeartbeat, \ MarkItemAsDone VERSION = "20140819.03" EXPIRE_TIME = 60 * 60 * 48 # 48 hours between archive requests WPULL_EXE = find_executable('Wpull', None, [ './wpull' ]) PHANTOMJS = find_executable('PhantomJS', '1.9.7', ['phantomjs', './phantomjs'], '-v') version_integer = (sys.version_info.major * 10) + sys.version_info.minor assert version_integer >= 33, \ "This pipeline requires Python >= 3.3. You are running %s." % \ sys.version assert WPULL_EXE, 'No usable Wpull found.' assert PHANTOMJS, 'PhantomJS 1.9.0 was not found.' assert 'RSYNC_URL' in env, 'RSYNC_URL not set.' assert 'REDIS_URL' in env, 'REDIS_URL not set.' if StrictVersion(seesaw.__version__) < StrictVersion("0.1.8b1"):
import shutil import json from tornado.httpclient import HTTPClient, HTTPRequest from seesaw.project import * from seesaw.item import * from seesaw.config import * from seesaw.task import * from seesaw.pipeline import * from seesaw.externalprocess import * from seesaw.tracker import * from seesaw.util import find_executable WGET_LUA = find_executable('wget-lua', '1.14.lua.20130523-9a5c', ['./wget-lua', 'wget-lua']) CURL = find_executable('curl', '7.2', ['curl']) if not WGET_LUA: raise Exception("wget-lua cannot be found") if not CURL: raise Exception("curl cannot be found") # ---- DATA_DIR = "data" USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0" VERSION = "20130910.01" TRACKER = "http://quilt.at.ninjawedding.org/patchy"