Esempio n. 1
0
 def test_find_executable_bad_version(self):
     self.assertFalse(find_executable(
         'pipeline runner',
         '123-notrealversion',
         ['./run-pipeline', '../run-pipeline'],
         version_arg='--version')
     )
Esempio n. 2
0
 def test_find_executable(self):
     self.assertTrue(find_executable(
         'pipeline runner',
         seesaw.__version__,
         ['./run-pipeline', '../run-pipeline'],
         version_arg='--version')
     )
Esempio n. 3
0
    def test_find_executable_bad_version(self):
        if seesaw.six.PY3:
            exes = ['./run-pipeline3', '../run-pipeline3']
        else:
            exes = ['./run-pipeline', '../run-pipeline']

        self.assertFalse(find_executable(
            'pipeline runner',
            '123-notrealversion',
            exes,
            version_arg='--version')
        )
Esempio n. 4
0
    def test_find_executable(self):
        if seesaw.six.PY3:
            exes = ['./run-pipeline3', '../run-pipeline3']
        else:
            exes = ['./run-pipeline', '../run-pipeline']

        self.assertTrue(find_executable(
            'pipeline runner',
            seesaw.__version__,
            exes,
            version_arg='--version')
        )
Esempio n. 5
0
    def test_find_executable_regex_version(self):
        if seesaw.six.PY3:
            exes = ['./run-pipeline3', '../run-pipeline3']
        else:
            exes = ['./run-pipeline', '../run-pipeline']

        self.assertTrue(find_executable(
            'pipeline runner',
            re.compile(seesaw.__version__.replace('.', '\\.')),
            exes,
            version_arg='--version')
        )
Esempio n. 6
0
from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, SetWarcFileSizeInRedis, StopHeartbeat, \
    MarkItemAsDone


VERSION = "20140915.01"
PHANTOMJS_VERSION = '1.9.7'
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, [ './wpull' ])
PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION,
        ['phantomjs', './phantomjs', '../phantomjs'], '-v')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \
        sys.version

assert WPULL_EXE, 'No usable Wpull found.'
assert PHANTOMJS, 'PhantomJS %s was not found.' % PHANTOMJS_VERSION
assert 'RSYNC_URL' in env, 'RSYNC_URL not set.'
assert 'REDIS_URL' in env, 'REDIS_URL not set.'

if StrictVersion(seesaw.__version__) < StrictVersion("0.1.8b1"):
Esempio n. 7
0
import json

from os import environ as env
from urlparse import urlparse
from seesaw.project import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *

from seesaw.util import find_executable

VERSION = "20131101.01"
USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WGET_LUA = find_executable("Wget+Lua", "GNU Wget 1.14.0-archivebot1", ["./wget-lua"])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

if "RSYNC_URL" not in env:
    raise Exception("RSYNC_URL not set.")

if "REDIS_URL" not in env:
    raise Exception("REDIS_URL not set.")

if "LOG_CHANNEL" not in env:
    raise Exception("LOG_CHANNEL not set.")

RSYNC_URL = env["RSYNC_URL"]
REDIS_URL = env["REDIS_URL"]
sys.path.append(os.getcwd())
from config import *
from depcheck import *

checkDeps()

###########################################################################
# Find a useful grabProject executable.
#
GRAB_TEST = find_executable(
	"grabProject",
	["1"],
	[
		"./grabProject.py",
		"../grabProject.py",
		"../../grabProject.py",
		"/home/warrior/grabProject.py",
		"/usr/bin/grabProject.py"
	]
)


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20151123.05"
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'googlecodersync'
Esempio n. 9
0
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    'Wget+Lua',
    ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'],
    [
        './wget-lua',
        './wget-lua-warrior',
        './wget-lua-local',
        '../wget-lua',
        '../../wget-lua',
        '/home/warrior/wget-lua',
        '/usr/bin/wget-lua'
    ]
)

if not WGET_LUA:
    raise Exception('No usable Wget+Lua found.')


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
Esempio n. 10
0
    raise Exception("This pipeline needs seesaw version 0.1.5 or higher.")


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    "Wget+Lua",
    ["GNU Wget 1.14.lua.20130523-9a5c"],
    [
        "./wget-lua",
        "./wget-lua-warrior",
        "./wget-lua-local",
        "../wget-lua",
        "../../wget-lua",
        "/home/warrior/wget-lua",
        "/usr/bin/wget-lua"
    ]
)

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

###########################################################################
# Determine if FFMPEG is available
# Should probably utilize an ffmpeg build (or source) distributed from the
# repo to avoid nasty API incompatibilities between FFMPEG versions.
# However, if the options used are relatively simple, using distro-provided
Esempio n. 11
0
import json

from os import environ as env
from urlparse import urlparse
from seesaw.project import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *

from seesaw.util import find_executable

VERSION = "20131101.01"
USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WGET_LUA = find_executable('Wget+Lua', "GNU Wget 1.14.0-archivebot1",
                           ['./wget-lua'])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

if 'RSYNC_URL' not in env:
    raise Exception('RSYNC_URL not set.')

if 'REDIS_URL' not in env:
    raise Exception('REDIS_URL not set.')

if 'LOG_CHANNEL' not in env:
    raise Exception('LOG_CHANNEL not set.')

RSYNC_URL = env['RSYNC_URL']
REDIS_URL = env['REDIS_URL']
Esempio n. 12
0
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable(
    'Wget+AT',
    [
        'GNU Wget 1.20.3-at.20211001.01',
        'GNU Wget 1.21.3-at.20220503.02'
    ],
    [
         './wget-at',
         '/home/warrior/data/wget-at-gnutls'
    ]
)

if not WGET_AT:
    raise Exception('No usable Wget+At found.')


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', [
    'GNU Wget 1.20.3-at.20210212.02',
], ['./wget-at', '/home/warrior/data/wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20210325.06'
USER_AGENT = 'Archive Team (ircs://irc.hackint.org#nintendone https://webirc.hackint.org/#irc://irc.hackint.org/#nintendone)'
TRACKER_ID = 'super-mario-maker-bookmarks'
TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 1
Esempio n. 14
0
# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

###########################################################################
# Find a useful Wpull executable.
#
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WPULL_EXE = find_executable("Wpull", re.compile(r"\b1\.2\.3\b"), [
    "./wpull",
    "/usr/local/bin/wpull",
    os.path.expanduser("~/.local/share/wpull-1.2.3/wpull"),
    os.path.expanduser("~/.local/bin/wpull"),
    "/usr/bin/wpull",
    "/usr/local/lib/python3.7/site-packages/wpull",
    "./wpull_bootstrap",
])
YOUTUBE_DL_EXE = find_executable(
    "youtube-dl",
    None,  # No version requirements
    [
        "./youtube-dl",
        "/usr/local/bin/youtube-dl",
        "/usr/bin/youtube-dl",
        "youtube-dl",
    ],
    '--version',
)
Esempio n. 15
0
from seesaw.externalprocess import WgetDownload
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', ['GNU Wget 1.20.3-at.20201030.01'],
                          ['./wget-at', '/home/warrior/data/wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20210203.05'
USER_AGENT = 'Archive Team'
TRACKER_ID = 'halo-new'
TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 5
Esempio n. 16
0
import shutil
import json

from tornado.httpclient import HTTPClient, HTTPRequest

from seesaw.project import *
from seesaw.item import *
from seesaw.config import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *

from seesaw.util import find_executable

WGET_LUA = find_executable('wget-lua', '1.14.lua.20130523-9a5c',
                           ['./wget-lua', 'wget-lua'])

CURL = find_executable('curl', '7.2', ['curl'])

if not WGET_LUA:
    raise Exception("wget-lua cannot be found")

if not CURL:
    raise Exception("curl cannot be found")

# ----

DATA_DIR = "data"
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0"
VERSION = "20130910.01"
TRACKER = "http://quilt.at.ninjawedding.org/patchy"
Esempio n. 17
0
from seesaw.externalprocess import WgetDownload, ExternalProcess
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable


# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")


###########################################################################
# Find a useful rsync executable
RSYNC = find_executable(
    "rsync",["2.6.9"],
    [
        "/usr/bin/rsync"
    ]
)

#if not RSYNC:
#    raise Exception("No usable rsync found.")


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20150614.01"
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'sourceforge-rsync'
Esempio n. 18
0
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.1.5"):
    raise Exception("This pipeline needs seesaw version 0.1.5 or higher.")

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable("Wget+Lua", ["GNU Wget 1.14.lua.20130523-9a5c"], [
    "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua",
    "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua"
])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

###########################################################################
# Determine if FFMPEG is available
# Should probably utilize an ffmpeg build (or source) distributed from the
# repo to avoid nasty API incompatibilities between FFMPEG versions.
# However, if the options used are relatively simple, using distro-provided
# ffmpeg builds shouldn't be too problematic. Just be sure to add a metadata
# WARCRecord indicating the version string of the ffmpeg that is used.
FFMPEG = find_executable(
    "ffmpeg", ["ffmpeg version 2"],
    ["/usr/bin/ffmpeg", "/usr/local/bin/ffmpeg", "./ffmpeg"], "-version")
Esempio n. 19
0
# nice, though.
sys.path.append(os.getcwd())

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.preflight import check_wpull_args
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, Wpull, CompressLogIfFailed, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP, CheckLocalWebserver

WPULL_VERSION = ('2.0.3')
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', WPULL_VERSION, ['wpull', './wpull'],
                            '--version')
YOUTUBE_DL = find_executable('youtube-dl', None, ['./youtube-dl'], '--version')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \
        sys.version

if not os.environ.get('NO_SEGFAULT_340'):
    assert sys.version_info[:3] != (3, 4, 0), \
        "Python 3.4.0 should not be used. It may segfault. " \
        "Set NO_SEGFAULT_340=1 if your Python is patched. " \
        "See https://bugs.python.org/issue21435"

assert WPULL_EXE, 'No usable Wpull found.'
Esempio n. 20
0

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.1.5"):
	raise Exception("This pipeline needs seesaw version 0.1.5 or higher.")


###########################################################################
# Find a useful rsync_size_tester executable.
#
RSYNC_TEST = find_executable(
	"rsync_size_tester",
	["1"],
	[
		"./rsync_size_tester.py",
		"../rsync_size_tester.py",
		"../../rsync_size_tester.py",
		"/home/warrior/rsync_size_tester.py",
		"/usr/bin/rsync_size_tester.py"
	]
)

#Yes this is hackish but run-pipeline won't let you add more command line args
#If the file "LARGE-RSYNC" is in the directory, allow larger rsync's
#Using Gigabytes not Gibibytes to be safe
if os.path.isfile("LARGE-RSYNC"):
	MAX_RSYNC = "150000000000"
else:
	MAX_RSYNC = "25000000000"

Esempio n. 21
0
# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_AT = find_executable(
    'Wget+AT',
    [
        'GNU Wget 1.20.3-at.20200401.01',
        'GNU Wget 1.20.3-at.20200804.01',
        'GNU Wget 1.20.3-at.20200902.01'
    ],
    ['./wget-at']
)

if not WGET_AT:
    raise Exception('No usable Wget+Lua found.')


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20200902.01'
Esempio n. 22
0
from seesaw.config import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *
from seesaw.util import find_executable


WGET_LUA = find_executable(
    "Wget+Lua",
    "GNU Wget 1.14.lua.20130120-8476",
    [
        "./wget-lua",
        "./wget-lua-warrior",
        "./wget-lua-local",
        "../wget-lua",
        "../../wget-lua",
        "/home/warrior/wget-lua",
        "/usr/bin/wget-lua",
    ],
)

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")


USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
VERSION = "20130129.01"

Esempio n. 23
0
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    'Wget+Lua',
    ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'],
    [
        './wget-lua',
        './wget-lua-warrior',
        './wget-lua-local',
        '../wget-lua',
        '../../wget-lua',
        '/home/warrior/wget-lua',
        '/usr/bin/wget-lua'
    ]
)

if not WGET_LUA:
    raise Exception('No usable Wget+Lua found.')


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
Esempio n. 24
0
from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \
    UploadWithTracker, SendDoneToTracker
from seesaw.util import find_executable
import zstandard

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', ['GNU Wget 1.20.3-at.20201030.01'],
                          ['./wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20201112.01'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
TRACKER_ID = 'urls'
TRACKER_HOST = 'trackerproxy.archiveteam.org'
MULTI_ITEM_SIZE = 20
Esempio n. 25
0
sys.path.append(os.getcwd())

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.preflight import check_wpull_args
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP

VERSION = "20150715.01"
PHANTOMJS_VERSION = '1.9.8'
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, ['./wpull'])
PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION,
        ['phantomjs', './phantomjs', '../phantomjs'], '-v')
YOUTUBE_DL = find_executable('youtube-dl', None, ['./youtube-dl'], '--version')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \
        sys.version

if not os.environ.get('NO_SEGFAULT_340'):
    assert sys.version_info[:3] != (3, 4, 0), \
        "Python 3.4.0 should not be used. It may segfault. " \
        "Set NO_SEGFAULT_340=1 if your Python is patched. " \
        "See https://bugs.python.org/issue21435"
Esempio n. 26
0
import json

from os import environ as env
from urlparse import urlparse
from seesaw.project import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *

from seesaw.util import find_executable

VERSION = "20140119.01"
USER_AGENT = "ArchiveTeam ArchiveBot/%s" % VERSION
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WGET_LUA = find_executable('Wget+Lua', "GNU Wget 1.14.0-archivebot1",
        [ './wget-lua' ])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

if 'RSYNC_URL' not in env:
    raise Exception('RSYNC_URL not set.')

if 'REDIS_URL' not in env:
    raise Exception('REDIS_URL not set.')

if 'LOG_CHANNEL' not in env:
    raise Exception('LOG_CHANNEL not set.')

RSYNC_URL = env['RSYNC_URL']
REDIS_URL = env['REDIS_URL']
Esempio n. 27
0
from seesaw.pipeline import Pipeline
from seesaw.project import Project
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable('Wget+Lua', ['GNU Wget 1.20.3-at-lua'], [
    './wget-lua', './wget-lua-warrior', './wget-lua-local', '../wget-lua',
    '../../wget-lua', '/home/warrior/wget-lua', '/usr/bin/wget-lua'
])

if not WGET_LUA:
    raise Exception('No usable Wget+Lua found.')

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20190925.01'
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'sketch'
TRACKER_HOST = 'tracker.archiveteam.org'
    UploadWithTracker, SendDoneToTracker
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

###########################################################################
# Find a useful Wpull executable.
#
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
PYTHON35_EXE = find_executable("Python 3.5", re.compile(r"^Python 3\.5"), [
    "/usr/local/bin/python3.5",
    "python3.5",
    "python3",
    "python",
])

if not PYTHON35_EXE:
    raise Exception("No usable python3.5 library found.")
if not os.environ.get('s3access'):
    raise Exception("s3 access key missing")
if not os.environ.get('s3secret'):
    raise Exception("s3 secret key missing")

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
Esempio n. 29
0
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.3"):
    raise Exception("This pipeline needs seesaw version 0.8.3 or higher.")


###########################################################################
# Find a useful Wpull executable.
#
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WPULL_EXE = find_executable(
    "Wpull",
    re.compile(r"\b1\.2\b"),
    [
        "./wpull",
        os.path.expanduser("~/.local/share/wpull-1.2/wpull"),
        os.path.expanduser("~/.local/bin/wpull"),
        "./wpull_bootstrap",
        "wpull",
    ],
)

if not WPULL_EXE:
    raise Exception("No usable Wpull found.")


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
Esempio n. 30
0
    UploadWithTracker, SendDoneToTracker
from tornado.ioloop import IOLoop
import zstandard

if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
    raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_AT will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string

WGET_AT = find_executable('Wget+AT', [
    'GNU Wget 1.20.3-at.20200902.01', 'GNU Wget 1.20.3-at.20200917.01',
    'GNU Wget 1.20.3-at.20200919.01', 'GNU Wget 1.20.3-at.20201030.01'
], ['./wget-at', '/usr/local/bin/wget-at'])

if not WGET_AT:
    raise Exception('No usable Wget+At found.')

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20210114.01'
USER_AGENT = 'Archive Team'
TRACKER_ID = 'github'
TRACKER_HOST = 'trackerproxy.archiveteam.org'
Esempio n. 31
0
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    "Wget+Lua",
    ["GNU Wget 1.14.lua.20130523-9a5c", "GNU Wget 1.14.lua.20160530-955376b"],
    [
        "./wget-lua", "./wget-lua-warrior", "./wget-lua-local", "../wget-lua",
        "../../wget-lua", "/home/warrior/wget-lua", "/usr/bin/wget-lua"
    ])

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20161212.01"
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'exua'
Esempio n. 32
0
from seesaw.util import find_executable

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"):
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")

###########################################################################
# Find a useful Wpull executable.
#
# WPULL_EXE will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WPULL_EXE = find_executable("Wpull", re.compile(r"\b1\.2\.3\b"), [
    "./wpull",
    os.path.expanduser("~/.local/share/wpull-1.2.3/wpull"),
    os.path.expanduser("~/.local/bin/wpull"),
    "./wpull_bootstrap",
    "wpull",
])

if not WPULL_EXE:
    raise Exception("No usable Wpull found.")

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20170826.01"
TRACKER_ID = 'newsgrabber'
TRACKER_HOST = 'tracker.archiveteam.org'
Esempio n. 33
0
# import requests

# check the seesaw version
if StrictVersion(seesaw.__version__) < StrictVersion('0.10.3'):
    raise Exception('This pipeline needs seesaw version 0.10.3 or higher.')

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
# TODO
PYTHON = find_executable(
    'Python3', ['Python 3.8', 'Python 3.7', 'Python 3.6', 'Python 3.5'], [
        '/usr/bin/python3',
        '/usr/local/bin/python3',
        './python3',
    ])

if not PYTHON:
    raise Exception('No usable Python 3 found.')

###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20191202.02'
USER_AGENT = 'ArchiveTeam'
TRACKER_ID = 'yahoo-groups-api'
# TRACKER_HOST = 'tracker.archiveteam.org'  #prod-env
Esempio n. 34
0
      return path
  return None
#---------------------------------------

###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable("Wget+Lua",
    [ "GNU Wget 1.14.lua.20130120-8476",
      "GNU Wget 1.14.lua.20130407-1f1d",
      "GNU Wget 1.14.lua.20130427-92d2",
      "GNU Wget 1.14.lua.20130523-9a5c" ],
    [ "./wget-lua",
      "./wget-lua-warrior",
      "./wget-lua-local",
      "../wget-lua",
      "../../wget-lua",
      "/home/warrior/wget-lua",
      "/usr/bin/wget-lua" ])

if not WGET_LUA:
  raise Exception("No usable Wget+Lua found.")

###########################################################################
# The user agent for external requests.
#
# Use this constant in the Wget command line.
USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.28"
Esempio n. 35
0
    raise Exception("This pipeline needs seesaw version 0.8.5 or higher.")


###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
    "Wget+Lua",
    ["GNU Wget 1.14.lua.20130523-9a5c"],
    [
        "./wget-lua",
        "./wget-lua-warrior",
        "./wget-lua-local",
        "../wget-lua",
        "../../wget-lua",
        "/home/warrior/wget-lua",
        "/usr/bin/wget-lua"
    ]
)

if not WGET_LUA:
    raise Exception("No usable Wget+Lua found.")


###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
Esempio n. 36
0
sys.path.append(os.getcwd())

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.preflight import check_wpull_args
from archivebot.seesaw.wpull import WpullArgs
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, StopHeartbeat, MarkItemAsDone, CheckIP

VERSION = "20150424.01"
PHANTOMJS_VERSION = '1.9.8'
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, [ './wpull' ])
PHANTOMJS = find_executable('PhantomJS', PHANTOMJS_VERSION,
        ['phantomjs', './phantomjs', '../phantomjs'], '-v')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \
        sys.version

if not os.environ.get('NO_SEGFAULT_340'):
    assert sys.version_info[:3] != (3, 4, 0), \
        "Python 3.4.0 should not be used. It may segfault. " \
        "Set NO_SEGFAULT_340=1 if your Python is patched. " \
        "See https://bugs.python.org/issue21435"
Esempio n. 37
0
# nice, though.
sys.path.append(os.getcwd())

from archivebot import control
from archivebot import shared_config
from archivebot.seesaw import extensions
from archivebot.seesaw import monitoring
from archivebot.seesaw.tasks import GetItemFromQueue, StartHeartbeat, \
    SetFetchDepth, PreparePaths, WriteInfo, DownloadUrlFile, \
    RelabelIfAborted, MoveFiles, SetWarcFileSizeInRedis, StopHeartbeat, \
    MarkItemAsDone


VERSION = "20140819.03"
EXPIRE_TIME = 60 * 60 * 48  # 48 hours between archive requests
WPULL_EXE = find_executable('Wpull', None, [ './wpull' ])
PHANTOMJS = find_executable('PhantomJS', '1.9.7',
        ['phantomjs', './phantomjs'], '-v')

version_integer = (sys.version_info.major * 10) + sys.version_info.minor

assert version_integer >= 33, \
        "This pipeline requires Python >= 3.3.  You are running %s." % \
        sys.version

assert WPULL_EXE, 'No usable Wpull found.'
assert PHANTOMJS, 'PhantomJS 1.9.0 was not found.'
assert 'RSYNC_URL' in env, 'RSYNC_URL not set.'
assert 'REDIS_URL' in env, 'REDIS_URL not set.'

if StrictVersion(seesaw.__version__) < StrictVersion("0.1.8b1"):
Esempio n. 38
0
import shutil
import json

from tornado.httpclient import HTTPClient, HTTPRequest

from seesaw.project import *
from seesaw.item import *
from seesaw.config import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *

from seesaw.util import find_executable

WGET_LUA = find_executable('wget-lua', '1.14.lua.20130523-9a5c',
                           ['./wget-lua', 'wget-lua'])

CURL = find_executable('curl', '7.2', ['curl'])

if not WGET_LUA:
    raise Exception("wget-lua cannot be found")

if not CURL:
    raise Exception("curl cannot be found")

# ----

DATA_DIR = "data"
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0"
VERSION = "20130910.01"
TRACKER = "http://quilt.at.ninjawedding.org/patchy"