def get_application(config=None): """Overide default get_application in Scrapy.""" if config is None: config = Config() # Override http_port by $PORT environment variable in Heroku. # Override bind_address to 0.0.0.0 if $PORT exists # Note that the http_port has to be a string intead of int. config.cp['scrapyd'].update( http_port=os.environ.get('PORT', config.get('http_port')), bind_address='0.0.0.0' if os.environ.get('PORT') else config.get('bind_address') ) apppath = config.get('application', 'scrapyd.app.application') appfunc = load_object(apppath) return appfunc(config)
def get_spider_list(project, runner=None, pythonpath=None, version=''): """Return the spider list from the given project, using the given runner""" if "cache" not in get_spider_list.__dict__: get_spider_list.cache = UtilsCache() try: return get_spider_list.cache[project][version] except KeyError: pass if runner is None: runner = Config().get('runner') env = os.environ.copy() env['PYTHONIOENCODING'] = 'UTF-8' env['SCRAPY_PROJECT'] = project if pythonpath: env['PYTHONPATH'] = pythonpath if version: env['SCRAPY_EGG_VERSION'] = version pargs = [sys.executable, '-m', runner, 'list'] proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) out, err = proc.communicate() if proc.returncode: msg = err or out or '' msg = msg.decode('utf8') raise RuntimeError(msg.encode('unicode_escape') if six.PY2 else msg) # FIXME: can we reliably decode as UTF-8? # scrapy list does `print(list)` tmp = out.decode('utf-8').splitlines() try: project_cache = get_spider_list.cache[project] project_cache[version] = tmp except KeyError: project_cache = {version: tmp} get_spider_list.cache[project] = project_cache return tmp
def setUp(self): d = self.mktemp() os.mkdir(d) config = Config(values={'eggs_dir': d, 'logs_dir': d}) config.cp.add_section('settings') config.cp.set('settings', 'newbot', 'newbot.settings') self.environ = Environment(config, initenv={})
def get_spider_list(project, runner=None, pythonpath=None, version=''): """Return the spider list from the given project, using the given runner""" if "cache" not in get_spider_list.__dict__: get_spider_list.cache = UtilsCache() try: return get_spider_list.cache[project][version] except KeyError: pass if runner is None: runner = Config().get('runner') env = os.environ.copy() env['SCRAPY_PROJECT'] = project if pythonpath: env['PYTHONPATH'] = pythonpath if version: env['SCRAPY_EGG_VERSION'] = version pargs = [sys.executable, '-m', runner, 'list'] proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) out, err = proc.communicate() if proc.returncode: msg = err or out or 'unknown error' raise RuntimeError(msg.splitlines()[-1]) tmp = out.splitlines() try: project_cache = get_spider_list.cache[project] project_cache[version] = tmp except KeyError: project_cache = {version: tmp} get_spider_list.cache[project] = project_cache return tmp
def test_egg_config_application(self): config = Config() eggstore = 'scrapyd.tests.test_eggstorage.SomeFakeEggStorage' config.cp.set('scrapyd', 'eggstorage', eggstore) app = application(config) app_eggstorage = app.getComponent(IEggStorage) assert isinstance(app_eggstorage, SomeFakeEggStorage) app_eggstorage.list_projects() == ['hello_world']
def setUp(self): d = self.mktemp() config = Config(values={'dbs_dir': d, 'finished_to_keep': '2'}) self.jobst = MemoryJobStorage(config) self.j1, self.j2, self.j3 = j1, j2, j3 self.jobst.add(self.j1) self.jobst.add(self.j2) self.jobst.add(self.j3)
def test_get_environment_with_no_items_dir(self): config = Config(values={'items_dir': '', 'logs_dir': ''}) config.cp.add_section('settings') config.cp.set('settings', 'newbot', 'newbot.settings') msg = {'_project': 'mybot', '_spider': 'myspider', '_job': 'ID'} slot = 3 environ = Environment(config, initenv={}) env = environ.get_environment(msg, slot) self.failUnless('SCRAPY_FEED_URI' not in env) self.failUnless('SCRAPY_LOG_FILE' not in env)
def setUp(self): d = self.mktemp() eggs_dir = os.path.join(d, 'eggs') dbs_dir = os.path.join(d, 'dbs') os.makedirs(eggs_dir) os.makedirs(dbs_dir) os.makedirs(os.path.join(eggs_dir, 'mybot1')) os.makedirs(os.path.join(eggs_dir, 'mybot2')) config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir}) self.queues = get_spider_queues(config) self.poller = QueuePoller(config)
def list(project): eggdir = path.join(Config().get("eggs_dir"), project) versions = { path.splitext(path.basename(x))[0]: hashlib.md5(open(x, 'rb').read()).hexdigest() for x in glob("%s/*.egg" % eggdir) } return [{ "version": version, "checksum": versions[version] } for version in sorted(versions.keys(), key=LooseVersion)]
def get_spider_list(project, runner=None): """Return the spider list from the given project, using the given runner""" if runner is None: runner = Config().get('runner') env = os.environ.copy() env['SCRAPY_PROJECT'] = project pargs = [sys.executable, '-m', runner, 'list'] proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) out, err = proc.communicate() if proc.returncode: msg = err or out or 'unknown error' raise RuntimeError(msg.splitlines()[-1]) return out.splitlines()
def test_get_environment_with_logfile(self): config = Config( values={ 'items_dir': '', 'logs_dir': '.', 'logs_filename': '{project}-{spider}-{job}-{Y}{m}{d}T{H}{M}{S}' }) msg = {'_project': 'mybot', '_spider': 'myspider', '_job': 'ID'} slot = 3 environ = Environment(config, initenv={}) now = datetime.datetime.now() env = environ.get_environment(msg, slot) expected_logfilename = now.strftime("mybot-spider-%Y%m%dT%H%M%S") self.assert_(env['SCRAPY_LOG_FILE'], expected_logfilename)
def main(): logging.basicConfig(level=logging.INFO) config = Config() Register(to_bytes('http://%s:%d' % (config.get('bind_address', '127.0.0.1'), config.getint('http_port', 6800))), config.get('register_path', '/scrapyd-cluster/worker'), hosts=config.get('zookeeper_hosts', '127.0.0.1:2181')) argv[1:1] = ['-n', '-y', join(dirname(scrapyd.__file__), 'txapp.py')] run()
def get_spider_list(project, runner=None, pythonpath=None, version=''): """Return the spider list from the given project, using the given runner""" if "cache" not in get_spider_list.__dict__: get_spider_list.cache = UtilsCache() try: return get_spider_list.cache[project][version] except KeyError: pass if runner is None: runner = Config().get('runner') env = os.environ.copy() env['PYTHONIOENCODING'] = 'UTF-8' #env['SCRAPY_PROJECT'] = project change to: because encoding is utf-8, this would not be necessary in python 3? env['SCRAPY_PROJECT'] = str(project.decode('utf8')) print "type of env [scrapy_prokject]: ", type(env['SCRAPY_PROJECT']) #print env if pythonpath: env['PYTHONPATH'] = pythonpath print pythonpath #env['PYTHONPATH'] = str(pythonpath.decode('utf8')) if version: env['SCRAPY_EGG_VERSION'] = version print version #env['SCRAPY_EGG_VERSION'] = str(version.decode('utf8')) #env['SCRAPY_EGG_VERSION'] = version change to ^ because encoding is utf-8 print pargs = [sys.executable, '-m', runner, 'list'] proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) out, err = proc.communicate() #if proc.returncode: # msg = err or out or '' # msg = msg.decode('utf8') # raise RuntimeError(msg.encode('unicode_escape') if six.PY2 else msg) # FIXME: can we reliably decode as UTF-8? # scrapy list does `print(list)` tmp = out.decode('utf-8').splitlines() print tmp try: project_cache = get_spider_list.cache[project] project_cache[version] = tmp except KeyError: project_cache = {version: tmp} get_spider_list.cache[project] = project_cache return tmp
def parse_spider_log(project, spider, jobid, keyword): import re logdir = Config().get('logs_dir') file_name = os.path.join( os.path.join(logdir, os.path.join(project, spider)), jobid + '.log') logs, res = '', {} if os.path.exists(file_name): with open(file_name, 'r') as f: data = f.readline() while data: if keyword in data: logs = data data = f.readline() else: return res if logs: match_str = re.search('{.*?}\n$', logs) if match_str: res = eval(match_str.group().strip()) return res
def _get_config(): datadir = os.path.join(project_data_dir(), 'scrapyd') conf = { 'eggs_dir': os.path.join(datadir, 'eggs'), 'logs_dir': os.path.join(datadir, 'logs'), 'items_dir': os.path.join(datadir, 'items'), 'dbs_dir': os.path.join(datadir, 'dbs'), } for k in ['eggs_dir', 'logs_dir', 'items_dir', 'dbs_dir']: # create dirs d = conf[k] if not os.path.exists(d): os.makedirs(d) scrapyd_conf = """ [scrapyd] eggs_dir = %(eggs_dir)s logs_dir = %(logs_dir)s items_dir = %(items_dir)s dbs_dir = %(dbs_dir)s """ % conf return Config(extra_sources=[StringIO(scrapyd_conf)])
def setUp(self): d = self.mktemp() config = Config(values={'eggs_dir': d}) self.eggst = FilesystemEggStorage(config)
#Stage 2 Update (Python 3) from future import standard_library standard_library.install_aliases() from builtins import object import datetime, json import urllib.request, urllib.parse, http.client from scrapy.utils.project import get_project_settings settings = get_project_settings() from scrapyd.config import Config scrapyd_config = Config() scrapyd_port = scrapyd_config.getint('http_port', 6800) from dynamic_scraper.models import Scraper class TaskUtils(object): conf = { "MAX_SPIDER_RUNS_PER_TASK": 10, "MAX_CHECKER_RUNS_PER_TASK": 25, } def _run_spider(self, **kwargs): param_dict = { 'project': 'default', 'spider': kwargs['spider'], 'id': kwargs['id'], 'run_type': kwargs['run_type'], 'do_action': kwargs['do_action'] } params = urllib.parse.urlencode(param_dict) headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
def get_application(config=None): if config is None: config = Config() apppath = config.get('application', 'scrapyd.app.application') appfunc = load_object(apppath) return appfunc(config)
def __init__(self): cache_db = Config().get("cache_dbs", default=":memory:") self.cache_manager = JsonSqliteDict(database=cache_db, table="utils_cache_manager")
#Stage 2 Update (Python 3) from future import standard_library standard_library.install_aliases() from builtins import object import datetime, json import urllib.request, urllib.parse, http.client import os from scrapy.utils.project import get_project_settings settings = get_project_settings() from scrapyd.config import Config scrapyd_config = Config() scrapyd_port = scrapyd_config.getint('http_port', 6800) from dynamic_scraper.models import Scraper class TaskUtils(object): conf = { "MAX_SPIDER_RUNS_PER_TASK": 10, "MAX_CHECKER_RUNS_PER_TASK": 25, } def _run_spider(self, **kwargs): scrapyd_host = os.environ.get('SCRAPYD_HOST', 'localhost') scrapyd_port = os.environ.get('SCRAPYD_PORT', '6800') param_dict = { 'project': 'default', 'spider': kwargs['spider'], 'id': kwargs['id'], 'run_type': kwargs['run_type'], 'do_action': kwargs['do_action']
import os from scrapyd.eggstorage import FilesystemEggStorage from scrapyd.config import Config import urllib2 from poster.encode import multipart_encode from poster.streaminghttp import register_openers register_openers() source_dir = '/kf/scrapyd' dest_url = 'http://localhost:6801/addversion.json' source_eggs_dir = os.path.join(source_dir, 'eggs') source_config = Config({'eggs_dir': source_eggs_dir}) source_egg_storage = FilesystemEggStorage(source_config) for dir in os.listdir(source_eggs_dir): #print dir project = dir version, egg = source_egg_storage.get(project) print project, version post_data = { 'egg': egg, 'project': project, 'version': version, } datagen, headers = multipart_encode(post_data) request = urllib2.Request(url=dest_url, headers=headers, data=datagen) try: res = urllib2.urlopen(request) except urllib2.HTTPError as e: print 'HTTPError: %s' % e
# -*- coding: utf-8 -*- from scrapyd.config import Config from SpiderKeeper.scrapyd.app import create_spiderkeeper_application application = create_spiderkeeper_application(Config())