def test_update_settings_check_unicode_in_py2_key_value(): # a dict entry is duplicated as unicode doesn't match native str value test = EntrypointSettings() test.setdict({'\xf1e\xf1e\xf1e': '\xf1e\xf1e'}, 10) assert test['\xf1e\xf1e\xf1e'] == '\xf1e\xf1e' native_key = to_native_str('\xf1e\xf1e\xf1e') assert test[native_key] == to_native_str('\xf1e\xf1e')
def _update_settings(o, d): # We need to convert settings to string since the S3 download handler # doesn't work if the AWS keys are passed as unicode. Other code may also # depend on settings being str. TODO: we should test this for k, v in d.items(): d[to_native_str(k)] = to_native_str(v) if is_string(v) else v o.update(d)
def _make_scrapy_args(arg, args_dict): if not args_dict: return [] args = [] for k, v in sorted(dict(args_dict).items()): args += [arg, "{}={}".format( to_native_str(k), to_native_str(v) if is_string(v) else v)] return args
def _job_args_and_env(msg): env = msg.get('job_env') if not isinstance(env, dict): env = {} cmd = msg.get('job_cmd') if not isinstance(cmd, list): cmd = [str(cmd)] return cmd, {to_native_str(k): to_native_str(v) if is_string(v) else v for k, v in sorted(dict(env).items())}
def _make_scrapy_args(arg, args_dict): if not args_dict: return [] args = [] for k, v in sorted(dict(args_dict).items()): args += [ arg, "{}={}".format(to_native_str(k), to_native_str(v) if is_string(v) else v) ] return args
def _job_args_and_env(msg): env = msg.get('job_env') if not isinstance(env, dict): env = {} cmd = msg.get('job_cmd') if not isinstance(cmd, list): cmd = [str(cmd)] return cmd, { to_native_str(k): to_native_str(v) if is_string(v) else v for k, v in sorted(dict(env).items()) }
def write(self, data): data = to_native_str(data, self.encoding) d = (self.buf + data).split('\n') self.buf = d[-1] messages = d[0:-1] for message in messages: self._logprefixed(message)
def test_get_args_and_env_run_script(): msg = {'key': '1/2/3', 'job_cmd': ['custom.py', 'arg1'], 'auth': 'authstring'} result = get_args_and_env(msg) expected_auth = codecs.encode(to_bytes('1/2/3:authstring'), 'hex') assert len(result) == 2 assert result[0] == ['custom.py', 'arg1'] assert result[1] == { 'SHUB_JOBAUTH': to_native_str(expected_auth), 'SHUB_JOBKEY': '1/2/3', 'SHUB_JOBNAME': 'custom.py', 'SHUB_JOB_TAGS': ''} add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'} msg.update(add_fields) result1 = get_args_and_env(msg) assert len(result1) == 2 assert result1[1]['SHUB_APIURL'] == 'some-api-url' assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB'
def _get_log_item(self, ev): """Get HubStorage log item for the given Twisted event, or None if no document should be inserted """ if ev['system'] == 'scrapy': level = ev['logLevel'] else: if ev['isError']: level = logging.ERROR else: level = logging.INFO # It's important to access level trough handler instance, # min log level can change at any moment. if level < self._hs_loghdlr.level: return msg = ev.get('message') if msg: msg = to_native_str(msg[0]) failure = ev.get('failure', None) if failure: msg = failure.getTraceback() why = ev.get('why', None) if why: msg = "%s\n%s" % (why, msg) fmt = ev.get('format') if fmt: try: msg = fmt % ev except: msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev) level = logging.ERROR # to replicate typical scrapy log appeareance msg = msg.replace('\n', '\n\t') return {'message': msg, 'level': level}
def test_get_args_and_env_run_script(): msg = { 'key': '1/2/3', 'job_cmd': ['custom.py', 'arg1'], 'auth': 'authstring' } result = get_args_and_env(msg) expected_auth = codecs.encode(to_bytes('1/2/3:authstring'), 'hex_codec') assert len(result) == 2 assert result[0] == ['custom.py', 'arg1'] assert result[1] == { 'SHUB_JOBAUTH': to_native_str(expected_auth), 'SHUB_JOBKEY': '1/2/3', 'SHUB_JOBNAME': 'custom.py', 'SHUB_JOB_TAGS': '' } add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'} msg.update(add_fields) result1 = get_args_and_env(msg) assert len(result1) == 2 assert result1[1]['SHUB_APIURL'] == 'some-api-url' assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB'
def test_get_args_and_env_run_spider(): msg = { 'key': '1/2/3', 'spider': 'test', 'spider_type': 'auto', 'auth': 'auths', 'spider_args': { 'arg1': 'val1', 'arg2': 'val2' }, 'settings': { 'SETTING1': 'VAL1', 'SETTING2': 'VAL2' } } result = get_args_and_env(msg) expected_auth = codecs.encode(to_bytes('1/2/3:auths'), 'hex_codec') assert len(result) == 2 assert result[0] == [ 'scrapy', 'crawl', 'test', '-a', 'arg1=val1', '-a', 'arg2=val2', '-s', 'SETTING1=VAL1', '-s', 'SETTING2=VAL2' ] assert result[1] == { 'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1', 'SCRAPY_SPIDER': 'test', 'SHUB_JOBAUTH': to_native_str(expected_auth), 'SHUB_JOBKEY': '1/2/3', 'SHUB_JOBNAME': 'test', 'SHUB_JOB_TAGS': '', 'SHUB_SPIDER_TYPE': 'auto' } add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'} msg.update(add_fields) result1 = get_args_and_env(msg) assert len(result1) == 2 assert result1[1]['SHUB_APIURL'] == 'some-api-url' assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB'
def test_get_args_and_env_run_spider(): msg = {'key': '1/2/3', 'spider': 'test', 'spider_type': 'auto', 'auth': 'auths', 'spider_args': {'arg1': 'val1', 'arg2': 'val2'}, 'settings': {'SETTING1': 'VAL1', 'SETTING2': 'VAL2'}} result = get_args_and_env(msg) expected_auth = codecs.encode(to_bytes('1/2/3:auths'), 'hex') assert len(result) == 2 assert result[0] == ['scrapy', 'crawl', 'test', '-a', 'arg1=val1', '-a', 'arg2=val2', '-s', 'SETTING1=VAL1', '-s', 'SETTING2=VAL2'] assert result[1] == {'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1', 'SCRAPY_SPIDER': 'test', 'SHUB_JOBAUTH': to_native_str(expected_auth), 'SHUB_JOBKEY': '1/2/3', 'SHUB_JOBNAME': 'test', 'SHUB_JOB_TAGS': '', 'SHUB_SPIDER_TYPE': 'auto'} add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'} msg.update(add_fields) result1 = get_args_and_env(msg) assert len(result1) == 2 assert result1[1]['SHUB_APIURL'] == 'some-api-url' assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB'
def auth(self): return to_native_str(decode(os.environ['SHUB_JOBAUTH'], 'hex_codec'))
def test_jobauth(): msg = {'key': '1/2/3', 'auth': 'authstring'} expected = codecs.encode(to_bytes('1/2/3:authstring'), 'hex') assert _jobauth(msg) == to_native_str(expected)
def set(self, name, value, priority='project'): super(EntrypointSettings, self).set(to_native_str(name), to_native_str(value) if is_string(value) else value, priority=priority)
def _jobauth(msg): auth_data = to_bytes('{0[key]}:{0[auth]}'.format(msg)) return to_native_str(codecs.encode(auth_data, 'hex_codec'))
def test_jobauth(): msg = {'key': '1/2/3', 'auth': 'authstring'} expected = codecs.encode(to_bytes('1/2/3:authstring'), 'hex_codec') assert _jobauth(msg) == to_native_str(expected)
def writelines(self, lines): for line in lines: line = to_native_str(line, self.encoding) self._logprefixed(line)
# -*- coding: utf-8 -*- import codecs import os import shutil import tempfile import pytest TEMP_DIR = tempfile.mkdtemp() SHUB_FIFO_PATH = os.path.join(TEMP_DIR, 'scrapinghub') os.environ['SHUB_FIFO_PATH'] = SHUB_FIFO_PATH from sh_scrapy.compat import to_native_str, to_bytes TEST_AUTH = to_native_str(codecs.encode(to_bytes('1/2/3:authstr'), 'hex_codec')) @pytest.fixture(scope='session', autouse=True) def clean_shub_fifo_path(): global TEMP_DIR try: yield finally: shutil.rmtree(TEMP_DIR) @pytest.fixture(autouse=True) def set_jobkeyenvironment(monkeypatch): monkeypatch.setenv('SHUB_JOBKEY', '1/2/3') monkeypatch.setenv('SCRAPY_JOB', '1/2/3')
def set(self, name, value, priority='project'): super(EntrypointSettings, self).set( to_native_str(name), to_native_str(value) if is_string(value) else value, priority=priority)
import os import sys import mock import pytest import codecs from sh_scrapy.hsref import _HubstorageRef from sh_scrapy.compat import to_native_str, to_bytes TEST_AUTH = to_native_str(codecs.encode(to_bytes('1/2/3:authstr'), 'hex')) def test_init_disabled(): hsref = _HubstorageRef() assert not hsref._client assert not hsref._project assert not hsref._job assert not hsref.enabled assert not hasattr(hsref, 'jobkey') assert not hsref._projectid assert not hsref._spiderid assert not hsref._jobcounter @pytest.fixture @mock.patch.dict(os.environ, {'SHUB_JOBKEY': '1/2/3'}) def hsref(): return _HubstorageRef() @pytest.fixture def hsc_class(monkeypatch):