def test_merge_request_kwargs(self): l = Lassie() l.request_opts = { 'timeout': 3, } request_kwargs = l.merge_request_kwargs() self.assertTrue('timeout' in request_kwargs)
def test_request_opts_default_user_agent(self): l = Lassie() l.request_opts = { 'timeout': 3 } # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)
def test_request_opts_no_headers(self): l = Lassie() l.request_opts = { 'headers': {}, 'timeout': 3 } # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers != {})
def test_bad_request_opts(self): l = Lassie() l.request_opts = { 'bad_key': True, 'headers': { 'User-Agent': 'lassie python' } } self.assertTrue('bad_key' not in l.request_opts) self.assertTrue('headers' in l.request_opts)
def test_request_opts(self): l = Lassie() l.request_opts = { 'headers': { 'User-Agent': 'lassie python', }, 'timeout': 3 } self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts)) # If they modify one of the keys value, make sure it actually happened l.request_opts['headers'].update({'Content-Type': 'application/json'}) self.assertEqual(len(l.request_opts['headers']), 2) self.assertTrue(set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers']))
def test_request_opts(self): l = Lassie() l.request_opts = { 'headers': { 'User-Agent': 'lassie python', }, 'timeout': 3 } self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts)) # If they modify one of the keys value, make sure it actually happened l.request_opts['headers'].update({'Content-Type': 'application/json'}) self.assertEqual(len(l.request_opts['headers']), 2) self.assertTrue( set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers']))
def get_page_info(url: str, timeout: int = 4) -> Optional[PageInfo]: """Возвращает информацию о странице, расположенной по указанному адресу, либо None. :param url: :param timeout: Таймаут на подключение. """ if not url: return None lassie = Lassie() lassie.request_opts = {'timeout': timeout} try: result = lassie.fetch( url, touch_icon=False, favicon=False, ) except LassieError: # В LassieError заворачиваются исключения requests, # в т.ч.ошибки подключения, таймаут и пр. return None if result['status_code'] != 200: return None info = PageInfo( title=result.get('title', ''), description=result.get('description', ''), site_name=result.get('site_name', ''), images=result['images'], ) return info
def test_request_opts_default_user_agent(self): l = Lassie() l.request_opts = {'timeout': 3} # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)
def test_request_opts_no_headers(self): l = Lassie() l.request_opts = {'headers': {}, 'timeout': 3} # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers != {})
def lassie(): l = Lassie() l.request_opts = {'timeout': 3} return l
sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8', all_images=True) print(sample) pprint(sample) print("*" * 100) from lassie import Lassie l = Lassie() sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8') print(sample) pprint(sample) print("*" * 100) l.request_opts = { 'headers': { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) ' 'Version/12.1.1 Safari/605.1.15 ' } } l.request_opts = {'timeout': 0.1} sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8') print(sample) pprint(sample)
from lassie import Lassie parser = argparse.ArgumentParser(description='Separates URLs with 200 status code from those without.') parser.add_argument('bmfile', help='Bookmarks file in JSON list format.') args = parser.parse_args() not_ok = [] bookmarks = [] requests_cache.configure('../cache/requests') user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36' headers = {'User-Agent': user_agent} l = Lassie() l.request_opts = {'headers': headers} webclient = requests.Session() webclient.headers.update(headers) with open(args.bmfile, 'r') as f: data = json.load(f) for i, b in enumerate(data['bookmarks']): url = b['url'] if not url or not url.startswith(('http', 'https')): continue print('#{}: {}'.format(i, url)) try: resp = webclient.head(url, timeout=10, headers={'User-Agent': user_agent})