Ejemplo n.º 1
0
    def test_merge_request_kwargs(self):
        l = Lassie()
        l.request_opts = {
            'timeout': 3,
        }

        request_kwargs = l.merge_request_kwargs()
        self.assertTrue('timeout' in request_kwargs)
Ejemplo n.º 2
0
    def test_merge_request_kwargs(self):
        l = Lassie()
        l.request_opts = {
            'timeout': 3,
        }

        request_kwargs = l.merge_request_kwargs()
        self.assertTrue('timeout' in request_kwargs)
Ejemplo n.º 3
0
    def test_request_opts_default_user_agent(self):
        l = Lassie()
        l.request_opts = {
            'timeout': 3
        }

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)
Ejemplo n.º 4
0
    def test_request_opts_no_headers(self):
        l = Lassie()
        l.request_opts = {
            'headers': {},
            'timeout': 3
        }

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers != {})
Ejemplo n.º 5
0
    def test_bad_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'bad_key': True,
            'headers': {
                'User-Agent': 'lassie python'
            }
        }

        self.assertTrue('bad_key' not in l.request_opts)
        self.assertTrue('headers' in l.request_opts)
Ejemplo n.º 6
0
    def test_bad_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'bad_key': True,
            'headers': {
                'User-Agent': 'lassie python'
            }
        }

        self.assertTrue('bad_key' not in l.request_opts)
        self.assertTrue('headers' in l.request_opts)
Ejemplo n.º 7
0
    def test_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'headers': {
                'User-Agent': 'lassie python',
            },
            'timeout': 3
        }

        self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts))

        # If they modify one of the keys value, make sure it actually happened
        l.request_opts['headers'].update({'Content-Type': 'application/json'})
        self.assertEqual(len(l.request_opts['headers']), 2)
        self.assertTrue(set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers']))
Ejemplo n.º 8
0
    def test_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'headers': {
                'User-Agent': 'lassie python',
            },
            'timeout': 3
        }

        self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts))

        # If they modify one of the keys value, make sure it actually happened
        l.request_opts['headers'].update({'Content-Type': 'application/json'})
        self.assertEqual(len(l.request_opts['headers']), 2)
        self.assertTrue(
            set(('User-Agent',
                 'Content-Type')).issubset(l.request_opts['headers']))
Ejemplo n.º 9
0
def get_page_info(url: str, timeout: int = 4) -> Optional[PageInfo]:
    """Возвращает информацию о странице, расположенной
    по указанному адресу, либо None.

    :param url:
    :param timeout: Таймаут на подключение.

    """
    if not url:
        return None

    lassie = Lassie()
    lassie.request_opts = {'timeout': timeout}

    try:
        result = lassie.fetch(
            url,
            touch_icon=False,
            favicon=False,
        )

    except LassieError:
        # В LassieError заворачиваются исключения requests,
        # в т.ч.ошибки подключения, таймаут и пр.
        return None

    if result['status_code'] != 200:
        return None

    info = PageInfo(
        title=result.get('title', ''),
        description=result.get('description', ''),
        site_name=result.get('site_name', ''),
        images=result['images'],
    )

    return info
Ejemplo n.º 10
0
    def test_request_opts_default_user_agent(self):
        l = Lassie()
        l.request_opts = {'timeout': 3}

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)
Ejemplo n.º 11
0
    def test_request_opts_no_headers(self):
        l = Lassie()
        l.request_opts = {'headers': {}, 'timeout': 3}

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers != {})
Ejemplo n.º 12
0
def lassie():
    l = Lassie()
    l.request_opts = {'timeout': 3}
    return l
Ejemplo n.º 13
0
sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8',
                      all_images=True)
print(sample)
pprint(sample)

print("*" * 100)

from lassie import Lassie

l = Lassie()
sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8')

print(sample)
pprint(sample)

print("*" * 100)

l.request_opts = {
    'headers': {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) '
        'Version/12.1.1 Safari/605.1.15 '
    }
}

l.request_opts = {'timeout': 0.1}

sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8')

print(sample)
pprint(sample)
Ejemplo n.º 14
0
from lassie import Lassie


parser = argparse.ArgumentParser(description='Separates URLs with 200 status code from those without.')
parser.add_argument('bmfile', help='Bookmarks file in JSON list format.')
args = parser.parse_args()

not_ok = []
bookmarks = []

requests_cache.configure('../cache/requests')
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36'
headers = {'User-Agent': user_agent}
l = Lassie()
l.request_opts = {'headers': headers}
webclient = requests.Session()
webclient.headers.update(headers)


with open(args.bmfile, 'r') as f:
    data = json.load(f)

for i, b in enumerate(data['bookmarks']):
    url = b['url']
    if not url or not url.startswith(('http', 'https')):
        continue

    print('#{}: {}'.format(i, url))
    try:
        resp = webclient.head(url, timeout=10, headers={'User-Agent': user_agent})