Beispiel #1
0
def test_fetch():
    '''test URL fetching'''
    assert utils.fetch_url('1234') == ''
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    assert utils.decode_response(b'\x1f\x8babcdef') is not None
    assert utils.fetch_url('https://expired.badssl.com/',
                           no_ssl=True) is not None
Beispiel #2
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #assert utils.fetch_url('https://httpbin.org/status/404') is None
    #url = 'https://httpbin.org/status/200'
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # multiprocessing
    domain_dict = dict()
    domain_dict['httpbin.org'] = [
        'https://httpbin.org/status/301', 'https://httpbin.org/status/304',
        'https://httpbin.org/status/200', 'https://httpbin.org/status/300',
        'https://httpbin.org/status/400', 'https://httpbin.org/status/500'
    ]
    assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25,
                                               None) is None
Beispiel #3
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(None, args) is None
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #url = 'https://httpbin.org/status/200'
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # multiprocessing
    domain_dict = dict()
    domain_dict['httpbin.org'] = [
        'https://httpbin.org/status/301', 'https://httpbin.org/status/304',
        'https://httpbin.org/status/200', 'https://httpbin.org/status/300',
        'https://httpbin.org/status/400', 'https://httpbin.org/status/505'
    ]
    assert cli_utils.multi_threaded_processing(
        domain_dict, args, 0.25,
        None) == (['https://httpbin.org/status/301'], None)
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict['test.org'] = ['http://test.org/1']
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['test.org'] = ['http://test.org/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['test.org'] = ['http://test.org/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 3)
    testdict['test.org'] = ['http://test.org/1']
    backoffdict['test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 0)
Beispiel #4
0
def test_download():
    '''test page download'''
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    url = 'https://httpbin.org/status/200'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, url, False, True) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, url, False, True) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, url, False, True) is not None
Beispiel #5
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(None, args) is None
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #url = 'https://httpbin.org/status/200'
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    #url = 'https://httpbin.org/links/2/2'
    #teststring = utils.fetch_url(url)
    #assert teststring is not None
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # single/multiprocessing
    domain_dict = dict()
    domain_dict['https://httpbin.org'] = [
        '/status/301', '/status/304', '/status/200', '/status/300',
        '/status/400', '/status/505'
    ]
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    results = cli_utils.download_queue_processing(domain_dict, args, None,
                                                  config)
    assert len(results[0]) == 5 and results[1] is None
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict['http://test.org'] = ['/1']
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['http://test.org'] = ['/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['http://test.org'] = ['/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 3)
    testdict['http://test.org'] = ['/1']
    backoffdict['test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 0)
Beispiel #6
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    url = 'https://httpbin.org/status/200'
    teststring = utils.fetch_url(url)
    assert teststring is None  # too small
    assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
Beispiel #7
0
def test_fetch():
    '''test URL fetching'''
    assert utils.fetch_url('1234') == ''
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    assert utils.decode_response(b'\x1f\x8babcdef') is not None
    assert utils.fetch_url('https://expired.badssl.com/',
                           no_ssl=True) is not None
    # no decoding
    response = utils.fetch_url('https://httpbin.org/status/200', decode=False)
    assert response == ''
    # response object
    url = 'https://httpbin.org/encoding/utf8'
    response = utils._send_request(url, False, DEFAULT_CONFIG)
    myobject = utils._handle_response(url, response, False, DEFAULT_CONFIG)
    assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
    # straight handling of response object
    assert utils.load_html(response) is not None
    # nothing to see here
    assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None
    # user-agents rotation
    assert utils._parse_config(UA_CONFIG) == ['Firefox', 'Chrome']
    custom = utils._determine_headers(UA_CONFIG)
    assert custom['User-Agent'] == 'Chrome' or custom['User-Agent'] == 'Firefox'
Beispiel #8
0
def test_fetch():
    '''test URL fetching'''
    assert utils.fetch_url('1234') == ''
    assert utils.fetch_url('https://httpbin.org/status/404') is None