Esempio n. 1
0
def test_parser():
    '''test argument parsing for the command-line interface'''
    testargs = [
        '', '-fv', '--xmltei', '--notables', '-u', 'https://www.example.org'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is True
    assert args.verbose is True
    assert args.notables is False
    assert args.xmltei is True
    assert args.URL == 'https://www.example.org'
    args = cli.map_args(args)
    assert args.output_format == 'xmltei'
    testargs = ['', '-out', 'csv', '-u', 'https://www.example.org']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is False
    assert args.verbose is False
    assert args.output_format == 'csv'
    # test args mapping
    testargs = ['', '--xml']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    args = cli.map_args(args)
    assert args.output_format == 'xml'
    args.xml = False
    args.csv = True
    args = cli.map_args(args)
    assert args.output_format == 'csv'
    args.csv = False
    args.json = True
    args = cli.map_args(args)
    assert args.output_format == 'json'
Esempio n. 2
0
def test_queue():
    'Test creation, modification and download of URL queues.'
    # test conversion and storage
    inputdict = add_to_compressed_dict(['ftps://www.example.org/'])
    assert inputdict == dict()
    inputdict = add_to_compressed_dict(['https://www.example.org/'])
    # CLI args
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    assert url_processing_pipeline(args, inputdict) is None
    # single/multiprocessing
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    domain_dict = dict()
    domain_dict['https://httpbin.org'] = deque([
        '/status/301', '/status/304', '/status/200', '/status/300',
        '/status/400', '/status/505'
    ])
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    results = download_queue_processing(domain_dict, args, None, config)
    assert len(results[0]) == 6 and results[1] is None
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict['http://test.org'] = deque(['/1'])
    assert draw_backoff_url(testdict, backoffdict, 0,
                            set()) == ('http://test.org/1', dict(), dict(),
                                       'http://test.org')
    testdict['http://test.org'] = deque(['/1'])
    backoffdict['http://test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert draw_backoff_url(testdict, backoffdict, 0,
                            set()) == ('http://test.org/1', dict(), dict(),
                                       'http://test.org')
    # code hangs, logical:
    #testdict['http://test.org'] = deque(['/1'])
    #backoffdict['http://test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    #assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('http://test.org/1', dict(), dict(), 0)
    # download buffer
    domain_dict = {
        'https://test.org': deque(['/1', '/2', '/3']),
        'https://test2.org': deque(['/1', '/2', '/3']),
        'https://test3.org': deque(['/1', '/2', '/3']),
        'https://test4.org': deque(['/1', '/2', '/3']),
        'https://test5.org': deque(['/1', '/2', '/3']),
        'https://test6.org': deque(['/1', '/2', '/3'])
    }
    bufferlist, _, _, _ = load_download_buffer(domain_dict,
                                               dict(),
                                               0,
                                               threads=1)
    assert len(bufferlist) == 6
    bufferlist, _, _, _ = load_download_buffer(domain_dict,
                                               dict(),
                                               0,
                                               threads=2)
    assert len(bufferlist) == 6
Esempio n. 3
0
def test_sysoutput():
    '''test command-line output with respect to CLI arguments'''
    testargs = ['', '--csv', '-o', '/root/forbidden/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    filepath, destdir = cli_utils.determine_output_path(
        args, args.outputdir, '')
    assert len(filepath) >= 10 and filepath.endswith('.csv')
    assert destdir == '/root/forbidden/'
    assert cli_utils.check_outputdir_status(args.outputdir) is False
    testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.check_outputdir_status(args.outputdir) is True
    # test fileslug for name
    filepath, destdir = cli_utils.determine_output_path(args,
                                                        args.outputdir,
                                                        '',
                                                        new_filename='AAZZ')
    assert filepath.endswith('AAZZ.xml')
    # test json output
    args2 = args
    args2.xml, args2.json = False, True
    args2 = cli.map_args(args2)
    filepath2, destdir2 = cli_utils.determine_output_path(args,
                                                          args.outputdir,
                                                          '',
                                                          new_filename='AAZZ')
    assert filepath2.endswith('AAZZ.json')
    # test directory counter
    assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir/1'
    # test file writing
    testargs = ['', '--csv', '-o', '/dev/null/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    result = 'DADIDA'
    cli_utils.write_result(result, args)
    # process with no counter
    assert cli_utils.process_result('DADIDA', args, None, None,
                                    DEFAULT_CONFIG) is None
    # test keeping dir structure
    testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    filepath, destdir = cli_utils.determine_output_path(
        args, 'testfile.txt', '')
    assert filepath == 'test/testfile.txt'
    # test hash as output file name
    assert args.hash_as_name is False
    args.hash_as_name = True
    assert args.keep_dirs is True
    args.keep_dirs = False
    filepath, destdir = cli_utils.determine_output_path(
        args, 'testfile.txt', '')
    assert filepath == 'test/2jmj7l5rSw0yVb-vlWAYkK-YBwk.txt'
Esempio n. 4
0
def test_parser():
    '''test argument parsing for the command-line interface'''
    testargs = [
        '', '-fv', '--xmltei', '--notables', '-u', 'https://www.example.org'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is True
    assert args.verbose is True
    assert args.notables is False
    assert args.xmltei is True
    assert args.URL == 'https://www.example.org'
    args = cli.map_args(args)
    assert args.output_format == 'xmltei'
    testargs = ['', '-out', 'csv', '-u', 'https://www.example.org']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is False
    assert args.verbose is False
    assert args.output_format == 'csv'
    # test args mapping
    testargs = ['', '--xml']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    args = cli.map_args(args)
    assert args.output_format == 'xml'
    args.xml, args.csv = False, True
    args = cli.map_args(args)
    assert args.output_format == 'csv'
    args.csv, args.json = False, True
    args = cli.map_args(args)
    assert args.output_format == 'json'
    # process_args
    args.inputdir = '/dev/null'
    args.verbose = True
    args.blacklist = os.path.join(TEST_DIR, 'resources/list-discard.txt')
    cli.process_args(args)
    assert len(args.blacklist) == 2
    # filter
    testargs = [
        '', '-i', 'resources/list-discard.txt', '--url-filter', 'test1',
        'test2'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.inputfile == 'resources/list-discard.txt'
    assert args.url_filter == ['test1', 'test2']
    resources_dir = os.path.join(TEST_DIR, 'resources')
    args.inputfile = os.path.join(resources_dir, 'list-discard.txt')
    args.blacklist == os.path.join(resources_dir, 'list-discard.txt')
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
Esempio n. 5
0
def test_sysoutput():
    '''test command-line output with respect to CLI arguments'''
    testargs = ['', '--csv', '-o', '/root/forbidden/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    filename = cli_utils.determine_filename(args)
    assert len(filename) >= 10 and filename.endswith('.csv')
    assert cli_utils.check_outputdir_status(args.outputdir) is False
    testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.check_outputdir_status(args.outputdir) is True
    assert cli_utils.determine_filename(args).endswith('.xml')
Esempio n. 6
0
def test_input_filtering():
    '''test internal functions to filter urls'''
    testargs = ['']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    # load dictionary
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    inputdict = cli.load_input_dict(args)
    assert inputdict['https://httpbin.org'] == deque(['/status/200', '/status/404'])
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    args.blacklist = {'httpbin.org/status/404'}
    inputdict = cli.load_input_dict(args)
    assert inputdict['https://httpbin.org'] == deque(['/status/200'])
    # deduplication and filtering
    myinput = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6']
    myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
    inputdict = add_to_compressed_dict(myinput, myblacklist)
    assert inputdict['https://example.org'] == deque(['/2', '/4', '/6'])
    # URL in blacklist
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    my_urls = cli_utils.load_input_urls(args)
    my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt'))
    inputdict = add_to_compressed_dict(my_urls, my_blacklist)
    assert len(inputdict) == 0
    # URL filter
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    my_urls = cli_utils.load_input_urls(args)
    assert len(add_to_compressed_dict(my_urls, None, ['status'], None)) == 1
    assert len(add_to_compressed_dict(my_urls, None, ['teststring'], None)) == 0
    assert len(add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)) == 1
    # malformed URLs
    inputdict = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None)
    assert len(inputdict) == 1
Esempio n. 7
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #assert utils.fetch_url('https://httpbin.org/status/404') is None
    #url = 'https://httpbin.org/status/200'
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # multiprocessing
    domain_dict = dict()
    domain_dict['httpbin.org'] = [
        'https://httpbin.org/status/301', 'https://httpbin.org/status/304',
        'https://httpbin.org/status/200', 'https://httpbin.org/status/300',
        'https://httpbin.org/status/400', 'https://httpbin.org/status/500'
    ]
    assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25,
                                               None) is None
Esempio n. 8
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, [], 0) is None
    assert cli_utils.url_processing_pipeline(
        args, ['https://www.example.org/'], 0) is None
    # test inputlist + blacklist
    resources_dir = os.path.join(TEST_DIR, 'resources')
    testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    resources_dir = os.path.join(TEST_DIR, 'resources')
    #testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt'), '--blacklist', os.path.join(resources_dir, 'list-discard.txt')]
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #print(args.blacklist)
    #assert args.blacklist is not None
    # test backoff between domain requests
    reftime = datetime.now()
    assert cli_utils.url_processing_pipeline(args, my_urls, 2) is None
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    cli_utils.url_processing_pipeline(args, my_urls, 2)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test timeout
    testargs = ['', '-out', 'xml', '--timeout']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # test JSON output
    testargs = ['', '-out', 'json']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.file_processing_pipeline(args)
Esempio n. 9
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(None, args) is None
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #url = 'https://httpbin.org/status/200'
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    #url = 'https://httpbin.org/links/2/2'
    #teststring = utils.fetch_url(url)
    #assert teststring is not None
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # single/multiprocessing
    domain_dict = dict()
    domain_dict['https://httpbin.org'] = [
        '/status/301', '/status/304', '/status/200', '/status/300',
        '/status/400', '/status/505'
    ]
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    results = cli_utils.download_queue_processing(domain_dict, args, None,
                                                  config)
    assert len(results[0]) == 5 and results[1] is None
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict['http://test.org'] = ['/1']
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['http://test.org'] = ['/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['http://test.org'] = ['/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 3)
    testdict['http://test.org'] = ['/1']
    backoffdict['test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 0)
Esempio n. 10
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(None, args) is None
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #url = 'https://httpbin.org/status/200'
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/links/2/2'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # multiprocessing
    domain_dict = dict()
    domain_dict['httpbin.org'] = [
        'https://httpbin.org/status/301', 'https://httpbin.org/status/304',
        'https://httpbin.org/status/200', 'https://httpbin.org/status/300',
        'https://httpbin.org/status/400', 'https://httpbin.org/status/505'
    ]
    assert cli_utils.multi_threaded_processing(
        domain_dict, args, 0.25,
        None) == (['https://httpbin.org/status/301'], None)
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict['test.org'] = ['http://test.org/1']
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['test.org'] = ['http://test.org/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('http://test.org/1', dict(),
                                                dict(), 0)
    testdict['test.org'] = ['http://test.org/1']
    backoffdict['test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 3)
    testdict['test.org'] = ['http://test.org/1']
    backoffdict['test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('http://test.org/1', dict(),
                                                dict(), 0)
Esempio n. 11
0
def test_input_type():
    '''test input type errors'''
    testfile = 'docs/trafilatura-demo.gif'
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(testfile, 'rb') as f:
        teststring = f.read(1024)
    assert cli.examine(teststring, args) is None
    testfile = 'docs/index.rst'
    with open(testfile, 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
Esempio n. 12
0
def test_parser():
    '''test argument parsing for the command-line interface'''
    testargs = [
        '', '-fv', '--xmltei', '--notables', '-u', 'https://www.example.org'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is True
    assert args.verbose is True
    assert args.notables is False
    assert args.xmltei is True
    assert args.URL == 'https://www.example.org'
    args = cli.map_args(args)
    assert args.output_format == 'xmltei'
    testargs = ['', '-out', 'csv', '-u', 'https://www.example.org']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is False
    assert args.verbose is False
    assert args.output_format == 'csv'
    # test args mapping
    testargs = ['', '--xml']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    args = cli.map_args(args)
    assert args.output_format == 'xml'
    args.xml, args.csv = False, True
    args = cli.map_args(args)
    assert args.output_format == 'csv'
    args.csv, args.json = False, True
    args = cli.map_args(args)
    assert args.output_format == 'json'
    # process_args
    args.inputdir = '/dev/null'
    args.verbose = True
    args.blacklist = os.path.join(TEST_DIR, 'resources/list-discard.txt')
    cli.process_args(args)
    assert len(args.blacklist) == 4
Esempio n. 13
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, [], 0) is None
    assert cli_utils.url_processing_pipeline(
        args, ['https://www.example.org/'], 0) is None
    # test inputlist + blacklist
    resources_dir = os.path.join(TEST_DIR, 'resources')
    testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    # test backoff between domain requests
    reftime = datetime.now()
    assert cli_utils.url_processing_pipeline(args, my_urls, 2) is None
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # URL in blacklist
    testargs = [
        '', '-i',
        os.path.join(resources_dir, 'list-process.txt'), '-b',
        os.path.join(resources_dir, 'list-discard.txt')
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    print(cli_utils.url_processing_checks(args, my_urls))
    assert len(cli_utils.url_processing_checks(args, my_urls)) == 0
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    cli_utils.url_processing_pipeline(args, my_urls, 2)
Esempio n. 14
0
def test_sysoutput():
    '''test command-line output with respect to CLI arguments'''
    testargs = ['', '--csv', '-o', '/root/forbidden/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    filepath, destdir = cli_utils.determine_output_path(args, args.outputdir)
    print(filepath)
    assert len(filepath) >= 10 and filepath.endswith('.csv')
    assert destdir == '/root/forbidden/'
    assert cli_utils.check_outputdir_status(args.outputdir) is False
    testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.check_outputdir_status(args.outputdir) is True
    # test fileslug for name
    filepath, destdir = cli_utils.determine_output_path(args,
                                                        args.outputdir,
                                                        new_filename='AAZZ')
    assert filepath.endswith('AAZZ.xml')
    # test directory counter
    assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir/1'
    # test file writing
    testargs = ['', '--csv', '-o', '/dev/null/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    result = 'DADIDA'
    cli_utils.write_result(result, args)
    # process with no counter
    assert cli_utils.process_result('DADIDA', args, None, None) is None
    # test keeping dir structure
    testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt')
    print(filepath, destdir)
    assert filepath == 'test/testfile.txt'
Esempio n. 15
0
def test_input_type():
    '''test input type errors'''
    testfile = 'docs/trafilatura-demo.gif'
    testargs = ['', '-u', 'http']
    with patch.object(sys, 'argv', testargs):
        assert cli.main() is None
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(testfile, 'rb') as f:
        teststring = f.read(1024)
    assert cli.examine(teststring, args) is None
    testfile = 'docs/usage.rst'
    with open(testfile, 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test file list
    assert 10 <= len(list(cli_utils.generate_filelist(RESOURCES_DIR))) <= 20
Esempio n. 16
0
def test_input_type():
    '''test input type errors'''
    testfile = 'docs/trafilatura-demo.gif'
    testargs = ['', '-u', 'http']
    with patch.object(sys, 'argv', testargs):
        assert cli.main() is None
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(testfile, 'rb') as f:
        teststring = f.read(1024)
    assert cli.examine(teststring, args) is None
    testfile = 'docs/index.rst'
    with open(testfile, 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test file list
    assert cli_utils.generate_filelist(os.path.join(TEST_DIR,
                                                    'resources')) is not None
Esempio n. 17
0
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(None, args) is None
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #url = 'https://httpbin.org/status/200'
    #teststring = fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    #url = 'https://httpbin.org/links/2/2'
    #teststring = fetch_url(url)
    #assert teststring is not None
    #assert cli.examine(teststring, args, url) is None
    url = 'https://httpbin.org/html'
    teststring = fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
Esempio n. 18
0
def test_parser():
    '''test argument parsing for the command-line interface'''
    testargs = [
        '', '-fvv', '--xmltei', '--notables', '-u', 'https://www.example.org'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is True
    assert args.verbose == 2
    assert args.notables is False and args.no_tables is False
    assert args.xmltei is True
    assert args.URL == 'https://www.example.org'
    args = cli.map_args(args)
    assert args.output_format == 'xmltei'
    testargs = [
        '', '-out', 'csv', '--no-tables', '-u', 'https://www.example.org'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.fast is False
    assert args.verbose == 0
    assert args.output_format == 'csv'
    assert args.no_tables is False
    # test args mapping
    testargs = ['', '--xml', '--nocomments', '--precision', '--recall']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    args = cli.map_args(args)
    assert args.output_format == 'xml' and args.no_comments is False
    # combination possible (?)
    assert args.precision is True and args.recall is True
    args.xml, args.csv = False, True
    args = cli.map_args(args)
    assert args.output_format == 'csv'
    args.csv, args.json = False, True
    args = cli.map_args(args)
    assert args.output_format == 'json'
    testargs = ['', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    args = cli.map_args(args)
    assert args.only_with_metadata is True
    # process_args
    args.inputdir = '/dev/null'
    args.verbose = 1
    args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
    cli.process_args(args)
    assert len(args.blacklist) == 2
    # filter
    testargs = [
        '', '-i', 'resources/list-discard.txt', '--url-filter', 'test1',
        'test2'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.inputfile == 'resources/list-discard.txt'
    assert args.url_filter == ['test1', 'test2']
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-discard.txt')
    args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
    # version
    testargs = ['', '--version']
    with pytest.raises(SystemExit) as e, redirect_stdout(f):
        with patch.object(sys, 'argv', testargs):
            args = cli.parse_args(testargs)
    assert e.type == SystemExit
    assert e.value.code == 0
    assert re.match(
        r'Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]',
        f.getvalue())
Esempio n. 19
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # straight command-line input
    #testargs = ['', '<html><body>Text</body></html>']
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #f = io.StringIO()
    #with redirect_stdout(f):
    #    cli.process_args(args)
    #assert len(f.getvalue()) == 0
    # test URL listing

    # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
    os.environ['PYTHONIOENCODING'] = "utf-8"

    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, dict()) is None
    # test inputlist + blacklist
    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args)
    assert my_urls is not None and len(my_urls) == 2
    testargs = [
        '', '-i',
        os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist',
        os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.blacklist is not None
    # test backoff between domain requests
    inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
    reftime = datetime.now()
    cli_utils.url_processing_pipeline(args, inputdict)
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # test blacklist and empty dict
    args.blacklist = cli_utils.load_blacklist(args.blacklist)
    assert len(args.blacklist) == 2
    inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
    cli_utils.url_processing_pipeline(args, inputdict)
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    testargs = ['', '-out', 'xml', '--only-with-metadata', '--precision']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test JSON output
    testargs = ['', '-out', 'json', '--recall']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # dry-run file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.file_processing_pipeline(args)
    # file processing pipeline on resources/
    args.inputdir = RESOURCES_DIR
    cli_utils.file_processing_pipeline(args)
    # sitemaps
    testargs = ['', '--sitemap', 'https://httpbin.org/', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
    # config file
    testargs = [
        '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
    # config = use_config(filename=args.config_file)
    assert cli.examine(teststring, args) is None
    # CLI options
    testargs = ['', '--links', '--images']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    #with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f:
    #    teststring = f.read()
    #result = cli.examine(teststring, args)
    #assert '[link](testlink.html)' in result # and 'test.jpg' in result

    # Crawling
    testargs = ['', '--crawl', 'https://httpbin.org/html']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli_utils.cli_crawler(args)
    assert len(f.getvalue()) == 0
    # links permitted
    testargs = [
        '', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel',
        '1'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli_utils.cli_crawler(args)
    assert f.getvalue() == 'https://httpbin.org/links/1/0\n'
    # 0 links permitted
    args.crawl = 'https://httpbin.org/links/4/4'
    f = io.StringIO()
    with redirect_stdout(f):
        cli_utils.cli_crawler(args, n=0)
    # print(f.getvalue())
    assert len(f.getvalue().split('\n')) == 5

    # Exploration (Sitemap + Crawl)
    testargs = ['', '--explore', 'https://httpbin.org/html']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
Esempio n. 20
0
def test_queue():
    'Test creation, modification and download of URL queues.'
    # test conversion and storage
    inputdict = add_to_compressed_dict(['ftps://www.example.org/', 'http://'])
    assert inputdict == dict()
    inputdict = add_to_compressed_dict(['https://www.example.org/'])
    # CLI args
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    assert url_processing_pipeline(args, inputdict) is None
    # single/multiprocessing
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    domain_dict = {
        'https://httpbin.org': deque(
            [
                '/status/301',
                '/status/304',
                '/status/200',
                '/status/300',
                '/status/400',
                '/status/505',
            ]
        )
    }
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    config['DEFAULT']['SLEEP_TIME'] = '0.2'
    results = download_queue_processing(domain_dict, args, None, config)
    ## fixed: /301 missing, probably for a good reason...
    assert len(results[0]) == 5 and results[1] is None
    # test backoff algorithm
    backoffdict = {}
    testdict = {'http://test.org': deque(['/1'])}
    assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
    testdict['http://test.org'] = deque(['/1'])
    backoffdict['http://test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
    # concurrent domains
    testdict = {}
    backoffdict = {}
    testdict['http://test.org'] = deque(['/1'])
    testdict['http://example.org'] = deque(['/1'])
    # simulate recent request
    backoffdict['http://test.org'] = datetime.now()
    # must return the other domain
    test = draw_backoff_url(testdict, backoffdict, 5)
    assert test[0], test[1] == ('http://example.org/1', {'http://test.org': deque(['/1'])})
    print(test)
    assert test[2] != {}
    # sleeps and returns the rest
    assert draw_backoff_url(testdict, backoffdict, 1) == ('http://test.org/1', {}, {})
    # code hangs, logical:
    #testdict['http://test.org'] = deque(['/1'])
    #backoffdict['http://test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    #assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
    # download buffer
    domain_dict = {'https://test.org': deque(['/1', '/2', '/3']), 'https://test2.org': deque(['/1', '/2', '/3']), 'https://test3.org': deque(['/1', '/2', '/3']), 'https://test4.org': deque(['/1', '/2', '/3']), 'https://test5.org': deque(['/1', '/2', '/3']), 'https://test6.org': deque(['/1', '/2', '/3'])}
    bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=1)
    assert len(bufferlist) == 6
    bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=2)
    assert len(bufferlist) == 6
Esempio n. 21
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # straight command-line input
    #testargs = ['', '<html><body>Text</body></html>']
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #f = io.StringIO()
    #with redirect_stdout(f):
    #    cli.process_args(args)
    #assert len(f.getvalue()) == 0
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, dict()) is None
    # test conversion and storage
    inputdict = cli.convert_inputlist(None, ['ftps://www.example.org/'], None,
                                      None)
    assert inputdict == dict()
    inputdict = cli.convert_inputlist(None, ['https://www.example.org/'], None,
                                      None)
    assert cli_utils.url_processing_pipeline(args, inputdict) is None
    # test inputlist + blacklist
    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    testargs = [
        '', '-i',
        os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist',
        os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.blacklist is not None
    # test backoff between domain requests
    inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None,
                                            None)
    reftime = datetime.now()
    cli_utils.url_processing_pipeline(args, inputdict)
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # test blacklist and empty dict
    args.blacklist = cli_utils.load_blacklist(args.blacklist)
    assert len(args.blacklist) == 2
    inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None,
                                            None)
    cli_utils.url_processing_pipeline(args, inputdict)
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test JSON output
    testargs = ['', '-out', 'json']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # dry-run file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.file_processing_pipeline(args)
    # file processing pipeline on resources/
    args.inputdir = RESOURCES_DIR
    cli_utils.file_processing_pipeline(args)
    # sitemaps
    testargs = ['', '--sitemap', 'https://httpbin.org/', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
    # config file
    testargs = [
        '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
    config = use_config(filename=args.config_file)
    assert cli.examine(teststring, args) is None
    # CLI options
    testargs = ['', '--links', '--images']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)