コード例 #1
0
def test_input_filtering():
    '''test internal functions to filter urls'''
    testargs = ['']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    # load dictionary
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    inputdict = cli.load_input_dict(args)
    assert inputdict['https://httpbin.org'] == deque(['/status/200', '/status/404'])
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    args.blacklist = {'httpbin.org/status/404'}
    inputdict = cli.load_input_dict(args)
    assert inputdict['https://httpbin.org'] == deque(['/status/200'])
    # deduplication and filtering
    myinput = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6']
    myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
    inputdict = add_to_compressed_dict(myinput, myblacklist)
    assert inputdict['https://example.org'] == deque(['/2', '/4', '/6'])
    # URL in blacklist
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    my_urls = cli_utils.load_input_urls(args)
    my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt'))
    inputdict = add_to_compressed_dict(my_urls, my_blacklist)
    assert len(inputdict) == 0
    # URL filter
    args.inputfile = os.path.join(RESOURCES_DIR, 'list-process.txt')
    my_urls = cli_utils.load_input_urls(args)
    assert len(add_to_compressed_dict(my_urls, None, ['status'], None)) == 1
    assert len(add_to_compressed_dict(my_urls, None, ['teststring'], None)) == 0
    assert len(add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)) == 1
    # malformed URLs
    inputdict = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None)
    assert len(inputdict) == 1
コード例 #2
0
ファイル: cli_tests.py プロジェクト: EiffelFly/trafilatura
def test_input_filtering():
    '''test internal functions to filter urls'''
    # deduplication and filtering
    myinput = [
        'https://example.org/1', 'https://example.org/2',
        'https://example.org/2', 'https://example.org/3',
        'https://example.org/4', 'https://example.org/5',
        'https://example.org/6'
    ]
    myblacklist = {
        'https://example.org/1', 'https://example.org/3',
        'https://example.org/5'
    }
    assert cli_utils.url_processing_checks(myblacklist, myinput) == [
        'https://example.org/2', 'https://example.org/4',
        'https://example.org/6'
    ]
    # URL in blacklist
    resources_dir = os.path.join(TEST_DIR, 'resources')
    my_urls = cli_utils.load_input_urls(
        os.path.join(resources_dir, 'list-process.txt'))
    my_blacklist = cli_utils.load_blacklist(
        os.path.join(resources_dir, 'list-discard.txt'))
    print(cli_utils.url_processing_checks(my_blacklist, my_urls))
    assert len(cli_utils.url_processing_checks(my_blacklist, my_urls)) == 0
コード例 #3
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, [], 0) is None
    assert cli_utils.url_processing_pipeline(
        args, ['https://www.example.org/'], 0) is None
    # test inputlist + blacklist
    resources_dir = os.path.join(TEST_DIR, 'resources')
    testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    resources_dir = os.path.join(TEST_DIR, 'resources')
    #testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt'), '--blacklist', os.path.join(resources_dir, 'list-discard.txt')]
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #print(args.blacklist)
    #assert args.blacklist is not None
    # test backoff between domain requests
    reftime = datetime.now()
    assert cli_utils.url_processing_pipeline(args, my_urls, 2) is None
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    cli_utils.url_processing_pipeline(args, my_urls, 2)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test timeout
    testargs = ['', '-out', 'xml', '--timeout']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # test JSON output
    testargs = ['', '-out', 'json']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(resources_dir, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.file_processing_pipeline(args)
コード例 #4
0
ファイル: cli_tests.py プロジェクト: seryum/trafilatura
def test_input_filtering():
    '''test internal functions to filter urls'''
    resources_dir = os.path.join(TEST_DIR, 'resources')
    # load dictionary
    inputdict = cli.load_input_dict(
        os.path.join(resources_dir, 'list-process.txt'), set())
    assert inputdict['https://httpbin.org'] == ['/status/200', '/status/404']
    inputdict = cli.load_input_dict(
        os.path.join(resources_dir, 'list-process.txt'),
        {'httpbin.org/status/404'})
    assert inputdict['https://httpbin.org'] == ['/status/200']
    # deduplication and filtering
    myinput = [
        'https://example.org/1', 'https://example.org/2',
        'https://example.org/2', 'https://example.org/3',
        'https://example.org/4', 'https://example.org/5',
        'https://example.org/6'
    ]
    myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
    inputdict = cli_utils.convert_inputlist(myblacklist, myinput, None, None)
    assert inputdict['https://example.org'] == ['/2', '/4', '/6']
    # URL in blacklist
    my_urls = cli_utils.load_input_urls(
        os.path.join(resources_dir, 'list-process.txt'))
    my_blacklist = cli_utils.load_blacklist(
        os.path.join(resources_dir, 'list-discard.txt'))
    inputdict = cli_utils.convert_inputlist(my_blacklist, my_urls, None, None)
    assert len(inputdict) == 0
    # URL filter
    my_urls = cli_utils.load_input_urls(
        os.path.join(resources_dir, 'list-process.txt'))
    assert len(cli.convert_inputlist(None, my_urls, ['status'], None)) == 1
    my_urls = cli_utils.load_input_urls(
        os.path.join(resources_dir, 'list-process.txt'))
    assert len(cli.convert_inputlist(None, my_urls, ['teststring'], None)) == 0
    my_urls = cli_utils.load_input_urls(
        os.path.join(resources_dir, 'list-process.txt'))
    assert len(
        cli.convert_inputlist(None, my_urls, ['status', 'teststring'],
                              None)) == 1
    # malformed URLs
    inputdict = cli_utils.convert_inputlist(
        {}, ['123345', 'https://www.example.org/1'], None, None)
    assert len(inputdict) == 1
コード例 #5
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, [], 0) is None
    assert cli_utils.url_processing_pipeline(
        args, ['https://www.example.org/'], 0) is None
    # test inputlist + blacklist
    resources_dir = os.path.join(TEST_DIR, 'resources')
    testargs = ['', '-i', os.path.join(resources_dir, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    # test backoff between domain requests
    reftime = datetime.now()
    assert cli_utils.url_processing_pipeline(args, my_urls, 2) is None
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # URL in blacklist
    testargs = [
        '', '-i',
        os.path.join(resources_dir, 'list-process.txt'), '-b',
        os.path.join(resources_dir, 'list-discard.txt')
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    print(cli_utils.url_processing_checks(args, my_urls))
    assert len(cli_utils.url_processing_checks(args, my_urls)) == 0
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    cli_utils.url_processing_pipeline(args, my_urls, 2)
コード例 #6
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # straight command-line input
    #testargs = ['', '<html><body>Text</body></html>']
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #f = io.StringIO()
    #with redirect_stdout(f):
    #    cli.process_args(args)
    #assert len(f.getvalue()) == 0
    # test URL listing

    # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
    os.environ['PYTHONIOENCODING'] = "utf-8"

    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, dict()) is None
    # test inputlist + blacklist
    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args)
    assert my_urls is not None and len(my_urls) == 2
    testargs = [
        '', '-i',
        os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist',
        os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.blacklist is not None
    # test backoff between domain requests
    inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
    reftime = datetime.now()
    cli_utils.url_processing_pipeline(args, inputdict)
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # test blacklist and empty dict
    args.blacklist = cli_utils.load_blacklist(args.blacklist)
    assert len(args.blacklist) == 2
    inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
    cli_utils.url_processing_pipeline(args, inputdict)
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    testargs = ['', '-out', 'xml', '--only-with-metadata', '--precision']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test JSON output
    testargs = ['', '-out', 'json', '--recall']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # dry-run file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.file_processing_pipeline(args)
    # file processing pipeline on resources/
    args.inputdir = RESOURCES_DIR
    cli_utils.file_processing_pipeline(args)
    # sitemaps
    testargs = ['', '--sitemap', 'https://httpbin.org/', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
    # config file
    testargs = [
        '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
    # config = use_config(filename=args.config_file)
    assert cli.examine(teststring, args) is None
    # CLI options
    testargs = ['', '--links', '--images']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    #with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f:
    #    teststring = f.read()
    #result = cli.examine(teststring, args)
    #assert '[link](testlink.html)' in result # and 'test.jpg' in result

    # Crawling
    testargs = ['', '--crawl', 'https://httpbin.org/html']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli_utils.cli_crawler(args)
    assert len(f.getvalue()) == 0
    # links permitted
    testargs = [
        '', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel',
        '1'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli_utils.cli_crawler(args)
    assert f.getvalue() == 'https://httpbin.org/links/1/0\n'
    # 0 links permitted
    args.crawl = 'https://httpbin.org/links/4/4'
    f = io.StringIO()
    with redirect_stdout(f):
        cli_utils.cli_crawler(args, n=0)
    # print(f.getvalue())
    assert len(f.getvalue().split('\n')) == 5

    # Exploration (Sitemap + Crawl)
    testargs = ['', '--explore', 'https://httpbin.org/html']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
コード例 #7
0
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # straight command-line input
    #testargs = ['', '<html><body>Text</body></html>']
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #f = io.StringIO()
    #with redirect_stdout(f):
    #    cli.process_args(args)
    #assert len(f.getvalue()) == 0
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, dict()) is None
    # test conversion and storage
    inputdict = cli.convert_inputlist(None, ['ftps://www.example.org/'], None,
                                      None)
    assert inputdict == dict()
    inputdict = cli.convert_inputlist(None, ['https://www.example.org/'], None,
                                      None)
    assert cli_utils.url_processing_pipeline(args, inputdict) is None
    # test inputlist + blacklist
    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    testargs = [
        '', '-i',
        os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist',
        os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.blacklist is not None
    # test backoff between domain requests
    inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None,
                                            None)
    reftime = datetime.now()
    cli_utils.url_processing_pipeline(args, inputdict)
    delta = (datetime.now() - reftime).total_seconds()
    assert delta > 2
    # test blacklist and empty dict
    args.blacklist = cli_utils.load_blacklist(args.blacklist)
    assert len(args.blacklist) == 2
    inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None,
                                            None)
    cli_utils.url_processing_pipeline(args, inputdict)
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is None
    # test JSON output
    testargs = ['', '-out', 'json']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    assert cli.examine(teststring, args) is not None
    # dry-run file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.file_processing_pipeline(args)
    # file processing pipeline on resources/
    args.inputdir = RESOURCES_DIR
    cli_utils.file_processing_pipeline(args)
    # sitemaps
    testargs = ['', '--sitemap', 'https://httpbin.org/', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
        cli.process_args(args)
    assert len(f.getvalue()) == 0
    # config file
    testargs = [
        '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg'
    ]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring = f.read()
    args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
    config = use_config(filename=args.config_file)
    assert cli.examine(teststring, args) is None
    # CLI options
    testargs = ['', '--links', '--images']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)