Esempio n. 1
def test_queue():
    'Test creation, modification and download of URL queues.'
    # test conversion and storage
    inputdict = add_to_compressed_dict(['ftps://'])
    assert inputdict == dict()
    inputdict = add_to_compressed_dict([''])
    # CLI args
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    assert url_processing_pipeline(args, inputdict) is None
    # single/multiprocessing
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    domain_dict = dict()
    domain_dict[''] = deque([
        '/status/301', '/status/304', '/status/200', '/status/300',
        '/status/400', '/status/505'
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    results = download_queue_processing(domain_dict, args, None, config)
    assert len(results[0]) == 6 and results[1] is None
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict[''] = deque(['/1'])
    assert draw_backoff_url(testdict, backoffdict, 0,
                            set()) == ('', dict(), dict(),
    testdict[''] = deque(['/1'])
    backoffdict[''] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert draw_backoff_url(testdict, backoffdict, 0,
                            set()) == ('', dict(), dict(),
    # code hangs, logical:
    #testdict[''] = deque(['/1'])
    #backoffdict[''] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    #assert cli_utils.draw_backoff_url(testdict, backoffdict, 0, 3) == ('', dict(), dict(), 0)
    # download buffer
    domain_dict = {
        '': deque(['/1', '/2', '/3']),
        '': deque(['/1', '/2', '/3']),
        '': deque(['/1', '/2', '/3']),
        '': deque(['/1', '/2', '/3']),
        '': deque(['/1', '/2', '/3']),
        '': deque(['/1', '/2', '/3'])
    bufferlist, _, _, _ = load_download_buffer(domain_dict,
    assert len(bufferlist) == 6
    bufferlist, _, _, _ = load_download_buffer(domain_dict,
    assert len(bufferlist) == 6
Esempio n. 2
def test_download():
    '''test page download and command-line interface'''
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli.examine(None, args) is None
    assert cli.examine(' ', args) is None
    assert cli.examine('0' * int(10e7), args) is None
    #url = ''
    #teststring = utils.fetch_url(url)
    #assert teststring is None  # too small
    #assert cli.examine(teststring, args, url) is None
    #url = ''
    #teststring = utils.fetch_url(url)
    #assert teststring is not None
    #assert cli.examine(teststring, args, url) is None
    url = ''
    teststring = utils.fetch_url(url)
    assert teststring is not None
    assert cli.examine(teststring, args, url) is not None
    # single/multiprocessing
    domain_dict = dict()
    domain_dict[''] = [
        '/status/301', '/status/304', '/status/200', '/status/300',
        '/status/400', '/status/505'
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    results = cli_utils.download_queue_processing(domain_dict, args, None,
    assert len(results[0]) == 5 and results[1] is None
    # test backoff algorithm
    testdict = dict()
    backoffdict = dict()
    testdict[''] = ['/1']
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('', dict(),
                                                dict(), 0)
    testdict[''] = ['/1']
    backoffdict[''] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 0) == ('', dict(),
                                                dict(), 0)
    testdict[''] = ['/1']
    backoffdict[''] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('', dict(),
                                                dict(), 3)
    testdict[''] = ['/1']
    backoffdict[''] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    assert cli_utils.draw_backoff_url(testdict, backoffdict,
                                      0, 3) == ('', dict(),
                                                dict(), 0)
Esempio n. 3
def test_cli_pipeline():
    '''test command-line processing pipeline'''
    # straight command-line input
    #testargs = ['', '<html><body>Text</body></html>']
    #with patch.object(sys, 'argv', testargs):
    #    args = cli.parse_args(testargs)
    #f = io.StringIO()
    #with redirect_stdout(f):
    #    cli.process_args(args)
    #assert len(f.getvalue()) == 0
    # test URL listing
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert cli_utils.url_processing_pipeline(args, dict()) is None
    # test conversion and storage
    inputdict = cli.convert_inputlist(None, ['ftps://'], None,
    assert inputdict == dict()
    inputdict = cli.convert_inputlist(None, [''], None,
    assert cli_utils.url_processing_pipeline(args, inputdict) is None
    # test inputlist + blacklist
    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    my_urls = cli_utils.load_input_urls(args.inputfile)
    assert my_urls is not None and len(my_urls) == 2
    testargs = [
        '', '-i',
        os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist',
        os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived'
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    assert args.blacklist is not None
    # test backoff between domain requests
    inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None,
    reftime =
    cli_utils.url_processing_pipeline(args, inputdict)
    delta = ( - reftime).total_seconds()
    assert delta > 2
    # test blacklist and empty dict
    args.blacklist = cli_utils.load_blacklist(args.blacklist)
    assert len(args.blacklist) == 2
    inputdict = cli_utils.convert_inputlist(args.blacklist, my_urls, None,
    cli_utils.url_processing_pipeline(args, inputdict)
    # test backup
    testargs = ['', '--backup-dir', '/tmp/']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    cli_utils.archive_html('00Test', args)
    # test date-based exclusion
    testargs = ['', '-out', 'xml', '--with-metadata']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring =
    assert cli.examine(teststring, args) is None
    # test JSON output
    testargs = ['', '-out', 'json']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring =
    assert cli.examine(teststring, args) is not None
    # dry-run file processing pipeline
    testargs = ['', '--parallel', '1', '--inputdir', '/dev/null']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    # file processing pipeline on resources/
    args.inputdir = RESOURCES_DIR
    # sitemaps
    testargs = ['', '--sitemap', '', '--list']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    f = io.StringIO()
    with redirect_stdout(f):
    assert len(f.getvalue()) == 0
    # config file
    testargs = [
        '', '--inputdir', '/dev/null', '--config-file', 'newsettings.cfg'
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r') as f:
        teststring =
    args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
    config = use_config(filename=args.config_file)
    assert cli.examine(teststring, args) is None
    # CLI options
    testargs = ['', '--links', '--images']
    with patch.object(sys, 'argv', testargs):
        args = cli.parse_args(testargs)
Esempio n. 4
from trafilatura.metadata import METADATA_LIST
from trafilatura.settings import DEFAULT_CONFIG, TAG_CATALOG, use_config

from trafilatura import utils, xml

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

TEST_DIR = os.path.abspath(os.path.dirname(__file__))
RESOURCES_DIR = os.path.join(TEST_DIR, 'resources')


UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg'))

    'http://exotic_tags': 'exotic_tags.html',

def load_mock_page(url, xml_flag=False, langcheck=None, tei_output=False):
    '''load mock page from samples'''
        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]),
                  'r') as inputf:
            htmlstring =
    # encoding/windows fix for the tests
    except UnicodeDecodeError:
        # read as binary
Esempio n. 5
def test_queue():
    'Test creation, modification and download of URL queues.'
    # test conversion and storage
    inputdict = add_to_compressed_dict(['ftps://', 'http://'])
    assert inputdict == dict()
    inputdict = add_to_compressed_dict([''])
    # CLI args
    testargs = ['', '--list']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    assert url_processing_pipeline(args, inputdict) is None
    # single/multiprocessing
    testargs = ['', '-v']
    with patch.object(sys, 'argv', testargs):
        args = parse_args(testargs)
    domain_dict = {
        '': deque(
    args.archived = True
    args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
    config = use_config(filename=args.config_file)
    config['DEFAULT']['SLEEP_TIME'] = '0.2'
    results = download_queue_processing(domain_dict, args, None, config)
    ## fixed: /301 missing, probably for a good reason...
    assert len(results[0]) == 5 and results[1] is None
    # test backoff algorithm
    backoffdict = {}
    testdict = {'': deque(['/1'])}
    assert draw_backoff_url(testdict, backoffdict, 0) == ('', dict(), dict())
    testdict[''] = deque(['/1'])
    backoffdict[''] = datetime(2019, 5, 18, 15, 17, 8, 132263)
    assert draw_backoff_url(testdict, backoffdict, 0) == ('', dict(), dict())
    # concurrent domains
    testdict = {}
    backoffdict = {}
    testdict[''] = deque(['/1'])
    testdict[''] = deque(['/1'])
    # simulate recent request
    backoffdict[''] =
    # must return the other domain
    test = draw_backoff_url(testdict, backoffdict, 5)
    assert test[0], test[1] == ('', {'': deque(['/1'])})
    assert test[2] != {}
    # sleeps and returns the rest
    assert draw_backoff_url(testdict, backoffdict, 1) == ('', {}, {})
    # code hangs, logical:
    #testdict[''] = deque(['/1'])
    #backoffdict[''] = datetime(2030, 5, 18, 15, 17, 8, 132263)
    #assert draw_backoff_url(testdict, backoffdict, 0) == ('', dict(), dict())
    # download buffer
    domain_dict = {'': deque(['/1', '/2', '/3']), '': deque(['/1', '/2', '/3']), '': deque(['/1', '/2', '/3']), '': deque(['/1', '/2', '/3']), '': deque(['/1', '/2', '/3']), '': deque(['/1', '/2', '/3'])}
    bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=1)
    assert len(bufferlist) == 6
    bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=2)
    assert len(bufferlist) == 6