Esempio n. 1
0
def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.load_html(123) is None
    assert utils.load_html('<html><body>XYZ</body></html>') is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    assert extract(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
    # legacy
    assert process_record(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
Esempio n. 2
0
def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.load_html(123) is None
    assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None
    assert utils.load_html(
        b'<html><body>\x2f\x2e\x9f</body></html>') is not None
    assert utils.load_html(
        '<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    assert extract(None, 'url', '0000', target_language=None) is None
    # GZip
    with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
        myinput = gzfile.read()
    assert 'Long story short,' in extract(myinput)
    # legacy
    assert process_record(None, 'url', '0000', target_language=None) is None
Esempio n. 3
0
def test_fetch():
    '''test URL fetching'''
    assert fetch_url('1234') == ''
    assert fetch_url('https://httpbin.org/status/404') is None
    assert decode_response(b'\x1f\x8babcdef') is not None
    assert fetch_url('https://expired.badssl.com/', no_ssl=True) is not None
    # no decoding
    response = fetch_url('https://httpbin.org/status/200', decode=False)
    assert response == ''
    # response object
    url = 'https://httpbin.org/encoding/utf8'
    response = _send_request(url, False, DEFAULT_CONFIG)
    myobject = _handle_response(url, response, False, DEFAULT_CONFIG)
    assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
    # straight handling of response object
    assert load_html(response) is not None
    # nothing to see here
    assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None
    # default config is none
    assert _parse_config(DEFAULT_CONFIG) == (None, None)
    # default user-agent
    default = _determine_headers(DEFAULT_CONFIG)
    assert default['User-Agent'] == USER_AGENT
    assert 'Cookie' not in default
    # user-agents rotation
    assert _parse_config(UA_CONFIG) == ([
        'Firefox', 'Chrome'
    ], 'yummy_cookie=choco; tasty_cookie=strawberry')
    custom = _determine_headers(UA_CONFIG)
    assert custom['User-Agent'] in ['Chrome', 'Firefox']
    assert custom['Cookie'] == 'yummy_cookie=choco; tasty_cookie=strawberry'
Esempio n. 4
0
def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.is_dubious_html('This is a string.') is True
    assert utils.is_dubious_html(b'This is a string.') is True
    with pytest.raises(TypeError) as err:
        assert utils.load_html(123) is None
    assert 'incompatible' in str(err.value)
    assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None
    assert utils.load_html(b'<html><body>\x2f\x2e\x9f</body></html>') is not None
    assert utils.load_html('<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    with pytest.raises(TypeError) as err:
        assert extract(None, 'url', '0000', target_language=None) is None
        # legacy
        assert process_record(None, 'url', '0000', target_language=None) is None
    # GZip
    with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
        myinput = gzfile.read()
    assert 'Long story short,' in extract(myinput)

    # unicode normalization
    assert utils.normalize_unicode('A\u0308ffin') != 'A\u0308ffin'
    testresult = extract('<html><body><p>A\u0308ffin</p></body></html>', config=ZERO_CONFIG)
    assert testresult != 'A\u0308ffin' and testresult == 'Äffin'
Esempio n. 5
0
def baseline(filecontent):
    """Use baseline extraction function targeting JSON metadata and/or text paragraphs"""
    tree = load_html(filecontent)
    postbody = etree.Element('body')
    # scrape from json text
    for elem in tree.xpath('//script[@type="application/ld+json"]'):
        if elem.text and '"articleBody":' in elem.text:
            mymatch = re.search(r'"articleBody":"(.+?)","', elem.text)
            if mymatch:
                temp_text = mymatch.group(1)
                temp_text = temp_text.replace('\\"', '"')
                # temp_text = trim(temp_text)
                len_text = len(temp_text)
                postbody = etree.Element('body')
                elem = etree.Element('p')
                elem.text = temp_text
                postbody.append(elem)
                return postbody, len_text, temp_text
    # scrape from article tag
    elems = tree.xpath('//article')  # |//main
    if len(elems) > 0:
        article_elem = elems[0]
        temp_text = sanitize(article_elem.text_content())
        len_text = len(temp_text)
        if len_text > 0:
            elem = etree.Element('p')
            elem.text = temp_text
            postbody.append(elem)
            return postbody, len_text, temp_text
    # scrape from text paragraphs
    results = set()
    resultlist = list()
    #search_tree = discard_unwanted(tree)
    # search_tree = prune_html(tree)
    for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
        entry = element.text_content()
        if entry not in results:
            resultlist.append(entry)
        results.add(entry)
    for textpart in resultlist:
        elem = etree.Element('p')
        elem.text = textpart
        postbody.append(elem)
    temp_text = sanitize('\n'.join(postbody.itertext()))
    len_text = len(temp_text)
    return postbody, len_text, temp_text
Esempio n. 6
0
def test_fetch():
    '''Test URL fetching.'''
    # pycurl tests
    if pycurl is not None:
        assert fetch_url('1234') is None
    # urllib3 tests
    else:
        assert fetch_url('1234') == ''
    assert fetch_url('https://httpbin.org/status/404') is None
    # empty request?
    #assert _send_request('') is None
    # test if the fonctions default to no_ssl
    assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
    if pycurl is not None:
        assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
    # no SSL, no decoding
    url = 'https://httpbin.org/status/200'
    response = _send_request('https://httpbin.org/status/200', True, DEFAULT_CONFIG)
    assert response.data == b''
    if pycurl is not None:
        response1 = _send_pycurl_request('https://httpbin.org/status/200', True, DEFAULT_CONFIG)
        assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
        assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
    # response object
    url = 'https://httpbin.org/encoding/utf8'
    response = _send_request(url, False, DEFAULT_CONFIG)
    myobject = _handle_response(url, response, False, DEFAULT_CONFIG)
    assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
    # too large response object
    mock = Mock()
    mock.status = 200
    # too large
    mock.data = (b'ABC'*10000000)
    assert _handle_response(url, mock, False, DEFAULT_CONFIG) == ''
    # too small
    mock.data = (b'ABC')
    assert _handle_response(url, mock, False, DEFAULT_CONFIG) == ''
    # straight handling of response object
    assert load_html(response) is not None
    # nothing to see here
    assert extract(response, url=response.url, config=ZERO_CONFIG) is None
Esempio n. 7
0
def test_fetch():
    '''test URL fetching'''
    assert utils.fetch_url('1234') == ''
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    assert utils.decode_response(b'\x1f\x8babcdef') is not None
    assert utils.fetch_url('https://expired.badssl.com/',
                           no_ssl=True) is not None
    # no decoding
    response = utils.fetch_url('https://httpbin.org/status/200', decode=False)
    assert response == ''
    # response object
    url = 'https://httpbin.org/encoding/utf8'
    response = utils._send_request(url, False, DEFAULT_CONFIG)
    myobject = utils._handle_response(url, response, False, DEFAULT_CONFIG)
    assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
    # straight handling of response object
    assert utils.load_html(response) is not None
    # nothing to see here
    assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None
    # user-agents rotation
    assert utils._parse_config(UA_CONFIG) == ['Firefox', 'Chrome']
    custom = utils._determine_headers(UA_CONFIG)
    assert custom['User-Agent'] == 'Chrome' or custom['User-Agent'] == 'Firefox'
Esempio n. 8
0
def extract(filecontent,
            url=None,
            record_id='0001',
            no_fallback=False,
            include_comments=False,
            csv_output=False,
            xml_output=False,
            tei_output=False,
            tei_validation=False,
            target_language=None,
            include_tables=True,
            include_formatting=False):
    '''Main process for text extraction'''
    # init
    tree = load_html(filecontent)
    if tree is None:
        return None

    # Metadata here
    if csv_output is True or xml_output is True or tei_output is True:
        docmeta = extract_metadata(tree, default_url=url)
    else:
        docmeta = None

    # backup (or not) for further processing
    if no_fallback is False:
        backup_tree = deepcopy(tree)
    else:
        backup_tree = None

    # clean
    cleaned_tree = manual_cleaning(tree, include_tables)
    # save space and processing time
    cleaned_tree = prune_html(cleaned_tree)
    # use LXML cleaner
    cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree)
    # tree_cache[cleaned_tree] = list(cleaned_tree.iter())

    # convert tags, the rest does not work without conversion
    cleaned_tree = convert_tags(cleaned_tree)
    # remove hi-element to avoid tail bug
    if (xml_output is False
            and tei_output is False) or include_formatting is False:
        etree.strip_tags(cleaned_tree, 'hi')

    # comments first, then remove
    if include_comments is True:
        commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(
            cleaned_tree)
    else:
        commentsbody, temp_comments, len_comments = None, '', 0

    # extract content
    postbody, temp_text, len_text, sure_thing = extract_content(
        cleaned_tree, include_tables)

    # compare if necessary
    if no_fallback is False:  # and sure_thing is False:
        postbody, temp_text, len_text = compare_extraction(
            backup_tree, url, postbody, temp_text, len_text)
        # try with justext
        if len_text < MIN_EXTRACTED_SIZE:
            LOGGER.error('not enough text %s %s', record_id, url)
            postbody, len_text, temp_text = justext_rescue(
                tree, url, target_language, postbody, len_text, temp_text)
            LOGGER.error('justext length %s', len_text)
        # second backup
        # if len_text < MIN_EXTRACTED_SIZE:
        #     postbody, len_text, temp_text = baseline(filecontent)
    else:
        # rescue: try to use original/dirty tree
        if sure_thing is False and len_text < MIN_EXTRACTED_SIZE:
            postbody, len_text, temp_text = baseline(filecontent)
            #tree = load_html(filecontent)
            #tree = convert_tags(tree)
            #postbody, temp_text, len_text, sure_thing = extract_content(tree)
            LOGGER.debug('non-clean extracted length: %s (extraction)',
                         len_text)

    if len_comments < MIN_EXTRACTED_COMM_SIZE:
        LOGGER.info('not enough comments %s %s', record_id, url)
    if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE:
        LOGGER.info('text and comments not long enough: %s %s', len_text,
                    len_comments)
        return None

    # sanity check on language
    if language_filter(temp_text, temp_comments, target_language, record_id,
                       url) is True:
        return None

    # check duplicates at body level
    if duplicate_test(postbody) is True:
        return None

    # cache elements
    put_in_cache(postbody)
    if commentsbody is not None:
        put_in_cache(commentsbody)

    # XML (TEI) steps
    if xml_output is True or tei_output is True:
        if xml_output is True:
            output = build_xml_output(postbody, commentsbody)
            output = add_xml_meta(output, docmeta)
        elif tei_output is True:
            output = build_tei_output(postbody, commentsbody, docmeta)
        # can be improved
        control_string = etree.tostring(output, encoding='unicode')
        control_string = sanitize(control_string)
        # necessary for cleaning
        control_parser = etree.XMLParser(remove_blank_text=True)
        output_tree = etree.fromstring(control_string, control_parser)
        # validate
        if tei_output is True and tei_validation is True:
            result = validate_tei(output_tree)
            LOGGER.info('TEI validation result: %s %s %s', result, record_id,
                        docmeta.url)
        returnstring = etree.tostring(output_tree,
                                      pretty_print=True,
                                      encoding='unicode').strip()
    # CSV + TXT output
    else:
        if csv_output is True:
            posttext = xmltotxt(postbody)
            commentstext = xmltotxt(commentsbody)
            returnstring = txttocsv(posttext, commentstext, docmeta)
        else:
            output = build_xml_output(postbody, commentsbody)
            returnstring = xmltotxt(output)

    return returnstring