def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.load_html(123) is None assert utils.load_html('<html><body>XYZ</body></html>') is not None #assert utils.load_html(b'0'*int(10e3)) is None assert extract(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None # legacy assert process_record(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.load_html(123) is None assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None assert utils.load_html( b'<html><body>\x2f\x2e\x9f</body></html>') is not None assert utils.load_html( '<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None #assert utils.load_html(b'0'*int(10e3)) is None assert extract(None, 'url', '0000', target_language=None) is None # GZip with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile: myinput = gzfile.read() assert 'Long story short,' in extract(myinput) # legacy assert process_record(None, 'url', '0000', target_language=None) is None
def test_fetch(): '''test URL fetching''' assert fetch_url('1234') == '' assert fetch_url('https://httpbin.org/status/404') is None assert decode_response(b'\x1f\x8babcdef') is not None assert fetch_url('https://expired.badssl.com/', no_ssl=True) is not None # no decoding response = fetch_url('https://httpbin.org/status/200', decode=False) assert response == '' # response object url = 'https://httpbin.org/encoding/utf8' response = _send_request(url, False, DEFAULT_CONFIG) myobject = _handle_response(url, response, False, DEFAULT_CONFIG) assert myobject.data.startswith(b'<h1>Unicode Demo</h1>') # straight handling of response object assert load_html(response) is not None # nothing to see here assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None # default config is none assert _parse_config(DEFAULT_CONFIG) == (None, None) # default user-agent default = _determine_headers(DEFAULT_CONFIG) assert default['User-Agent'] == USER_AGENT assert 'Cookie' not in default # user-agents rotation assert _parse_config(UA_CONFIG) == ([ 'Firefox', 'Chrome' ], 'yummy_cookie=choco; tasty_cookie=strawberry') custom = _determine_headers(UA_CONFIG) assert custom['User-Agent'] in ['Chrome', 'Firefox'] assert custom['Cookie'] == 'yummy_cookie=choco; tasty_cookie=strawberry'
def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.is_dubious_html('This is a string.') is True assert utils.is_dubious_html(b'This is a string.') is True with pytest.raises(TypeError) as err: assert utils.load_html(123) is None assert 'incompatible' in str(err.value) assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None assert utils.load_html(b'<html><body>\x2f\x2e\x9f</body></html>') is not None assert utils.load_html('<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None #assert utils.load_html(b'0'*int(10e3)) is None with pytest.raises(TypeError) as err: assert extract(None, 'url', '0000', target_language=None) is None # legacy assert process_record(None, 'url', '0000', target_language=None) is None # GZip with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile: myinput = gzfile.read() assert 'Long story short,' in extract(myinput) # unicode normalization assert utils.normalize_unicode('A\u0308ffin') != 'A\u0308ffin' testresult = extract('<html><body><p>A\u0308ffin</p></body></html>', config=ZERO_CONFIG) assert testresult != 'A\u0308ffin' and testresult == 'Äffin'
def baseline(filecontent): """Use baseline extraction function targeting JSON metadata and/or text paragraphs""" tree = load_html(filecontent) postbody = etree.Element('body') # scrape from json text for elem in tree.xpath('//script[@type="application/ld+json"]'): if elem.text and '"articleBody":' in elem.text: mymatch = re.search(r'"articleBody":"(.+?)","', elem.text) if mymatch: temp_text = mymatch.group(1) temp_text = temp_text.replace('\\"', '"') # temp_text = trim(temp_text) len_text = len(temp_text) postbody = etree.Element('body') elem = etree.Element('p') elem.text = temp_text postbody.append(elem) return postbody, len_text, temp_text # scrape from article tag elems = tree.xpath('//article') # |//main if len(elems) > 0: article_elem = elems[0] temp_text = sanitize(article_elem.text_content()) len_text = len(temp_text) if len_text > 0: elem = etree.Element('p') elem.text = temp_text postbody.append(elem) return postbody, len_text, temp_text # scrape from text paragraphs results = set() resultlist = list() #search_tree = discard_unwanted(tree) # search_tree = prune_html(tree) for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'): entry = element.text_content() if entry not in results: resultlist.append(entry) results.add(entry) for textpart in resultlist: elem = etree.Element('p') elem.text = textpart postbody.append(elem) temp_text = sanitize('\n'.join(postbody.itertext())) len_text = len(temp_text) return postbody, len_text, temp_text
def test_fetch(): '''Test URL fetching.''' # pycurl tests if pycurl is not None: assert fetch_url('1234') is None # urllib3 tests else: assert fetch_url('1234') == '' assert fetch_url('https://httpbin.org/status/404') is None # empty request? #assert _send_request('') is None # test if the fonctions default to no_ssl assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None if pycurl is not None: assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None # no SSL, no decoding url = 'https://httpbin.org/status/200' response = _send_request('https://httpbin.org/status/200', True, DEFAULT_CONFIG) assert response.data == b'' if pycurl is not None: response1 = _send_pycurl_request('https://httpbin.org/status/200', True, DEFAULT_CONFIG) assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG) assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG) # response object url = 'https://httpbin.org/encoding/utf8' response = _send_request(url, False, DEFAULT_CONFIG) myobject = _handle_response(url, response, False, DEFAULT_CONFIG) assert myobject.data.startswith(b'<h1>Unicode Demo</h1>') # too large response object mock = Mock() mock.status = 200 # too large mock.data = (b'ABC'*10000000) assert _handle_response(url, mock, False, DEFAULT_CONFIG) == '' # too small mock.data = (b'ABC') assert _handle_response(url, mock, False, DEFAULT_CONFIG) == '' # straight handling of response object assert load_html(response) is not None # nothing to see here assert extract(response, url=response.url, config=ZERO_CONFIG) is None
def test_fetch(): '''test URL fetching''' assert utils.fetch_url('1234') == '' assert utils.fetch_url('https://httpbin.org/status/404') is None assert utils.decode_response(b'\x1f\x8babcdef') is not None assert utils.fetch_url('https://expired.badssl.com/', no_ssl=True) is not None # no decoding response = utils.fetch_url('https://httpbin.org/status/200', decode=False) assert response == '' # response object url = 'https://httpbin.org/encoding/utf8' response = utils._send_request(url, False, DEFAULT_CONFIG) myobject = utils._handle_response(url, response, False, DEFAULT_CONFIG) assert myobject.data.startswith(b'<h1>Unicode Demo</h1>') # straight handling of response object assert utils.load_html(response) is not None # nothing to see here assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None # user-agents rotation assert utils._parse_config(UA_CONFIG) == ['Firefox', 'Chrome'] custom = utils._determine_headers(UA_CONFIG) assert custom['User-Agent'] == 'Chrome' or custom['User-Agent'] == 'Firefox'
def extract(filecontent, url=None, record_id='0001', no_fallback=False, include_comments=False, csv_output=False, xml_output=False, tei_output=False, tei_validation=False, target_language=None, include_tables=True, include_formatting=False): '''Main process for text extraction''' # init tree = load_html(filecontent) if tree is None: return None # Metadata here if csv_output is True or xml_output is True or tei_output is True: docmeta = extract_metadata(tree, default_url=url) else: docmeta = None # backup (or not) for further processing if no_fallback is False: backup_tree = deepcopy(tree) else: backup_tree = None # clean cleaned_tree = manual_cleaning(tree, include_tables) # save space and processing time cleaned_tree = prune_html(cleaned_tree) # use LXML cleaner cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree) # tree_cache[cleaned_tree] = list(cleaned_tree.iter()) # convert tags, the rest does not work without conversion cleaned_tree = convert_tags(cleaned_tree) # remove hi-element to avoid tail bug if (xml_output is False and tei_output is False) or include_formatting is False: etree.strip_tags(cleaned_tree, 'hi') # comments first, then remove if include_comments is True: commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments( cleaned_tree) else: commentsbody, temp_comments, len_comments = None, '', 0 # extract content postbody, temp_text, len_text, sure_thing = extract_content( cleaned_tree, include_tables) # compare if necessary if no_fallback is False: # and sure_thing is False: postbody, temp_text, len_text = compare_extraction( backup_tree, url, postbody, temp_text, len_text) # try with justext if len_text < MIN_EXTRACTED_SIZE: LOGGER.error('not enough text %s %s', record_id, url) postbody, len_text, temp_text = justext_rescue( tree, url, target_language, postbody, len_text, temp_text) LOGGER.error('justext length %s', len_text) # second backup # if len_text < MIN_EXTRACTED_SIZE: # postbody, len_text, temp_text = baseline(filecontent) else: # rescue: try to use original/dirty tree if sure_thing is False and len_text < MIN_EXTRACTED_SIZE: postbody, len_text, temp_text = baseline(filecontent) #tree = load_html(filecontent) #tree = convert_tags(tree) #postbody, temp_text, len_text, sure_thing = extract_content(tree) LOGGER.debug('non-clean extracted length: %s (extraction)', len_text) if len_comments < MIN_EXTRACTED_COMM_SIZE: LOGGER.info('not enough comments %s %s', record_id, url) if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE: LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments) return None # sanity check on language if language_filter(temp_text, temp_comments, target_language, record_id, url) is True: return None # check duplicates at body level if duplicate_test(postbody) is True: return None # cache elements put_in_cache(postbody) if commentsbody is not None: put_in_cache(commentsbody) # XML (TEI) steps if xml_output is True or tei_output is True: if xml_output is True: output = build_xml_output(postbody, commentsbody) output = add_xml_meta(output, docmeta) elif tei_output is True: output = build_tei_output(postbody, commentsbody, docmeta) # can be improved control_string = etree.tostring(output, encoding='unicode') control_string = sanitize(control_string) # necessary for cleaning control_parser = etree.XMLParser(remove_blank_text=True) output_tree = etree.fromstring(control_string, control_parser) # validate if tei_output is True and tei_validation is True: result = validate_tei(output_tree) LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url) returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip() # CSV + TXT output else: if csv_output is True: posttext = xmltotxt(postbody) commentstext = xmltotxt(commentsbody) returnstring = txttocsv(posttext, commentstext, docmeta) else: output = build_xml_output(postbody, commentsbody) returnstring = xmltotxt(output) return returnstring