def test_lrucache(): '''test basic duplicate detection''' lru_test = LRUCache(maxsize=2) trafilatura.filters.LRU_TEST = lru_test my_body = etree.Element('body') ### element too short #my_element = html.fromstring('<p>AAAA BBBB</p>') #my_body.append(my_element) #put_in_cache(my_body) #assert duplicate_test(my_element) is False ### cached element my_element = html.fromstring( '<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>' ) my_body.append(my_element) assert duplicate_test(my_element) is False assert duplicate_test(my_element) is False assert duplicate_test(my_body) is False assert duplicate_test(my_element) is True other_body = etree.Element('body') other_element = html.fromstring( '<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>' ) other_body.append(other_element) assert duplicate_test(other_body) is False assert duplicate_test(other_element) is False assert duplicate_test(other_body) is False assert duplicate_test(other_element) is True yet_another_body = etree.Element('body') yet_another_element = html.fromstring( '<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>' ) yet_another_body.append(yet_another_element) assert duplicate_test(yet_another_body) is False assert duplicate_test(yet_another_body) is False assert duplicate_test(yet_another_body) is False # 2 elements in cache, original element has been cleared? # print(LRU_TEST.maxsize, LRU_TEST.full) assert duplicate_test(other_element) is True assert duplicate_test(yet_another_element) is True assert duplicate_test(my_element) is False # clear the cache lru_test.clear() assert duplicate_test(other_element) is False # get wrong key assert lru_test.get('tralala') == -1
def extract(filecontent, url=None, record_id='0001', no_fallback=False, include_comments=False, csv_output=False, xml_output=False, tei_output=False, tei_validation=False, target_language=None, include_tables=True, include_formatting=False): '''Main process for text extraction''' # init tree = load_html(filecontent) if tree is None: return None # Metadata here if csv_output is True or xml_output is True or tei_output is True: docmeta = extract_metadata(tree, default_url=url) else: docmeta = None # backup (or not) for further processing if no_fallback is False: backup_tree = deepcopy(tree) else: backup_tree = None # clean cleaned_tree = manual_cleaning(tree, include_tables) # save space and processing time cleaned_tree = prune_html(cleaned_tree) # use LXML cleaner cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree) # tree_cache[cleaned_tree] = list(cleaned_tree.iter()) # convert tags, the rest does not work without conversion cleaned_tree = convert_tags(cleaned_tree) # remove hi-element to avoid tail bug if (xml_output is False and tei_output is False) or include_formatting is False: etree.strip_tags(cleaned_tree, 'hi') # comments first, then remove if include_comments is True: commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments( cleaned_tree) else: commentsbody, temp_comments, len_comments = None, '', 0 # extract content postbody, temp_text, len_text, sure_thing = extract_content( cleaned_tree, include_tables) # compare if necessary if no_fallback is False: # and sure_thing is False: postbody, temp_text, len_text = compare_extraction( backup_tree, url, postbody, temp_text, len_text) # try with justext if len_text < MIN_EXTRACTED_SIZE: LOGGER.error('not enough text %s %s', record_id, url) postbody, len_text, temp_text = justext_rescue( tree, url, target_language, postbody, len_text, temp_text) LOGGER.error('justext length %s', len_text) # second backup # if len_text < MIN_EXTRACTED_SIZE: # postbody, len_text, temp_text = baseline(filecontent) else: # rescue: try to use original/dirty tree if sure_thing is False and len_text < MIN_EXTRACTED_SIZE: postbody, len_text, temp_text = baseline(filecontent) #tree = load_html(filecontent) #tree = convert_tags(tree) #postbody, temp_text, len_text, sure_thing = extract_content(tree) LOGGER.debug('non-clean extracted length: %s (extraction)', len_text) if len_comments < MIN_EXTRACTED_COMM_SIZE: LOGGER.info('not enough comments %s %s', record_id, url) if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE: LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments) return None # sanity check on language if language_filter(temp_text, temp_comments, target_language, record_id, url) is True: return None # check duplicates at body level if duplicate_test(postbody) is True: return None # cache elements put_in_cache(postbody) if commentsbody is not None: put_in_cache(commentsbody) # XML (TEI) steps if xml_output is True or tei_output is True: if xml_output is True: output = build_xml_output(postbody, commentsbody) output = add_xml_meta(output, docmeta) elif tei_output is True: output = build_tei_output(postbody, commentsbody, docmeta) # can be improved control_string = etree.tostring(output, encoding='unicode') control_string = sanitize(control_string) # necessary for cleaning control_parser = etree.XMLParser(remove_blank_text=True) output_tree = etree.fromstring(control_string, control_parser) # validate if tei_output is True and tei_validation is True: result = validate_tei(output_tree) LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url) returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip() # CSV + TXT output else: if csv_output is True: posttext = xmltotxt(postbody) commentstext = xmltotxt(commentsbody) returnstring = txttocsv(posttext, commentstext, docmeta) else: output = build_xml_output(postbody, commentsbody) returnstring = xmltotxt(output) return returnstring