def test_tei(): '''test TEI-related functions''' # open local resources to avoid redownloading at each run resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'httpbin_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, tei_output=True, tei_validation=False) assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True assert xml.validate_tei(etree.fromstring(teststring)) is False # test with another file with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, tei_output=True, tei_validation=False) assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True # include ID in metadata result = extract(teststring, "mocked", no_fallback=True, tei_output=True, tei_validation=False, record_id='0001') assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True
def test_tei(): '''test TEI-related functions''' # open local resources to avoid redownloading at each run with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result1 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False) result2 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=True) assert result1 is not None and result1 == result2 assert xml.validate_tei(etree.fromstring(result1)) is True assert xml.validate_tei(etree.fromstring(teststring)) is False # test with another file with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False) assert result is not None # and '<p>license</p>' in result assert xml.validate_tei(etree.fromstring(result)) is True # include ID in metadata result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False, record_id='0001') assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True # test header + metadata tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0') header = etree.SubElement(tei, 'teiHeader') docmeta = dict.fromkeys(METADATA_LIST) docmeta['categories'], docmeta['tags'] = [], [] docmeta['title'] = 'Title' assert xml.write_fullheader(header, docmeta) is not None docmeta['sitename'] = 'Site Name' assert xml.write_fullheader(header, docmeta) is not None docmeta['hostname'], docmeta['sitename'] = 'hostname', None assert xml.write_fullheader(header, docmeta) is not None
def test_tei(): '''test TEI-related functions''' # open local resources to avoid redownloading at each run with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result1 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False) result2 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=True) assert result1 is not None and result1 == result2 assert xml.validate_tei(etree.fromstring(result1)) is True assert xml.validate_tei(etree.fromstring(teststring)) is False # test with another file with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() # download, parse and validate simple html file result = extract(teststring, "mocked", no_fallback=True, include_comments=True, output_format='xmltei', tei_validation=False) assert result is not None # and '<p>license</p>' in result assert xml.validate_tei(etree.fromstring(result)) is True result = extract(teststring, "mocked", no_fallback=True, include_comments=False, output_format='xmltei', tei_validation=False) assert result is not None # and '<p>license</p>' in result assert xml.validate_tei(etree.fromstring(result)) is True # include ID in metadata result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False, record_id='0001') assert result is not None assert xml.validate_tei(etree.fromstring(result)) is True # test header + metadata tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0') header = etree.SubElement(tei, 'teiHeader') docmeta = Document() docmeta.categories, docmeta.tags = [], [] docmeta.title = 'Title' assert xml.write_fullheader(header, docmeta) is not None docmeta.sitename = 'Site Name' docmeta.date = '2021-01-01' assert xml.write_fullheader(header, docmeta) is not None docmeta.date = None assert xml.write_fullheader(header, docmeta) is not None docmeta.hostname = 'hostname' assert xml.write_fullheader(header, docmeta) is not None docmeta.sitename = None docmeta.license = 'CC BY-SA' docmeta.url = 'https://test.org/' docmeta.categories = ['cat1', 'cat2'] assert xml.write_fullheader(header, docmeta) is not None docmeta.date = '2021-01-01' assert xml.write_fullheader(header, docmeta) is not None docmeta.title, docmeta.sitename = None, None assert xml.write_fullheader(header, docmeta) is not None
# Versuch, die Keyword-Argumente url und target_language zu aendern; # result9 und result11 liefern None (liegt anscheinend am Aendern der Target-language), # das Eingeben einer url funktioniert aber: url1='https://github.blog/2019-03-29-leader-spotlight-erin-spiceland/' result9 = trafilatura.extract(downloaded, target_language='de') print(result9) result10 = trafilatura.extract(downloaded, url=url1) print(result10) result11 = trafilatura.extract(downloaded, url=url1, target_language='de') print(result11) # Ausprobieren der Validierung bzgl. TEI (siehe https://trafilatura.readthedocs.io/en/latest/validation.html) print("test TEI-Validation:") mytree2=etree.parse("DU_sense.xml") print("Correct file: ",validate_tei(mytree2)) mytree3=etree.parse("DU_sense_illformed.xml") print("Illformed file: ") print(validate_tei(mytree3)) # Ausprobieren der Funktion xmltotxt() (siehe https://trafilatura.readthedocs.io/en/latest/corefunctions.html) #f=open("DU_sense.xml","r") #xml_text=f.read() #print(trafilatura.xml.xmltotxt(xml_text)) # --> fuehrt zu TypeError: Invalid input object: str #f=open("DU_sense.xml","r") #print(trafilatura.xml.xmltotxt(f)) # --> fuehrt zu TypeError: Invalid input object: _io.TextIOWrapper #print(trafilatura.xml.xmltotxt("DU_sense.xml"))
def extract(filecontent, url=None, record_id='0001', no_fallback=False, include_comments=False, csv_output=False, xml_output=False, tei_output=False, tei_validation=False, target_language=None, include_tables=True, include_formatting=False): '''Main process for text extraction''' # init tree = load_html(filecontent) if tree is None: return None # Metadata here if csv_output is True or xml_output is True or tei_output is True: docmeta = extract_metadata(tree, default_url=url) else: docmeta = None # backup (or not) for further processing if no_fallback is False: backup_tree = deepcopy(tree) else: backup_tree = None # clean cleaned_tree = manual_cleaning(tree, include_tables) # save space and processing time cleaned_tree = prune_html(cleaned_tree) # use LXML cleaner cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree) # tree_cache[cleaned_tree] = list(cleaned_tree.iter()) # convert tags, the rest does not work without conversion cleaned_tree = convert_tags(cleaned_tree) # remove hi-element to avoid tail bug if (xml_output is False and tei_output is False) or include_formatting is False: etree.strip_tags(cleaned_tree, 'hi') # comments first, then remove if include_comments is True: commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments( cleaned_tree) else: commentsbody, temp_comments, len_comments = None, '', 0 # extract content postbody, temp_text, len_text, sure_thing = extract_content( cleaned_tree, include_tables) # compare if necessary if no_fallback is False: # and sure_thing is False: postbody, temp_text, len_text = compare_extraction( backup_tree, url, postbody, temp_text, len_text) # try with justext if len_text < MIN_EXTRACTED_SIZE: LOGGER.error('not enough text %s %s', record_id, url) postbody, len_text, temp_text = justext_rescue( tree, url, target_language, postbody, len_text, temp_text) LOGGER.error('justext length %s', len_text) # second backup # if len_text < MIN_EXTRACTED_SIZE: # postbody, len_text, temp_text = baseline(filecontent) else: # rescue: try to use original/dirty tree if sure_thing is False and len_text < MIN_EXTRACTED_SIZE: postbody, len_text, temp_text = baseline(filecontent) #tree = load_html(filecontent) #tree = convert_tags(tree) #postbody, temp_text, len_text, sure_thing = extract_content(tree) LOGGER.debug('non-clean extracted length: %s (extraction)', len_text) if len_comments < MIN_EXTRACTED_COMM_SIZE: LOGGER.info('not enough comments %s %s', record_id, url) if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE: LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments) return None # sanity check on language if language_filter(temp_text, temp_comments, target_language, record_id, url) is True: return None # check duplicates at body level if duplicate_test(postbody) is True: return None # cache elements put_in_cache(postbody) if commentsbody is not None: put_in_cache(commentsbody) # XML (TEI) steps if xml_output is True or tei_output is True: if xml_output is True: output = build_xml_output(postbody, commentsbody) output = add_xml_meta(output, docmeta) elif tei_output is True: output = build_tei_output(postbody, commentsbody, docmeta) # can be improved control_string = etree.tostring(output, encoding='unicode') control_string = sanitize(control_string) # necessary for cleaning control_parser = etree.XMLParser(remove_blank_text=True) output_tree = etree.fromstring(control_string, control_parser) # validate if tei_output is True and tei_validation is True: result = validate_tei(output_tree) LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url) returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip() # CSV + TXT output else: if csv_output is True: posttext = xmltotxt(postbody) commentstext = xmltotxt(commentsbody) returnstring = txttocsv(posttext, commentstext, docmeta) else: output = build_xml_output(postbody, commentsbody) returnstring = xmltotxt(output) return returnstring