Ejemplo n.º 1
0
def test_tei():
    '''test TEI-related functions'''
    # open local resources to avoid redownloading at each run
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'httpbin_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     tei_output=True,
                     tei_validation=False)
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    assert xml.validate_tei(etree.fromstring(teststring)) is False
    # test with another file
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     tei_output=True,
                     tei_validation=False)
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    # include ID in metadata
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     tei_output=True,
                     tei_validation=False,
                     record_id='0001')
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
Ejemplo n.º 2
0
def test_tei():
    '''test TEI-related functions'''
    # open local resources to avoid redownloading at each run
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result1 = extract(teststring,
                      "mocked",
                      no_fallback=True,
                      output_format='xmltei',
                      tei_validation=False)
    result2 = extract(teststring,
                      "mocked",
                      no_fallback=True,
                      output_format='xmltei',
                      tei_validation=True)
    assert result1 is not None and result1 == result2
    assert xml.validate_tei(etree.fromstring(result1)) is True
    assert xml.validate_tei(etree.fromstring(teststring)) is False
    # test with another file
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     output_format='xmltei',
                     tei_validation=False)
    assert result is not None  # and '<p>license</p>' in result
    assert xml.validate_tei(etree.fromstring(result)) is True
    # include ID in metadata
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     output_format='xmltei',
                     tei_validation=False,
                     record_id='0001')
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    # test header + metadata
    tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    header = etree.SubElement(tei, 'teiHeader')
    docmeta = dict.fromkeys(METADATA_LIST)
    docmeta['categories'], docmeta['tags'] = [], []
    docmeta['title'] = 'Title'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta['sitename'] = 'Site Name'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta['hostname'], docmeta['sitename'] = 'hostname', None
    assert xml.write_fullheader(header, docmeta) is not None
Ejemplo n.º 3
0
def test_tei():
    '''test TEI-related functions'''
    # open local resources to avoid redownloading at each run
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result1 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False)
    result2 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=True)
    assert result1 is not None and result1 == result2
    assert xml.validate_tei(etree.fromstring(result1)) is True
    assert xml.validate_tei(etree.fromstring(teststring)) is False
    # test with another file
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring, "mocked", no_fallback=True, include_comments=True, output_format='xmltei', tei_validation=False)
    assert result is not None # and '<p>license</p>' in result
    assert xml.validate_tei(etree.fromstring(result)) is True
    result = extract(teststring, "mocked", no_fallback=True, include_comments=False, output_format='xmltei', tei_validation=False)
    assert result is not None # and '<p>license</p>' in result
    assert xml.validate_tei(etree.fromstring(result)) is True
    # include ID in metadata
    result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False, record_id='0001')
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    # test header + metadata
    tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    header = etree.SubElement(tei, 'teiHeader')
    docmeta = Document()
    docmeta.categories, docmeta.tags = [], []
    docmeta.title = 'Title'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.sitename = 'Site Name'
    docmeta.date = '2021-01-01'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.date = None
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.hostname = 'hostname'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.sitename = None
    docmeta.license = 'CC BY-SA'
    docmeta.url = 'https://test.org/'
    docmeta.categories = ['cat1', 'cat2']
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.date = '2021-01-01'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.title, docmeta.sitename = None, None
    assert xml.write_fullheader(header, docmeta) is not None
# Versuch, die Keyword-Argumente url und target_language zu aendern; 
# result9 und result11 liefern None (liegt anscheinend am Aendern der Target-language),
# das Eingeben einer url funktioniert aber: 
url1='https://github.blog/2019-03-29-leader-spotlight-erin-spiceland/'
result9 = trafilatura.extract(downloaded, target_language='de')
print(result9)
result10 = trafilatura.extract(downloaded, url=url1)
print(result10)
result11 = trafilatura.extract(downloaded, url=url1, target_language='de')
print(result11)

  
# Ausprobieren der Validierung bzgl. TEI (siehe https://trafilatura.readthedocs.io/en/latest/validation.html)
print("test TEI-Validation:")
mytree2=etree.parse("DU_sense.xml")
print("Correct file: ",validate_tei(mytree2))
mytree3=etree.parse("DU_sense_illformed.xml")
print("Illformed file: ")
print(validate_tei(mytree3))

# Ausprobieren der Funktion xmltotxt() (siehe https://trafilatura.readthedocs.io/en/latest/corefunctions.html)
#f=open("DU_sense.xml","r")
#xml_text=f.read()
#print(trafilatura.xml.xmltotxt(xml_text))
# --> fuehrt zu TypeError: Invalid input object: str

#f=open("DU_sense.xml","r")
#print(trafilatura.xml.xmltotxt(f))
# --> fuehrt zu TypeError: Invalid input object: _io.TextIOWrapper

#print(trafilatura.xml.xmltotxt("DU_sense.xml"))
Ejemplo n.º 5
0
def extract(filecontent,
            url=None,
            record_id='0001',
            no_fallback=False,
            include_comments=False,
            csv_output=False,
            xml_output=False,
            tei_output=False,
            tei_validation=False,
            target_language=None,
            include_tables=True,
            include_formatting=False):
    '''Main process for text extraction'''
    # init
    tree = load_html(filecontent)
    if tree is None:
        return None

    # Metadata here
    if csv_output is True or xml_output is True or tei_output is True:
        docmeta = extract_metadata(tree, default_url=url)
    else:
        docmeta = None

    # backup (or not) for further processing
    if no_fallback is False:
        backup_tree = deepcopy(tree)
    else:
        backup_tree = None

    # clean
    cleaned_tree = manual_cleaning(tree, include_tables)
    # save space and processing time
    cleaned_tree = prune_html(cleaned_tree)
    # use LXML cleaner
    cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree)
    # tree_cache[cleaned_tree] = list(cleaned_tree.iter())

    # convert tags, the rest does not work without conversion
    cleaned_tree = convert_tags(cleaned_tree)
    # remove hi-element to avoid tail bug
    if (xml_output is False
            and tei_output is False) or include_formatting is False:
        etree.strip_tags(cleaned_tree, 'hi')

    # comments first, then remove
    if include_comments is True:
        commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(
            cleaned_tree)
    else:
        commentsbody, temp_comments, len_comments = None, '', 0

    # extract content
    postbody, temp_text, len_text, sure_thing = extract_content(
        cleaned_tree, include_tables)

    # compare if necessary
    if no_fallback is False:  # and sure_thing is False:
        postbody, temp_text, len_text = compare_extraction(
            backup_tree, url, postbody, temp_text, len_text)
        # try with justext
        if len_text < MIN_EXTRACTED_SIZE:
            LOGGER.error('not enough text %s %s', record_id, url)
            postbody, len_text, temp_text = justext_rescue(
                tree, url, target_language, postbody, len_text, temp_text)
            LOGGER.error('justext length %s', len_text)
        # second backup
        # if len_text < MIN_EXTRACTED_SIZE:
        #     postbody, len_text, temp_text = baseline(filecontent)
    else:
        # rescue: try to use original/dirty tree
        if sure_thing is False and len_text < MIN_EXTRACTED_SIZE:
            postbody, len_text, temp_text = baseline(filecontent)
            #tree = load_html(filecontent)
            #tree = convert_tags(tree)
            #postbody, temp_text, len_text, sure_thing = extract_content(tree)
            LOGGER.debug('non-clean extracted length: %s (extraction)',
                         len_text)

    if len_comments < MIN_EXTRACTED_COMM_SIZE:
        LOGGER.info('not enough comments %s %s', record_id, url)
    if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE:
        LOGGER.info('text and comments not long enough: %s %s', len_text,
                    len_comments)
        return None

    # sanity check on language
    if language_filter(temp_text, temp_comments, target_language, record_id,
                       url) is True:
        return None

    # check duplicates at body level
    if duplicate_test(postbody) is True:
        return None

    # cache elements
    put_in_cache(postbody)
    if commentsbody is not None:
        put_in_cache(commentsbody)

    # XML (TEI) steps
    if xml_output is True or tei_output is True:
        if xml_output is True:
            output = build_xml_output(postbody, commentsbody)
            output = add_xml_meta(output, docmeta)
        elif tei_output is True:
            output = build_tei_output(postbody, commentsbody, docmeta)
        # can be improved
        control_string = etree.tostring(output, encoding='unicode')
        control_string = sanitize(control_string)
        # necessary for cleaning
        control_parser = etree.XMLParser(remove_blank_text=True)
        output_tree = etree.fromstring(control_string, control_parser)
        # validate
        if tei_output is True and tei_validation is True:
            result = validate_tei(output_tree)
            LOGGER.info('TEI validation result: %s %s %s', result, record_id,
                        docmeta.url)
        returnstring = etree.tostring(output_tree,
                                      pretty_print=True,
                                      encoding='unicode').strip()
    # CSV + TXT output
    else:
        if csv_output is True:
            posttext = xmltotxt(postbody)
            commentstext = xmltotxt(commentsbody)
            returnstring = txttocsv(posttext, commentstext, docmeta)
        else:
            output = build_xml_output(postbody, commentsbody)
            returnstring = xmltotxt(output)

    return returnstring