Python extract_metadata Examples

Programming Language: Python

Namespace/Package Name: trafilatura.metadata

Method/Function: extract_metadata

Examples at hotexamples.com: 25

Python extract_metadata - 25 examples found. These are the top rated real world Python examples of trafilatura.metadata.extract_metadata extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: metadata_tests.py Project: zanachka/trafilatura

def test_meta():
    '''Test extraction out of meta-elements'''
    metadata = extract_metadata(
        '<html><head><meta property="og:title" content="Open Graph Title"/><meta property="og:author" content="Jenny Smith"/><meta property="og:description" content="This is an Open Graph description"/><meta property="og:site_name" content="My first site"/><meta property="og:url" content="https://example.org/test"/></head><body><a rel="license" href="https://creativecommons.org/">Creative Commons</a></body></html>'
    )
    assert metadata['title'] == 'Open Graph Title'
    assert metadata['author'] == 'Jenny Smith'
    assert metadata['description'] == 'This is an Open Graph description'
    assert metadata['sitename'] == 'My first site'
    assert metadata['url'] == 'https://example.org/test'
    assert metadata['license'] == 'Creative Commons'
    metadata = extract_metadata(
        '<html><head><meta name="dc.title" content="Open Graph Title"/><meta name="dc.creator" content="Jenny Smith"/><meta name="dc.description" content="This is an Open Graph description"/></head><body></body></html>'
    )
    assert metadata['title'] == 'Open Graph Title'
    assert metadata['author'] == 'Jenny Smith'
    assert metadata['description'] == 'This is an Open Graph description'
    metadata = extract_metadata(
        '<html><head><meta itemprop="headline" content="Title"/></head><body></body></html>'
    )
    assert metadata['title'] == 'Title'
    # catch errors
    assert extract_metadata('') is None
    metadata = extract_metadata('<html><title></title></html>')
    assert metadata['sitename'] is None

Example #2

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_catstags():
    '''Test extraction of categories and tags'''
    metadata = extract_metadata('<html><body><p class="entry-categories"><a href="https://example.org/category/cat1/">Cat1</a>, <a href="https://example.org/category/cat2/">Cat2</a></p></body></html>')
    assert metadata.categories == ['Cat1', 'Cat2']
    metadata = extract_metadata('<html><body><p class="entry-tags"><a href="https://example.org/tags/tag1/">Tag1</a>, <a href="https://example.org/tags/tag2/">Tag2</a></p></body></html>')
    assert metadata.tags == ['Tag1', 'Tag2']
    metadata = extract_metadata('<html><head><meta name="keywords" content="sodium, salt, paracetamol, blood, pressure, high, heart, &amp;quot, intake, warning, study, &amp;quot, medicine, dissolvable, cardiovascular" /></head></html>')
    assert metadata.tags == ['sodium, salt, paracetamol, blood, pressure, high, heart, intake, warning, study, medicine, dissolvable, cardiovascular']

Example #3

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_sitename():
    '''Test extraction of site name'''
    metadata = extract_metadata('<html><head><meta name="article:publisher" content="The Newspaper"/></head><body/></html>')
    assert metadata.sitename == 'The Newspaper'
    metadata = extract_metadata('<html><head><meta property="article:publisher" content="The Newspaper"/></head><body/></html>')
    assert metadata.sitename == 'The Newspaper'
    metadata = extract_metadata('<html><head><title>sitemaps.org - Home</title></head><body/></html>')
    assert metadata.sitename == 'sitemaps.org'

Example #4

Show file

File: metadata_tests.py Project: vkuberan/trafilatura

def test_catstags():
    '''Test extraction of categories and tags'''
    metadata = extract_metadata(
        '<html><body><p class="entry-categories"><a href="https://example.org/category/cat1/">Cat1</a>, <a href="https://example.org/category/cat2/">Cat2</a></p></body></html>'
    )
    assert metadata['categories'] == ['Cat1', 'Cat2']
    metadata = extract_metadata(
        '<html><body><p class="entry-tags"><a href="https://example.org/tags/tag1/">Tag1</a>, <a href="https://example.org/tags/tag2/">Tag2</a></p></body></html>'
    )
    assert metadata['tags'] == ['Tag1', 'Tag2']

Example #5

Show file

File: metadata_tests.py Project: gaplan/trafilatura

def test_url():
    '''Test the extraction of author names'''
    metadata = extract_metadata(
        '<html><head><meta property="og:url" content="https://example.org"/></head><body></body></html>'
    )
    assert metadata['url'] == 'https://example.org'
    metadata = extract_metadata(
        '<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>'
    )
    assert metadata['url'] == 'https://example.org'

Example #6

Show file

def test_url():
    '''Test URL extraction'''
    metadata = extract_metadata('<html><head><meta property="og:url" content="https://example.org"/></head><body></body></html>')
    assert metadata['url'] == 'https://example.org'
    metadata = extract_metadata('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>')
    assert metadata['url'] == 'https://example.org'
    metadata = extract_metadata('<html><head><meta name="twitter:url" content="https://example.org"/></head><body></body></html>')
    assert metadata['url'] == 'https://example.org'
    metadata = extract_metadata('<html><head><link rel="alternate" hreflang="x-default" href="https://example.org"/></head><body></body></html>')
    assert metadata['url'] == 'https://example.org'

Example #7

Show file

File: metadata_tests.py Project: vkuberan/trafilatura

def test_dates():
    '''Simple tests for date extraction (most of the tests are carried out externally for htmldate module)'''
    metadata = extract_metadata(
        '<html><head><meta property="og:published_time" content="2017-09-01"/></head><body></body></html>'
    )
    assert metadata['date'] == '2017-09-01'
    metadata = extract_metadata(
        '<html><head><meta property="og:url" content="https://example.org/2017/09/01/content.html"/></head><body></body></html>'
    )
    assert metadata['date'] == '2017-09-01'

Example #8

Show file

File: metadata_tests.py Project: phongtnit/trafilatura

def test_meta():
    '''Test extraction out of meta-elements'''
    metadata = extract_metadata('<html><head><meta property="og:title" content="Open Graph Title"/><meta property="og:author" content="Jenny Smith"/><meta property="og:description" content="This is an Open Graph description"/><meta property="og:site_name" content="My first site"/><meta property="og:url" content="https://example.org/test"/></head><body></body></html>')
    assert metadata['title'] == 'Open Graph Title'
    assert metadata['author'] == 'Jenny Smith'
    assert metadata['description'] == 'This is an Open Graph description'
    assert metadata['sitename'] == 'My first site'
    assert metadata['url'] == 'https://example.org/test'
    metadata = extract_metadata('<html><head><meta name="dc.title" content="Open Graph Title"/><meta name="dc.creator" content="Jenny Smith"/><meta name="dc.description" content="This is an Open Graph description"/></head><body></body></html>')
    assert metadata['title'] == 'Open Graph Title'
    assert metadata['author'] == 'Jenny Smith'
    assert metadata['description'] == 'This is an Open Graph description'

Example #9

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_dates():
    '''Simple tests for date extraction (most of the tests are carried out externally for htmldate module)'''
    metadata = extract_metadata('<html><head><meta property="og:published_time" content="2017-09-01"/></head><body></body></html>')
    assert metadata.date == '2017-09-01'
    metadata = extract_metadata('<html><head><meta property="og:url" content="https://example.org/2017/09/01/content.html"/></head><body></body></html>')
    assert metadata.date == '2017-09-01'
    metadata = extract_metadata('<html><head><meta property="og:url" content="https://example.org/2017/09/01/content.html"/></head><body></body></html>')
    assert metadata.date == '2017-09-01'
    mystring = '<html><body><p>Veröffentlicht am 1.9.17</p></body></html>'
    metadata = extract_metadata(mystring, fastmode=False)
    assert metadata.date == '2017-09-01'
    metadata = extract_metadata(mystring, fastmode=True)
    assert metadata.date is None

Example #10

Show file

def test_titles():
    '''Test the extraction of titles'''
    metadata = extract_metadata(
        '<html><head><title>Test Title</title></head><body></body></html>')
    assert metadata.title == 'Test Title'
    metadata = extract_metadata(
        '<html><body><h1>First</h1><h1>Second</h1></body></html>')
    assert metadata.title == 'First'
    metadata = extract_metadata(
        '<html><body><h2>First</h2><h1>Second</h1></body></html>')
    assert metadata.title == 'Second'
    metadata = extract_metadata(
        '<html><body><h2>First</h2><h2>Second</h2></body></html>')
    assert metadata.title == 'First'

Example #11

Show file

def test_meta():
    '''Test extraction out of meta-elements'''
    metadata = extract_metadata(
        '<html><head><meta property="og:title" content="Open Graph Title"/><meta property="og:author" content="Jenny Smith"/><meta property="og:description" content="This is an Open Graph description"/><meta property="og:site_name" content="My first site"/></head><body></body></html>'
    )
    assert metadata.title == 'Open Graph Title'
    assert metadata.author == 'Jenny Smith'
    assert metadata.description == 'This is an Open Graph description'
    assert metadata.sitename == 'My first site'
    metadata = extract_metadata(
        '<html><head><meta name="dc.title" content="Open Graph Title"/><meta name="dc.creator" content="Jenny Smith"/><meta name="dc.description" content="This is an Open Graph description"/></head><body></body></html>'
    )
    assert metadata.title == 'Open Graph Title'
    assert metadata.author == 'Jenny Smith'
    assert metadata.description == 'This is an Open Graph description'

Example #12

Show file

File: metadata_tests.py Project: vkuberan/trafilatura

def test_authors():
    '''Test the extraction of author names'''
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a href="" rel="author">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span class="author">Jenny Smith</span></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a class="author">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><address class="author">Jenny Smith</address></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><author>Jenny Smith</author></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '''<html><body><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","description":"The president and his campaign competed again on Monday, with his slash-and-burn remarks swamping news coverage even as his advisers used conventional levers to try to lift his campaign.","image":[{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-videoSixteenByNineJumbo1600.jpg","height":900,"width":1600,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/merlin_178764738_11d22ae6-9e7e-4d7a-b28a-20bf52b23e86-superJumbo.jpg","height":1365,"width":2048,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-mediumSquareAt3X.jpg","height":1800,"width":1800,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "}],"mainEntityOfPage":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","url":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","inLanguage":"en","author":[{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/maggie-haberman","name":"Maggie Haberman"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/shane-goldmacher","name":"Shane Goldmacher"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/michael-crowley","name":"Michael Crowley"}],"dateModified":"2020-10-20T01:22:07.000Z","datePublished":"2020-10-19T22:24:02.000Z","headline":"Trump Team Unveils $55 Million Ad Blitz on a Day of Scattershot Attacks","publisher":{"@id":"https://www.nytimes.com/#publisher"},"copyrightHolder":{"@id":"https://www.nytimes.com/#publisher"},"sourceOrganization":{"@id":"https://www.nytimes.com/#publisher"},"copyrightYear":2020,"isAccessibleForFree":false,"hasPart":{"@type":"WebPageElement","isAccessibleForFree":false,"cssSelector":".meteredContent"},"isPartOf":{"@type":["CreativeWork","Product"],"name":"The New York Times","productID":"nytimes.com:basic"}}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsMediaOrganization","name":"The New York Times","logo":{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png","height":40,"width":250},"url":"https://www.nytimes.com/","@id":"https://www.nytimes.com/#publisher","diversityPolicy":"https://www.nytco.com/diversity-and-inclusion-at-the-new-york-times/","ethicsPolicy":"https://www.nytco.com/who-we-are/culture/standards-and-ethics/","masthead":"https://www.nytimes.com/interactive/2019/admin/the-new-york-times-masthead.html","foundingDate":"1851-09-18","sameAs":"https://en.wikipedia.org/wiki/The_New_York_Times"}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"BreadcrumbList","itemListElement":[{"@context":"http://schema.org","@type":"ListItem","name":"U.S.","position":1,"item":"https://www.nytimes.com/section/us"},{"@context":"http://schema.org","@type":"ListItem","name":"Politics","position":2,"item":"https://www.nytimes.com/section/politics"}]}</script></body></html>'''
    )
    assert metadata[
        'author'] == 'Maggie Haberman; Shane Goldmacher; Michael Crowley'
    metadata = dict.fromkeys(METADATA_LIST)
    metadata = extract_json(
        html.fromstring(
            '''<html><body><script type="application/ld+json" class="yoast-schema-graph">{"@context":"https://schema.org",{"@type":"WebPage","@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage","url":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/","name":"Jean S\u00e9villia : \"L'\u00c9tat fran\u00e7ais et l'\u00c9tat alg\u00e9rien doivent reconna\u00eetre les crimes commis des deux c\u00f4t\u00e9s\" - Boulevard Voltaire","datePublished":"2018-09-13T12:21:13+00:00","dateModified":"2018-09-14T12:33:14+00:00","inLanguage":"fr-FR"},{"@type":"Article","@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#article","isPartOf":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage"},"author":{"@id":"https://www.bvoltaire.fr/#/schema/person/96c0ed8f089950c46afc2044cb23e8da"},"headline":"Jean S\u00e9villia : &#8220;L&#8217;\u00c9tat fran\u00e7ais et l&#8217;\u00c9tat alg\u00e9rien doivent reconna\u00eetre les crimes commis des deux c\u00f4t\u00e9s&#8221;","datePublished":"2018-09-13T12:21:13+00:00","dateModified":"2018-09-14T12:33:14+00:00","mainEntityOfPage":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage"},"publisher":{"@id":"https://www.bvoltaire.fr/#organization"},"image":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#primaryimage"},"keywords":"Guerre d'Alg\u00e9rie","articleSection":"Audio,Editoriaux,Entretiens,Histoire","inLanguage":"fr-FR"},{"@type":"Person","@id":"https://www.bvoltaire.fr/#/schema/person/96c0ed8f089950c46afc2044cb23e8da","name":"Jean S\u00e9villia","image":{"@type":"ImageObject","@id":"https://www.bvoltaire.fr/#personlogo","inLanguage":"fr-FR","url":"https://secure.gravatar.com/avatar/1dd0ad5cb1fc3695880af1725477b22e?s=96&d=mm&r=g","caption":"Jean S\u00e9villia"},"description":"R\u00e9dacteur en chef adjoint au Figaro Magazine, membre du comit\u00e9 scientifique du Figaro Histoire, et auteur de biographies et d\u2019essais historiques.","sameAs":["https://www.bvoltaire.fr/"]}]}</script></body></html>'''
        ), metadata)
    assert metadata['author'] == "Jean Sévillia"

Example #13

Show file

File: metadata_tests.py Project: zanachka/trafilatura

def test_titles():
    '''Test the extraction of titles'''
    metadata = extract_metadata(
        '<html><head><title>Test Title</title></head><body></body></html>')
    assert metadata['title'] == 'Test Title'
    metadata = extract_metadata(
        '<html><body><h1>First</h1><h1>Second</h1></body></html>')
    assert metadata['title'] == 'First'
    metadata = extract_metadata(
        '<html><body><h1>   </h1><div class="post-title">Test Title</div></body></html>'
    )
    assert metadata['title'] == 'Test Title'
    metadata = extract_metadata(
        '<html><body><h2>First</h2><h1>Second</h1></body></html>')
    assert metadata['title'] == 'Second'
    metadata = extract_metadata(
        '<html><body><h2>First</h2><h2>Second</h2></body></html>')
    assert metadata['title'] == 'First'
    metadata = extract_metadata('<html><body><title></title></body></html>')
    assert metadata['title'] is None
    metadata = extract_metadata(
        r'''<html><body><script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Semantic satiation","url":"https:\/\/en.wikipedia.org\/wiki\/Semantic_satiation","sameAs":"http:\/\/www.wikidata.org\/entity\/Q226007","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q226007","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2006-07-12T09:27:14Z","dateModified":"2020-08-31T23:55:26Z","headline":"psychological phenomenon in which repetition causes a word to temporarily lose meaning for the listener"}</script>
<script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":112,"wgHostname":"mw2373"});});</script></html>'''
    )
    assert metadata['title'] == 'Semantic satiation'

Example #14

Show file

def test_authors():
    '''Test the extraction of author names'''
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a href="" rel="author">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span class="author">Jenny Smith</span></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a class="author">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><address class="author">Jenny Smith</address></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><author>Jenny Smith</author></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '''<html><body><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","description":"The president and his campaign competed again on Monday, with his slash-and-burn remarks swamping news coverage even as his advisers used conventional levers to try to lift his campaign.","image":[{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-videoSixteenByNineJumbo1600.jpg","height":900,"width":1600,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/merlin_178764738_11d22ae6-9e7e-4d7a-b28a-20bf52b23e86-superJumbo.jpg","height":1365,"width":2048,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-mediumSquareAt3X.jpg","height":1800,"width":1800,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "}],"mainEntityOfPage":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","url":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","inLanguage":"en","author":[{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/maggie-haberman","name":"Maggie Haberman"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/shane-goldmacher","name":"Shane Goldmacher"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/michael-crowley","name":"Michael Crowley"}],"dateModified":"2020-10-20T01:22:07.000Z","datePublished":"2020-10-19T22:24:02.000Z","headline":"Trump Team Unveils $55 Million Ad Blitz on a Day of Scattershot Attacks","publisher":{"@id":"https://www.nytimes.com/#publisher"},"copyrightHolder":{"@id":"https://www.nytimes.com/#publisher"},"sourceOrganization":{"@id":"https://www.nytimes.com/#publisher"},"copyrightYear":2020,"isAccessibleForFree":false,"hasPart":{"@type":"WebPageElement","isAccessibleForFree":false,"cssSelector":".meteredContent"},"isPartOf":{"@type":["CreativeWork","Product"],"name":"The New York Times","productID":"nytimes.com:basic"}}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsMediaOrganization","name":"The New York Times","logo":{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png","height":40,"width":250},"url":"https://www.nytimes.com/","@id":"https://www.nytimes.com/#publisher","diversityPolicy":"https://www.nytco.com/diversity-and-inclusion-at-the-new-york-times/","ethicsPolicy":"https://www.nytco.com/who-we-are/culture/standards-and-ethics/","masthead":"https://www.nytimes.com/interactive/2019/admin/the-new-york-times-masthead.html","foundingDate":"1851-09-18","sameAs":"https://en.wikipedia.org/wiki/The_New_York_Times"}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"BreadcrumbList","itemListElement":[{"@context":"http://schema.org","@type":"ListItem","name":"U.S.","position":1,"item":"https://www.nytimes.com/section/us"},{"@context":"http://schema.org","@type":"ListItem","name":"Politics","position":2,"item":"https://www.nytimes.com/section/politics"}]}</script></body></html>'''
    )
    assert metadata[
        'author'] == 'Maggie Haberman; Shane Goldmacher; Michael Crowley'

Example #15

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_license():
    '''Test extraction of CC licenses'''
    # a rel
    metadata = extract_metadata('<html><body><p><a href="https://creativecommons.org/licenses/by-sa/4.0/" rel="license">CC BY-SA</a></p></body></html>')
    assert metadata.license == 'CC BY-SA 4.0'
    metadata = extract_metadata('<html><body><p><a href="https://licenses.org/unknown" rel="license">Unknown</a></p></body></html>')
    assert metadata.license == 'Unknown'
    # footer
    metadata = extract_metadata('<html><body><footer><a href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA</a></footer></body></html>')
    assert metadata.license == 'CC BY-SA 4.0'
    # footer: netzpolitik.org
    metadata = extract_metadata('''<html><body>
<div class="footer__navigation">
<p class="footer__licence">
            <strong>Lizenz: </strong>
            Die von uns verfassten Inhalte stehen, soweit nicht anders vermerkt, unter der Lizenz
            <a href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons BY-NC-SA 4.0.</a>
        </p>
    </div>
</body></html>''')
    assert metadata.license == 'CC BY-NC-SA 4.0'
    # this is not a license
    metadata = extract_metadata('''<html><body><footer class="entry-footer">
	<span class="cat-links">Posted in <a href="https://sallysbakingaddiction.com/category/seasonal/birthday/" rel="category tag">Birthday</a></span>
	</footer></body></html>''')
    assert metadata.license is None
    # this is a license
    metadata = extract_metadata('''<html><body><footer class="entry-footer">
	<span>The license is <a href="https://example.org/1">CC BY-NC</a></span>
	</footer></body></html>''')
    assert metadata.license == 'CC BY-NC'

Example #16

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_meta():
    '''Test extraction out of meta-elements'''
    metadata = extract_metadata('<html><head><meta property="og:title" content="Open Graph Title"/><meta property="og:author" content="Jenny Smith"/><meta property="og:description" content="This is an Open Graph description"/><meta property="og:site_name" content="My first site"/><meta property="og:url" content="https://example.org/test"/></head><body><a rel="license" href="https://creativecommons.org/">Creative Commons</a></body></html>')
    assert metadata.title == 'Open Graph Title'
    assert metadata.author == 'Jenny Smith'
    assert metadata.description == 'This is an Open Graph description'
    assert metadata.sitename == 'My first site'
    assert metadata.url == 'https://example.org/test'
    assert metadata.license == 'Creative Commons'
    metadata = extract_metadata('<html><head><meta name="dc.title" content="Open Graph Title"/><meta name="dc.creator" content="Jenny Smith"/><meta name="dc.description" content="This is an Open Graph description"/></head><body></body></html>')
    assert metadata.title == 'Open Graph Title'
    assert metadata.author == 'Jenny Smith'
    assert metadata.description == 'This is an Open Graph description'
    metadata = extract_metadata('<html><head><meta itemprop="headline" content="Title"/></head><body></body></html>')
    assert metadata.title == 'Title'
    # catch errors
    assert extract_metadata('') is None
    metadata = extract_metadata('<html><title></title></html>')
    assert metadata.sitename is None
    metadata = extract_metadata('<html><head><title>' + 'AAA'*10000 + '</title></head></html>')
    assert metadata.title.endswith('…') and len(metadata.title) == 10000
    assert extract_metadata('<html><head><meta otherkey="example" content="Unknown text"/></head></html>') is not None
    assert extract_metadata('<html><head><title></title><title></title><title></title></head></html>') is not None

Example #17

Show file

def test_authors():
    '''Test the extraction of author names'''
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>'
    )
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a href="" rel="author">Jenny Smith</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span class="author">Jenny Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a class="author">Jenny Smith</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><address class="author">Jenny Smith</address></body></html>'
    )
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><author>Jenny Smith</author></body></html>')
    assert metadata.author == 'Jenny Smith'

Example #18

Show file

File: metadata_tests.py Project: vkuberan/trafilatura

def test_description():
    '''Test the extraction of descriptions'''
    metadata = extract_metadata(
        '<html><head><meta itemprop="description" content="Description"/></head><body></body></html>'
    )
    assert metadata['description'] == 'Description'

Example #19

Show file

File: metadata_tests.py Project: zanachka/trafilatura

def test_sitename():
    '''Test extraction of site name'''
    metadata = extract_metadata(
        '<html><head><title>sitemaps.org - Home</title></head><body/></html>')
    assert metadata['sitename'] == 'sitemaps.org'

Example #20

Show file

File: metadata_tests.py Project: zanachka/trafilatura

def test_authors():
    '''Test the extraction of author names'''
    # normalization
    assert normalize_authors(None, 'abc') == 'Abc'
    assert normalize_authors(None, 'Steve Steve 123') == 'Steve Steve'
    assert normalize_authors(None, 'By Steve Steve') == 'Steve Steve'
    # extraction
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith"/><meta itemprop="author" content="John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith und John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><head><meta name="author" content="Jenny Smith"/><meta name="author" content="John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><head><meta name="author" content="Jenny Smith and John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><head><meta name="author" content="Jenny Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><head><meta name="author" content="Jenny Smith ❤️"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><head><meta name="citation_author" content="Jenny Smith and John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><head><meta property="author" content="Jenny Smith"/><meta property="author" content="John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><head><meta itemprop="author" content="Jenny Smith and John Smith"/></head><body></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><body><a href="" rel="author">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span class="author">Jenny Smith</span></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span class="wrapper--detail__writer">Jenny Smith</span></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span id="author-name">Jenny Smith</span></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span class="author">Jenny Smith and John Smith</span></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith; John Smith'
    metadata = extract_metadata(
        '<html><body><a class="author">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a class="author">Jenny Smith from Trafilatura</a></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><a class="username">Jenny Smith</a></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><div class="submitted-by"><a>Jenny Smith</a></div></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><div class="byline-content"><div class="byline"><a>Jenny Smith</a></div><time>July 12, 2021 08:05</time></div></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><h3 itemprop="author">Jenny Smith</h3></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><div data-component="Byline">Jenny Smith</div></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span id="author">Jenny Smith</span></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span id="author">Jenny_Smith</span></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><span itemprop="author name">Shannon Deery, Mitch Clarke, Susie O’Brien, Laura Placella, Kara Irving, Jordy Atkinson, Suzan Delibasic</span></body></html>'
    )
    assert metadata[
        'author'] == 'Shannon Deery; Mitch Clarke; Susie O’Brien; Laura Placella; Kara Irving; Jordy Atkinson; Suzan Delibasic'
    metadata = extract_metadata(
        '<html><body><address class="author">Jenny Smith</address></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><author>Jenny Smith</author></body></html>')
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><div class="author"><span class="profile__name"> Jenny Smith </span> <a href="https://twitter.com/jenny_smith" class="profile__social" target="_blank"> @jenny_smith </a> <span class="profile__extra lg:hidden"> 11:57AM </span> </div></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><p class="author-section byline-plain">By <a class="author" rel="nofollow">Jenny Smith For Daily Mail Australia</a></p></body></html>'
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = extract_metadata(
        '<html><body><div class="o-Attribution__a-Author"><span class="o-Attribution__a-Author--Label">By:</span><span class="o-Attribution__a-Author--Prefix"><span class="o-Attribution__a-Name"><a href="//web.archive.org/web/20210707074846/https://www.discovery.com/profiles/ian-shive">Ian Shive</a></span></span></div></body></html>'
    )
    assert metadata['author'] == 'Ian Shive'
    metadata = extract_metadata(
        '<html><body><div class="ArticlePage-authors"><div class="ArticlePage-authorName" itemprop="name"><span class="ArticlePage-authorBy">By&nbsp;</span><a aria-label="Ben Coxworth" href="https://newatlas.com/author/ben-coxworth/"><span>Ben Coxworth</span></a></div></div></body></html>'
    )
    assert metadata['author'] == 'Ben Coxworth'

    ## JSON extraction
    metadata = extract_metadata(
        '''<html><body><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","description":"The president and his campaign competed again on Monday, with his slash-and-burn remarks swamping news coverage even as his advisers used conventional levers to try to lift his campaign.","image":[{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-videoSixteenByNineJumbo1600.jpg","height":900,"width":1600,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/merlin_178764738_11d22ae6-9e7e-4d7a-b28a-20bf52b23e86-superJumbo.jpg","height":1365,"width":2048,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-mediumSquareAt3X.jpg","height":1800,"width":1800,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "}],"mainEntityOfPage":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","url":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","inLanguage":"en","author":[{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/maggie-haberman","name":"Maggie Haberman"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/shane-goldmacher","name":"Shane Goldmacher"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/michael-crowley","name":"Michael Crowley"}],"dateModified":"2020-10-20T01:22:07.000Z","datePublished":"2020-10-19T22:24:02.000Z","headline":"Trump Team Unveils $55 Million Ad Blitz on a Day of Scattershot Attacks","publisher":{"@id":"https://www.nytimes.com/#publisher"},"copyrightHolder":{"@id":"https://www.nytimes.com/#publisher"},"sourceOrganization":{"@id":"https://www.nytimes.com/#publisher"},"copyrightYear":2020,"isAccessibleForFree":false,"hasPart":{"@type":"WebPageElement","isAccessibleForFree":false,"cssSelector":".meteredContent"},"isPartOf":{"@type":["CreativeWork","Product"],"name":"The New York Times","productID":"nytimes.com:basic"}}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsMediaOrganization","name":"The New York Times","logo":{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png","height":40,"width":250},"url":"https://www.nytimes.com/","@id":"https://www.nytimes.com/#publisher","diversityPolicy":"https://www.nytco.com/diversity-and-inclusion-at-the-new-york-times/","ethicsPolicy":"https://www.nytco.com/who-we-are/culture/standards-and-ethics/","masthead":"https://www.nytimes.com/interactive/2019/admin/the-new-york-times-masthead.html","foundingDate":"1851-09-18","sameAs":"https://en.wikipedia.org/wiki/The_New_York_Times"}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"BreadcrumbList","itemListElement":[{"@context":"http://schema.org","@type":"ListItem","name":"U.S.","position":1,"item":"https://www.nytimes.com/section/us"},{"@context":"http://schema.org","@type":"ListItem","name":"Politics","position":2,"item":"https://www.nytimes.com/section/politics"}]}</script></body></html>'''
    )
    assert metadata[
        'author'] == 'Maggie Haberman; Shane Goldmacher; Michael Crowley'
    metadata = extract_metadata(
        '''<html><body><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":{"@type":"WebPage","@id":"https://www.perthnow.com.au/news/government-defends-graphic-covid-19-ad-after-backlash-c-3376985"},"dateline":null,"publisher":{"@type":"Organization","name":"PerthNow","url":"https://www.perthnow.com.au","logo":{"@type":"ImageObject","url":"https://www.perthnow.com.au/static/publisher-logos/publisher-logo-60px-high.png","width":575,"height":60}},"keywords":["News","News","Australia","Politics","Federal Politics","News","TAS News"],"articleSection":"News","headline":"Government defends graphic Covid-19 ad after backlash","description":"A graphic COVID-19 ad showing a young woman apparently on the verge of death has prompted a backlash, but the government insists it wasn’t done lightly.","dateCreated":"2021-07-12T00:11:50.000Z","datePublished":"2021-07-12T00:11:50.000Z","dateModified":"2021-07-12T01:25:20.617Z","isAccessibleForFree":"True","articleBody":"The man tasked with co-ordinating Australia&rsquo;s Covid-19 vaccine rollout insists a confronting ad depicting a woman on the verge of death was not run lightly. The 30-second clip, depicting a woman apparently in her 20s or 30s gasping for air on a hospital bed, was filmed last year, but the federal government held off running it as no outbreak was deemed serious enough to warrant it. The government has been forced to defend the ad, reminiscent of the &ldquo;Grim Reaper&rdquo; HIV ads in the 1980s, after it prompted a backlash over claims it was too confronting. A more temperate series of ads, depicting arms on ordinary Australians with the moniker &ldquo;Arm Yourself&rdquo;, began last week, but Covid-19 taskforce commander Lieutenant General John Frewen said the escalating situation in Sydney called for a more explicit response. &ldquo;It is absolutely confronting and we didn&rsquo;t use it lightly. There was serious consideration given to whether it was required and we took expert advice,&rdquo; he told Today on Monday. &ldquo;It is confronting but leaves people in no doubt about the seriousness of getting Covid, and it seeks to have people stay home, get tested and get vaccinated as quickly as they can.&rdquo; NSW on Sunday confirmed another 77 cases, 31 of which had been in the community while infectious, and Premier Gladys Berejiklian warned she would be &ldquo;shocked&rdquo; if the number did not exceed 100 on Monday. General Frewen said the &ldquo;concerning situation&rdquo; had prompted the government to shift an additional 300,000 doses to NSW over the coming fortnight. &ldquo;The Delta variant is proving to be very difficult to contain, so we&rsquo;re working very closely with NSW authorities and standing ready to help them in any way we can,&rdquo; he said. Agriculture Minister David Littleproud said the ad was designed to shock Sydneysiders into action as the situation deteriorated. &ldquo;This is about shooting home that this is a serious situation and can get anybody. The fact we&rsquo;re actually debating this I think says to me that the campaign we&rsquo;ve approved is working,&rdquo; he said. The age of the woman in the ad has sparked controversy, with most younger Australians still ineligible to receive their vaccine. But with 11 of the 52 people in hospital across NSW under 35, Labor frontbencher Tanya Plibersek warned the Delta variant was &ldquo;hitting younger people as well&rdquo;. Labor had long demanded a national Covid-19 advertising campaign, which Ms Plibersek said was delayed as a result of the government&rsquo;s sluggish vaccine rollout. &ldquo;Perhaps the reason it&rsquo;s taken so long is if you encourage people to go and get vaccinated, you&rsquo;ve got to have enough of the vaccine available. We simply haven&rsquo;t; we&rsquo;ve been absolutely behind the eight ball in getting another vaccine for Australians,&rdquo; she told Sunrise. Labor frontbencher Chris Bowen, whose western Sydney electorate was in the grip of the outbreak, said the issue was &ldquo;not vaccine hesitancy so much, it&rsquo;s vaccine scarcity&rdquo;. He accepted there was a role for &ldquo;pointing out the consequences of not getting vaccinated&rdquo; to those that were hesitant about the jab, but said the new campaign lacked the &ldquo;creative spark&rdquo; of the Grim Reaper ads. &ldquo;That was a very tough message, a very stark message, but in a very creative way. I think the government really needs to rethink this advertising campaign from scratch; it&rsquo;s too late, and it&rsquo;s pretty low impact,&rdquo; he told ABC radio. He also dismissed the &ldquo;Arm Yourself&rdquo; campaign as &ldquo;very low energy&rdquo;. &ldquo;I don&rsquo;t think that&rsquo;s going to have any impact,&rdquo; he said.","image":[{"@type":"ImageObject","url":"https://images.perthnow.com.au/publication/C-3376985/6c07502f73bdccd45d879356219c325574873a6d-16x9-x0y444w1151h647.jpg","width":1151,"height":647},{"@type":"ImageObject","url":"https://images.perthnow.com.au/publication/C-3376985/6c07502f73bdccd45d879356219c325574873a6d-4x3-x0y336w1151h863.jpg","width":1151,"height":863}],"thumbnailUrl":"https://images.perthnow.com.au/publication/C-3376985/6c07502f73bdccd45d879356219c325574873a6d-16x9-x0y444w1151h647.jpg","url":"https://www.perthnow.com.au/news/government-defends-graphic-covid-19-ad-after-backlash-c-3376985","author":{"@type":"Organization","name":"NCA NewsWire"},"name":"Government defends graphic Covid-19 ad after backlash"}</script><span itemprop="author name">Jenny Smith</span></span></body></html>'''
    )
    assert metadata['author'] == 'Jenny Smith'
    metadata = dict.fromkeys(METADATA_LIST)
    metadata = extract_json(
        html.fromstring(
            '''<html><body><script type="application/ld+json" class="yoast-schema-graph">{"@context":"https://schema.org",{"@type":"WebPage","@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage","url":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/","name":"Jean S\u00e9villia : \"L'\u00c9tat fran\u00e7ais et l'\u00c9tat alg\u00e9rien doivent reconna\u00eetre les crimes commis des deux c\u00f4t\u00e9s\" - Boulevard Voltaire","datePublished":"2018-09-13T12:21:13+00:00","dateModified":"2018-09-14T12:33:14+00:00","inLanguage":"fr-FR"},{"@type":"Article","@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#article","isPartOf":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage"},"author":{"@id":"https://www.bvoltaire.fr/#/schema/person/96c0ed8f089950c46afc2044cb23e8da"},"headline":"Jean S\u00e9villia : &#8220;L&#8217;\u00c9tat fran\u00e7ais et l&#8217;\u00c9tat alg\u00e9rien doivent reconna\u00eetre les crimes commis des deux c\u00f4t\u00e9s&#8221;","datePublished":"2018-09-13T12:21:13+00:00","dateModified":"2018-09-14T12:33:14+00:00","mainEntityOfPage":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage"},"publisher":{"@id":"https://www.bvoltaire.fr/#organization"},"image":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#primaryimage"},"keywords":"Guerre d'Alg\u00e9rie","articleSection":"Audio,Editoriaux,Entretiens,Histoire","inLanguage":"fr-FR"},{"@type":"Person","@id":"https://www.bvoltaire.fr/#/schema/person/96c0ed8f089950c46afc2044cb23e8da","name":"Jean S\u00e9villia","image":{"@type":"ImageObject","@id":"https://www.bvoltaire.fr/#personlogo","inLanguage":"fr-FR","url":"https://secure.gravatar.com/avatar/1dd0ad5cb1fc3695880af1725477b22e?s=96&d=mm&r=g","caption":"Jean S\u00e9villia"},"description":"R\u00e9dacteur en chef adjoint au Figaro Magazine, membre du comit\u00e9 scientifique du Figaro Histoire, et auteur de biographies et d\u2019essais historiques.","sameAs":["https://www.bvoltaire.fr/"]}]}</script></body></html>'''
        ), metadata)
    assert metadata['author'] == "Jean Sévillia"

Example #21

Show file

def extract(filecontent,
            url=None,
            record_id='0001',
            no_fallback=False,
            include_comments=False,
            csv_output=False,
            xml_output=False,
            tei_output=False,
            tei_validation=False,
            target_language=None,
            include_tables=True,
            include_formatting=False):
    '''Main process for text extraction'''
    # init
    tree = load_html(filecontent)
    if tree is None:
        return None

    # Metadata here
    if csv_output is True or xml_output is True or tei_output is True:
        docmeta = extract_metadata(tree, default_url=url)
    else:
        docmeta = None

    # backup (or not) for further processing
    if no_fallback is False:
        backup_tree = deepcopy(tree)
    else:
        backup_tree = None

    # clean
    cleaned_tree = manual_cleaning(tree, include_tables)
    # save space and processing time
    cleaned_tree = prune_html(cleaned_tree)
    # use LXML cleaner
    cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree)
    # tree_cache[cleaned_tree] = list(cleaned_tree.iter())

    # convert tags, the rest does not work without conversion
    cleaned_tree = convert_tags(cleaned_tree)
    # remove hi-element to avoid tail bug
    if (xml_output is False
            and tei_output is False) or include_formatting is False:
        etree.strip_tags(cleaned_tree, 'hi')

    # comments first, then remove
    if include_comments is True:
        commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(
            cleaned_tree)
    else:
        commentsbody, temp_comments, len_comments = None, '', 0

    # extract content
    postbody, temp_text, len_text, sure_thing = extract_content(
        cleaned_tree, include_tables)

    # compare if necessary
    if no_fallback is False:  # and sure_thing is False:
        postbody, temp_text, len_text = compare_extraction(
            backup_tree, url, postbody, temp_text, len_text)
        # try with justext
        if len_text < MIN_EXTRACTED_SIZE:
            LOGGER.error('not enough text %s %s', record_id, url)
            postbody, len_text, temp_text = justext_rescue(
                tree, url, target_language, postbody, len_text, temp_text)
            LOGGER.error('justext length %s', len_text)
        # second backup
        # if len_text < MIN_EXTRACTED_SIZE:
        #     postbody, len_text, temp_text = baseline(filecontent)
    else:
        # rescue: try to use original/dirty tree
        if sure_thing is False and len_text < MIN_EXTRACTED_SIZE:
            postbody, len_text, temp_text = baseline(filecontent)
            #tree = load_html(filecontent)
            #tree = convert_tags(tree)
            #postbody, temp_text, len_text, sure_thing = extract_content(tree)
            LOGGER.debug('non-clean extracted length: %s (extraction)',
                         len_text)

    if len_comments < MIN_EXTRACTED_COMM_SIZE:
        LOGGER.info('not enough comments %s %s', record_id, url)
    if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE:
        LOGGER.info('text and comments not long enough: %s %s', len_text,
                    len_comments)
        return None

    # sanity check on language
    if language_filter(temp_text, temp_comments, target_language, record_id,
                       url) is True:
        return None

    # check duplicates at body level
    if duplicate_test(postbody) is True:
        return None

    # cache elements
    put_in_cache(postbody)
    if commentsbody is not None:
        put_in_cache(commentsbody)

    # XML (TEI) steps
    if xml_output is True or tei_output is True:
        if xml_output is True:
            output = build_xml_output(postbody, commentsbody)
            output = add_xml_meta(output, docmeta)
        elif tei_output is True:
            output = build_tei_output(postbody, commentsbody, docmeta)
        # can be improved
        control_string = etree.tostring(output, encoding='unicode')
        control_string = sanitize(control_string)
        # necessary for cleaning
        control_parser = etree.XMLParser(remove_blank_text=True)
        output_tree = etree.fromstring(control_string, control_parser)
        # validate
        if tei_output is True and tei_validation is True:
            result = validate_tei(output_tree)
            LOGGER.info('TEI validation result: %s %s %s', result, record_id,
                        docmeta.url)
        returnstring = etree.tostring(output_tree,
                                      pretty_print=True,
                                      encoding='unicode').strip()
    # CSV + TXT output
    else:
        if csv_output is True:
            posttext = xmltotxt(postbody)
            commentstext = xmltotxt(commentsbody)
            returnstring = txttocsv(posttext, commentstext, docmeta)
        else:
            output = build_xml_output(postbody, commentsbody)
            returnstring = xmltotxt(output)

    return returnstring

Example #22

Show file

def test_json_extraction():
    ## JSON extraction
    metadata = extract_metadata('''<html><body><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","description":"The president and his campaign competed again on Monday, with his slash-and-burn remarks swamping news coverage even as his advisers used conventional levers to try to lift his campaign.","image":[{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-videoSixteenByNineJumbo1600.jpg","height":900,"width":1600,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/merlin_178764738_11d22ae6-9e7e-4d7a-b28a-20bf52b23e86-superJumbo.jpg","height":1365,"width":2048,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "},{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/2020/10/19/us/politics/19campaign/19campaign-mediumSquareAt3X.jpg","height":1800,"width":1800,"caption":"In Arizona on Monday, President Trump aired grievances against people including former President Barack Obama and Michelle Obama; Joseph R. Biden Jr. and Hunter Biden; Dr. Anthony S. Fauci; and two female NBC News hosts. "}],"mainEntityOfPage":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","url":"https://www.nytimes.com/2020/10/19/us/politics/trump-ads-biden-election.html","inLanguage":"en","author":[{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/maggie-haberman","name":"Maggie Haberman"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/shane-goldmacher","name":"Shane Goldmacher"},{"@context":"http://schema.org","@type":"Person","url":"https://www.nytimes.com/by/michael-crowley","name":"Michael Crowley"}],"dateModified":"2020-10-20T01:22:07.000Z","datePublished":"2020-10-19T22:24:02.000Z","headline":"Trump Team Unveils $55 Million Ad Blitz on a Day of Scattershot Attacks","publisher":{"@id":"https://www.nytimes.com/#publisher"},"copyrightHolder":{"@id":"https://www.nytimes.com/#publisher"},"sourceOrganization":{"@id":"https://www.nytimes.com/#publisher"},"copyrightYear":2020,"isAccessibleForFree":false,"hasPart":{"@type":"WebPageElement","isAccessibleForFree":false,"cssSelector":".meteredContent"},"isPartOf":{"@type":["CreativeWork","Product"],"name":"The New York Times","productID":"nytimes.com:basic"}}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsMediaOrganization","name":"The New York Times","logo":{"@context":"http://schema.org","@type":"ImageObject","url":"https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png","height":40,"width":250},"url":"https://www.nytimes.com/","@id":"https://www.nytimes.com/#publisher","diversityPolicy":"https://www.nytco.com/diversity-and-inclusion-at-the-new-york-times/","ethicsPolicy":"https://www.nytco.com/who-we-are/culture/standards-and-ethics/","masthead":"https://www.nytimes.com/interactive/2019/admin/the-new-york-times-masthead.html","foundingDate":"1851-09-18","sameAs":"https://en.wikipedia.org/wiki/The_New_York_Times"}</script><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"BreadcrumbList","itemListElement":[{"@context":"http://schema.org","@type":"ListItem","name":"U.S.","position":1,"item":"https://www.nytimes.com/section/us"},{"@context":"http://schema.org","@type":"ListItem","name":"Politics","position":2,"item":"https://www.nytimes.com/section/politics"}]}</script></body></html>''')
    assert metadata.author == 'Maggie Haberman; Shane Goldmacher; Michael Crowley'

    metadata = extract_metadata('''<html><body><script data-rh="true" type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":{"@type":"WebPage","@id":"https://www.perthnow.com.au/news/government-defends-graphic-covid-19-ad-after-backlash-c-3376985"},"dateline":null,"publisher":{"@type":"Organization","name":"PerthNow","url":"https://www.perthnow.com.au","logo":{"@type":"ImageObject","url":"https://www.perthnow.com.au/static/publisher-logos/publisher-logo-60px-high.png","width":575,"height":60}},"keywords":["News","News","Australia","Politics","Federal Politics","News","TAS News"],"articleSection":"News","headline":"Government defends graphic Covid-19 ad after backlash","description":"A graphic COVID-19 ad showing a young woman apparently on the verge of death has prompted a backlash, but the government insists it wasn’t done lightly.","dateCreated":"2021-07-12T00:11:50.000Z","datePublished":"2021-07-12T00:11:50.000Z","dateModified":"2021-07-12T01:25:20.617Z","isAccessibleForFree":"True","articleBody":"The man tasked with co-ordinating Australia&rsquo;s Covid-19 vaccine rollout insists a confronting ad depicting a woman on the verge of death was not run lightly. The 30-second clip, depicting a woman apparently in her 20s or 30s gasping for air on a hospital bed, was filmed last year, but the federal government held off running it as no outbreak was deemed serious enough to warrant it. The government has been forced to defend the ad, reminiscent of the &ldquo;Grim Reaper&rdquo; HIV ads in the 1980s, after it prompted a backlash over claims it was too confronting. A more temperate series of ads, depicting arms on ordinary Australians with the moniker &ldquo;Arm Yourself&rdquo;, began last week, but Covid-19 taskforce commander Lieutenant General John Frewen said the escalating situation in Sydney called for a more explicit response. &ldquo;It is absolutely confronting and we didn&rsquo;t use it lightly. There was serious consideration given to whether it was required and we took expert advice,&rdquo; he told Today on Monday. &ldquo;It is confronting but leaves people in no doubt about the seriousness of getting Covid, and it seeks to have people stay home, get tested and get vaccinated as quickly as they can.&rdquo; NSW on Sunday confirmed another 77 cases, 31 of which had been in the community while infectious, and Premier Gladys Berejiklian warned she would be &ldquo;shocked&rdquo; if the number did not exceed 100 on Monday. General Frewen said the &ldquo;concerning situation&rdquo; had prompted the government to shift an additional 300,000 doses to NSW over the coming fortnight. &ldquo;The Delta variant is proving to be very difficult to contain, so we&rsquo;re working very closely with NSW authorities and standing ready to help them in any way we can,&rdquo; he said. Agriculture Minister David Littleproud said the ad was designed to shock Sydneysiders into action as the situation deteriorated. &ldquo;This is about shooting home that this is a serious situation and can get anybody. The fact we&rsquo;re actually debating this I think says to me that the campaign we&rsquo;ve approved is working,&rdquo; he said. The age of the woman in the ad has sparked controversy, with most younger Australians still ineligible to receive their vaccine. But with 11 of the 52 people in hospital across NSW under 35, Labor frontbencher Tanya Plibersek warned the Delta variant was &ldquo;hitting younger people as well&rdquo;. Labor had long demanded a national Covid-19 advertising campaign, which Ms Plibersek said was delayed as a result of the government&rsquo;s sluggish vaccine rollout. &ldquo;Perhaps the reason it&rsquo;s taken so long is if you encourage people to go and get vaccinated, you&rsquo;ve got to have enough of the vaccine available. We simply haven&rsquo;t; we&rsquo;ve been absolutely behind the eight ball in getting another vaccine for Australians,&rdquo; she told Sunrise. Labor frontbencher Chris Bowen, whose western Sydney electorate was in the grip of the outbreak, said the issue was &ldquo;not vaccine hesitancy so much, it&rsquo;s vaccine scarcity&rdquo;. He accepted there was a role for &ldquo;pointing out the consequences of not getting vaccinated&rdquo; to those that were hesitant about the jab, but said the new campaign lacked the &ldquo;creative spark&rdquo; of the Grim Reaper ads. &ldquo;That was a very tough message, a very stark message, but in a very creative way. I think the government really needs to rethink this advertising campaign from scratch; it&rsquo;s too late, and it&rsquo;s pretty low impact,&rdquo; he told ABC radio. He also dismissed the &ldquo;Arm Yourself&rdquo; campaign as &ldquo;very low energy&rdquo;. &ldquo;I don&rsquo;t think that&rsquo;s going to have any impact,&rdquo; he said.","image":[{"@type":"ImageObject","url":"https://images.perthnow.com.au/publication/C-3376985/6c07502f73bdccd45d879356219c325574873a6d-16x9-x0y444w1151h647.jpg","width":1151,"height":647},{"@type":"ImageObject","url":"https://images.perthnow.com.au/publication/C-3376985/6c07502f73bdccd45d879356219c325574873a6d-4x3-x0y336w1151h863.jpg","width":1151,"height":863}],"thumbnailUrl":"https://images.perthnow.com.au/publication/C-3376985/6c07502f73bdccd45d879356219c325574873a6d-16x9-x0y444w1151h647.jpg","url":"https://www.perthnow.com.au/news/government-defends-graphic-covid-19-ad-after-backlash-c-3376985","author":{"@type":"Organization","name":"NCA NewsWire"},"name":"Government defends graphic Covid-19 ad after backlash"}</script><span itemprop="author name">Jenny Smith</span></span></body></html>''')
    assert metadata.author == 'Jenny Smith'

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''<html><body><script type="application/ld+json" class="yoast-schema-graph">{"@context":"https://schema.org","@graph":[{"@type":"WebPage","@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage","url":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/","name":"Jean S\u00e9villia : \"L'\u00c9tat fran\u00e7ais et l'\u00c9tat alg\u00e9rien doivent reconna\u00eetre les crimes commis des deux c\u00f4t\u00e9s\" - Boulevard Voltaire","datePublished":"2018-09-13T12:21:13+00:00","dateModified":"2018-09-14T12:33:14+00:00","inLanguage":"fr-FR"},{"@type":"Article","@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#article","isPartOf":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage"},"author":{"@id":"https://www.bvoltaire.fr/#/schema/person/96c0ed8f089950c46afc2044cb23e8da"},"headline":"Jean S\u00e9villia : &#8220;L&#8217;\u00c9tat fran\u00e7ais et l&#8217;\u00c9tat alg\u00e9rien doivent reconna\u00eetre les crimes commis des deux c\u00f4t\u00e9s&#8221;","datePublished":"2018-09-13T12:21:13+00:00","dateModified":"2018-09-14T12:33:14+00:00","mainEntityOfPage":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#webpage"},"publisher":{"@id":"https://www.bvoltaire.fr/#organization"},"image":{"@id":"https://www.bvoltaire.fr/jean-sevillia-letat-francais-et-letat-algerien-doivent-reconnaitre-les-crimes-commis-des-deux-cotes/#primaryimage"},"keywords":"Guerre d'Alg\u00e9rie","articleSection":"Audio,Editoriaux,Entretiens,Histoire","inLanguage":"fr-FR"},{"@type":"Person","@id":"https://www.bvoltaire.fr/#/schema/person/96c0ed8f089950c46afc2044cb23e8da","name":"Jean S\u00e9villia","image":{"@type":"ImageObject","@id":"https://www.bvoltaire.fr/#personlogo","inLanguage":"fr-FR","url":"https://secure.gravatar.com/avatar/1dd0ad5cb1fc3695880af1725477b22e?s=96&d=mm&r=g","caption":"Jean S\u00e9villia"},"description":"R\u00e9dacteur en chef adjoint au Figaro Magazine, membre du comit\u00e9 scientifique du Figaro Histoire, et auteur de biographies et d\u2019essais historiques.","sameAs":["https://www.bvoltaire.fr/"]}]}</script></body></html>'''), metadata)
    assert metadata.author == "Jean Sévillia"

    ### Test for potential errors
    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
<script type="application/ld+json">
{
  "@context":"http://schema.org",
  "@type":"LiveBlogPosting",
  "@id":"http://techcrunch.com/2015/03/08/apple-watch-event-live-blog",
  "about":{
    "@type":"Event",
    "startDate":"2015-03-09T13:00:00-07:00",
    "name":"Apple Spring Forward Event"
  },
  "coverageStartTime":"2015-03-09T11:30:00-07:00",
  "coverageEndTime":"2015-03-09T16:00:00-07:00",
  "headline":"Apple Spring Forward Event Live Blog",
  "description":"Welcome to live coverage of the Apple Spring Forward …",
  "liveBlogUpdate":{
      "@type":"BlogPosting",
      "headline":"Coming this April, HBO NOW will be available exclusively in the U.S. on Apple TV and the App Store.",
      "datePublished":"2015-03-09T13:08:00-07:00",
      "articleBody": "It's $14.99 a month.<br> And for a limited time, …"
    },
}
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.title == 'Apple Spring Forward Event Live Blog'

    ### Test for potential errors
    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
<script type="application/ld+json">
{
  "@context":"http://schema.org",
  "@type":"LiveBlogPosting",
  "@id":"http://techcrunch.com/2015/03/08/apple-watch-event-live-blog",
  "about":{
    "@type":"Event",
    "startDate":"2015-03-09T13:00:00-07:00",
    "name":"Apple Spring Forward Event"
  },
  "coverageStartTime":"2015-03-09T11:30:00-07:00",
  "coverageEndTime":"2015-03-09T16:00:00-07:00",
  "headline":"Apple Spring Forward Event Live Blog",
  "description":"Welcome to live coverage of the Apple Spring Forward …",
    "liveBlogUpdate": [
    {
      "@type":"BlogPosting",
      "headline":"iPhone is growing at nearly twice the rate of the rest of the smartphone market.",
      "datePublished":"2015-03-09T13:13:00-07:00",
      "image":"http://images.apple.com/live/2015-mar-event/images/573cb_xlarge_2x.jpg"
    },
    {
      "@type":"BlogPosting",
      "headline":"See the new flagship Apple Retail Store in West Lake, China.",
      "datePublished":"2015-03-09T13:17:00-07:00",
      "video":{
        "@type":"VideoObject",
        "thumbnail":"http://images.apple.com/live/2015-mar-event/images/908d2e_large_2x.jpg"
    },
  ]
}
</script>
</body></html>'''), metadata)

    assert metadata is not None and metadata.title == 'Apple Spring Forward Event Live Blog'

    ### Test for potential errors - Missing content on live blog
    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
    <html><body>
    <script type="application/ld+json">
    {
      "@context":"http://schema.org",
      "@type":"LiveBlogPosting",
      "@id":"http://techcrunch.com/2015/03/08/apple-watch-event-live-blog",
      "about":{
        "@type":"Event",
        "startDate":"2015-03-09T13:00:00-07:00",
        "name":"Apple Spring Forward Event"
      },
      "coverageStartTime":"2015-03-09T11:30:00-07:00",
      "coverageEndTime":"2015-03-09T16:00:00-07:00",
      "headline":"Apple Spring Forward Event Live Blog",
      "description":"Welcome to live coverage of the Apple Spring Forward …"
    }
    </script>
    </body></html>'''), metadata)

    assert metadata is not None and metadata.title == 'Apple Spring Forward Event Live Blog'

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
        {
            "@context": "https://schema.org",
            "@type": "socialmediaposting",
            "name": "The Hitchhiker's Guide to the Galaxy",
            "genre": "comedy science fiction",
            "startDate": "1979-10-12",
            "endDate": "1992-10-12",
            "abstract": "Earthman Arthur Dent is saved by his friend, Ford Prefect—an alien researcher for the titular Hitchhiker's Guide to the Galaxy, which provides info on every planet in the galaxy—from the Earth just before it is destroyed by the alien Vogons.",
            "author": {
                "@type": "Person",
                "givenName": "Douglas",
                "familyName": "Adams",
                "additionalName": "Noel",
                "birthDate": "1952-03-11",
                "birthPlace": {
                    "@type": "Place",
                    "address": "Cambridge, Cambridgeshire, England"
                },
                "deathDate": "2001-05-11",
                "deathPlace": {
                    "@type": "Place",
                    "address": "Highgate Cemetery, London, England"
                }
            }
        }
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.author == 'Douglas Noel Adams'

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
        {
            "@context":"https://schema.org",
            "@graph":[
                {
                    "@type": "Article",
                    "author":{
                        "name":"John Smith"
                    },
                    "keywords": [
                        "SAFC",
                        "Warwick Thornton"
                    ],
                    "articleSection": [
                        null
                    ],
                    "inLanguage": "en-AU"
                }
            ]
        }
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and len(metadata.categories) == 0

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
        {
            "@context":"http://schema.org",
            "@type":"WebPage",
            "url":"https://7news.com.au/sport/golf/mickelson-comments-hurt-new-league-norman-c-6658099",
            "name":"Mickelson comments hurt new league: Norman | 7NEWS",
            "alternateName":"7NEWS",
            "author":{
                "@type":"Person",
                "name":"7NEWS"
            },
            "description":"Greg Norman says a host of the world's top golfers were set to be involved in his new Saudi-funded golf league until Phil Mikelson's controversial comments.",
            "publisher":{
                "@type":"Organization",
                "name":"7NEWS",
                "url":"https://7news.com.au",
                "logo":{
                "@type":"ImageObject",
                "url":"https://7news.com.au/static/social-images/publisher-logo-60px-high.png",
                "width":600,
                "height":60
                }
            },
            "image":{
                "@type":"ImageObject",
                "url":"https://7news.com.au/static/social-images/share-400x400.png",
                "width":400,
                "height":400
            }
        }
    </script>
    <script type="application/ld+json">
        {
            "@context":"http://schema.org",
            "@type":"NewsArticle",
            "mainEntityOfPage":{
                "@type":"WebPage",
                "@id":"https://7news.com.au/sport/golf/mickelson-comments-hurt-new-league-norman-c-6658099"
            },
            "dateline":"Sydney, AU",
            "publisher":{
                "@type":"Organization",
                "name":"7NEWS",
                "url":"https://7news.com.au",
                "logo":{
                "@type":"ImageObject",
                "url":"https://7news.com.au/static/social-images/publisher-logo-60px-high.png",
                "width":600,
                "height":60
            }
            },
            "keywords":[
                "Sport",
                "Golf"
            ],
            "articleSection":"Golf",
            "headline":"Mickelson comments hurt new league: Norman",
            "description":"Greg Norman says a host of the world's top golfers were set to be involved in his new Saudi-funded golf league until Phil Mikelson's controversial comments.",
            "dateCreated":"2022-05-02T23:20:48.000Z",
            "datePublished":"2022-05-02T23:20:48.000Z",
            "dateModified":"2022-05-02T23:20:50.493Z",
            "isAccessibleForFree":true,
            "isPartOf":{
                "@type":[
                    "CreativeWork",
                    "Product"
                ],
                "name":"7NEWS",
                "productID":"7news.com.au:everyday_digital"
            },
            "articleBody":"Greg Norman has declared that Phil Mickelson's controversial comments complicated matters for the lucrative new golf league he is fronting that is backed by Saudi Arabian investment. "There's no question (it) hurt. It hurt a lot of aspects," Norman told ESPN. "It hurt the PGA Tour. It hurt us. It hurt the game of golf. It hurt Phil. So yeah, across all fronts. It wasn't just specifically to us. But it definitely created negative momentum against us." Two-time major championship winner Norman is the CEO of the LIV Golf Invitational Series, formerly known as the Super Golf League. As the venture was trying to get rolling, comments from Mickelson published on February 15, after a November interview with author Alan Shipnuck, caused a firestorm. Mickelson referenced the killing of Washington Post reporter Jamal Khashoggi and called the Saudi Arabians "scary motherf***ers to get involved with." He went on to explain why he still had interest in joining the Saudi-backed league. "(They) have a horrible record on human rights," Mickelson was said, according to Shipnuck. "They execute people over there for being gay. Knowing all of this, why would I even consider it? Because this is a once-in-a-lifetime opportunity to reshape how the PGA Tour operates. "They've been able to get by with manipulative, coercive, strong-arm tactics because we, the players, had no recourse." Norman said on Monday that nearly one third of the top 50 players in the world were committed to playing in the new golf tour. When Mickelson's comments were revealed, many top players instead reaffirmed their commitment to playing on the PGA Tour. "Quite honestly, we were ready to launch (in February)," Norman said to ESPN. "We had enough players in our strength of field, or minimal viable product, ready to come on board. "And when all of that happened, everybody got the jitters, and the PGA Tour threatened people with lifetime bans and stuff like that." Originally set to be a 14-event schedule, the LIV Golf Invitational Series has been restructured to an eight tournament season. Of those, five are expected to take place in the United States with a total of $255 million ($A361m) in prize money. Mickelson has since filed a request to play in an event that conflicts with the PGA Tour, signalling his desire to still play in the LIV Golf Invitational Series. A June 9-11 tournament is set for London and will be open to 48 players set to compete on 12 four-man teams. "I've been very pleasantly surprised," Norman said. "What has been talked about in the media and what is reality are two different things. "We know what's happening with a lot of interest expressed. "From an expectation standpoint, we've got a lot of interest from significantly named players."",
            "image":[
            {
                "@type":"ImageObject",
                "url":"https://images.7news.com.au/publication/C-6658099/47ef62deeb56ad4e91bde98237459c3c66dec3e9-16x9-x0y0w1280h720.jpg",
                "width":1280,
                "height":720
            },
            {
                "@type":"ImageObject",
                "url":"https://images.7news.com.au/publication/C-6658099/47ef62deeb56ad4e91bde98237459c3c66dec3e9-4x3-x160y0w960h720.jpg",
                "width":960,
                "height":720
            }
            ],
            "thumbnailUrl":"https://images.7news.com.au/publication/C-6658099/47ef62deeb56ad4e91bde98237459c3c66dec3e9-16x9-x0y0w1280h720.jpg",
            "url":"https://7news.com.au/sport/golf/mickelson-comments-hurt-new-league-norman-c-6658099",
            "author":[
            {
                "@type":"Person",
                "@id":"/profile/digital-staff",
                "jobTitle":"Writer",
                "name":"Digital Staff",
                "publishingPrinciples":null,
                "description":"Journalist from the 7NEWS.com.au team.",
                "sameas":[
                ],
                "image":null
                }
            ],
            "name":"Mickelson comments hurt new league: Norman"
            }
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.title == "Mickelson comments hurt new league: Norman" and metadata.sitename == "7NEWS" and metadata.author == "Digital Staff" and "Golf" in metadata.categories

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
        {
            "@context":"http://schema.org",
            "@type":"NewsArticle",
            "author":[
                {
                    "@type":"Person",
                    "name":"Bill Birtles"
                }
            ],
            "dateModified":"2022-05-02T17:58:24+00:00",
            "datePublished":"2022-05-02T17:58:24+00:00",
            "description":"Some Australians caught up in Shanghai's extreme five-week lockdown say the Australian government has done little to help its citizens in distress, and they're pleading with Canberra to arrange charter flights to get them home. ",
            "headline":"Australians stuck in Shanghai's COVID lockdown beg consular officials to help them flee",
            "image":{
                "@type":"ImageObject",
                "height":485,
                "url":"https://live-production.wcms.abc-cdn.net.au/e4c2d55eac0a18fae458413c45915787?impolicy=wcms_crop_resize&cropH=608&cropW=1080&xPos=0&yPos=38&width=862&height=485",
                "width":862
            },
            "keywords":"",
            "mainEntityOfPage":"https://www.abc.net.au/news/2022-05-03/australians-in-shanghai-lockdown-voice-frustration/101031126",
            "publisher":{
                "@type":"Organization",
                "name":"ABC News",
                "logo":{
                    "@type":"ImageObject",
                    "height":60,
                    "url":"https://www.abc.net.au/res/abc/logos/amp-news-logo-60x240.png",
                    "width":240
                }
            }
        }
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.title == "Australians stuck in Shanghai's COVID lockdown beg consular officials to help them flee" and metadata.author == "Bill Birtles" and metadata.sitename == "ABC News"

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
    {
        "@context":"http://schema.org",
        "@type":"NewsArticle",
        "description":"The city moved into the medium risk level, known as yellow, as it sees a troubling increase in cases and the mayor weighs bringing back some restrictions.",
        "image":[
            {
                "@context":"http://schema.org",
                "@type":"ImageObject",
                "url":"https://static01.nyt.com/images/2022/05/02/multimedia/02nyvirus-alert2/02nyvirus-alert2-videoSixteenByNineJumbo1600.jpg",
                "height":900,
                "width":1600,
                "caption":"Herald Square in Manhattan on Monday. New York City entered the yellow coronavirus risk level, meaning cases rose above 200 per 100,000 residents per week."
            },
            {
                "@context":"http://schema.org",
                "@type":"ImageObject",
                "url":"https://static01.nyt.com/images/2022/05/02/multimedia/02nyvirus-alert2/02nyvirus-alert2-superJumbo.jpg",
                "height":1366,
                "width":2048,
                "caption":"Herald Square in Manhattan on Monday. New York City entered the yellow coronavirus risk level, meaning cases rose above 200 per 100,000 residents per week."
            },
            {
                "@context":"http://schema.org",
                "@type":"ImageObject",
                "url":"https://static01.nyt.com/images/2022/05/02/multimedia/02nyvirus-alert2/02nyvirus-alert2-mediumSquareAt3X.jpg",
                "height":1801,
                "width":1800,
                "caption":"Herald Square in Manhattan on Monday. New York City entered the yellow coronavirus risk level, meaning cases rose above 200 per 100,000 residents per week."
            }
        ],
        "mainEntityOfPage":"https://www.nytimes.com/2022/05/02/nyregion/nyc-coronavirus-yellow-risk-level.html",
        "url":"https://www.nytimes.com/2022/05/02/nyregion/nyc-coronavirus-yellow-risk-level.html",
        "inLanguage":"en",
        "author":[
            {
                "@context":"http://schema.org",
                "@type":"Person",
                "url":"https://www.nytimes.com/by/sharon-otterman",
                "name":"Sharon Otterman"
            },
            {
                "@context":"http://schema.org",
                "@type":"Person",
                "url":"https://www.nytimes.com/by/emma-g-fitzsimmons",
                "name":"Emma G. Fitzsimmons"
            }
        ],
        "dateModified":"2022-05-02T20:21:32.149Z",
        "datePublished":"2022-05-02T14:31:28.000Z",
        "headline":"New York City Enters Higher Coronavirus Risk Level as Case Numbers Rise",
        "publisher":{
            "@id":"https://www.nytimes.com/#publisher"
        },
        "copyrightHolder":{
            "@id":"https://www.nytimes.com/#publisher"
        },
        "sourceOrganization":{
            "@id":"https://www.nytimes.com/#publisher"
        },
        "copyrightYear":2022,
        "isAccessibleForFree":false,
        "hasPart":{
            "@type":"WebPageElement",
            "isAccessibleForFree":false,
            "cssSelector":".meteredContent"
        },
        "isPartOf":{
            "@type":[
                "CreativeWork",
                "Product"
            ],
            "name":"The New York Times",
            "productID":"nytimes.com:basic"
        }
    }
    </script>
    <script type="application/ld+json">
        {
            "@context":"http://schema.org",
            "@type":"NewsMediaOrganization",
            "name":"The New York Times",
            "logo":{
                "@context":"http://schema.org",
                "@type":"ImageObject",
                "url":"https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png",
                "height":40,
                "width":250
            },
            "url":"https://www.nytimes.com/",
            "@id":"https://www.nytimes.com/#publisher",
            "diversityPolicy":"https://www.nytco.com/diversity-and-inclusion-at-the-new-york-times/",
            "ethicsPolicy":"https://www.nytco.com/who-we-are/culture/standards-and-ethics/",
            "masthead":"https://www.nytimes.com/interactive/2020/09/08/admin/the-new-york-times-masthead.html",
            "foundingDate":"1851-09-18",
            "sameAs":"https://en.wikipedia.org/wiki/The_New_York_Times"
        }
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.title == "New York City Enters Higher Coronavirus Risk Level as Case Numbers Rise" and metadata.author == "Sharon Otterman; Emma G Fitzsimmons" and metadata.sitename == "The New York Times"

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
        [
            {
                "@type":"NewsArticle",
                "@context":"http://schema.org",
                "headline":"Decreto permite que consumidor cancele serviços de empresas via WhatsApp",
                "description":"O governo federal estabeleceu mudanças nas regras do SAC (Serviço de Atendimento ao Consumidor). Uma das principais alterações é a obrigação das empresas em disponibilizarem a possibilidade de cancelamento de algum serviço através do mesmo canal em que oc",
                "author":{
                    "name":"Caio Mello",
                    "@type":"Person",
                    "url":"https://uol.com.br/"
                },
                "publisher":{
                    "@type":"Organization",
                    "name":"UOL",
                    "url":"https://www.uol.com.br",
                    "logo":{
                        "@type":"ImageObject",
                        "url":"https://conteudo.imguol.com.br/uolamp/UOL_logo_156x60.png",
                        "width":156,
                        "height":60
                    }
                },
                "url":"https://economia.uol.com.br/noticias/redacao/2022/05/02/empresas-devem-permitir-cancelamento-pelo-whatsapp-a-partir-de-outubro.htm",
                "mainEntityOfPage":"https://economia.uol.com.br/noticias/redacao/2022/05/02/empresas-devem-permitir-cancelamento-pelo-whatsapp-a-partir-de-outubro.htm",
                "image":[
                    "https://conteudo.imguol.com.br/c/noticias/82/2022/01/28/telegram-e-whatsapp-na-tela-do-celular-smartphone-wechat-aps-de-mensageria-mensagem-1643388540478_v2_1x1.jpg",
                    "https://conteudo.imguol.com.br/c/noticias/82/2022/01/28/telegram-e-whatsapp-na-tela-do-celular-smartphone-wechat-aps-de-mensageria-mensagem-1643388540478_v2_3x4.jpg",
                    "https://conteudo.imguol.com.br/c/noticias/82/2022/01/28/telegram-e-whatsapp-na-tela-do-celular-smartphone-wechat-aps-de-mensageria-mensagem-1643388540478_v2_4x3.jpg"
                ],
                "datePublished":"2022-05-02T15:40:13-03:00",
                "dateModified":"2022-05-02T17:52:35-03:00"
            }
        ]
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.title == "Decreto permite que consumidor cancele serviços de empresas via WhatsApp" and metadata.author == "Caio Mello" and metadata.sitename == "UOL"

    metadata = Document()
    metadata = extract_meta_json(html.fromstring('''
<html><body>
    <script type="application/ld+json">
        {
            "@context": "http://schema.org",
            "@type": "NewsArticle",
            "mainEntityOfPage": "https://www.thelocal.de/20190402/12-words-and-phrases-to-help-you-show-off-in-hamburg/",
            "headline": "12 words and phrases you need to survive in Hamburg",
            "datePublished": "2019-04-02T10:18:44+02:00",
            "dateModified": "2022-05-02T16:48:55+02:00",
            "description": "Hamburg is a pretty cosmopolitan place, and you won’t have any problem speaking Hochdeutsch around town. But traditionally, people in the city speak Hamburger Platt, and it&#39;s still very much alive.",
            "keywords": ["language","hamburg"],
            "isAccessibleForFree": true,
            "author": {
                "@type": "Person",
                "name": "Alexander Johnstone",
                "email": "*****@*****.**",
                "url": "https://www.thelocal.de"
            },
            "publisher": {
                "@type": "Organization",
                "name": "The Local",
                "logo": {
                    "@type": "ImageObject",
                    "url": "https://www.thelocal.de/wp-content/themes/thelocal/assets/images/the-local-network-logo-300x60.png",
                    "width": 300,
                    "height": 60
                }
            },
            "isPartOf": {
              "@type": ["CreativeWork", "Product"],
              "name" : "Membership of The Local",
              "productID": "thelocal.de:member"
            },
            "image": {
                "@type": "ImageObject",
                "url": "https://apiwp.thelocal.com/wp-content/uploads/2019/04/c99168bc66462f0a1ae0472c449af4b5c28c9652c1ccf7cb961a8d8cf77b147b.jpg",
                "width": 1500,
                "height": 1000
            },
            "articleBody": "<p><i>Hamburger Platt</i> a quirky variation of Low Saxon (<i>Niederdeutsch</i>), a language which is spoken in northern Germany and eastern parts of the Netherlands.</p><p>It is estimated that are around <a href=\"http://www.hamburg.de/stadtleben/4108598/plattdeutsch-geschichte/\" target=\"_blank\" rel=\"noopener noreferrer\">six million people in eight different German states speak Low Saxon, and around 100,000 speak it in Hamburg itself.</a></p><p><strong>SEE ALSO:<a href=\"https://www.thelocal.de/20190321/local-knowledge-an-insiders-guide-to-life-in-hamburg\"> Local knowledge: an insider guide to life in Hamburg</a></strong></p><p>But it’s on the rise, especially among the young. Some schools teach it from first grade, and there’s even a Hip-Hop group “De fofftig Penns” (“Die fünfzig Pfennige” or \"50 cents\" as pfennig was a former currency) that raps in Plattdeutsch.</p><p>So we thought we should get started on some Hamburger Platt too.</p><p>Here’s a little list of words and phrases to get you started, so that next time you go to Hamburg, you can start to fit in like a true local:</p><p><strong>1. Moin (hello)</strong></p><p><i>Moin</i>, also sometimes <i>moin moin </i>covers a lot of different greetings, as it can mean <i>Guten Morgen</i>, <i>Guten Tag</i> and even <i>Guten Abend</i>. How simple!</p><p><img class=\"size-full wp-image-687931\" src=\"https://apiwp.thelocal.com/wp-content/uploads/2019/04/275024014-scaled.jpg\" alt=\"A shop in Hamburg with the sign 'Moin' outside. \" width=\"2560\" height=\"1673\" /></p><div class=\"post-thumbnail-credit\">A shop in Hamburg with the sign 'Moin' outside. Photo: picture alliance/dpa | Axel Heimken</div><p><strong>2. Schnacken (chat)</strong></p><p><i>Schnack</i> is also the word for chit-chat, and someone who speaks Plattdeutsch could be described as a <i>Plattschnacker</i>.</p><div class=\"ml-manual-widget-container\" style=\"height: 50px; border: 1px solid #c3c3c3; background-color: #dedede;\">Manual widget for ML (class=\"ml-manual-widget-container\")</div><p><strong>3. Macker (lad)</strong></p><p>This means a lad or a mate or even a boyfriend. Or you could try the slang term <i>Digga</i>, which is more equivalent to <i>Alter</i>, meaning dude or man, and has become pretty cool recently.</p><p><strong>4. Klock (clock)</strong></p><p>You probably could've guessed this one. It shows how close some of the words are to English. There isn't always that much difference between the Low Saxon and the Anglo Saxon (which is the route of much of the English language).</p><p><strong>5. Schmöken (smoke)</strong></p><p>Another one that just sounds like English in a German accent! When you're in Hamburg you'll see people <i>schnacken</i> while they <i>schmöken</i> outside a restaurant.</p><p><strong>6. Büx (trousers)</strong></p><p><span style=\"font-size: 10px;\"><i><img style=\"width: 640px; height: 383px;\" src=\"https://www.thelocal.com/wp-content/uploads/2019/04/1511264772_1481025162_buex.jpg\" alt=\"\" />A man holds up a huge pair of </i>Büx<i> at the Hamburg tailor's Herrenkleidung Policke, which makes suits for all sizes imaginable. Photo: DPA</i></span></p><p>Perhaps not one you'll use everyday, but there's also the related verb <i>utbüxen,</i> which means to slip away or escape.</p><p><strong>7. Mall (mad)</strong></p><p>You may well hear \"Bist du mall?!\" being bounded around, which means \"Are you out of you mind?!\"</p><p><strong>8. Sabbelknoken (mobile phone)</strong></p><p>It's definitely a bit of a mouthful, but it is still used by some in Hamburg, and literally translates as a \"mouth bone/limb\".</p><p><strong>9. Wat is de Klock? (What’s the time?)</strong></p><p>You don't need to be Sherlock to deduce this one either, as it sounds like broken English, but it could come in pretty useful on a visit.</p><p><strong>10. En mol Lütt un Lütt (a beer and a schnapps)</strong></p><p>Here's where you might need Sherlock. This is a classic order in a traditional Hamburg pub, but who would have thought that asking for two Lütt could get you both a beer and a shot?</p><p><strong>11. In’n Tüddel koomm (get confused)</strong></p><p>This one almost sounds like what it means, and you almost have to yodel to say it. If you by mistake stumbled into Herbertstraße off the Reeperbahn, you may well <i>in'n Tüddel koomm</i>.</p><p><strong>12. Du bist mein Schietbüdel (you’re my darling)</strong></p><p>And finally one for if you find the right person in Hamburg. It's become really popular in the last few years, and although it used to be an insult, it's now used as a term of endearment. </p>"
          }
    </script>
</script>
</body></html>'''), metadata)
    assert metadata is not None and metadata.title == "12 words and phrases you need to survive in Hamburg" and metadata.author == "Alexander Johnstone" and metadata.sitename == "The Local"

Example #23

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_description():
    '''Test the extraction of descriptions'''
    metadata = extract_metadata('<html><head><meta itemprop="description" content="Description"/></head><body></body></html>')
    assert metadata.description == 'Description'
    metadata = extract_metadata('<html><head><meta property="og:description" content="&amp;#13; A Northern Territory action plan, which includes plans to support development and employment on Aboriginal land, has received an update. &amp;#13..." /></head><body></body></html>')
    assert metadata.description == 'A Northern Territory action plan, which includes plans to support development and employment on Aboriginal land, has received an update. ...'

Example #24

Show file

File: metadata_tests.py Project: vkuberan/trafilatura

def test_pages():
    '''Test on real web pages'''
    metadata = extract_metadata(
        load_mock_page(
            'http://blog.python.org/2016/12/python-360-is-now-available.html'))
    assert metadata['title'] == 'Python 3.6.0 is now available!'
    assert metadata[
        'description'] == 'Python 3.6.0 is now available! Python 3.6.0 is the newest major release of the Python language, and it contains many new features and opti...'
    assert metadata['author'] == 'Ned Deily'
    assert metadata[
        'url'] == 'http://blog.python.org/2016/12/python-360-is-now-available.html'
    assert metadata['sitename'] == 'blog.python.org'

    metadata = extract_metadata(
        load_mock_page(
            'https://en.blog.wordpress.com/2019/06/19/want-to-see-a-more-diverse-wordpress-contributor-community-so-do-we/'
        ))
    assert metadata[
        'title'] == 'Want to See a More Diverse WordPress Contributor Community? So Do We.'
    assert metadata[
        'description'] == 'More diverse speakers at WordCamps means a more diverse community contributing to WordPress — and that results in better software for everyone.'
    assert metadata['sitename'] == 'The WordPress.com Blog'
    assert metadata[
        'url'] == 'https://en.blog.wordpress.com/2019/06/19/want-to-see-a-more-diverse-wordpress-contributor-community-so-do-we/'

    metadata = extract_metadata(
        load_mock_page('https://creativecommons.org/about/'))
    assert metadata['title'] == 'What we do - Creative Commons'
    assert metadata[
        'description'] == 'What is Creative Commons? Creative Commons helps you legally share your knowledge and creativity to build a more equitable, accessible, and innovative world. We unlock the full potential of the internet to drive a new era of development, growth and productivity. With a network of staff, board, and affiliates around the world, Creative Commons provides … Read More "What we do"'
    assert metadata['sitename'] == 'Creative Commons'
    assert metadata['url'] == 'https://creativecommons.org/about/'
    # date None

    metadata = extract_metadata(
        load_mock_page('https://www.creativecommons.at/faircoin-hackathon'))
    assert metadata['title'] == 'FairCoin hackathon beim Sommercamp'
    # assert metadata['url']='/faircoin-hackathon'

    metadata = extract_metadata(
        load_mock_page(
            'https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/'
        ))
    assert metadata[
        'title'] == 'Die Cider Connection: Abmahnungen gegen Nutzer von Creative-Commons-Bildern'
    assert metadata['author'] == 'Markus Reuter'
    assert metadata[
        'description'] == 'Seit Dezember 2015 verschickt eine Cider Connection zahlreiche Abmahnungen wegen fehlerhafter Creative-Commons-Referenzierungen. Wir haben recherchiert und legen jetzt das Netzwerk der Abmahner offen.'
    assert metadata['sitename'] == 'netzpolitik.org'
    # cats + tags
    assert metadata[
        'url'] == 'https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/'

    metadata = extract_metadata(
        load_mock_page(
            'https://www.befifty.de/home/2017/7/12/unter-uns-montauk'))
    assert metadata[
        'title'] == 'Das vielleicht schönste Ende der Welt: Montauk'
    assert metadata['author'] == 'Beate Finken'
    assert metadata[
        'description'] == 'Ein Strand, ist ein Strand, ist ein Strand Ein Strand, ist ein Strand, ist ein Strand. Von wegen! In Italien ist alles wohl organisiert, Handtuch an Handtuch oder Liegestuhl an Liegestuhl. In der Karibik liegt man unter Palmen im Sand und in Marbella dominieren Beton und eine kerzengerade Promenade'
    assert metadata['sitename'] == 'BeFifty'
    assert metadata['categories'] == ['Travel', 'Amerika']
    assert metadata[
        'url'] == 'https://www.befifty.de/home/2017/7/12/unter-uns-montauk'

    metadata = extract_metadata(
        load_mock_page('https://www.soundofscience.fr/1927'))
    assert metadata[
        'title'] == 'Une candidature collective à la présidence du HCERES'
    assert metadata['author'] == 'Martin Clavey'
    assert metadata['description'].startswith(
        'En réaction à la candidature du conseiller recherche')
    assert metadata['sitename'] == 'The Sound Of Science'
    assert metadata['categories'] == ['Politique scientifique française']
    # assert metadata['tags'] == ['évaluation', 'HCERES']
    assert metadata['url'] == 'https://www.soundofscience.fr/1927'

    url = 'https://laviedesidees.fr/L-evaluation-et-les-listes-de.html'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata['title'] == 'L’évaluation et les listes de revues'
    assert metadata['author'] == 'Florence Audier'
    assert metadata['description'].startswith("L'évaluation, et la place")
    assert metadata['sitename'] == 'La Vie des idées'
    # assert metadata['categories'] == ['Essai', 'Économie']
    assert metadata['tags'] == []
    # <meta property="og:type" content="article" />
    # <meta name="DC:type" content="journalArticle">
    assert metadata[
        'url'] == 'http://www.laviedesidees.fr/L-evaluation-et-les-listes-de.html'

    metadata = extract_metadata(
        load_mock_page(
            'https://www.theguardian.com/education/2020/jan/20/thousands-of-uk-academics-treated-as-second-class-citizens'
        ))
    assert metadata[
        'title'] == "Thousands of UK academics 'treated as second-class citizens'"
    assert metadata['author'] == 'Richard Adams'
    assert metadata['description'].startswith(
        'Report claims higher education institutions')
    assert metadata['sitename'] == 'The Guardian'  # originally "the Guardian"
    assert metadata['categories'] == ['Education']
    #assert metadata['tags'] == [] ## TODO: check tags
    # meta name="keywords"
    assert metadata[
        'url'] == 'http://www.theguardian.com/education/2020/jan/20/thousands-of-uk-academics-treated-as-second-class-citizens'

    metadata = extract_metadata(
        load_mock_page(
            'https://phys.org/news/2019-10-flint-flake-tool-partially-birch.html'
        ))
    assert metadata[
        'title'] == 'Flint flake tool partially covered by birch tar adds to evidence of Neanderthal complex thinking'
    assert metadata['author'] == 'Bob Yirka'
    assert metadata[
        'description'] == 'A team of researchers affiliated with several institutions in The Netherlands has found evidence in small a cutting tool of Neanderthals using birch tar. In their paper published in Proceedings of the National Academy of Sciences, the group describes the tool and what it revealed about Neanderthal technology.'
    # assert metadata['sitename'] == 'Phys'
    # assert metadata['categories'] == ['Archaeology', 'Fossils']
    assert metadata['tags'] == [
        "Science, Physics News, Science news, Technology News, Physics, Materials, Nanotech, Technology, Science"
    ]
    assert metadata[
        'url'] == 'https://phys.org/news/2019-10-flint-flake-tool-partially-birch.html'

    # metadata = extract_metadata(load_mock_page('https://gregoryszorc.com/blog/2020/01/13/mercurial%27s-journey-to-and-reflections-on-python-3/'))
    # assert metadata['title'] == "Mercurial's Journey to and Reflections on Python 3"
    # assert metadata['author'] == 'Gregory Szorc'
    # assert metadata['description'] == 'Description of the experience of making Mercurial work with Python 3'
    # assert metadata['sitename'] == 'gregoryszorc'
    # assert metadata['categories'] == ['Python', 'Programming']

    metadata = extract_metadata(
        load_mock_page(
            'https://www.pluralsight.com/tech-blog/managing-python-environments/'
        ))
    assert metadata['title'] == 'Managing Python Environments'
    assert metadata['author'] == 'John Walk'
    assert metadata['description'].startswith("If you're not careful,")
    # assert metadata['sitename'] == 'Pluralsight'
    # assert metadata['categories'] == ['Python', 'Programming']
    assert metadata[
        'url'] == 'https://www.pluralsight.com/tech-blog/managing-python-environments/'

    url = 'https://stackoverflow.blog/2020/01/20/what-is-rust-and-why-is-it-so-popular/'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'What is Rust and why is it so popular? - Stack Overflow Blog'
    # assert metadata['author'] == 'Jake Goulding'
    assert metadata['sitename'] == 'Stack Overflow Blog'
    assert metadata['categories'] == ['Bulletin']
    assert metadata['tags'] == ['programming', 'rust']
    assert metadata['url'] == url

    url = 'https://www.dw.com/en/berlin-confronts-germanys-colonial-past-with-new-initiative/a-52060881'
    metadata = extract_metadata(load_mock_page(url))
    assert "Berlin confronts Germany's colonial past with new initiative" in metadata[
        'title']
    # assert metadata['author'] == 'Ben Knight' # "Deutsche Welle (www.dw.com)"
    assert metadata[
        'description'] == "The German capital has launched a five-year project to mark its part in European colonialism. Streets which still honor leaders who led the Reich's imperial expansion will be renamed — and some locals aren't happy."
    assert metadata['sitename'] == 'DW.COM'  # 'DW - Deutsche Welle'
    # assert metadata['categories'] == ['Colonialism', 'History', 'Germany']
    assert metadata['url'] == url

    metadata = extract_metadata(
        load_mock_page(
            'https://www.theplanetarypress.com/2020/01/management-of-intact-forestlands-by-indigenous-peoples-key-to-protecting-climate/'
        ))
    #print(metadata)
    #sys.exit()
    # assert metadata['title'] == 'Management of Intact Forestlands by Indigenous Peoples Key to Protecting Climate'
    # assert metadata['author'] == 'Julie Mollins'
    assert metadata['sitename'] == 'The Planetary Press'
    # assert metadata['categories'] == ['Indigenous People', 'Environment']
    assert metadata[
        'url'] == 'https://www.theplanetarypress.com/2020/01/management-of-intact-forestlands-by-indigenous-peoples-key-to-protecting-climate/'

    url = 'https://wikimediafoundation.org/news/2020/01/15/access-to-wikipedia-restored-in-turkey-after-more-than-two-and-a-half-years/'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'Access to Wikipedia restored in Turkey after more than two and a half years'
    assert metadata['author'] == 'Wikimedia Foundation'
    # assert metadata['description'] == 'Report about the restored accessibility of Wikipedia in Turkey'
    assert metadata['sitename'] == 'Wikimedia Foundation'
    # assert metadata['categories'] == ['Politics', 'Turkey', 'Wikipedia']
    assert metadata['url'] == url

    url = 'https://www.reuters.com/article/us-awards-sag/parasite-scores-upset-at-sag-awards-boosting-oscar-chances-idUSKBN1ZI0EH'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata['title'].endswith(
        'scores historic upset at SAG awards, boosting Oscar chances'
    )  # &#039;Parasite&#039;
    assert metadata['author'] == 'Jill Serjeant'
    assert metadata['date'] == '2020-01-20'
    # assert metadata['description'] == '“Parasite,” the Korean language social satire about the wealth gap in South Korea, was the first film in a foreign language to win the top prize of best cast ensemble in the 26 year-history of the SAG awards.'
    # assert metadata['sitename'] == 'Reuters'
    # assert metadata['categories'] == ['Parasite', 'SAG awards', 'Cinema']
    # print(metadata)
    # assert metadata['url'] == 'https://www.reuters.com/article/us-awards-sag-idUSKBN1ZI0EH'

    url = 'https://www.nationalgeographic.co.uk/environment-and-conservation/2020/01/ravenous-wild-goats-ruled-island-over-century-now-its-being'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == "Ravenous wild goats ruled this island for over a century. Now, it's being reborn."
    assert metadata['author'] == 'Michael Hingston'
    assert metadata['description'].startswith(
        'The rocky island of Redonda, once stripped of its flora and fauna')
    assert metadata['sitename'] == 'National Geographic'
    # assert metadata['categories'] == ['Goats', 'Environment', 'Redonda']
    assert metadata['url'] == url

    url = 'https://www.nature.com/articles/d41586-019-02790-3'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'Gigantic Chinese telescope opens to astronomers worldwide'
    assert metadata['author'] == 'Elizabeth Gibney'
    assert metadata[
        'description'] == 'FAST has superior sensitivity to detect cosmic phenomena, including fast radio bursts and pulsars.'
    # assert metadata['sitename'] == 'Nature'
    # assert metadata['categories'] == ['Astronomy', 'Telescope', 'China']
    assert metadata['url'] == url

    url = 'https://www.scmp.com/comment/opinion/article/3046526/taiwanese-president-tsai-ing-wens-political-playbook-should-be'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'Carrie Lam should study Tsai Ing-wen’s playbook'  # '<h1 data-v-1223d442="" class="inner__main-headline main-headline">Taiwanese President Tsai Ing-wen’s political playbook should be essential reading for Hong Kong leader Carrie Lam</h1>'
    # author in JSON-LD
    assert metadata['author'] == 'Alice Wu'
    assert metadata['url'] == url

    url = 'https://www.faz.net/aktuell/wirtschaft/nutzerbasierte-abrechnung-musik-stars-fordern-neues-streaming-modell-16604622.html'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'Nutzerbasierte Abrechnung: Musik-Stars fordern neues Streaming-Modell'
    # author overriden from JSON-LD + double name
    assert 'Benjamin Fischer' in metadata['author']
    assert metadata['sitename'] == 'Frankfurter Allgemeine Zeitung'
    assert metadata['url'] == 'https://www.faz.net/1.6604622'

    url = 'https://boingboing.net/2013/07/19/hating-millennials-the-preju.html'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == "Hating Millennials - the prejudice you're allowed to boast about"
    assert metadata['author'] == 'Cory Doctorow'
    assert metadata['sitename'] == 'Boing Boing'
    assert metadata['url'] == url

    url = 'https://www.gofeminin.de/abnehmen/wie-kann-ich-schnell-abnehmen-s1431651.html'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'Wie kann ich schnell abnehmen? Der Schlachtplan zum Wunschgewicht'
    assert metadata['author'] == 'Diane Buckstegge'
    assert metadata['sitename'] == 'Gofeminin'  # originally "gofeminin"
    assert metadata['url'] == url

    url = 'https://github.blog/2019-03-29-leader-spotlight-erin-spiceland/'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata['title'] == 'Leader spotlight: Erin Spiceland'
    assert metadata['author'] == 'Jessica Rudder'
    assert metadata['description'].startswith('We’re spending Women’s History')
    assert metadata['sitename'] == 'The GitHub Blog'
    assert metadata['categories'] == ['Community']
    assert metadata['url'] == url

    url = 'https://www.spiegel.de/spiegel/print/d-161500790.html'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata['title'] == 'Ein Albtraum'
    # print(metadata)
    # assert metadata['author'] == 'Clemens Höges'

    url = 'https://www.salon.com/2020/01/10/despite-everything-u-s-emissions-dipped-in-2019_partner/'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata[
        'title'] == 'Despite everything, U.S. emissions dipped in 2019'
    # in JSON-LD
    assert metadata['author'] == 'Nathanael Johnson'
    assert metadata['sitename'] == 'Salon.com'
    # in header
    assert 'Science & Health' in metadata['categories']
    assert 'Gas Industry' in metadata['tags'] and 'coal emissions' in metadata[
        'tags']
    assert metadata['url'] == url

    url = 'https://www.ndr.de/nachrichten/info/16-Coronavirus-Update-Wir-brauchen-Abkuerzungen-bei-der-Impfstoffzulassung,podcastcoronavirus140.html'
    metadata = extract_metadata(load_mock_page(url))
    assert metadata['url'] == url
    assert 'Korinna Hennig' in metadata['author']
    assert 'Ältere Menschen' in str(metadata['tags'])

Example #25

Show file

File: metadata_tests.py Project: adbar/trafilatura

def test_authors():
    '''Test the extraction of author names'''
    # normalization
    assert normalize_authors(None, 'abc') == 'Abc'
    assert normalize_authors(None, 'Steve Steve 123') == 'Steve Steve'
    assert normalize_authors(None, 'By Steve Steve') == 'Steve Steve'
    assert normalize_json('Test \\nthis') == 'Test this'
    # blacklist
    metadata = extract_metadata('<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>', author_blacklist={'Jenny Smith'})
    assert metadata.author == None
    # extraction
    metadata = extract_metadata('<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><head><meta itemprop="author" content="Jenny Smith"/><meta itemprop="author" content="John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta itemprop="author" content="Jenny Smith und John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta name="author" content="Jenny Smith"/><meta name="author" content="John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta name="author" content="Jenny Smith and John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta name="author" content="Jenny Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><head><meta name="author" content="Hank O&#39;Hop"/></head><body></body></html>')
    assert metadata.author == 'Hank O\'Hop'
    metadata = extract_metadata('<html><head><meta name="author" content="Jenny Smith ❤️"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><head><meta name="citation_author" content="Jenny Smith and John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta property="author" content="Jenny Smith"/><meta property="author" content="John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta itemprop="author" content="Jenny Smith and John Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><head><meta name="article:author" content="Jenny Smith"/></head><body></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><a href="" rel="author">Jenny Smith</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><a href="" rel="author">Jenny "The Author" Smith</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><span class="author">Jenny Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><h4 class="author">Jenny Smith</h4></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><h4 class="author">Jenny Smith — Trafilatura</h4></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><span class="wrapper--detail__writer">Jenny Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><span id="author-name">Jenny Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><figure data-component="Figure"><div class="author">Jenny Smith</div></figure></body></html>')
    assert metadata.author is None
    metadata = extract_metadata('<html><body><div class="sidebar"><div class="author">Jenny Smith</div></figure></body></html>')
    assert metadata.author is None
    metadata = extract_metadata('<html><body><span class="author">Jenny Smith and John Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith; John Smith'
    metadata = extract_metadata('<html><body><a class="author">Jenny Smith</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><a class="author">Jenny Smith <div class="title">Editor</div></a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><a class="author">Jenny Smith from Trafilatura</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><meta itemprop="author" content="Fake Author"/><a class="author">Jenny Smith from Trafilatura</a></body></html>', author_blacklist={'Fake Author'})
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><a class="username">Jenny Smith</a></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><div class="submitted-by"><a>Jenny Smith</a></div></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><div class="byline-content"><div class="byline"><a>Jenny Smith</a></div><time>July 12, 2021 08:05</time></div></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><h3 itemprop="author">Jenny Smith</h3></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><div data-component="Byline">Jenny Smith</div></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><span id="author">Jenny Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><span id="author">Jenny_Smith</span></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><span itemprop="author name">Shannon Deery, Mitch Clarke, Susie O’Brien, Laura Placella, Kara Irving, Jordy Atkinson, Suzan Delibasic</span></body></html>')
    assert metadata.author == 'Shannon Deery; Mitch Clarke; Susie O’Brien; Laura Placella; Kara Irving; Jordy Atkinson; Suzan Delibasic'
    metadata = extract_metadata('<html><body><address class="author">Jenny Smith</address></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><author>Jenny Smith</author></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><div class="author"><span class="profile__name"> Jenny Smith </span> <a href="https://twitter.com/jenny_smith" class="profile__social" target="_blank"> @jenny_smith </a> <span class="profile__extra lg:hidden"> 11:57AM </span> </div></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><p class="author-section byline-plain">By <a class="author" rel="nofollow">Jenny Smith For Daily Mail Australia</a></p></body></html>')
    assert metadata.author == 'Jenny Smith'
    metadata = extract_metadata('<html><body><div class="o-Attribution__a-Author"><span class="o-Attribution__a-Author--Label">By:</span><span class="o-Attribution__a-Author--Prefix"><span class="o-Attribution__a-Name"><a href="//web.archive.org/web/20210707074846/https://www.discovery.com/profiles/ian-shive">Ian Shive</a></span></span></div></body></html>')
    assert metadata.author == 'Ian Shive'
    metadata = extract_metadata('<html><body><div class="ArticlePage-authors"><div class="ArticlePage-authorName" itemprop="name"><span class="ArticlePage-authorBy">By&nbsp;</span><a aria-label="Ben Coxworth" href="https://newatlas.com/author/ben-coxworth/"><span>Ben Coxworth</span></a></div></div></body></html>')
    assert metadata.author == 'Ben Coxworth'