Example #1
0
def extract_opengraph(tree):
    '''Search meta tags following the OpenGraph guidelines (https://ogp.me/)'''
    title, author, url, description, site_name = (None, ) * 5
    # detect OpenGraph schema
    for elem in tree.xpath('//head/meta[starts-with(@property, "og:")]'):
        # safeguard
        if not elem.get('content'):
            continue
        # site name
        if elem.get('property') == 'og:site_name':
            site_name = elem.get('content')
        # blog title
        elif elem.get('property') == 'og:title':
            title = elem.get('content')
        # orig URL
        elif elem.get('property') == 'og:url':
            if validate_url(elem.get('content'))[0] is True:
                url = elem.get('content')
        # description
        elif elem.get('property') == 'og:description':
            description = elem.get('content')
        # og:author
        elif elem.get('property') in ('og:author', 'og:article:author'):
            author = elem.get('content')
        # og:type
        #elif elem.get('property') == 'og:type':
        #    pagetype = elem.get('content')
        # og:locale
        #elif elem.get('property') == 'og:locale':
        #    pagelocale = elem.get('content')
    return trim(title), trim(author), trim(url), trim(description), trim(
        site_name)
Example #2
0
def extract_url(tree, default_url=None):
    '''Extract the URL from the canonical link'''
    # https://www.tutorialrepublic.com/html-reference/html-base-tag.php
    # default url as fallback
    url = default_url
    # try canonical link first
    element = tree.find('.//head//link[@rel="canonical"]')
    if element is not None and 'href' in element.attrib and URL_COMP_CHECK.match(
            element.attrib['href']):
        url = element.attrib['href']
    # try default language link
    else:
        for element in tree.iterfind('.//head//link[@rel="alternate"]'):
            if 'hreflang' in element.attrib and element.attrib[
                    'hreflang'] is not None and element.attrib[
                        'hreflang'] == 'x-default':
                if URL_COMP_CHECK.match(element.attrib['href']):
                    LOGGER.debug(
                        html.tostring(element,
                                      pretty_print=False,
                                      encoding='unicode').strip())
                    url = element.attrib['href']
    # add domain name if it's missing
    if url is not None and url.startswith('/'):
        for element in tree.iterfind('.//head//meta[@content]'):
            if 'name' in element.attrib:
                attrtype = element.attrib['name']
            elif 'property' in element.attrib:
                attrtype = element.attrib['property']
            else:
                continue
            if attrtype.startswith('og:') or attrtype.startswith('twitter:'):
                domain_match = re.match(r'https?://[^/]+',
                                        element.attrib['content'])
                if domain_match:
                    # prepend URL
                    url = domain_match.group(0) + url
                    break
    # sanity check: don't return invalid URLs
    if url is not None:
        validation_result, parsed_url = validate_url(url)
        if validation_result is False:
            url = None
        else:
            url = normalize_url(parsed_url)
    return url
Example #3
0
def examine_meta(tree):
    '''Search meta tags for relevant information'''
    metadata = dict.fromkeys([
        'title', 'author', 'url', 'hostname', 'description', 'sitename',
        'date', 'categories', 'tags'
    ])
    # bootstrap from potential OpenGraph tags
    title, author, url, description, site_name = extract_opengraph(tree)
    # test if all return values have been assigned
    if all((title, author, url, description,
            site_name)):  # if they are all defined
        metadata['title'], metadata['author'], metadata['url'], metadata[
            'description'], metadata[
                'sitename'] = title, author, url, description, site_name
        return metadata
    tags = []
    # skim through meta tags
    for elem in tree.xpath('//head/meta[@content]'):
        # content
        if not elem.get('content'):
            continue
        content_attr = elem.get('content')
        # image info
        # ...
        # property
        if 'property' in elem.attrib:
            # no opengraph a second time
            if elem.get('property').startswith('og:'):
                continue
            if elem.get('property') == 'article:tag':
                tags.append(content_attr)
            elif elem.get('property') in ('author', 'article:author'):
                if author is None:
                    author = content_attr
        # name attribute
        elif 'name' in elem.attrib:  # elem.get('name') is not None:
            # author
            if elem.get('name') in ('author', 'byl', 'dc.creator',
                                    'sailthru.author'):  # twitter:creator
                if author is None:
                    author = content_attr
            # title
            elif elem.get('name') in ('title', 'dc.title', 'sailthru.title',
                                      'twitter:title'):
                if title is None:
                    title = content_attr
            # description
            elif elem.get('name') in ('description', 'dc.description',
                                      'dc:description', 'sailthru.description',
                                      'twitter:description'):
                if description is None:
                    description = content_attr
            # site name
            elif elem.get('name') in (
                    'publisher', 'DC.publisher', 'twitter:site',
                    'application-name'
            ) or 'twitter:app:name' in elem.get('name'):
                if site_name is None:
                    site_name = content_attr
            # url
            elif elem.get('name') == 'twitter:url':
                if url is None and validate_url(content_attr)[0] is True:
                    url = content_attr
            # keywords
            elif elem.get('name') == 'keywords':  # 'page-topic'
                tags.append(content_attr)
        elif 'itemprop' in elem.attrib:
            if elem.get('itemprop') == 'author':
                if author is None:
                    author = content_attr
            elif elem.get('itemprop') == 'description':
                if description is None:
                    description = content_attr
            # to verify:
            #elif elem.get('itemprop') == 'name':
            #    if title is None:
            #        title = elem.get('content')
        # other types
        else:
            if not 'charset' in elem.attrib and not 'http-equiv' in elem.attrib and not 'property' in elem.attrib:
                LOGGER.debug(
                    html.tostring(elem, pretty_print=False,
                                  encoding='unicode').strip())
    metadata['title'], metadata['author'], metadata['url'], metadata[
        'description'], metadata['sitename'], metadata[
            'tags'] = title, author, url, description, site_name, tags
    return metadata
Example #4
0
def examine_meta(tree):
    '''Search meta tags for relevant information'''
    metadata = dict.fromkeys(METADATA_LIST)
    # bootstrap from potential OpenGraph tags
    title, author, url, description, site_name = extract_opengraph(tree)
    # test if all return values have been assigned
    if all((title, author, url, description,
            site_name)):  # if they are all defined
        metadata['title'], metadata['author'], metadata['url'], metadata[
            'description'], metadata[
                'sitename'] = title, author, url, description, site_name
        return metadata
    tags, backup_sitename = [], None
    # skim through meta tags
    for elem in tree.iterfind('.//head/meta[@content]'):
        # content
        if not elem.get('content'):
            continue
        content_attr = elem.get('content')
        # image info
        # ...
        # property
        if 'property' in elem.attrib:
            # no opengraph a second time
            if elem.get('property').startswith('og:'):
                continue
            if elem.get('property') == 'article:tag':
                tags.append(content_attr)
            elif elem.get('property') in ('author', 'article:author'):
                author = author or content_attr
        # name attribute
        elif 'name' in elem.attrib:
            name_attr = elem.get('name').lower()
            # author
            if name_attr in METANAME_AUTHOR:
                author = author or content_attr
            # title
            elif name_attr in METANAME_TITLE:
                title = title or content_attr
            # description
            elif name_attr in METANAME_DESCRIPTION:
                description = description or content_attr
            # site name
            elif name_attr in METANAME_PUBLISHER:
                site_name = site_name or content_attr
            elif name_attr in ('twitter:site', 'application-name'
                               ) or 'twitter:app:name' in elem.get('name'):
                backup_sitename = content_attr
            # url
            elif name_attr == 'twitter:url':
                if url is None and validate_url(content_attr)[0] is True:
                    url = content_attr
            # keywords
            elif name_attr == 'keywords':  # 'page-topic'
                tags.append(content_attr)
        elif 'itemprop' in elem.attrib:
            if elem.get('itemprop') == 'author':
                author = author or content_attr
            elif elem.get('itemprop') == 'description':
                description = description or content_attr
            elif elem.get('itemprop') == 'headline':
                title = title or content_attr
            # to verify:
            #elif elem.get('itemprop') == 'name':
            #    if title is None:
            #        title = elem.get('content')
        # other types
        else:
            if not any(key in elem.attrib
                       for key in ('charset', 'http-equiv', 'property')):
                LOGGER.debug(
                    'unknown attribute: %s',
                    html.tostring(elem, pretty_print=False,
                                  encoding='unicode').strip())
    # backups
    if site_name is None and backup_sitename is not None:
        site_name = backup_sitename
    # copy
    metadata['title'], metadata['author'], metadata['url'], metadata[
        'description'], metadata['sitename'], metadata[
            'tags'] = title, author, url, description, site_name, tags
    return metadata
Example #5
0
def examine_meta(tree):
    '''Search meta tags for relevant information'''
    metadata = Document()  # alt: Metadata()
    # bootstrap from potential OpenGraph tags
    title, author, url, description, site_name = extract_opengraph(tree)
    # test if all return values have been assigned
    if all((title, author, url, description, site_name)):  # if they are all defined
        metadata.title, metadata.author, metadata.url, metadata.description, metadata.sitename = title, author, url, description, site_name
        return metadata
    tags, backup_sitename = [], None
    # skim through meta tags
    for elem in tree.iterfind('.//head/meta[@content]'):
        # content
        if not elem.get('content'):
            continue
        content_attr = HTML_STRIP_TAG.sub('', elem.get('content'))
        # image info
        # ...
        # property
        if 'property' in elem.attrib:
            # no opengraph a second time
            if elem.get('property').startswith('og:'):
                continue
            if elem.get('property') == 'article:tag':
                tags.append(normalize_tags(content_attr))
            elif elem.get('property') in PROPERTY_AUTHOR:
                author = normalize_authors(author, content_attr)
            elif elem.get('property') == 'article:publisher':
                site_name = site_name or content_attr
        # name attribute
        elif 'name' in elem.attrib:
            name_attr = elem.get('name').lower()
            # author
            if name_attr in METANAME_AUTHOR:
                author = normalize_authors(author, content_attr)
            # title
            elif name_attr in METANAME_TITLE:
                title = title or content_attr
            # description
            elif name_attr in METANAME_DESCRIPTION:
                description = description or content_attr
            # site name
            elif name_attr in METANAME_PUBLISHER:
                site_name = site_name or content_attr
            elif name_attr in TWITTER_ATTRS or 'twitter:app:name' in elem.get('name'):
                backup_sitename = content_attr
            # url
            elif name_attr == 'twitter:url':
                if url is None and validate_url(content_attr)[0] is True:
                    url = content_attr
            # keywords
            elif name_attr in METANAME_TAG:  # 'page-topic'
                tags.append(normalize_tags(content_attr))
        elif 'itemprop' in elem.attrib:
            if elem.get('itemprop') == 'author':
                author = normalize_authors(author, content_attr)
            elif elem.get('itemprop') == 'description':
                description = description or content_attr
            elif elem.get('itemprop') == 'headline':
                title = title or content_attr
            # to verify:
            # elif elem.get('itemprop') == 'name':
            #    if title is None:
            #        title = elem.get('content')
        # other types
        elif all(
            key not in elem.attrib
            for key in EXTRA_META
        ):
            LOGGER.debug('unknown attribute: %s',
                         tostring(elem, pretty_print=False, encoding='unicode').strip())
    # backups
    if site_name is None and backup_sitename is not None:
        site_name = backup_sitename
    # copy
    metadata.title, metadata.author, metadata.url, metadata.description, metadata.sitename, metadata.tags = title, author, url, description, site_name, tags
    return metadata