コード例 #1
0
def get_links(htmlpath, exclude=None):
    ''' Get links from an html file.

        Not well tested. See reinhardt.feeds for examples of more reliable parsing.

        Returns a list. Each item is a list of [PATH, URL, SUMMARY].

        'htmlpath' is path of html file.

        'exclude' is string in href to exclude, without top level domain.
        Example: To exclude links to google, use "exclude='google'".

        Very ad hoc.
    '''

    # fallable importdelayed until needed
    try:
        from pyquery.pyquery import PyQuery

    except ModuleNotFoundError:
        raise Exception('pyquery not installed')

    else:

        results = []

        with open(htmlpath) as infile:

            html = PyQuery(to_bytes(infile.read()))
            anchor_tags = html.items('a')
            # log.debug(f'{len(list(anchor_tags))} links: {htmlpath}') # DEBUG
            for item in anchor_tags:
                href = item.attr('href')
                if href and href.startswith('http'):
                    if exclude and (exclude not in href):
                        results.append([htmlpath, href, item.text().strip()])
                        # log.debug(f'\t{href}') # DEBUG

        return results