def get_links(htmlpath, exclude=None): ''' Get links from an html file. Not well tested. See reinhardt.feeds for examples of more reliable parsing. Returns a list. Each item is a list of [PATH, URL, SUMMARY]. 'htmlpath' is path of html file. 'exclude' is string in href to exclude, without top level domain. Example: To exclude links to google, use "exclude='google'". Very ad hoc. ''' # fallable importdelayed until needed try: from pyquery.pyquery import PyQuery except ModuleNotFoundError: raise Exception('pyquery not installed') else: results = [] with open(htmlpath) as infile: html = PyQuery(to_bytes(infile.read())) anchor_tags = html.items('a') # log.debug(f'{len(list(anchor_tags))} links: {htmlpath}') # DEBUG for item in anchor_tags: href = item.attr('href') if href and href.startswith('http'): if exclude and (exclude not in href): results.append([htmlpath, href, item.text().strip()]) # log.debug(f'\t{href}') # DEBUG return results