def list_candidates(self, html, encoding='utf8'): """ list all the data record candidates. Returns ------- A sorted list of elements with descreasing order of odds of being an candidate. """ if isinstance(html, unicode): html = html.encode(encoding) parser = etree.HTMLParser(encoding=encoding) doc = etree.parse(StringIO(html), parser) d = {} # find all the non-empty text nodes for e in doc.xpath('//*/text()[normalize-space()]'): p = e.getparent() xpath = doc.getpath(p) d.setdefault(simplify_xpath(xpath), []).append(xpath) counter = collections.Counter() for key, elements in d.iteritems(): deepest_common_ancestor = "/".join(common_prefix(*[xpath.split('/') for xpath in elements])) counter[deepest_common_ancestor] += 1 return [doc.xpath(k)[0] for k,v in sorted(counter.items(), key=operator.itemgetter(1), reverse=True)], doc
def common_pref_reduce(a, b): if b[0] == '-': return a if a is None: return b return utils.common_prefix(a, b)