Exemple #1
0
    def list_candidates(self, html, encoding='utf8'):
        """
        list all the data record candidates.

        Returns
        -------
        A sorted list of elements with descreasing order of odds of being an candidate.
        """
        if isinstance(html, unicode):
            html = html.encode(encoding)

        parser = etree.HTMLParser(encoding=encoding)
        doc = etree.parse(StringIO(html), parser)

        d = {}
        # find all the non-empty text nodes
        for e in doc.xpath('//*/text()[normalize-space()]'):
            p = e.getparent()
            xpath = doc.getpath(p)
            d.setdefault(simplify_xpath(xpath), []).append(xpath)

        counter = collections.Counter()
        for key, elements in d.iteritems():
            deepest_common_ancestor = "/".join(common_prefix(*[xpath.split('/') for xpath in elements]))
            counter[deepest_common_ancestor] += 1

        return [doc.xpath(k)[0] for k,v in sorted(counter.items(), key=operator.itemgetter(1), reverse=True)], doc
Exemple #2
0
def common_pref_reduce(a, b):
	if b[0] == '-':
		return a
	if a is None:
		return b
	return utils.common_prefix(a, b)
Exemple #3
0
def common_pref_reduce(a, b):
    if b[0] == '-':
        return a
    if a is None:
        return b
    return utils.common_prefix(a, b)