Ejemplo n.º 1
0
    def _xpath(self, path, html, limit):
        """Recursively search HTML for content at XPath
        """
        counter, separator, tag, index, attributes = path.pop(0)
        if counter == 0:
            self.num_searches += 1

        results = []
        if tag == '..':
            # parent
            raise common.WebScrapingError('.. not yet supported')
            results.append(self.get_parent(html))
        elif tag == 'text()':
            # extract child text
            text = self._get_content(self._get_html(html))
            results.append(common.remove_tags(text, keep_children=False))
            # check if next tag is selecting attribute
        elif tag.startswith('@'):
            attr = tag[1:].lower()
            #parent = self.get_parent(context)
            value = self._get_attributes(html).get(attr, '')
            results.append(value)
        else:
            # have tag
            if counter > 0:
                # get child html when not at root
                html = self._get_content(html)

            # search direct children if / and all descendants if //
            search_fn = self._find_children if separator == '' else self._find_descendants
            matches = search_fn(html, tag)

            # support negative indices
            if index is not None and index < 0:
                matches = list(matches)
                index += len(matches) + 1

            for child_i, child in enumerate(matches):
                # check if matches index
                if index is None or index == child_i + 1:
                    # check if matches attributes
                    if not attributes or self._match_attributes(
                            attributes, self._get_attributes(child)):
                        if path:
                            results.extend(self._xpath(path[:], child, limit))
                        else:
                            # final node
                            results.append(self._get_content(child))
                        if len(results) > limit:
                            break

            #if not children:
            #    attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or ''
            #    common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
        return results
Ejemplo n.º 2
0
    def _xpath(self, path, html, limit):
        """Recursively search HTML for content at XPath
        """
        counter, separator, tag, index, attributes = path.pop(0)
        if counter == 0:
            self.num_searches += 1

        results = []
        if tag == '..':
            # parent
            raise common.WebScrapingError('.. not yet supported')
            results.append(self.get_parent(html))
        elif tag == 'text()':
            # extract child text
            text = self._get_content(self._get_html(html))
            results.append(common.remove_tags(text, keep_children=False))
            # check if next tag is selecting attribute
        elif tag.startswith('@'):
            attr = tag[1:].lower()
            #parent = self.get_parent(context)
            value = self._get_attributes(html).get(attr, '')
            results.append(value)
        else:
            # have tag
            if counter > 0:
                # get child html when not at root
                html = self._get_content(html)

            # search direct children if / and all descendants if //
            search_fn = self._find_children if separator == '' else self._find_descendants
            matches = search_fn(html, tag)

            # support negative indices
            if index is not None and index < 0:
                matches = list(matches)
                index += len(matches) + 1

            for child_i, child in enumerate(matches):
                # check if matches index
                if index is None or index == child_i + 1:
                    # check if matches attributes
                    if not attributes or self._match_attributes(attributes, self._get_attributes(child)):
                        if path:
                            results.extend(self._xpath(path[:], child, limit))
                        else:
                            # final node
                            results.append(self._get_content(child))
                        if len(results) > limit:
                            break

            #if not children:
            #    attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or ''
            #    common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
        return results
Ejemplo n.º 3
0
def get_excerpt(html, try_meta=False, max_chars=10000):
    """Extract excerpt from this HTML by finding largest text block

    try_meta indicates whether to try extracting from meta description tag
    max_chars is the maximum number of characters for the excerpt
    """
    # try extracting meta description tag
    excerpt = ''
    if try_meta:
        excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content')
    if not excerpt:
        # remove these tags and then find biggest text block
        bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'footer'
        content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags))
        if content:
            excerpt = max((len(p.strip()), p) for p in content.splitlines())[1]
    return common.unescape(excerpt.strip())[:max_chars]
Ejemplo n.º 4
0
def get_excerpt(html, try_meta=False, max_chars=255):
    """Extract excerpt from this HTML by finding largest text block

    try_meta indicates whether to try extracting from meta description tag
    max_chars is the maximum number of characters for the excerpt
    """
    # try extracting meta description tag
    excerpt = ''
    if try_meta:
        excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content')
    if not excerpt:
        # remove these tags and then find biggest text block
        bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags))
        if content:
            excerpt = max((len(p.strip()), p) for p in content.splitlines())[1]
    return common.unescape(excerpt.strip())[:max_chars]
Ejemplo n.º 5
0
def search(html, xpath, remove=None):
    """Query HTML document using XPath
    
    remove is a list of tags to ignore

    >>> search('<span>1</span><div>abc<a>LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>', '/div/a')
    ['LINK 1', 'LINK 3']
    >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]/a[@class="link"]')
    ['LINK 1']
    >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]//a')
    ['LINK 1', 'LINK 2']
    >>> search('<div>abc<a class="link">LINK 1</a></div>', '/div/a/@class')
    ['link']

    # test searching unicode
    >>> search(u'<a href="http://www.google.com" class="flink">google</a>', '//a[@class="flink"]')
    [u'google']

    # test scraping a large amount of content
    len(search('<div><span>!</span></div>' * 10000, '//span'))
    10000
    """
    orig_html = html
    html = clean_html(html, remove)
    contexts = [html] # initial context is entire webpage
    parent_attributes = []
    for tag_i, (separator, tag, index, attributes) in enumerate(xpath_iter(xpath)):
        children = []
        if tag == '..':
            # parent
            raise common.WebScrapingError('.. not yet supported')
        elif tag == 'text()':
            # extract child text
            for context in contexts:
                children.append(common.remove_tags(context, keep_children=False))
        elif tag.startswith('@'):
            # selecting attribute
            name = tag[1:].lower()
            for a in parent_attributes:
                children.append(a.get(name, ''))
        else:
            # have tag
            parent_attributes = []
            for context in contexts:
                # search direct children if / and all descendants if //
                matches = (separator == '' and find_children or find_descendants)(context, tag)
                # XXX change to iterator
                abs_index = index
                if abs_index is not None and abs_index < 0:
                    # support negative indices
                    abs_index += len(matches) + 1
                for child_i, child in enumerate(matches):
                    if index is None or abs_index == child_i + 1:
                        # matches index if defined
                        child_attributes = get_attributes(child)
                        if match_attributes(attributes, child_attributes):
                            # child matches tag and any defined indices or attributes
                            children.append(get_content(child))
                            parent_attributes.append(child_attributes)
        if not children and tag == 'tbody':
            pass # skip tbody, which firefox includes in xpath when does not exist
        else:
            contexts = children
        if not contexts:
            attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or ''
            common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
            break
    return contexts
Ejemplo n.º 6
0
def search(html, xpath, remove=None):
    """Query HTML document using XPath
    
    remove is a list of tags to ignore

    >>> search('<span>1</span><div>abc<a>LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>', '/div/a')
    ['LINK 1', 'LINK 3']
    >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]/a[@class="link"]')
    ['LINK 1']
    >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]//a')
    ['LINK 1', 'LINK 2']
    >>> search('<div>abc<a class="link">LINK 1</a></div>', '/div/a/@class')
    ['link']

    # test searching unicode
    >>> search(u'<a href="http://www.google.com" class="flink">google</a>', '//a[@class="flink"]')
    [u'google']

    # test scraping a large amount of content
    len(search('<div><span>!</span></div>' * 10000, '//span'))
    10000
    """
    orig_html = html
    html = clean_html(html, remove)
    contexts = [html]  # initial context is entire webpage
    parent_attributes = []
    for tag_i, (separator, tag, index,
                attributes) in enumerate(xpath_iter(xpath)):
        children = []
        if tag == '..':
            # parent
            raise common.WebScrapingError('.. not yet supported')
        elif tag == 'text()':
            # extract child text
            for context in contexts:
                children.append(
                    common.remove_tags(context, keep_children=False))
        elif tag.startswith('@'):
            # selecting attribute
            name = tag[1:].lower()
            for a in parent_attributes:
                children.append(a.get(name, ''))
        else:
            # have tag
            parent_attributes = []
            for context in contexts:
                # search direct children if / and all descendants if //
                matches = (separator == '' and find_children
                           or find_descendants)(context, tag)
                # XXX change to iterator
                abs_index = index
                if abs_index is not None and abs_index < 0:
                    # support negative indices
                    abs_index += len(matches) + 1
                for child_i, child in enumerate(matches):
                    if index is None or abs_index == child_i + 1:
                        # matches index if defined
                        child_attributes = get_attributes(child)
                        if match_attributes(attributes, child_attributes):
                            # child matches tag and any defined indices or attributes
                            children.append(get_content(child))
                            parent_attributes.append(child_attributes)
        if not children and tag == 'tbody':
            pass  # skip tbody, which firefox includes in xpath when does not exist
        else:
            contexts = children
        if not contexts:
            attributes_s = attributes and ''.join('[@%s="%s"]' % a
                                                  for a in attributes) or ''
            common.logger.debug(
                'No matches for <%s%s%s> (tag %d)' %
                (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
            break
    return contexts