Exemple #1
0
def _parseSummary((data, headers)):
    """
    Extract the plot summary.
    """
    tree = parseHTML(data)

    for p in tree.findall('//p'):
        if p.get('class') == 'plotpar':
            return p.text.strip()
Exemple #2
0
def _parseSummary((data, headers)):
    """
    Extract the plot summary.
    """
    tree = parseHTML(data)

    for p in tree.findall('//p'):
        if p.get('class') == 'plotpar':
            return p.text.strip()
Exemple #3
0
    def test_parseHTML(self):
        """
        L{eridanusstd.util.parseHTML} will use the newer html5lib API if
        available and parse HTML content into an LXML element tree.
        """
        if not hasattr(html5lib, 'parse'):
            raise SkipTest('html5lib is too old')

        tree = util.parseHTML(self.path.sibling('index.html').open())
        self.assertIdentical(type(tree), type(etree.ElementTree()))
Exemple #4
0
def _parsePoster((data, headers)):
    """
    Extract the URL for the poster image.
    """
    tree = parseHTML(data)

    for table in tree.findall('//table'):
        if table.get('id') == 'principal':
            img = table.find('.//img')
            if img is not None:
                return IMDB_URL.click(img.get('src'))
            return None
Exemple #5
0
def _parsePoster((data, headers)):
    """
    Extract the URL for the poster image.
    """
    tree = parseHTML(data)

    for table in tree.findall('//table'):
        if table.get('id') == 'principal':
            img = table.find('.//img')
            if img is not None:
                return IMDB_URL.click(img.get('src'))
            return None
Exemple #6
0
def qdbUS(quoteID):
    url = QDB_US_URL.child(quoteID)

    def extractQuote(tree):
        quote = tree.find('//form/table/tbody')
        header = unicode(''.join(quote.find('tr/td').itertext())).strip()
        text = unicode(''.join(quote.find('tr/td/p').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go().addCallback(
        lambda (data, headers): parseHTML(data)).addErrback(
            handleBadQuoteID, quoteID).addCallback(extractQuote)
Exemple #7
0
def qdbUS(quoteID):
    url = QDB_US_URL.child(quoteID)

    def extractQuote(tree):
        quote = tree.find('//form/table/tbody')
        header = unicode(''.join(quote.find('tr/td').itertext())).strip()
        text = unicode(''.join(quote.find('tr/td/p').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go(
        ).addCallback(lambda (data, headers): parseHTML(data)
        ).addErrback(handleBadQuoteID, quoteID
        ).addCallback(extractQuote)
Exemple #8
0
def bash(quoteID):
    url = BASH_URL.add(quoteID)

    def extractQuote(tree):
        header = (t for t in tree.find('//p[@class="quote"]').itertext()
                  if t not in ('+', '-', '[X]'))
        header = unicode(''.join(header), 'ascii').strip()
        text = unicode(''.join(
            tree.find('//p[@class="qt"]').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go().addCallback(
        lambda (data, headers): parseHTML(data)).addErrback(
            handleBadQuoteID, quoteID).addCallback(extractQuote)
Exemple #9
0
def bash(quoteID):
    url = BASH_URL.add(quoteID)

    def extractQuote(tree):
        header = (t for t in tree.find('//p[@class="quote"]').itertext()
                  if t not in ('+', '-', '[X]'))
        header = unicode(''.join(header), 'ascii').strip()
        text = unicode(''.join(tree.find('//p[@class="qt"]').itertext())).strip()

        yield u'%s -- %s' % (header, url)
        for line in text.splitlines():
            yield line

    return util.PerseverantDownloader(url).go(
        ).addCallback(lambda (data, headers): parseHTML(data)
        ).addErrback(handleBadQuoteID, quoteID
        ).addCallback(extractQuote)
Exemple #10
0
def _parseSearchResults((data, headers)):
    """
    Parse search result HTML into an iterable of C{(name, url, id)}.
    """
    tree = parseHTML(data)

    # XXX: Maybe do something a little more less shot-in-the-darkish, like
    # finding the first `ol` after an `h1`.
    for li in tree.find('//ol').findall('li'):
        a = li.find('a')
        url = IMDB_URL.click(a.get('href'))
        name = unicode(a.text)
        # Skip video games, this should be part of the "I want movies,
        # I want TV series" criteria stuff.
        if not name.endswith(u'(VG)'):
            pathList = url.pathList()
            id = pathList[-1] or pathList[-2]
            yield name, url, id
Exemple #11
0
def _parseSearchResults((data, headers)):
    """
    Parse search result HTML into an iterable of C{(name, url, id)}.
    """
    tree = parseHTML(data)

    # XXX: Maybe do something a little more less shot-in-the-darkish, like
    # finding the first `ol` after an `h1`.
    for li in tree.find('//ol').findall('li'):
        a = li.find('a')
        url = IMDB_URL.click(a.get('href'))
        name = unicode(a.text)
        # Skip video games, this should be part of the "I want movies,
        # I want TV series" criteria stuff.
        if not name.endswith(u'(VG)'):
            pathList = url.pathList()
            id = pathList[-1] or pathList[-2]
            yield name, url, id
Exemple #12
0
def _extractTitle(data):
    def sanitizeTitle(title):
        return _whitespace.sub(u' ', title.strip())

    if data:
        try:
            tree = parseHTML(data)
            results = tree.xpath(
                '//xhtml:title',
                namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'})

            results = filter(
                None, (sanitizeTitle(unicode(e.text)) for e in results))
            if results:
                return u';'.join(results)
        except:
            log.msg('Extracting title failed:')
            log.err()

    return None
Exemple #13
0
class Calculator(object):
    """
    Primitive screen-scraping interface to Google's calculator.
    """
    _resultFormatting = {'sup': u'^'}

    def _formatResult(self, elem):
        """
        Gracefully downgrade HTML markup in calculator results.
        """
        def _format():
            yield elem.text
            for child in elem.iterchildren():
                tag = child.tag.split('}')[-1]
                extra = self._resultFormatting.get(tag)
                if extra is not None:
                    yield extra
                yield child.text
                yield child.tail

        return filter(None, _format())

    def _extractResult(self, (data, headers), expn):
        """
        Extract the calculator result from a Google search.

        @rtype:  C{(unicode, unicode)}
        @return: A pair of C{(expn, result)}.
        """
        tree = parseHTML(data)
        results = tree.xpath(
            '//xhtml:h2[@class="r"]/xhtml:b',
            namespaces={'xhtml': 'http://www.w3.org/1999/xhtml'})
        if results:
            return u''.join(self._formatResult(results[0]))
        raise errors.InvalidExpression(expn)
Exemple #14
0
_infoParsers = {
    u'director': _hyperlinkedText(u'director'),
    u'genre': _genre,
    u'release date': _releaseDate,
    }

def _parseTitleInfo((data, headers), url):
    """
    Parse an IMDB HTML document into structured information.

    The resulting dictionary contains keys that map roughly to the relevant
    IMDB fields of the same name.

    @rtype: Deferred firing with a C{dict}
    """
    tree = parseHTML(data)

    info = {}
    info['title'] = tree.find('//h1').text.strip()

    # Scan all the `<div class="info">` tags for information that we know how
    # to parse.
    infoElems = (e for e in tree.findall('//div') if e.get('class') == 'info')
    for elem in infoElems:
        h5 = elem.find('h5')
        if h5 is None:
            continue

        infoName = h5.text
        if infoName is None:
            continue
Exemple #15
0
    u'director': _hyperlinkedText(u'director'),
    u'genre': _genre,
    u'release date': _releaseDate,
}


def _parseTitleInfo((data, headers), url):
    """
    Parse an IMDB HTML document into structured information.

    The resulting dictionary contains keys that map roughly to the relevant
    IMDB fields of the same name.

    @rtype: Deferred firing with a C{dict}
    """
    tree = parseHTML(data)

    info = {}
    info['title'] = tree.find('//h1').text.strip()

    # Scan all the `<div class="info">` tags for information that we know how
    # to parse.
    infoElems = (e for e in tree.findall('//div') if e.get('class') == 'info')
    for elem in infoElems:
        h5 = elem.find('h5')
        if h5 is None:
            continue

        infoName = h5.text
        if infoName is None:
            continue