예제 #1
0
def html_to_text(s):
	# strip tags
	s = re.sub(r'<\W*(?:b|big|i|small|tt|abbr|acronym|cite|code|dfn|em|kbd|strong|samp|var|a|bdo|q|span|sub|sup)\b[^>]*?>', '', s, flags=re.I)
	s = re.sub(r'<[^>]*?>', ' ', s)
	# replace entities
	s = unescape(s)
	# strip leading and trailing spaces
	s = s.strip()
	# replace all sequences of subsequent whitespaces with a single space
	s = re.sub(r'\s+', ' ', s)
	return s
예제 #2
0
def html_to_text(s):
    # strip tags
    s = re.sub(
        r'<\W*(?:b|big|i|small|tt|abbr|acronym|cite|code|dfn|em|kbd|strong|samp|var|a|bdo|q|span|sub|sup)\b[^>]*?>',
        '',
        s,
        flags=re.I)
    s = re.sub(r'<[^>]*?>', ' ', s)
    # replace entities
    s = unescape(s)
    # strip leading and trailing spaces
    s = s.strip()
    # replace all sequences of subsequent whitespaces with a single space
    s = re.sub(r'\s+', ' ', s)
    return s
예제 #3
0
    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(
                "//meta[@property='og:title']/@content").extract():
            break
        else:
            return []

        for picture in hxs.select(
                "//*[@id='drink_infopicvid']/img/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients = []
        for node in hxs.select("//ul[@id='ingredients']/li"):
            parts = []

            for child in node.select('* | text()'):
                text = html_to_text(child.extract())

                if 'ingredient' in (child.xmlNode.prop('class') or '').split():
                    text = text.split('--')[-1]

                text = text.strip()

                if not text:
                    continue

                parts.append(text)

            ingredients.append(' '.join(parts))

        # don't crawl recipes like 'American Whiskey & Canadian Whisky',
        # that only consist of pouring a single spirit into a glass.
        if len(ingredients) <= 1:
            return []

        return [
            CocktailItem(title=unescape(title),
                         picture=picture,
                         url=response.url,
                         source='Esquire',
                         ingredients=ingredients)
        ]
예제 #4
0
    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//meta[@property='og:title']/@content").extract():
            break
        else:
            return []

        for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients = []
        for node in hxs.select("//ul[@id='ingredients']/li"):
            parts = []

            for child in node.select('* | text()'):
                text = html_to_text(child.extract())

                if 'ingredient' in (child.xmlNode.prop('class') or '').split():
                    text = text.split('--')[-1]

                text = text.strip()

                if not text:
                    continue

                parts.append(text)

            ingredients.append(' '.join(parts))

        # don't crawl recipes like 'American Whiskey & Canadian Whisky',
        # that only consist of pouring a single spirit into a glass.
        if len(ingredients) <= 1:
            return []

        return [CocktailItem(
            title=unescape(title),
            picture=picture,
            url=response.url,
            source='Esquire',
            ingredients=ingredients
        )]