def html_to_text(s): # strip tags s = re.sub(r'<\W*(?:b|big|i|small|tt|abbr|acronym|cite|code|dfn|em|kbd|strong|samp|var|a|bdo|q|span|sub|sup)\b[^>]*?>', '', s, flags=re.I) s = re.sub(r'<[^>]*?>', ' ', s) # replace entities s = unescape(s) # strip leading and trailing spaces s = s.strip() # replace all sequences of subsequent whitespaces with a single space s = re.sub(r'\s+', ' ', s) return s
def html_to_text(s): # strip tags s = re.sub( r'<\W*(?:b|big|i|small|tt|abbr|acronym|cite|code|dfn|em|kbd|strong|samp|var|a|bdo|q|span|sub|sup)\b[^>]*?>', '', s, flags=re.I) s = re.sub(r'<[^>]*?>', ' ', s) # replace entities s = unescape(s) # strip leading and trailing spaces s = s.strip() # replace all sequences of subsequent whitespaces with a single space s = re.sub(r'\s+', ' ', s) return s
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select( "//meta[@property='og:title']/@content").extract(): break else: return [] for picture in hxs.select( "//*[@id='drink_infopicvid']/img/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = [] for node in hxs.select("//ul[@id='ingredients']/li"): parts = [] for child in node.select('* | text()'): text = html_to_text(child.extract()) if 'ingredient' in (child.xmlNode.prop('class') or '').split(): text = text.split('--')[-1] text = text.strip() if not text: continue parts.append(text) ingredients.append(' '.join(parts)) # don't crawl recipes like 'American Whiskey & Canadian Whisky', # that only consist of pouring a single spirit into a glass. if len(ingredients) <= 1: return [] return [ CocktailItem(title=unescape(title), picture=picture, url=response.url, source='Esquire', ingredients=ingredients) ]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//meta[@property='og:title']/@content").extract(): break else: return [] for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = [] for node in hxs.select("//ul[@id='ingredients']/li"): parts = [] for child in node.select('* | text()'): text = html_to_text(child.extract()) if 'ingredient' in (child.xmlNode.prop('class') or '').split(): text = text.split('--')[-1] text = text.strip() if not text: continue parts.append(text) ingredients.append(' '.join(parts)) # don't crawl recipes like 'American Whiskey & Canadian Whisky', # that only consist of pouring a single spirit into a glass. if len(ingredients) <= 1: return [] return [CocktailItem( title=unescape(title), picture=picture, url=response.url, source='Esquire', ingredients=ingredients )]