def runTest(self): f = open(doc_fn, "rb") c = f.read() f.close() doc = html.document_fromstring(c) body = doc.xpath("//body")[0] bad = [] selector, count = self.selectors[self.index] xpath = cssselect.css_to_xpath(cssselect.parse(selector)) try: results = body.xpath(xpath) except Exception: e = sys.exc_info()[1] e.args = "%s for xpath %r" % (e, xpath) raise found = {} for item in results: if item in found: assert 0, "Element shows up multiple times: %r" % item found[item] = None if isinstance(results, basestring): assert 0, "Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath)) if len(results) != count: # if self.shortDescription() == 'div.character, div.dialog': # import pdb; pdb.set_trace() assert 0, "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath))
def runTest(self): f = open(doc_fn, 'rb') c = f.read() f.close() doc = html.document_fromstring(c) body = doc.xpath('//body')[0] bad = [] selector, count = self.selectors[self.index] options = dict(regex_prefix='re') xpath = cssselect.css_to_xpath(cssselect.parse(selector, options), **options) try: results = body.xpath(xpath, namespaces=namespaces) except Exception: e = sys.exc_info()[1] e.args = ("%s for xpath %r" % (e, xpath), ) raise found = {} for item in results: if item in found: assert 0, ("Element shows up multiple times: %r" % item) found[item] = None if isinstance(results, basestring): assert 0, ("Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath))) if len(results) != count: #if self.shortDescription() == 'div.character, div.dialog': # import pdb; pdb.set_trace() assert 0, ( "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath)))
def css_to_xpath(css, simplify=True): """CSS to XPath. Example: css_to_xpath('div.pad a').""" xpath = cssselect.css_to_xpath(css) if simplify: xpath = xpath.replace('descendant-or-self::', '//').replace('descendant::', '') return xpath
def __init__(self, css, namespaces=XPNSMAP): if isinstance(css, unicode): # Workaround for bug in lxml on windows/OS X that causes a massive # memory leak with non ASCII selectors css = css.encode('ascii', 'ignore').decode('ascii') try: path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath(css)) self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces) except: self.sel1 = lambda x: [] try: path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath_no_case(css)) self.sel2 = etree.XPath(path, namespaces=namespaces) except: self.sel2 = lambda x: [] self.sel2_use_logged = False self.css = css
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_ingredients = css_to_xpath('.recipeMeasure') class CocktailDbSpider(CrawlSpider): name = 'cocktaildb' allowed_domains = ['www.cocktaildb.com'] start_urls = ['http://www.cocktaildb.com'] rules = ( Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'.*')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h2').extract(): break else: return []
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) #DEPTH_LIMIT = 2 ########NEW FILE######## __FILENAME__ = cocktaildb from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_ingredients = css_to_xpath('.recipeMeasure') class CocktailDbSpider(CrawlSpider): name = 'cocktaildb' allowed_domains = ['www.cocktaildb.com'] start_urls = ['http://www.cocktaildb.com'] rules = ( Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'.*')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h2').extract():
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_ingredients = css_to_xpath('.recipeMeasure') class CocktailDbSpider(CrawlSpider): name = 'cocktaildb' allowed_domains = ['www.cocktaildb.com'] start_urls = ['http://www.cocktaildb.com'] rules = ( Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'.*')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h2').extract(): break else: return [] ingredients = hxs.select(xp_ingredients).extract()
from urlparse import urljoin from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_header = css_to_xpath('.header') + '/text()' xp_ingredients = css_to_xpath('.story') + ("[1]//text()[" "preceding::text()[" "normalize-space(self::text()) = 'Ingredients:'" "]" "][" "starts-with(normalize-space(self::text()), '-')" "]") xp_picture = ("//img[" "preceding::comment()[" "contains(self::comment(), ' COCKTAIL PHOTO ')" "]" "]/@src") class CocktailTimesSpider(CrawlSpider): name = 'cocktailtimes' allowed_domains = ['www.cocktailtimes.com'] start_urls = ['http://www.cocktailtimes.com'] rules = (
def selector_to_xpath(selector, prefix='descendant-or-self::'): """JQuery selector to xpath. """ selector = selector.replace('[@', '[') return css_to_xpath(selector, prefix)
def find_many(self, attribute = None, class_name = None, id = None, name = None, tag = None, text = None, text_contains = None, re_test = None, xpath = None, css = None ): """ Retorna una lista de elementos que cumplen con un conjunto de determinados criterios de búsqueda. Los criterios se definen usando keywords: *class_name* si tiene un clase dada *attribute* attribute puede ser de tres tipos: 1. str o unicode: Se busca un elemento con un atributo del tipo indicado. 2. list: Se busca un elemento que contenga todos los atributos especificados en la lista. 3. dict: Se busca un elemento que contenga todos los atributos especificados y que ademas los valores de los mismos coincidan con los valores indicados por el diccionario. Si se tiene el siguiente código .. code-block:: html <html><body> <a href='spam' rel='bar'/> <a href='eggs' src='foo'/> <a href='eggs' src='toast'/> </body></html> Suponiendo que browser cargó esa página :: browser.find_many(tag = 'a', attribute = 'rel') # devuelve el primer a browser.find_many(tag = 'a', attribute = [ 'href', 'src' ]) # devuelve el segundo y el tercer a browser.find_many(tag = 'a', attribute = { 'href' : 'eggs', 'src' : 'toast' }) # devuelve el tercero *id* si tiene el id en un valor determinado *name* si tiene el atributo name definido con cierto valor *tag* si tiene un en la de elemento dado: 'A', 'H1', etc. *text* si tiene algún texto en particular *text_contains* si el texto contiene alguna cadena particular *re_test* busca que un atributo coincida con una expresión regular se deben pasar pares (<nombre atributo>, expresión regular) *xpath* Excluyente a todos los otros: Obtiene por xpath absoluto *css* Excluyente a todos los otros: Obtiene por selector CSS """ for i in (attribute, class_name, id, name, tag, text, xpath, css): if i == None: break else: raise ValueError("You must specify at least one parameter") # se eligió búsqueda por css if css != None: for i in (attribute, class_name, id, name, tag, text, xpath, text_contains, re_test): if i != None: raise ValueError("You must either specify xpath, css or "\ "other parameter but not both") return self.get_elements_by_xpath(cssselect.css_to_xpath(css)) # se eligió búsqueda por xpath if xpath != None: for i in (attribute, class_name, id, name, tag, text, text_contains, re_test): if i != None: raise ValueError("You must either specify xpath, css or "\ "other parameter but not both") return self.get_elements_by_xpath(xpath) # computa los criterios que tienen que ver con atributos attr_criteria = [] if attribute != None: # - 'attribute' puede ser de tres tipos: # 1) str, unicode: Se busca un elemento con un atributo del tipo indicado. # 2) list: Se busca un elemento que contenga todos los atributos especificados en la lista. # 3) dict: Se busca un elemento que contenga todos los atributos especificados y que ademas # los valores de los mismos coincidan con los valores indicados por el diccionario. if isinstance(attribute, str) or isinstance(attribute, str): attr_criteria.append("@%s" % attribute) elif isinstance(attribute, list): for a in attribute: attr_criteria.append("@%s" % a) elif isinstance(attribute, dict): for k, v in attribute.items(): attr_criteria.append("@%s=%s" % (k, json.dumps(v))) else: raise ValueError('attribute es de tipo no valido. Se esperaba: str, unicode, list o dict. Se encontro: ' + str(attribute.__class__.__name__)) attr_dict = {'class':class_name, 'id':id, 'name': name } for attr, attr_val in list(attr_dict.items()): if attr_val != None: attr_criteria.append("@%s=%s" % (attr, json.dumps(attr_val))) # por texto if text != None: attr_criteria.append("@%s=%s" % ('text()', json.dumps(text))) if text_contains != None: attr_criteria.append("contains(text(), %s)" % json.dumps(text_contains)) # computa cada parte de la expresión final if tag == None: tag_expr = "*" else: tag_expr = tag if len(attr_criteria) == 0: attr_criteria_expr = "" else: attr_criteria_expr = "[ %s ]" % " and ".join(attr_criteria) # computa el xpath final a evaluar xpath_expr = ".// %s %s" % (tag_expr, attr_criteria_expr) res = self.get_elements_by_xpath(xpath_expr) # filtra si hay una expresión regular if re_test != None: res = self.__filter_by_re_test(res, re_test) return res
from urlparse import urljoin from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_recipes = css_to_xpath('.hrecipe') xp_ingredients = css_to_xpath('.ingredient li') class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['http://en.wikipedia.org/wiki/List_of_cocktails'] rules = ( Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:Cocktails(\b|_)'))), Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:.+(\b|_)drinks?(\b|_)'))), Rule(SgmlLinkExtractor(allow=(r'/wiki/[^:]+$')), callback='parse_recipes'), ) def parse_recipes(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select("//link[@rel='canonical']/@href").extract(): url = urljoin(response.url, url)
from urlparse import urljoin, urlparse from itertools import groupby from functools import partial from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_recipe_links = css_to_xpath(".cocktail") + "//a[1]/@href" class OhGoshSpider(BaseSpider): name = "ohgosh" start_urls = ["http://ohgo.sh/cocktail-recipes/"] def parse(self, response): hxs = HtmlXPathSelector(response) links = hxs.select(xp_recipe_links).extract() links = [urljoin(response.url, url) for url in links] links.sort() for page_url, recipe_urls in groupby(links, lambda url: url.split("#")[0]): yield Request(page_url, partial(self.parse_recipes, recipe_urls=list(recipe_urls))) def parse_recipes(self, response, recipe_urls):
from urlparse import urljoin from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text, split_at_br xp_title = css_to_xpath('.entry-title') xp_ingredients = css_to_xpath('.entry-content p') + '[1]' xp_previous_link = css_to_xpath('.nav-previous a') + '/@href' class Monkey47Spider(BaseSpider): name = 'monkey47' start_urls = ['http://www.monkey47.com/wordpress/tag/gin_cocktail_rezepte/'] def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(xp_title + '//a/@href').extract(): yield Request(urljoin(response.url, url), self.parse_recipe) for url in hxs.select(xp_previous_link).extract(): yield Request(urljoin(response.url, url), self.parse) def parse_recipe(self, response): hxs = HtmlXPathSelector(response)
from urlparse import urljoin from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_recipes = css_to_xpath('.hrecipe') xp_ingredients = css_to_xpath('.ingredient li') class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['http://en.wikipedia.org/wiki/List_of_cocktails'] rules = ( Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:Cocktails(\b|_)'))), Rule(SgmlLinkExtractor( allow=(r'/wiki/Category:.+(\b|_)drinks?(\b|_)'))), Rule(SgmlLinkExtractor(allow=(r'/wiki/[^:]+$')), callback='parse_recipes'), ) def parse_recipes(self, response): hxs = HtmlXPathSelector(response)
import re from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_title = css_to_xpath('.recipe_title') xp_ingredients = css_to_xpath('.ingredient') class DrinksMixerSpider(CrawlSpider): name = 'drinksmixer' allowed_domains = ['www.drinksmixer.com'] start_urls = ['http://www.drinksmixer.com/'] rules = ( Rule(SgmlLinkExtractor(allow=r'/drink[^/]+.html$'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'/cat/')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break
from urlparse import urljoin from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text, split_at_br, extract_extra_ingredients xp_recipe_links = css_to_xpath('.SolrResultTitle a') + '/@href' xp_next_link = css_to_xpath('.SolrPageNext a') + '/@href' class SaveurSpider(BaseSpider): name = 'saveur' start_urls = [ 'http://www.saveur.com/solrSearchResults.jsp?fq=Course:Beverages' ] def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(xp_recipe_links).extract(): yield Request(urljoin(response.url, url), self.parse_recipe) for url in hxs.select(xp_next_link).extract(): yield Request(urljoin(response.url, url), self.parse) def parse_recipe(self, response):
import re from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_title = css_to_xpath('.recipe_title') xp_ingredients = css_to_xpath('.ingredient') class DrinksMixerSpider(CrawlSpider): name = 'drinksmixer' allowed_domains = ['www.drinksmixer.com'] start_urls = ['http://www.drinksmixer.com/'] rules = ( Rule(SgmlLinkExtractor(allow=r'/drink[^/]+.html$'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'/cat/')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return []
from urlparse import urljoin from scrapy.contrib.spiders import SitemapSpider from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text, unescape xp_ingredient = css_to_xpath('.ingredient') class EsquireSpider(SitemapSpider): name = 'esquire' sitemap_urls = ['http://www.esquire.com/robots.txt'] sitemap_rules = [('/drinks/.*-recipe$', 'parse_recipe')] def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//meta[@property='og:title']/@content").extract(): break else: return [] for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None
from urlparse import urljoin, urlparse from itertools import groupby from functools import partial from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_recipe_links = css_to_xpath('.cocktail') + '//a[1]/@href' class OhGoshSpider(BaseSpider): name = 'ohgosh' start_urls = ['http://ohgo.sh/cocktail-recipes/'] def parse(self, response): hxs = HtmlXPathSelector(response) links = hxs.select(xp_recipe_links).extract() links = [urljoin(response.url, url) for url in links] links.sort() for page_url, recipe_urls in groupby(links, lambda url: url.split('#')[0]): yield Request( page_url,
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_ingredients = css_to_xpath('.cocktail-ingredients tr') class KindredCocktails(CrawlSpider): name = 'kindredcocktails' allowed_domains = ['www.kindredcocktails.com'] start_urls = ['http://www.kindredcocktails.com'] rules = ( Rule(SgmlLinkExtractor(allow=r'/cocktail/[^/?]+$'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'.*')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] ingredients = hxs.select(xp_ingredients).extract()
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) #DEPTH_LIMIT = 2 ########NEW FILE######## __FILENAME__ = cocktaildb from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_ingredients = css_to_xpath('.recipeMeasure') class CocktailDbSpider(CrawlSpider): name = 'cocktaildb' allowed_domains = ['www.cocktaildb.com'] start_urls = ['http://www.cocktaildb.com'] rules = ( Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'.*')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response)
from urlparse import urljoin from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_header = css_to_xpath('.header') + '/text()' xp_ingredients = css_to_xpath('.story') + ( "[1]//text()[" "preceding::text()[" "normalize-space(self::text()) = 'Ingredients:'" "]" "][" "starts-with(normalize-space(self::text()), '-')" "]") xp_picture = ("//img[" "preceding::comment()[" "contains(self::comment(), ' COCKTAIL PHOTO ')" "]" "]/@src") class CocktailTimesSpider(CrawlSpider): name = 'cocktailtimes' allowed_domains = ['www.cocktailtimes.com'] start_urls = ['http://www.cocktailtimes.com']
from urlparse import urljoin from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text, split_at_br, extract_extra_ingredients xp_recipe_links = css_to_xpath('.SolrResultTitle a') + '/@href' xp_next_link = css_to_xpath('.SolrPageNext a') + '/@href' class SaveurSpider(BaseSpider): name = 'saveur' start_urls = ['http://www.saveur.com/solrSearchResults.jsp?fq=Course:Beverages'] def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(xp_recipe_links).extract(): yield Request(urljoin(response.url, url), self.parse_recipe) for url in hxs.select(xp_next_link).extract(): yield Request(urljoin(response.url, url), self.parse) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract():
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text xp_ingredients = css_to_xpath('.cocktail-ingredients tr') class KindredCocktails(CrawlSpider): name = 'kindredcocktails' allowed_domains = ['www.kindredcocktails.com'] start_urls = ['http://www.kindredcocktails.com'] rules = ( Rule(SgmlLinkExtractor(allow=r'/cocktail/[^/?]+$'), callback='parse_recipe'), Rule(SgmlLinkExtractor(allow=r'.*')), ) def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return []
from urlparse import urljoin from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text, split_at_br xp_title = css_to_xpath('.entry-title') xp_ingredients = css_to_xpath('.entry-content p') + '[1]' xp_previous_link = css_to_xpath('.nav-previous a') + '/@href' class Monkey47Spider(BaseSpider): name = 'monkey47' start_urls = [ 'http://www.monkey47.com/wordpress/tag/gin_cocktail_rezepte/' ] def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(xp_title + '//a/@href').extract(): yield Request(urljoin(response.url, url), self.parse_recipe) for url in hxs.select(xp_previous_link).extract(): yield Request(urljoin(response.url, url), self.parse)
def selector_to_xpath(selector): """JQuery selector to xpath. """ selector = selector.replace('[@', '[') return css_to_xpath(selector)
import json from functools import partial from collections import OrderedDict from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from lxml.cssselect import css_to_xpath from cocktails.items import CocktailItem from cocktails.utils import html_to_text URL = 'http://www.seriouseats.com/topics/search?index=recipe&count=200&term=c|cocktails' xp_ingredients = css_to_xpath('.ingredient') class SeriouseatsSpider(BaseSpider): name = 'seriouseats' start_urls = [URL] def parse(self, response): recipes = json.loads(response.body)['entries'] for recipe in recipes: picture = None for size in sorted(int(k[10:]) for k in recipe if k.startswith('thumbnail_')): picture = recipe['thumbnail_%d' % size] if picture: