def runTest(self):
     f = open(doc_fn, "rb")
     c = f.read()
     f.close()
     doc = html.document_fromstring(c)
     body = doc.xpath("//body")[0]
     bad = []
     selector, count = self.selectors[self.index]
     xpath = cssselect.css_to_xpath(cssselect.parse(selector))
     try:
         results = body.xpath(xpath)
     except Exception:
         e = sys.exc_info()[1]
         e.args = "%s for xpath %r" % (e, xpath)
         raise
     found = {}
     for item in results:
         if item in found:
             assert 0, "Element shows up multiple times: %r" % item
         found[item] = None
     if isinstance(results, basestring):
         assert 0, "Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath))
     if len(results) != count:
         # if self.shortDescription() == 'div.character, div.dialog':
         #    import pdb; pdb.set_trace()
         assert 0, "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath))
Esempio n. 2
0
 def runTest(self):
     f = open(doc_fn, 'rb')
     c = f.read()
     f.close()
     doc = html.document_fromstring(c)
     body = doc.xpath('//body')[0]
     bad = []
     selector, count = self.selectors[self.index]
     options = dict(regex_prefix='re')
     xpath = cssselect.css_to_xpath(cssselect.parse(selector, options),
                                    **options)
     try:
         results = body.xpath(xpath, namespaces=namespaces)
     except Exception:
         e = sys.exc_info()[1]
         e.args = ("%s for xpath %r" % (e, xpath), )
         raise
     found = {}
     for item in results:
         if item in found:
             assert 0, ("Element shows up multiple times: %r" % item)
         found[item] = None
     if isinstance(results, basestring):
         assert 0, ("Got string result (%r), not element, for xpath %r" %
                    (results[:20], str(xpath)))
     if len(results) != count:
         #if self.shortDescription() == 'div.character, div.dialog':
         #    import pdb; pdb.set_trace()
         assert 0, (
             "Did not get expected results (%s) instead %s for xpath %r" %
             (count, len(results), str(xpath)))
Esempio n. 3
0
def css_to_xpath(css, simplify=True):
    """CSS to XPath.
    
    Example: css_to_xpath('div.pad a')."""
    xpath = cssselect.css_to_xpath(css)
    if simplify:
        xpath = xpath.replace('descendant-or-self::', '//').replace('descendant::', '')
    return xpath
Esempio n. 4
0
def css_to_xpath(css, simplify=True):
    """CSS to XPath.

    Example: css_to_xpath('div.pad a')."""
    xpath = cssselect.css_to_xpath(css)
    if simplify:
        xpath = xpath.replace('descendant-or-self::',
                              '//').replace('descendant::', '')
    return xpath
Esempio n. 5
0
 def __init__(self, css, namespaces=XPNSMAP):
     if isinstance(css, unicode):
         # Workaround for bug in lxml on windows/OS X that causes a massive
         # memory leak with non ASCII selectors
         css = css.encode('ascii', 'ignore').decode('ascii')
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath(css))
         self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces)
     except:
         self.sel1 = lambda x: []
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
                 css_to_xpath_no_case(css))
         self.sel2 = etree.XPath(path, namespaces=namespaces)
     except:
         self.sel2 = lambda x: []
     self.sel2_use_logged = False
     self.css = css
Esempio n. 6
0
 def __init__(self, css, namespaces=XPNSMAP):
     if isinstance(css, unicode):
         # Workaround for bug in lxml on windows/OS X that causes a massive
         # memory leak with non ASCII selectors
         css = css.encode('ascii', 'ignore').decode('ascii')
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
                                       css_to_xpath(css))
         self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces)
     except:
         self.sel1 = lambda x: []
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
                                       css_to_xpath_no_case(css))
         self.sel2 = etree.XPath(path, namespaces=namespaces)
     except:
         self.sel2 = lambda x: []
     self.sel2_use_logged = False
     self.css = css
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')


class CocktailDbSpider(CrawlSpider):
    name = 'cocktaildb'
    allowed_domains = ['www.cocktaildb.com']
    start_urls = ['http://www.cocktaildb.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h2').extract():
            break
        else:
            return []
Esempio n. 8
0
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

#DEPTH_LIMIT = 2

########NEW FILE########
__FILENAME__ = cocktaildb
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')

class CocktailDbSpider(CrawlSpider):
	name = 'cocktaildb'
	allowed_domains = ['www.cocktaildb.com']
	start_urls = ['http://www.cocktaildb.com']

	rules = (
		Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'),
		Rule(SgmlLinkExtractor(allow=r'.*')),
	)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h2').extract():
Esempio n. 9
0
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')

class CocktailDbSpider(CrawlSpider):
	name = 'cocktaildb'
	allowed_domains = ['www.cocktaildb.com']
	start_urls = ['http://www.cocktaildb.com']

	rules = (
		Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'),
		Rule(SgmlLinkExtractor(allow=r'.*')),
	)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h2').extract():
			break
		else:
			return []

		ingredients = hxs.select(xp_ingredients).extract()
Esempio n. 10
0
from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_header = css_to_xpath('.header') + '/text()'
xp_ingredients = css_to_xpath('.story') + ("[1]//text()["
	"preceding::text()["
		"normalize-space(self::text()) = 'Ingredients:'"
	"]"
"]["
	"starts-with(normalize-space(self::text()), '-')"
"]")
xp_picture = ("//img["
	"preceding::comment()["
		"contains(self::comment(), ' COCKTAIL PHOTO ')"
	"]"
"]/@src")

class CocktailTimesSpider(CrawlSpider):
	name = 'cocktailtimes'
	allowed_domains = ['www.cocktailtimes.com']
	start_urls = ['http://www.cocktailtimes.com']

	rules = (
Esempio n. 11
0
def selector_to_xpath(selector, prefix='descendant-or-self::'):
    """JQuery selector to xpath.
    """
    selector = selector.replace('[@', '[')
    return css_to_xpath(selector, prefix)
Esempio n. 12
0
	def find_many(self,
			attribute = None,
			class_name = None,
			id = None,
			name = None,
			tag = None,
			text = None,
			text_contains = None,
			re_test = None,
			xpath = None,
			css = None
		):
		"""
		Retorna una lista de elementos que cumplen con un conjunto de 
		determinados criterios de búsqueda. Los criterios se definen usando
		keywords:
		
		*class_name*
			si tiene un clase dada 
		*attribute*
			attribute puede ser de tres tipos:
				1. str o unicode: Se busca un elemento con un atributo del tipo indicado.
				2. list: Se busca un elemento que contenga todos los atributos especificados en la lista.
				3. dict: Se busca un elemento que contenga todos los atributos especificados y que ademas 
						los valores de los mismos coincidan con los valores indicados por el diccionario.
					
			Si se tiene el siguiente código
			
			.. code-block:: html
			
				<html><body>
					<a href='spam' rel='bar'/>
					<a href='eggs' src='foo'/>
					<a href='eggs' src='toast'/>
				</body></html>

			Suponiendo que browser cargó esa página ::	
				
				browser.find_many(tag = 'a', attribute = 'rel')                                # devuelve el primer a
				browser.find_many(tag = 'a', attribute = [ 'href', 'src' ])                    # devuelve el segundo y el tercer a 
				browser.find_many(tag = 'a', attribute = { 'href' : 'eggs', 'src' : 'toast' }) # devuelve el tercero 
			
		*id*
			si tiene el id en un valor determinado
		*name*
			si tiene el atributo name definido con cierto valor
		*tag*
			si tiene un en la de elemento dado: 'A', 'H1', etc. 
		*text*
			si tiene algún texto en particular
		*text_contains*
			si el texto contiene alguna cadena particular
		*re_test*
			busca que un atributo coincida con una expresión regular
			se deben pasar pares (<nombre atributo>, expresión regular)
		*xpath*
			Excluyente a todos los otros: Obtiene por xpath absoluto
		*css*
			Excluyente a todos los otros: Obtiene por selector CSS
			
		"""
		for i in (attribute, class_name, id, name, tag, text, xpath, css):
			if i == None:
				break
		else:
			raise ValueError("You must specify at least one parameter")
		
		# se eligió búsqueda por css
		if css != None:
			for i in (attribute, class_name, id, name, tag, text, xpath,
					text_contains, re_test):
				if i != None:
					raise ValueError("You must either specify xpath, css or "\
					 "other parameter but not both")

			return self.get_elements_by_xpath(cssselect.css_to_xpath(css))
		
		# se eligió búsqueda por xpath
		if xpath != None:
			for i in (attribute, class_name, id, name, tag, text,
					text_contains, re_test):
				if i != None:
					raise ValueError("You must either specify xpath, css or "\
					 "other parameter but not both")
			return self.get_elements_by_xpath(xpath)
		
		# computa los criterios que tienen que ver con atributos
		attr_criteria = []
		if attribute != None:
			# - 'attribute' puede ser de tres tipos:
			# 1) str, unicode: Se busca un elemento con un atributo del tipo indicado.
			# 2) list: Se busca un elemento que contenga todos los atributos especificados en la lista.
			# 3) dict: Se busca un elemento que contenga todos los atributos especificados y que ademas 
			#		  los valores de los mismos coincidan con los valores indicados por el diccionario.
			
			if isinstance(attribute, str) or isinstance(attribute, str):
				attr_criteria.append("@%s" % attribute)
			elif isinstance(attribute, list):
				for a in attribute:
					attr_criteria.append("@%s" % a)
			elif isinstance(attribute, dict):
				for k, v in attribute.items():
					attr_criteria.append("@%s=%s" % (k, json.dumps(v)))
			else:
				raise ValueError('attribute es de tipo no valido. Se esperaba: str, unicode, list o dict. Se encontro: ' + str(attribute.__class__.__name__))
		
		attr_dict = {'class':class_name, 'id':id, 'name': name }
		for attr, attr_val in list(attr_dict.items()):
			if attr_val != None:
				attr_criteria.append("@%s=%s" % (attr, json.dumps(attr_val)))
		
		# por texto
		if text != None:
			attr_criteria.append("@%s=%s" % ('text()', json.dumps(text)))
		
		if text_contains != None:
			attr_criteria.append("contains(text(), %s)" % json.dumps(text_contains))
		
		# computa cada parte de la expresión final	
		if tag == None:
			tag_expr = "*"
		else:
			tag_expr = tag
			
		if len(attr_criteria) == 0:
			attr_criteria_expr = "" 
		else:
			attr_criteria_expr = "[ %s ]" % " and ".join(attr_criteria)
		
		# computa el xpath final a evaluar
		xpath_expr = ".// %s %s" % (tag_expr, attr_criteria_expr)
		res =  self.get_elements_by_xpath(xpath_expr)

		# filtra si hay una expresión regular
		if re_test != None:
			res = self.__filter_by_re_test(res, re_test)
		
		return res 
Esempio n. 13
0
from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipes = css_to_xpath('.hrecipe')
xp_ingredients = css_to_xpath('.ingredient li')

class WikipediaSpider(CrawlSpider):
	name = 'wikipedia'
	allowed_domains = ['en.wikipedia.org']
	start_urls = ['http://en.wikipedia.org/wiki/List_of_cocktails']

	rules = (
		Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:Cocktails(\b|_)'))),
		Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:.+(\b|_)drinks?(\b|_)'))),
		Rule(SgmlLinkExtractor(allow=(r'/wiki/[^:]+$')), callback='parse_recipes'),
	)

	def parse_recipes(self, response):
		hxs = HtmlXPathSelector(response)

		for url in hxs.select("//link[@rel='canonical']/@href").extract():
			url = urljoin(response.url, url)
Esempio n. 14
0
from urlparse import urljoin, urlparse
from itertools import groupby
from functools import partial

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipe_links = css_to_xpath(".cocktail") + "//a[1]/@href"


class OhGoshSpider(BaseSpider):
    name = "ohgosh"
    start_urls = ["http://ohgo.sh/cocktail-recipes/"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        links = hxs.select(xp_recipe_links).extract()
        links = [urljoin(response.url, url) for url in links]
        links.sort()

        for page_url, recipe_urls in groupby(links, lambda url: url.split("#")[0]):
            yield Request(page_url, partial(self.parse_recipes, recipe_urls=list(recipe_urls)))

    def parse_recipes(self, response, recipe_urls):
Esempio n. 15
0
from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br

xp_title = css_to_xpath('.entry-title')
xp_ingredients = css_to_xpath('.entry-content p') + '[1]'
xp_previous_link = css_to_xpath('.nav-previous a') + '/@href'

class Monkey47Spider(BaseSpider):
	name = 'monkey47'
	start_urls = ['http://www.monkey47.com/wordpress/tag/gin_cocktail_rezepte/']

	def parse(self, response):
		hxs = HtmlXPathSelector(response)

		for url in hxs.select(xp_title + '//a/@href').extract():
			yield Request(urljoin(response.url, url), self.parse_recipe)

		for url in hxs.select(xp_previous_link).extract():
			yield Request(urljoin(response.url, url), self.parse)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)
Esempio n. 16
0
from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipes = css_to_xpath('.hrecipe')
xp_ingredients = css_to_xpath('.ingredient li')


class WikipediaSpider(CrawlSpider):
    name = 'wikipedia'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ['http://en.wikipedia.org/wiki/List_of_cocktails']

    rules = (
        Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:Cocktails(\b|_)'))),
        Rule(SgmlLinkExtractor(
            allow=(r'/wiki/Category:.+(\b|_)drinks?(\b|_)'))),
        Rule(SgmlLinkExtractor(allow=(r'/wiki/[^:]+$')),
             callback='parse_recipes'),
    )

    def parse_recipes(self, response):
        hxs = HtmlXPathSelector(response)
Esempio n. 17
0
import re

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_title = css_to_xpath('.recipe_title')
xp_ingredients = css_to_xpath('.ingredient')


class DrinksMixerSpider(CrawlSpider):
    name = 'drinksmixer'
    allowed_domains = ['www.drinksmixer.com']
    start_urls = ['http://www.drinksmixer.com/']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/drink[^/]+.html$'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'/cat/')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
Esempio n. 18
0
from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br, extract_extra_ingredients

xp_recipe_links = css_to_xpath('.SolrResultTitle a') + '/@href'
xp_next_link = css_to_xpath('.SolrPageNext a') + '/@href'


class SaveurSpider(BaseSpider):
    name = 'saveur'
    start_urls = [
        'http://www.saveur.com/solrSearchResults.jsp?fq=Course:Beverages'
    ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(xp_recipe_links).extract():
            yield Request(urljoin(response.url, url), self.parse_recipe)

        for url in hxs.select(xp_next_link).extract():
            yield Request(urljoin(response.url, url), self.parse)

    def parse_recipe(self, response):
Esempio n. 19
0
import re

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_title = css_to_xpath('.recipe_title')
xp_ingredients = css_to_xpath('.ingredient')

class DrinksMixerSpider(CrawlSpider):
	name = 'drinksmixer'
	allowed_domains = ['www.drinksmixer.com']
	start_urls = ['http://www.drinksmixer.com/']

	rules = (
		Rule(SgmlLinkExtractor(allow=r'/drink[^/]+.html$'), callback='parse_recipe'),
		Rule(SgmlLinkExtractor(allow=r'/cat/')),
	)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select(xp_title).extract():
			break
		else:
			return []
Esempio n. 20
0
from urlparse import urljoin

from scrapy.contrib.spiders import SitemapSpider
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, unescape

xp_ingredient = css_to_xpath('.ingredient')

class EsquireSpider(SitemapSpider):
	name = 'esquire'
	sitemap_urls = ['http://www.esquire.com/robots.txt']
	sitemap_rules = [('/drinks/.*-recipe$', 'parse_recipe')]

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select("//meta[@property='og:title']/@content").extract():
			break
		else:
			return []

		for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract():
			picture = urljoin(response.url, picture)
			break
		else:
			picture = None
Esempio n. 21
0
from urlparse import urljoin, urlparse
from itertools import groupby
from functools import partial

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipe_links = css_to_xpath('.cocktail') + '//a[1]/@href'


class OhGoshSpider(BaseSpider):
    name = 'ohgosh'
    start_urls = ['http://ohgo.sh/cocktail-recipes/']

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        links = hxs.select(xp_recipe_links).extract()
        links = [urljoin(response.url, url) for url in links]
        links.sort()

        for page_url, recipe_urls in groupby(links,
                                             lambda url: url.split('#')[0]):
            yield Request(
                page_url,
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.cocktail-ingredients tr')

class KindredCocktails(CrawlSpider):
    name = 'kindredcocktails'
    allowed_domains = ['www.kindredcocktails.com']
    start_urls = ['http://www.kindredcocktails.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/cocktail/[^/?]+$'), callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []

        ingredients = hxs.select(xp_ingredients).extract()
Esempio n. 23
0
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

#DEPTH_LIMIT = 2

########NEW FILE########
__FILENAME__ = cocktaildb
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')


class CocktailDbSpider(CrawlSpider):
    name = 'cocktaildb'
    allowed_domains = ['www.cocktaildb.com']
    start_urls = ['http://www.cocktaildb.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)
from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_header = css_to_xpath('.header') + '/text()'
xp_ingredients = css_to_xpath('.story') + (
    "[1]//text()["
    "preceding::text()["
    "normalize-space(self::text()) = 'Ingredients:'"
    "]"
    "]["
    "starts-with(normalize-space(self::text()), '-')"
    "]")
xp_picture = ("//img["
              "preceding::comment()["
              "contains(self::comment(), ' COCKTAIL PHOTO ')"
              "]"
              "]/@src")


class CocktailTimesSpider(CrawlSpider):
    name = 'cocktailtimes'
    allowed_domains = ['www.cocktailtimes.com']
    start_urls = ['http://www.cocktailtimes.com']
Esempio n. 25
0
from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br, extract_extra_ingredients

xp_recipe_links = css_to_xpath('.SolrResultTitle a') + '/@href'
xp_next_link = css_to_xpath('.SolrPageNext a') + '/@href'

class SaveurSpider(BaseSpider):
	name = 'saveur'
	start_urls = ['http://www.saveur.com/solrSearchResults.jsp?fq=Course:Beverages']

	def parse(self, response):
		hxs = HtmlXPathSelector(response)

		for url in hxs.select(xp_recipe_links).extract():
			yield Request(urljoin(response.url, url), self.parse_recipe)

		for url in hxs.select(xp_next_link).extract():
			yield Request(urljoin(response.url, url), self.parse)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h1').extract():
Esempio n. 26
0
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.cocktail-ingredients tr')


class KindredCocktails(CrawlSpider):
    name = 'kindredcocktails'
    allowed_domains = ['www.kindredcocktails.com']
    start_urls = ['http://www.kindredcocktails.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/cocktail/[^/?]+$'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []
Esempio n. 27
0
from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br

xp_title = css_to_xpath('.entry-title')
xp_ingredients = css_to_xpath('.entry-content p') + '[1]'
xp_previous_link = css_to_xpath('.nav-previous a') + '/@href'


class Monkey47Spider(BaseSpider):
    name = 'monkey47'
    start_urls = [
        'http://www.monkey47.com/wordpress/tag/gin_cocktail_rezepte/'
    ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(xp_title + '//a/@href').extract():
            yield Request(urljoin(response.url, url), self.parse_recipe)

        for url in hxs.select(xp_previous_link).extract():
            yield Request(urljoin(response.url, url), self.parse)
Esempio n. 28
0
def selector_to_xpath(selector):
    """JQuery selector to xpath.
    """
    selector = selector.replace('[@', '[')
    return css_to_xpath(selector)
Esempio n. 29
0
import json
from functools import partial
from collections import OrderedDict

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

URL = 'http://www.seriouseats.com/topics/search?index=recipe&count=200&term=c|cocktails'

xp_ingredients = css_to_xpath('.ingredient')

class SeriouseatsSpider(BaseSpider):
  name = 'seriouseats'
	start_urls = [URL]

	def parse(self, response):
		recipes = json.loads(response.body)['entries']

		for recipe in recipes:
			picture = None

			for size in sorted(int(k[10:]) for k in recipe if k.startswith('thumbnail_')):
				picture = recipe['thumbnail_%d' % size]

				if picture:
Esempio n. 30
0
def selector_to_xpath(selector, prefix='descendant-or-self::'):
    """JQuery selector to xpath.
    """
    selector = selector.replace('[@', '[')
    return css_to_xpath(selector, prefix)