Python css_to_xpath Examples, lxml.cssselect.css_to_xpath Python Examples

Example #1

0

Show file

File: test_css.py Project: hortonworkstest/hortonworks-sandbox

 def runTest(self):
     f = open(doc_fn, "rb")
     c = f.read()
     f.close()
     doc = html.document_fromstring(c)
     body = doc.xpath("//body")[0]
     bad = []
     selector, count = self.selectors[self.index]
     xpath = cssselect.css_to_xpath(cssselect.parse(selector))
     try:
         results = body.xpath(xpath)
     except Exception:
         e = sys.exc_info()[1]
         e.args = "%s for xpath %r" % (e, xpath)
         raise
     found = {}
     for item in results:
         if item in found:
             assert 0, "Element shows up multiple times: %r" % item
         found[item] = None
     if isinstance(results, basestring):
         assert 0, "Got string result (%r), not element, for xpath %r" % (results[:20], str(xpath))
     if len(results) != count:
         # if self.shortDescription() == 'div.character, div.dialog':
         #    import pdb; pdb.set_trace()
         assert 0, "Did not get expected results (%s) instead %s for xpath %r" % (count, len(results), str(xpath))

Example #2

0

Show file

File: test_css.py Project: lrowe/lxml

 def runTest(self):
     f = open(doc_fn, 'rb')
     c = f.read()
     f.close()
     doc = html.document_fromstring(c)
     body = doc.xpath('//body')[0]
     bad = []
     selector, count = self.selectors[self.index]
     options = dict(regex_prefix='re')
     xpath = cssselect.css_to_xpath(cssselect.parse(selector, options),
                                    **options)
     try:
         results = body.xpath(xpath, namespaces=namespaces)
     except Exception:
         e = sys.exc_info()[1]
         e.args = ("%s for xpath %r" % (e, xpath), )
         raise
     found = {}
     for item in results:
         if item in found:
             assert 0, ("Element shows up multiple times: %r" % item)
         found[item] = None
     if isinstance(results, basestring):
         assert 0, ("Got string result (%r), not element, for xpath %r" %
                    (results[:20], str(xpath)))
     if len(results) != count:
         #if self.shortDescription() == 'div.character, div.dialog':
         #    import pdb; pdb.set_trace()
         assert 0, (
             "Did not get expected results (%s) instead %s for xpath %r" %
             (count, len(results), str(xpath)))

Example #3

0

Show file

File: lx.py Project: ThePenguin1140/jabbapylib

def css_to_xpath(css, simplify=True):
    """CSS to XPath.
    
    Example: css_to_xpath('div.pad a')."""
    xpath = cssselect.css_to_xpath(css)
    if simplify:
        xpath = xpath.replace('descendant-or-self::', '//').replace('descendant::', '')
    return xpath

Example #4

0

Show file

File: lx.py Project: the7day/jabbapylib

def css_to_xpath(css, simplify=True):
    """CSS to XPath.

    Example: css_to_xpath('div.pad a')."""
    xpath = cssselect.css_to_xpath(css)
    if simplify:
        xpath = xpath.replace('descendant-or-self::',
                              '//').replace('descendant::', '')
    return xpath

Example #5

0

Show file

File: stylizer.py Project: Eksmo/calibre

 def __init__(self, css, namespaces=XPNSMAP):
     if isinstance(css, unicode):
         # Workaround for bug in lxml on windows/OS X that causes a massive
         # memory leak with non ASCII selectors
         css = css.encode('ascii', 'ignore').decode('ascii')
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '", css_to_xpath(css))
         self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces)
     except:
         self.sel1 = lambda x: []
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
                 css_to_xpath_no_case(css))
         self.sel2 = etree.XPath(path, namespaces=namespaces)
     except:
         self.sel2 = lambda x: []
     self.sel2_use_logged = False
     self.css = css

Example #6

0

Show file

File: stylizer.py Project: syn-gowthamsrungarapu/calibre

 def __init__(self, css, namespaces=XPNSMAP):
     if isinstance(css, unicode):
         # Workaround for bug in lxml on windows/OS X that causes a massive
         # memory leak with non ASCII selectors
         css = css.encode('ascii', 'ignore').decode('ascii')
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
                                       css_to_xpath(css))
         self.sel1 = etree.XPath(css_to_xpath(css), namespaces=namespaces)
     except:
         self.sel1 = lambda x: []
     try:
         path = self.LOCAL_NAME_RE.sub(r"local-name() = '",
                                       css_to_xpath_no_case(css))
         self.sel2 = etree.XPath(path, namespaces=namespaces)
     except:
         self.sel2 = lambda x: []
     self.sel2_use_logged = False
     self.css = css

Example #7

0

Show file

File: cocktaildb.py Project: thomasmodeneis/cocktail-search

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')


class CocktailDbSpider(CrawlSpider):
    name = 'cocktaildb'
    allowed_domains = ['www.cocktaildb.com']
    start_urls = ['http://www.cocktaildb.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h2').extract():
            break
        else:
            return []

Example #8

0

Show file

File: allPythonContent.py Project: Mondego/pyreco

USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

#DEPTH_LIMIT = 2

########NEW FILE########
__FILENAME__ = cocktaildb
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')

class CocktailDbSpider(CrawlSpider):
	name = 'cocktaildb'
	allowed_domains = ['www.cocktaildb.com']
	start_urls = ['http://www.cocktaildb.com']

	rules = (
		Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'),
		Rule(SgmlLinkExtractor(allow=r'.*')),
	)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h2').extract():

Example #9

0

Show file

File: cocktaildb.py Project: corey-rr/cocktail-search

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')

class CocktailDbSpider(CrawlSpider):
	name = 'cocktaildb'
	allowed_domains = ['www.cocktaildb.com']
	start_urls = ['http://www.cocktaildb.com']

	rules = (
		Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'), callback='parse_recipe'),
		Rule(SgmlLinkExtractor(allow=r'.*')),
	)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h2').extract():
			break
		else:
			return []

		ingredients = hxs.select(xp_ingredients).extract()

Example #10

0

Show file

File: cocktailtimes.py Project: corey-rr/cocktail-search

from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_header = css_to_xpath('.header') + '/text()'
xp_ingredients = css_to_xpath('.story') + ("[1]//text()["
	"preceding::text()["
		"normalize-space(self::text()) = 'Ingredients:'"
	"]"
"]["
	"starts-with(normalize-space(self::text()), '-')"
"]")
xp_picture = ("//img["
	"preceding::comment()["
		"contains(self::comment(), ' COCKTAIL PHOTO ')"
	"]"
"]/@src")

class CocktailTimesSpider(CrawlSpider):
	name = 'cocktailtimes'
	allowed_domains = ['www.cocktailtimes.com']
	start_urls = ['http://www.cocktailtimes.com']

	rules = (

Example #11

0

Show file

File: cssselectpatch.py Project: GRSEB9S/rfpow

def selector_to_xpath(selector, prefix='descendant-or-self::'):
    """JQuery selector to xpath.
    """
    selector = selector.replace('[@', '[')
    return css_to_xpath(selector, prefix)

Example #12

0

Show file

File: dom.py Project: igui/sdf-showcase

	def find_many(self,
			attribute = None,
			class_name = None,
			id = None,
			name = None,
			tag = None,
			text = None,
			text_contains = None,
			re_test = None,
			xpath = None,
			css = None
		):
		"""
		Retorna una lista de elementos que cumplen con un conjunto de 
		determinados criterios de búsqueda. Los criterios se definen usando
		keywords:
		
		*class_name*
			si tiene un clase dada 
		*attribute*
			attribute puede ser de tres tipos:
				1. str o unicode: Se busca un elemento con un atributo del tipo indicado.
				2. list: Se busca un elemento que contenga todos los atributos especificados en la lista.
				3. dict: Se busca un elemento que contenga todos los atributos especificados y que ademas 
						los valores de los mismos coincidan con los valores indicados por el diccionario.
					
			Si se tiene el siguiente código
			
			.. code-block:: html
			
				<html><body>
					<a href='spam' rel='bar'/>
					<a href='eggs' src='foo'/>
					<a href='eggs' src='toast'/>
				</body></html>

			Suponiendo que browser cargó esa página ::	
				
				browser.find_many(tag = 'a', attribute = 'rel')                                # devuelve el primer a
				browser.find_many(tag = 'a', attribute = [ 'href', 'src' ])                    # devuelve el segundo y el tercer a 
				browser.find_many(tag = 'a', attribute = { 'href' : 'eggs', 'src' : 'toast' }) # devuelve el tercero 
			
		*id*
			si tiene el id en un valor determinado
		*name*
			si tiene el atributo name definido con cierto valor
		*tag*
			si tiene un en la de elemento dado: 'A', 'H1', etc. 
		*text*
			si tiene algún texto en particular
		*text_contains*
			si el texto contiene alguna cadena particular
		*re_test*
			busca que un atributo coincida con una expresión regular
			se deben pasar pares (<nombre atributo>, expresión regular)
		*xpath*
			Excluyente a todos los otros: Obtiene por xpath absoluto
		*css*
			Excluyente a todos los otros: Obtiene por selector CSS
			
		"""
		for i in (attribute, class_name, id, name, tag, text, xpath, css):
			if i == None:
				break
		else:
			raise ValueError("You must specify at least one parameter")
		
		# se eligió búsqueda por css
		if css != None:
			for i in (attribute, class_name, id, name, tag, text, xpath,
					text_contains, re_test):
				if i != None:
					raise ValueError("You must either specify xpath, css or "\
					 "other parameter but not both")

			return self.get_elements_by_xpath(cssselect.css_to_xpath(css))
		
		# se eligió búsqueda por xpath
		if xpath != None:
			for i in (attribute, class_name, id, name, tag, text,
					text_contains, re_test):
				if i != None:
					raise ValueError("You must either specify xpath, css or "\
					 "other parameter but not both")
			return self.get_elements_by_xpath(xpath)
		
		# computa los criterios que tienen que ver con atributos
		attr_criteria = []
		if attribute != None:
			# - 'attribute' puede ser de tres tipos:
			# 1) str, unicode: Se busca un elemento con un atributo del tipo indicado.
			# 2) list: Se busca un elemento que contenga todos los atributos especificados en la lista.
			# 3) dict: Se busca un elemento que contenga todos los atributos especificados y que ademas 
			#		  los valores de los mismos coincidan con los valores indicados por el diccionario.
			
			if isinstance(attribute, str) or isinstance(attribute, str):
				attr_criteria.append("@%s" % attribute)
			elif isinstance(attribute, list):
				for a in attribute:
					attr_criteria.append("@%s" % a)
			elif isinstance(attribute, dict):
				for k, v in attribute.items():
					attr_criteria.append("@%s=%s" % (k, json.dumps(v)))
			else:
				raise ValueError('attribute es de tipo no valido. Se esperaba: str, unicode, list o dict. Se encontro: ' + str(attribute.__class__.__name__))
		
		attr_dict = {'class':class_name, 'id':id, 'name': name }
		for attr, attr_val in list(attr_dict.items()):
			if attr_val != None:
				attr_criteria.append("@%s=%s" % (attr, json.dumps(attr_val)))
		
		# por texto
		if text != None:
			attr_criteria.append("@%s=%s" % ('text()', json.dumps(text)))
		
		if text_contains != None:
			attr_criteria.append("contains(text(), %s)" % json.dumps(text_contains))
		
		# computa cada parte de la expresión final	
		if tag == None:
			tag_expr = "*"
		else:
			tag_expr = tag
			
		if len(attr_criteria) == 0:
			attr_criteria_expr = "" 
		else:
			attr_criteria_expr = "[ %s ]" % " and ".join(attr_criteria)
		
		# computa el xpath final a evaluar
		xpath_expr = ".// %s %s" % (tag_expr, attr_criteria_expr)
		res =  self.get_elements_by_xpath(xpath_expr)

		# filtra si hay una expresión regular
		if re_test != None:
			res = self.__filter_by_re_test(res, re_test)
		
		return res

Example #13

0

Show file

File: wikipedia.py Project: WillJHaggard/cocktail-search

from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipes = css_to_xpath('.hrecipe')
xp_ingredients = css_to_xpath('.ingredient li')

class WikipediaSpider(CrawlSpider):
	name = 'wikipedia'
	allowed_domains = ['en.wikipedia.org']
	start_urls = ['http://en.wikipedia.org/wiki/List_of_cocktails']

	rules = (
		Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:Cocktails(\b|_)'))),
		Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:.+(\b|_)drinks?(\b|_)'))),
		Rule(SgmlLinkExtractor(allow=(r'/wiki/[^:]+$')), callback='parse_recipes'),
	)

	def parse_recipes(self, response):
		hxs = HtmlXPathSelector(response)

		for url in hxs.select("//link[@rel='canonical']/@href").extract():
			url = urljoin(response.url, url)

Example #14

0

Show file

File: ohgosh.py Project: WillJHaggard/cocktail-search

from urlparse import urljoin, urlparse
from itertools import groupby
from functools import partial

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipe_links = css_to_xpath(".cocktail") + "//a[1]/@href"


class OhGoshSpider(BaseSpider):
    name = "ohgosh"
    start_urls = ["http://ohgo.sh/cocktail-recipes/"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        links = hxs.select(xp_recipe_links).extract()
        links = [urljoin(response.url, url) for url in links]
        links.sort()

        for page_url, recipe_urls in groupby(links, lambda url: url.split("#")[0]):
            yield Request(page_url, partial(self.parse_recipes, recipe_urls=list(recipe_urls)))

    def parse_recipes(self, response, recipe_urls):

Example #15

0

Show file

File: monkey47.py Project: WillJHaggard/cocktail-search

from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br

xp_title = css_to_xpath('.entry-title')
xp_ingredients = css_to_xpath('.entry-content p') + '[1]'
xp_previous_link = css_to_xpath('.nav-previous a') + '/@href'

class Monkey47Spider(BaseSpider):
	name = 'monkey47'
	start_urls = ['http://www.monkey47.com/wordpress/tag/gin_cocktail_rezepte/']

	def parse(self, response):
		hxs = HtmlXPathSelector(response)

		for url in hxs.select(xp_title + '//a/@href').extract():
			yield Request(urljoin(response.url, url), self.parse_recipe)

		for url in hxs.select(xp_previous_link).extract():
			yield Request(urljoin(response.url, url), self.parse)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

Example #16

0

Show file

from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipes = css_to_xpath('.hrecipe')
xp_ingredients = css_to_xpath('.ingredient li')


class WikipediaSpider(CrawlSpider):
    name = 'wikipedia'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ['http://en.wikipedia.org/wiki/List_of_cocktails']

    rules = (
        Rule(SgmlLinkExtractor(allow=(r'/wiki/Category:Cocktails(\b|_)'))),
        Rule(SgmlLinkExtractor(
            allow=(r'/wiki/Category:.+(\b|_)drinks?(\b|_)'))),
        Rule(SgmlLinkExtractor(allow=(r'/wiki/[^:]+$')),
             callback='parse_recipes'),
    )

    def parse_recipes(self, response):
        hxs = HtmlXPathSelector(response)

Example #17

0

Show file

import re

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_title = css_to_xpath('.recipe_title')
xp_ingredients = css_to_xpath('.ingredient')


class DrinksMixerSpider(CrawlSpider):
    name = 'drinksmixer'
    allowed_domains = ['www.drinksmixer.com']
    start_urls = ['http://www.drinksmixer.com/']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/drink[^/]+.html$'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'/cat/')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break

Example #18

0

Show file

from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br, extract_extra_ingredients

xp_recipe_links = css_to_xpath('.SolrResultTitle a') + '/@href'
xp_next_link = css_to_xpath('.SolrPageNext a') + '/@href'


class SaveurSpider(BaseSpider):
    name = 'saveur'
    start_urls = [
        'http://www.saveur.com/solrSearchResults.jsp?fq=Course:Beverages'
    ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(xp_recipe_links).extract():
            yield Request(urljoin(response.url, url), self.parse_recipe)

        for url in hxs.select(xp_next_link).extract():
            yield Request(urljoin(response.url, url), self.parse)

    def parse_recipe(self, response):

Example #19

0

Show file

File: drinksmixer.py Project: WillJHaggard/cocktail-search

import re

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_title = css_to_xpath('.recipe_title')
xp_ingredients = css_to_xpath('.ingredient')

class DrinksMixerSpider(CrawlSpider):
	name = 'drinksmixer'
	allowed_domains = ['www.drinksmixer.com']
	start_urls = ['http://www.drinksmixer.com/']

	rules = (
		Rule(SgmlLinkExtractor(allow=r'/drink[^/]+.html$'), callback='parse_recipe'),
		Rule(SgmlLinkExtractor(allow=r'/cat/')),
	)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select(xp_title).extract():
			break
		else:
			return []

Example #20

0

Show file

File: esquire.py Project: arunpn/cocktail-search

from urlparse import urljoin

from scrapy.contrib.spiders import SitemapSpider
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, unescape

xp_ingredient = css_to_xpath('.ingredient')

class EsquireSpider(SitemapSpider):
	name = 'esquire'
	sitemap_urls = ['http://www.esquire.com/robots.txt']
	sitemap_rules = [('/drinks/.*-recipe$', 'parse_recipe')]

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select("//meta[@property='og:title']/@content").extract():
			break
		else:
			return []

		for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract():
			picture = urljoin(response.url, picture)
			break
		else:
			picture = None

Example #21

0

Show file

from urlparse import urljoin, urlparse
from itertools import groupby
from functools import partial

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_recipe_links = css_to_xpath('.cocktail') + '//a[1]/@href'


class OhGoshSpider(BaseSpider):
    name = 'ohgosh'
    start_urls = ['http://ohgo.sh/cocktail-recipes/']

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        links = hxs.select(xp_recipe_links).extract()
        links = [urljoin(response.url, url) for url in links]
        links.sort()

        for page_url, recipe_urls in groupby(links,
                                             lambda url: url.split('#')[0]):
            yield Request(
                page_url,

Example #22

0

Show file

File: kindredcocktails.py Project: WillJHaggard/cocktail-search

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.cocktail-ingredients tr')

class KindredCocktails(CrawlSpider):
    name = 'kindredcocktails'
    allowed_domains = ['www.kindredcocktails.com']
    start_urls = ['http://www.kindredcocktails.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/cocktail/[^/?]+$'), callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []

        ingredients = hxs.select(xp_ingredients).extract()

Example #23

0

Show file

USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

#DEPTH_LIMIT = 2

########NEW FILE########
__FILENAME__ = cocktaildb
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.recipeMeasure')


class CocktailDbSpider(CrawlSpider):
    name = 'cocktaildb'
    allowed_domains = ['www.cocktaildb.com']
    start_urls = ['http://www.cocktaildb.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/recipe_detail\b'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

Example #24

0

Show file

File: cocktailtimes.py Project: thomasmodeneis/cocktail-search

from urlparse import urljoin

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_header = css_to_xpath('.header') + '/text()'
xp_ingredients = css_to_xpath('.story') + (
    "[1]//text()["
    "preceding::text()["
    "normalize-space(self::text()) = 'Ingredients:'"
    "]"
    "]["
    "starts-with(normalize-space(self::text()), '-')"
    "]")
xp_picture = ("//img["
              "preceding::comment()["
              "contains(self::comment(), ' COCKTAIL PHOTO ')"
              "]"
              "]/@src")


class CocktailTimesSpider(CrawlSpider):
    name = 'cocktailtimes'
    allowed_domains = ['www.cocktailtimes.com']
    start_urls = ['http://www.cocktailtimes.com']

Example #25

0

Show file

File: saveur.py Project: WillJHaggard/cocktail-search

from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br, extract_extra_ingredients

xp_recipe_links = css_to_xpath('.SolrResultTitle a') + '/@href'
xp_next_link = css_to_xpath('.SolrPageNext a') + '/@href'

class SaveurSpider(BaseSpider):
	name = 'saveur'
	start_urls = ['http://www.saveur.com/solrSearchResults.jsp?fq=Course:Beverages']

	def parse(self, response):
		hxs = HtmlXPathSelector(response)

		for url in hxs.select(xp_recipe_links).extract():
			yield Request(urljoin(response.url, url), self.parse_recipe)

		for url in hxs.select(xp_next_link).extract():
			yield Request(urljoin(response.url, url), self.parse)

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h1').extract():

Example #26

0

Show file

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

xp_ingredients = css_to_xpath('.cocktail-ingredients tr')


class KindredCocktails(CrawlSpider):
    name = 'kindredcocktails'
    allowed_domains = ['www.kindredcocktails.com']
    start_urls = ['http://www.kindredcocktails.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/cocktail/[^/?]+$'),
             callback='parse_recipe'),
        Rule(SgmlLinkExtractor(allow=r'.*')),
    )

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []

Example #27

0

Show file

File: monkey47.py Project: thomasmodeneis/cocktail-search

from urlparse import urljoin

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text, split_at_br

xp_title = css_to_xpath('.entry-title')
xp_ingredients = css_to_xpath('.entry-content p') + '[1]'
xp_previous_link = css_to_xpath('.nav-previous a') + '/@href'


class Monkey47Spider(BaseSpider):
    name = 'monkey47'
    start_urls = [
        'http://www.monkey47.com/wordpress/tag/gin_cocktail_rezepte/'
    ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(xp_title + '//a/@href').extract():
            yield Request(urljoin(response.url, url), self.parse_recipe)

        for url in hxs.select(xp_previous_link).extract():
            yield Request(urljoin(response.url, url), self.parse)

Example #28

0

Show file

File: cssselectpatch.py Project: josephwecker/pywebtest

def selector_to_xpath(selector):
    """JQuery selector to xpath.
    """
    selector = selector.replace('[@', '[')
    return css_to_xpath(selector)

Example #29

0

Show file

File: snippet.py Project: szabo92/gistable

import json
from functools import partial
from collections import OrderedDict

from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath

from cocktails.items import CocktailItem
from cocktails.utils import html_to_text

URL = 'http://www.seriouseats.com/topics/search?index=recipe&count=200&term=c|cocktails'

xp_ingredients = css_to_xpath('.ingredient')

class SeriouseatsSpider(BaseSpider):
  name = 'seriouseats'
	start_urls = [URL]

	def parse(self, response):
		recipes = json.loads(response.body)['entries']

		for recipe in recipes:
			picture = None

			for size in sorted(int(k[10:]) for k in recipe if k.startswith('thumbnail_')):
				picture = recipe['thumbnail_%d' % size]

				if picture:

Example #30

0

Show file

File: cssselectpatch.py Project: AndryulE/kitsune

def selector_to_xpath(selector, prefix='descendant-or-self::'):
    """JQuery selector to xpath.
    """
    selector = selector.replace('[@', '[')
    return css_to_xpath(selector, prefix)