def scrape():
    for countrya in getFrag(base, '//ul[@class="country-reports-list"]/li/a'):
        url=urljoin(base ,countrya.get('href'))
        country=countrya.xpath('text()')[0]
        for chapter in getFrag(url, '//div[@class="book-navigation"]/ul[@class="menu"]//a'):
            print jdump({'country': country,
                         'url': urljoin(base ,chapter.get('href')),
                         'chapter': chapter.xpath('text()')[0].split(' ',1)[1]
                         }).encode('utf8').replace('\n','')
Ejemplo n.º 2
0
# (C) 2012 Stefan Marsiske <*****@*****.**>

import fileinput, re
from scraptils.utils import jdump

# Organisation Name | Town/City | County | Tier & Rating | Sub Tier

orgre=re.compile(r"^(\S.*?)\s{3,}(\S.*)\s{3,}(\S.*)")
org2re=re.compile(r"^(\S.*?)\s{3,}(\S.*)")
score=re.compile(r"^\s{1,}(\S.*)\s{3,}(\S.*)$")
cache=[]
for line in fileinput.input(openhook=fileinput.hook_compressed):
    m=orgre.match(line)
    if m:
        cache=[x.strip() if x else "" for x in m.groups()]
        continue
    m=org2re.match(line)
    if m:
        cache=[x.strip() if x else "" for x in m.groups()]
        continue
    m=score.match(line)
    if m:
        print jdump({'Organisation Name': cache[0],
                     'Town/City': cache[1],
                     'County': cache[2] if len(cache)>2 else '',
                     'Tier & Rating': m.group(1).strip(),
                     'Sub Tier': m.group(2).strip()}).replace('\n','')
        continue
    #print '[*] alert', cache, line