def scrape(): for countrya in getFrag(base, '//ul[@class="country-reports-list"]/li/a'): url=urljoin(base ,countrya.get('href')) country=countrya.xpath('text()')[0] for chapter in getFrag(url, '//div[@class="book-navigation"]/ul[@class="menu"]//a'): print jdump({'country': country, 'url': urljoin(base ,chapter.get('href')), 'chapter': chapter.xpath('text()')[0].split(' ',1)[1] }).encode('utf8').replace('\n','')
# (C) 2012 Stefan Marsiske <*****@*****.**> import fileinput, re from scraptils.utils import jdump # Organisation Name | Town/City | County | Tier & Rating | Sub Tier orgre=re.compile(r"^(\S.*?)\s{3,}(\S.*)\s{3,}(\S.*)") org2re=re.compile(r"^(\S.*?)\s{3,}(\S.*)") score=re.compile(r"^\s{1,}(\S.*)\s{3,}(\S.*)$") cache=[] for line in fileinput.input(openhook=fileinput.hook_compressed): m=orgre.match(line) if m: cache=[x.strip() if x else "" for x in m.groups()] continue m=org2re.match(line) if m: cache=[x.strip() if x else "" for x in m.groups()] continue m=score.match(line) if m: print jdump({'Organisation Name': cache[0], 'Town/City': cache[1], 'County': cache[2] if len(cache)>2 else '', 'Tier & Rating': m.group(1).strip(), 'Sub Tier': m.group(2).strip()}).replace('\n','') continue #print '[*] alert', cache, line