def catg(): sh = shelve.open("cat.db",writeback=False)['dict'] urls = ['http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A11091801%2Ck%3A%22&page=1&keywords=%22&ie=UTF8&qid=1375737123'] fil = open(fi[len(fi)-1].split('=')[1],"w") for i in sh: print i urls.append(urls[0].replace('mi',str(i))) #print urls for ii in urls: #a = requests.get(ii,headers={'User-Agent': 'Mozilla/5.0'}).text #If you're on a windows machine, works for both windows and linux, but can sometimes be unreliable and slow. a = subprocess.check_output("scrapy fetch '%s'"%ii,shell=True) #I know weird, but this seems to be quite reliable. hxs = HtmlXPathSelector(text=a) for p in hxs.select('//span'): if 'class="pagnDisabled"' in p: n = (int(str(p).split('<')[2].split('>')[1])+1) if n > 1: for ra in range(2, n): pag = ii.replace('page=1','page=%s'%str(ra)) aa = subprocess.check_output("scrapy fetch '%s'"%pag,shell=True) li = [] for d in hxs.select('//a').extract(): # find the right class if 'ilo2 ilc2' in d: li.append(d) lin = [] for dd in li: tt = HtmlXPathSelector(text=dd) lin.append(tt.select('//a/@href').extract()[0]) for it in lin: aaa = subprocess.check_output("scrapy fetch '%s'"%it,shell=True) ttt = HtmlXPathSelector(text=aaa) rev = ttt.select('//a') revi = "" for ite in rev: if "customer reviews" in ite: revv = HtmlXPathSelector(text=ite) revi = revv.select('//a/@href').extract() break revie = HtmlXPathSelector(text=subprocess.check_output("scrapy fetch '%s'"%revi,shell=True)) for renn in revie.selecet('//a/@href').extract(): if "profile" in renn: scrap_prof("http://www.amazon.com/"+renn)