def after_login(self, response): # check login succeed before going on #inspect_response(response) debug = 0; if 'lupkkk' in response.body: self.log("login sucessfully") if settings['debug'] or debug: self.log("enter debug mode. make sure that this link has been stored in db") url = 'http://aisex.com/bt/htm_data/5/1206/530628.html' post = {"url":url, "title":"test", "tag": self.name, "grab_at":datetime.datetime.utcnow(), "grab_progress":"0", "classify":self.classify, "create_time":'test'} createNewThread(post) yield Request(url,callback=self.parse_thread) return for x in self.parse_urls: yield Request(x,callback=self.parse_threads) #return Request(self.parse_urls[self.parse_index], # callback=self.parse_thread); else: self.log("failed to login") return
def parse_threads(self,response): self.log("access url:%s"% response.url) hxs = HtmlXPathSelector(response) #inspect_response(response) threads=hxs.select('//tr[normalize-space(@class)="lista2"]') for thread in threads: items=thread.select('./td') relative_link=items[1].select('./a/@href').extract() if relative_link : relative_link=relative_link[0] else: self.log("ERR:%s"%thread.extract()); absolute_link= urlparse.urljoin(response.url, relative_link.strip()) title=items[1].select('./a/text()').extract()[0] create_time = items[2].select('./text()').extract()[0] self.log("+++link:%s time:%s"%(absolute_link,create_time)); post = {"url":absolute_link, "title":title, "tag": self.name, "grab_at":datetime.datetime.utcnow(), "grab_progress":"0", "classify":self.classify, "create_time":create_time} if createNewThread(post): yield Request(absolute_link,callback=self.parse_thread) return ;
def parse_threads(self,response): self.log("access url:%s"% response.url) hxs = HtmlXPathSelector(response) all =hxs.select('//tbody[starts-with(normalize-space(@id),"normalthread_")]/tr'); for thread in all: content=thread.select("th"); relative_link=content.select('a[1]/@href').extract()[0]; absolute_link= urlparse.urljoin(response.url, relative_link.strip()) title = content.select('a[1]/text()').extract()[0] time=thread.select("td[normalize-space(@class)='by']"); create_time=time[0].select('em/span/text()').extract()[0] self.log("+++link:%s time:%s"%(absolute_link,create_time)); post = {"url":absolute_link, "title":title, "tag": self.name, "classify":self.classify, "grab_at":datetime.datetime.utcnow(), "grab_progress":"0", "create_time":create_time} if createNewThread(post): yield Request(absolute_link,callback=self.parse_thread) return ;
def parse_threads(self,response): self.log("access url:%s"% response.url) hxs = HtmlXPathSelector(response) #inspect_response(response) block=hxs.select('//tr[normalize-space(@class)="head"]/../tr[normalize-space(@class)="cbg"]') block_num=len(block) threads=hxs.select('//tr[normalize-space(@class)="head"]/../tr') for thread in threads: cls=thread.select('./@class').extract() if(cls[0]=="cbg"): block_num-=1; continue if(block_num==0): relative_link=thread.select('.//a[1]/@href').extract() if relative_link : relative_link=relative_link[0] else: self.log("ERR:%s"%thread.extract()); continue absolute_link= urlparse.urljoin(response.url, relative_link.strip()) title = thread.select('.//a[1]/text()').extract()[0] time=thread.select(".//td[normalize-space(@class)='smalltxt']")[0] create_time= time.select('./text()').extract()[1] self.log("+++link:%s time:%s"%(absolute_link,create_time)); post = {"url":absolute_link, "title":title, "tag": self.name, "grab_at":datetime.datetime.utcnow(), "grab_progress":"0", "classify":self.classify, "create_time":create_time} if createNewThread(post): yield Request(absolute_link,callback=self.parse_thread) return ;