Beispiel #1
0
    def after_login(self, response):
        # check login succeed before going on
        #inspect_response(response)
	debug = 0;
        if 'lupkkk' in response.body:
            self.log("login sucessfully")
	    if settings['debug'] or debug:
		self.log("enter debug mode. make sure that this link has been stored in db")
		url = 'http://aisex.com/bt/htm_data/5/1206/530628.html'
		post = {"url":url,
                                "title":"test",
                                "tag": self.name,
                                "grab_at":datetime.datetime.utcnow(),
                                "grab_progress":"0",
                                "classify":self.classify,
                                "create_time":'test'}
                createNewThread(post)
                yield Request(url,callback=self.parse_thread)
		return
            for x in self.parse_urls:
                 yield Request(x,callback=self.parse_threads)
             #return Request(self.parse_urls[self.parse_index],
              #                  callback=self.parse_thread);
        else:
             self.log("failed to login")
        return
Beispiel #2
0
    def parse_threads(self,response):
        self.log("access url:%s"% response.url)
        hxs = HtmlXPathSelector(response)
	#inspect_response(response)
	threads=hxs.select('//tr[normalize-space(@class)="lista2"]')

        for thread in threads:
	    items=thread.select('./td')
	    relative_link=items[1].select('./a/@href').extract()
	    if relative_link :
		relative_link=relative_link[0]
	    else:
		self.log("ERR:%s"%thread.extract());

	    absolute_link= urlparse.urljoin(response.url, relative_link.strip())
	    title=items[1].select('./a/text()').extract()[0]
	    create_time = items[2].select('./text()').extract()[0]

	    self.log("+++link:%s  time:%s"%(absolute_link,create_time));
            post = {"url":absolute_link,
                    		"title":title,
		    		"tag": self.name,
		    		"grab_at":datetime.datetime.utcnow(),
		    		"grab_progress":"0",
				"classify":self.classify,
                    		"create_time":create_time}
            if createNewThread(post):
                yield Request(absolute_link,callback=self.parse_thread)
        return ;
Beispiel #3
0
    def parse_threads(self,response):
        self.log("access url:%s"% response.url)
        hxs = HtmlXPathSelector(response)
        all =hxs.select('//tbody[starts-with(normalize-space(@id),"normalthread_")]/tr');
        for thread in all:
            content=thread.select("th");
            relative_link=content.select('a[1]/@href').extract()[0];
            absolute_link= urlparse.urljoin(response.url, relative_link.strip())
            title = content.select('a[1]/text()').extract()[0]
            time=thread.select("td[normalize-space(@class)='by']");
            create_time=time[0].select('em/span/text()').extract()[0] 
            self.log("+++link:%s  time:%s"%(absolute_link,create_time));
            post = {"url":absolute_link,
                    "title":title,
		    "tag": self.name,
		    "classify":self.classify,
		    "grab_at":datetime.datetime.utcnow(),
		    "grab_progress":"0",
                    "create_time":create_time}
	    if createNewThread(post):
                yield Request(absolute_link,callback=self.parse_thread)
        return ;
Beispiel #4
0
    def parse_threads(self,response):
        self.log("access url:%s"% response.url)
        hxs = HtmlXPathSelector(response)
	#inspect_response(response)
	block=hxs.select('//tr[normalize-space(@class)="head"]/../tr[normalize-space(@class)="cbg"]')
	block_num=len(block)
	threads=hxs.select('//tr[normalize-space(@class)="head"]/../tr')

        for thread in threads:
	    cls=thread.select('./@class').extract()
	    if(cls[0]=="cbg"):
		block_num-=1;
		continue
	    if(block_num==0):
		relative_link=thread.select('.//a[1]/@href').extract()
		if relative_link :
			relative_link=relative_link[0]
		else:
			self.log("ERR:%s"%thread.extract());
			continue
		absolute_link= urlparse.urljoin(response.url, relative_link.strip())
		title = thread.select('.//a[1]/text()').extract()[0]
		time=thread.select(".//td[normalize-space(@class)='smalltxt']")[0]
		create_time= time.select('./text()').extract()[1]

		self.log("+++link:%s  time:%s"%(absolute_link,create_time));
                post = {"url":absolute_link,
                    		"title":title,
		    		"tag": self.name,
		    		"grab_at":datetime.datetime.utcnow(),
		    		"grab_progress":"0",
				"classify":self.classify,
                    		"create_time":create_time}
                if createNewThread(post):
                	yield Request(absolute_link,callback=self.parse_thread)
        return ;