Ejemplo n.º 1
0
    def parse_thread(self,response):
        self.log("thread url:%s"% response.url)
        hxs = HtmlXPathSelector(response)
        first_floor= hxs.select('//div[starts-with(normalize-space(@id),"post_")]')[0]
        content=first_floor.select('.//td[starts-with(normalize-space(@id),"postmessage_")]')[0]
        imgs=content.select('.//img/@src').extract();
        if(len(imgs)==0):
	    self.threads_db.remove({"url":response.url})
            return;
	links = content.select('.//a/@href').extract();
        if(len(links)==0):
	    self.threads_db.remove({"url":response.url})
            return;
        #items = [];
        #for img in imgs:
        item = AutobtItem()
	item['name'] = response.url
        item['image_urls'] = imgs 
	item['links']=links
        #inspect_response(response)

	all=content.extract();
	
	ret = parseDetailFromContent(all);

	tt=self.threads_db.find_one({"url":response.url})
	tt["content"]=ret;
	tt['raw_content']=all;
	self.threads_db.save(tt);
	
	if not item['links']:
		return item
        return parse_bt(item);
Ejemplo n.º 2
0
    def parse_thread(self,response):
        self.log("thread url:%s"% response.url)
	#inspect_response(response)
        hxs = HtmlXPathSelector(response)
        first_floor= hxs.select('//table[normalize-space(@class)="lista"]')[0]
	content=first_floor.select('./tr')

	links = [];
	imgs = [];
	desc="";
	size=''
	for item in content:
		if (item.select("./td/text()").extract()[0].find('Torrent') > -1):
			bt_url = item.select("./td[2]/a/@href").extract()[0]
	    		#link= urlparse.urljoin(response.url, bt_url.strip())
			links.append(bt_url)
		elif(item.select("./td/text()").extract()[0].find('Poster') > -1):
			imgurl = item.select("./td[2]/img/@src").extract()[0]
			imgs.append(imgurl)
		elif(item.select("./td/text()").extract()[0].find('Description') > -1):
			imgurl = item.select("./td[2]//img/@src").extract()[0]
			imgs.append(imgurl)
			cc = item.select("./td[2]/text()").extract();
			for i in range(0,len(cc)-1):
				desc=desc+cc[i];
		elif(item.select("./td/text()").extract()[0].find('Size') > -1):
			size = item.select("./td[2]/text()").extract()[0]
			

        if(len(imgs)==0):
	    self.threads_db.remove({"url":response.url})
            return;
        if(len(links)==0):
	    self.threads_db.remove({"url":response.url})
            return;
        item = AutobtItem()
	item['name'] = response.url
        item['image_urls'] = imgs 
	item['links']=links
        #inspect_response(response)

	all=content.extract()[0];

	#ret = parseDetailFromContent(all);
	con={};
	con['size']=size;
	con['desc']=desc

	tt=self.threads_db.find_one({"url":response.url})
	con['name']=tt['title']
	ret={'level':len(con),"items":con}
	tt["content"]=ret;
	tt['raw_content']=all;
	self.threads_db.save(tt);
	
	if not item['links']:
		return item
	
        return parse_bt(item);
Ejemplo n.º 3
0
    def parse_thread(self,response):
        self.log("thread url:%s"% response.url)
        hxs = HtmlXPathSelector(response)
        first_floor= hxs.select('//div[starts-with(normalize-space(@id),"post_")]')[0]
        content=first_floor.select('.//td[starts-with(normalize-space(@id),"postmessage_")]')[0]
        imgs=content.select('.//img/@src').extract();
        if(len(imgs)==0):
	    self.threads_db.remove({"url":response.url})
            return;
	links = content.select('.//a/@href').extract();
        if(len(links)==0):
	    self.threads_db.remove({"url":response.url})
            return;
        #items = [];
        #for img in imgs:
        item = AutobtItem()
	item['name'] = response.url
        item['image_urls'] = imgs 
	item['links']=links
        #inspect_response(response)

	all=content.extract();
	con={"name":"","size":"","format":""}
	colon =  u'\uff1a' 
	tags={
         u'\u5f71\u7247\u540d\u7a31':"name",
         u'\u5f71\u7247\u540d\u79f0':"name",
         u'\u5f71\u7247\u683c\u5f0f':"format",
         u'\u5f71\u7247\u5927\u5c0f':"size",
        }

	for key in tags:
		index =all.find(key);
		if(index >0):
			name =all[index:]
			index = name.find("<br>") 
			if(index <0 ):
				index = name.find("\t")
			if(index >0):
				name = name[0:index]
				index = name.find(colon)
				if(index > 0):
					con[tags[key]] = name[(index+1):]
				else:
					index = name.find(u":")
					if(index>0):
						con[tags[key]] = name[(index+1):]

	tt=self.threads_db.find_one({"url":response.url})
	tt["content"]=con;
	tt['raw_content']=all;
	self.threads_db.save(tt);
	
	if not item['links']:
		return item
        return parse_bt(item);