def parse_thread(self,response): self.log("thread url:%s"% response.url) hxs = HtmlXPathSelector(response) first_floor= hxs.select('//div[starts-with(normalize-space(@id),"post_")]')[0] content=first_floor.select('.//td[starts-with(normalize-space(@id),"postmessage_")]')[0] imgs=content.select('.//img/@src').extract(); if(len(imgs)==0): self.threads_db.remove({"url":response.url}) return; links = content.select('.//a/@href').extract(); if(len(links)==0): self.threads_db.remove({"url":response.url}) return; #items = []; #for img in imgs: item = AutobtItem() item['name'] = response.url item['image_urls'] = imgs item['links']=links #inspect_response(response) all=content.extract(); ret = parseDetailFromContent(all); tt=self.threads_db.find_one({"url":response.url}) tt["content"]=ret; tt['raw_content']=all; self.threads_db.save(tt); if not item['links']: return item return parse_bt(item);
def parse_thread(self,response): self.log("thread url:%s"% response.url) #inspect_response(response) hxs = HtmlXPathSelector(response) first_floor= hxs.select('//table[normalize-space(@class)="lista"]')[0] content=first_floor.select('./tr') links = []; imgs = []; desc=""; size='' for item in content: if (item.select("./td/text()").extract()[0].find('Torrent') > -1): bt_url = item.select("./td[2]/a/@href").extract()[0] #link= urlparse.urljoin(response.url, bt_url.strip()) links.append(bt_url) elif(item.select("./td/text()").extract()[0].find('Poster') > -1): imgurl = item.select("./td[2]/img/@src").extract()[0] imgs.append(imgurl) elif(item.select("./td/text()").extract()[0].find('Description') > -1): imgurl = item.select("./td[2]//img/@src").extract()[0] imgs.append(imgurl) cc = item.select("./td[2]/text()").extract(); for i in range(0,len(cc)-1): desc=desc+cc[i]; elif(item.select("./td/text()").extract()[0].find('Size') > -1): size = item.select("./td[2]/text()").extract()[0] if(len(imgs)==0): self.threads_db.remove({"url":response.url}) return; if(len(links)==0): self.threads_db.remove({"url":response.url}) return; item = AutobtItem() item['name'] = response.url item['image_urls'] = imgs item['links']=links #inspect_response(response) all=content.extract()[0]; #ret = parseDetailFromContent(all); con={}; con['size']=size; con['desc']=desc tt=self.threads_db.find_one({"url":response.url}) con['name']=tt['title'] ret={'level':len(con),"items":con} tt["content"]=ret; tt['raw_content']=all; self.threads_db.save(tt); if not item['links']: return item return parse_bt(item);
def parse_thread(self,response): self.log("thread url:%s"% response.url) hxs = HtmlXPathSelector(response) first_floor= hxs.select('//div[starts-with(normalize-space(@id),"post_")]')[0] content=first_floor.select('.//td[starts-with(normalize-space(@id),"postmessage_")]')[0] imgs=content.select('.//img/@src').extract(); if(len(imgs)==0): self.threads_db.remove({"url":response.url}) return; links = content.select('.//a/@href').extract(); if(len(links)==0): self.threads_db.remove({"url":response.url}) return; #items = []; #for img in imgs: item = AutobtItem() item['name'] = response.url item['image_urls'] = imgs item['links']=links #inspect_response(response) all=content.extract(); con={"name":"","size":"","format":""} colon = u'\uff1a' tags={ u'\u5f71\u7247\u540d\u7a31':"name", u'\u5f71\u7247\u540d\u79f0':"name", u'\u5f71\u7247\u683c\u5f0f':"format", u'\u5f71\u7247\u5927\u5c0f':"size", } for key in tags: index =all.find(key); if(index >0): name =all[index:] index = name.find("<br>") if(index <0 ): index = name.find("\t") if(index >0): name = name[0:index] index = name.find(colon) if(index > 0): con[tags[key]] = name[(index+1):] else: index = name.find(u":") if(index>0): con[tags[key]] = name[(index+1):] tt=self.threads_db.find_one({"url":response.url}) tt["content"]=con; tt['raw_content']=all; self.threads_db.save(tt); if not item['links']: return item return parse_bt(item);