def parse(self, response): # 解析页面 movie_name_list = Selector(response=response).xpath( '//dd/div[1]/div[2]/a/div/div[1]/span[1]/text()').extract() movie_type_list = Selector(response=response).xpath( '//dd/div[1]/div[2]/a/div/div[2]/text()').extract() movie_time_list = Selector(response=response).xpath( '//dd/div[1]/div[2]/a/div/div[4]/text()').extract() # 去除换行、空格 movie_type_list = eval( str(movie_type_list).replace(' ', '').replace('\\n', '')) movie_time_list = eval( str(movie_time_list).replace(' ', '').replace('\\n', '')) for ele in movie_type_list: if len(ele) == 0: movie_type_list.remove(ele) movie_time_list.remove(ele) items = [] for index in range(10): item = Homework2Item(movie_name=movie_name_list[index], movie_type=movie_type_list[index], movie_time=movie_time_list[index]) items.append(item) return items
def scrapyContent(filename): body = open(filename, 'r').read() itemName = Selector( text=body).xpath("/html/body/div/div[3]//img//@alt").extract() imageUrlList = Selector( text=body).xpath('//img/@data-ks-lazyload').extract() itemUrlList = Selector( text=body).xpath("/html/body/div/div[3]//dt/a//@href").extract() #使用图片的名字可以防止一些空格(detail的a中的名字可能有很多空格) for i in itemName[0:60]: print(i.replace('\\"', "")) #保存图片,图片名字由数字命名,之后可以换成其他的名字 j = 0 """ for i in imageUrlList: filename=i.replace('\\"',"")[2:] print("http://"+filename) saveImage("http://"+filename,"image/"+str(j)+".jpg") j+=1 """ for i in itemUrlList: if (i[2:4] != "//"): itemUrlList.remove(i) for i in itemUrlList[0:60]: itemUrl = "https:" + i.replace('\\"', "") print(itemUrl)
def getAdvisorData(self,response): nameLink = Selector(text=response.text).xpath(self.nameAndLinks).extract() for m in [u'mw-redirect',"new"]: try: nameLink.remove(m); except: None; nameLinkSorted = [[j,i] for i,j in zip(nameLink[::2], nameLink[1::2])]; return nameLinkSorted;
def parse_comments(self, response): item = CommentsItem() item['id'] = response.meta['id'] item['flag'] = response.meta['flag'] item['author'] = [] item['author_comment'] = [] item['time'] = [] text = response.text restojson = json.loads(text, encoding='utf-8') html = restojson['data']['html'] html = html.split('\\n') html = ''.join(html) author_comments = Selector( text=html).xpath('//*[@class="WB_text"]').extract() for author_comment in author_comments: item['author'].append( Selector(text=author_comment).xpath('//a/text()').extract()[0]) remove_author = Selector( text=author_comment).xpath('//a/text()').extract()[0] author_comment = dealcontent(author_comment) comment = Selector(text=author_comment).xpath('//text()').extract() comment.remove(remove_author) comment = ''.join(comment) while re.match(r'^ ', comment): comment = comment.strip(' ') item['author_comment'].append(comment) if item['flag'] == 'forwarded': item['time'] = Selector(text=html).xpath( '//*[@class="WB_from S_txt2"]/a/@title').extract() if item['flag'] == 'comment': item['time'] = Selector(text=html).xpath( '//*[@class="WB_from S_txt2"]/text()').extract() item['like_count'] = Selector(text=html).xpath( '////span[@node-type="like_status"]/em[2]/text()').extract() lens = len(item['like_count']) for i in range(0, lens): item['like_count'][i] = item['like_count'][i].replace('赞', '0') yield copy.deepcopy(item)