def parse_subject(self, response): item = DoubanItem() api_info = response.request.info actors = response.css(".actor a::text").extract() page = response.text e = pq(page) link_report = e("#link-report").text() pic_url = e(".related-pic-bd a") related_pic = [p.attr("href") for p in pic_url.items()] rating_people = e(".rating_people span").text() # 解析页面 得到一个字典 d = self.parse_subject_one(page) sub_info = { "title": api_info["title"], "movieId": api_info["id"], "directors": api_info["directors"], "casts": api_info["casts"], "actors": actors, "category": "电影", "coverUrl": api_info["cover"], "movieUrl": api_info["url"], "ratingValue": float(api_info["rate"]), "introduction": link_report, "ratingPeople": int(rating_people), "comments": { "shortComment": [], "longComment": [], }, "commentStatus": { # 全部抓取完成设为 True "short": False, "long": False, "full": True, }, "etc": { "relatedPictures": related_pic, } } sub_info = dict(d, **sub_info) movie_id = api_info["id"] # 短评 short_url = "https://movie.douban.com/subject/{}/comments?start=0&limit=20&sort=new_score&status=P".format( movie_id) # request = Request(short_url, callback=self.parse_short, meta=self.ip_meta) request = Request(short_url, callback=self.parse_short) request.item = item request.info = sub_info yield request # 剧评 long_url = "https://movie.douban.com/subject/{}/reviews?start=0".format( movie_id) request2 = Request(long_url, callback=self.parse_long) request2.info = sub_info request2.item = item yield request2 time.sleep(self.SLEEP)
def get_media_requests(self, item, info): #文件名使用数字排序 imagename = 1 for image_url in item['大图']: imagename += 1 self.default_headers['referer'] = image_url myrequest = Request(image_url, headers=self.default_headers) #传递参数 myrequest.item = item myrequest.name = str(imagename) + ".jpg" yield myrequest
def parseAndroid(self, response): item = UptodownItem() html_android = response.text item["android_html"] = html_android print("包含描述信息的页面", response.url) item["android_url"] = response.url # 这里调用解析,拿到下载地址,下载 parse_result = self.__parseAndroid(html_android) for field in item.fields: if field in parse_result.keys(): item[field] = parse_result[field] android_download_url = response.url + "/download" request = Request(android_download_url, callback=self.parseDownload, dont_filter=True) request.item = item yield request
def parse_sina(self, response): try: e = pq(response.text) loopblk = e(".loopblk") for i, div in enumerate(loopblk.items()): if i == 1 or i == 2: # 去掉图片排行和视频排行 pass else: """ 1. 拿到这个 div 是干什么的 如:新闻总排行 2. 拿到数据很简单,重点是判断它是属于什么排行 --点击量排行,评论量排行,分享数排行 """ rank_name = div(".lbti h2").text() cons = div(".Cons") for index, c in enumerate(cons.items()): # 只要前三个 if index == 3 or index == 4: pass else: # self.test(i, c, rank_name, url, item) # self.parse_cons(i, c, rank_name, url, item) # ----------------------------test---------------------------- # print("parsing sina") """ :param url: 拿到这些信息的 url :param rank_name: 排行榜的名字 :param i: i==0 -->点击量排行, i==1 -->评论数排行, i==2 -->分享数排行 :param c: 要抓的最小单位 :return: """ # item = SougoutopItem() info = {} if index == 0: info['news_type'] = "新浪" + rank_name + "点击量排行" elif index == 1: info['news_type'] = "新浪" + rank_name + "评论数排行" elif index == 2: info['news_type'] = "新浪" + rank_name + "分享数排行" script = c("script") item = SougoutopItem() item['url'] = response.url item["missionCreateTime"] = datetime.now() item["info"] = info for a, s in enumerate(script.items()): if a == 0 and index < 2: pass # print(i, a, s) # print(i, a, "dududu", s.attr("src")) url = s.attr("src") request = Request( url, callback=self._parse_script, dont_filter=True) request.item = item yield request # yield Request("http://www.baidu.com", callback=self.parse_sougou) if index == 2 and a == 1: # pass # print(i, a, "总有奇葩", s.attr("src")) url = s.attr("src") request = Request( url, callback=self._parse_script, dont_filter=True) request.item = item yield request # ----------------------------test---------------------------- finally: print("sina 正在添加新任务至队列头部") request = Request(url=response.url, callback=self.parse_sina, dont_filter=True) yield request time.sleep(self.avg_sleep)