def parse_json(self, response): print("json") div_xpath = '//div[@class="news_li xh-highlight" or "news_li"]' title_xpath = 'h2/a/text()' url_xpath = 'h2/a/@href' summary_xpath = 'p/text()' image_url_xpath = 'div[@class="news_tu"]/a/img/@src' news_time_xpath = 'div[@class="pdtt_trbs"]/span[1]/text()' cnt = 0 re_selectors = response.xpath(div_xpath) for re_selector in re_selectors: title = re_selector.xpath(title_xpath).extract() url = re_selector.xpath(url_xpath).extract() # url_md5 = get_md5(url) summary = re_selector.xpath(summary_xpath).extract() img_urls = re_selector.xpath(image_url_xpath).extract() category = response.meta.get("category", "") from_platform = self.from_platform news_time = re_selector.xpath(news_time_xpath).extract() try: crawl_time = datetime.datetime.strptime( crawl_time, "%Y/%m/%d").date() except Exception as e: crawl_time = datetime.datetime.now().date() #排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_urls): pass else: continue #设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time[0]) else: news_score = 1.0 base_url = 'https://www.thepaper.cn/' img_url = make_str('http:', img_urls) news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_url) news_itemloader.add_value("url", parse.urljoin(base_url, url[0])) news_itemloader.add_value("url_md5", get_md5(url[0])) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # # 测试json,减少数量 # cnt = cnt + 1 # if cnt == 2: # break yield news_item pass
def parse_detail(self, response): print("detail") # html = Selector(response) cnt = 0 div_xpath = '//div/div[@class="item-inner"]' title_xpath = 'h2//a/text()' url_xpath = 'h2//a/@href' summary_xpath = 'div[@class="item-lead"]/text()' image_url_xpath = 'a[@class="image"]/figure/@data-url' news_time_xpath = 'div[@class="item-time"]/text()' re_selectors = response.xpath(div_xpath) for re_selector in re_selectors: title = re_selector.xpath(title_xpath).extract() url = re_selector.xpath(url_xpath).extract() # url_md5 = get_md5(url) summary = re_selector.xpath(summary_xpath).extract() img_url = re_selector.xpath(image_url_xpath).extract() category = response.meta.get("category", "") from_platform = self.from_platform news_time = re_selector.xpath(news_time_xpath).extract() try: crawl_time = datetime.datetime.strptime( crawl_time, "%Y/%m/%d").date() except Exception as e: crawl_time = datetime.datetime.now().date() #检测爬取是否正确 # with open('test.html','wb') as fp: # fp.write(response.text.encode('utf-8')) # fp.close() #排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_url): pass else: continue #设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time[0]) else: news_score = 1.0 # news_item = NewsItem() # news_item['title'] = title # news_item['image_urls'] = img_url # news_item['url'] = url # news_item['url_md5'] = get_md5(url) # news_item['category'] = category # news_item['summary'] = summary # news_item['from_platform'] = from_platform # news_item['news_time'] = news_time # news_item['crawl_time'] = crawl_time # news_item['news_score'] = news_score test_img = 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2519070834.webp' news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_url) # news_itemloader.add_value("image_path", '/images/full') #测试scrapyd的时候,指定图片路径 news_itemloader.add_value("url", parse.urljoin(response.url, url[0])) news_itemloader.add_value("url_md5", get_md5(url[0])) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # 测试json,减少数量 cnt = cnt + 1 if cnt == 3: break yield news_item pass
def parse_detail(self, response): print("detail") div_xpath = '//div[@class="newsbox"]/div[@class="news_li xh-highlight" or "news_li"]' title_xpath = 'h2/a/text()' url_xpath = 'h2/a/@href' summary_xpath = 'p/text()' image_url_xpath = 'div[@class="news_tu"]/a/img/@src' news_time_xpath = 'div[@class="pdtt_trbs"]/span[1]/text()' # topCid1_xpath = '//div[@class="pdtt01"]/div[@class="pdtt_lt"]/a[@class="tiptitleImg"]/@data-id' # topCid23_xpath = '//div[@class="newsbox"]/div[@class="news_li xh-highlight" or "news_li"]/div/a/@data-id' # # test_xpath = '//div[@id="masonryContent"]/div[@id="cont2257855"]/div[@class="news_tu"]/a[@class="tiptitleImg"]' # lasttime_xpath = '//div[@class="newsbox"]/div[@class="news_li" and @id="last1"]/@lasttime' # topCid1 = response.xpath(topCid1_xpath).extract()[0] # topCid2 = response.xpath(topCid23_xpath).extract()[0] # topCid3 = response.xpath(topCid23_xpath).extract()[1] # last_time = response.xpath(lasttime_xpath).extract()[0] # topCids = [] # topCids.append(topCid1) # topCids.append(topCid2) # topCids.append(topCid3) # topCid = ','.join(topCids) # # kv = {"category": name, "nodeids": node, "time": last_time, "topCids": topCid} cnt = 0 re_selectors = response.xpath(div_xpath) for re_selector in re_selectors: title = re_selector.xpath(title_xpath).extract() url = re_selector.xpath(url_xpath).extract() # url_md5 = get_md5(url) temp_summary = re_selector.xpath(summary_xpath).extract() if len(temp_summary) >= 1: summary = temp_summary else: summary = '' img_urls = re_selector.xpath(image_url_xpath).extract() category = response.meta.get("category", "") from_platform = self.from_platform news_time = re_selector.xpath(news_time_xpath).extract() try: crawl_time = datetime.datetime.strptime( crawl_time, "%Y/%m/%d").date() except Exception as e: crawl_time = datetime.datetime.now().date() # 检测爬取是否正确 with open('pengpai_test.html', 'wb') as fp: fp.write(response.text.encode('utf-8')) fp.close() #排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_urls): pass else: continue #设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time[0]) else: news_score = 1.0 base_url = 'https://www.thepaper.cn/' img_url = make_str('http:', img_urls) news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_url) news_itemloader.add_value("url", parse.urljoin(base_url, url[0])) news_itemloader.add_value("url_md5", get_md5(url[0])) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # #测试json,减少数量 # cnt = cnt + 1 # if cnt == 2: # break yield news_item node = response.meta.get("nodeids", "") index_add = 2 if node != "": for i in range(2): page = i + 2 if category == '精选': json_url = 'https://www.thepaper.cn/load_chosen.jsp?nodeids={0}&pageidx={1}'.format( node, page) else: json_url = 'https://www.thepaper.cn/load_index.jsp?nodeids={0}&pageidx={1}'.format( node, page) yield Request(url=json_url, callback=self.parse_json, meta={"category": category}, dont_filter=True)
def parse_detail(self, response): print('detail') self.open_selenium = False cnt = 0 div_xpath = '//div[@class="focus-mod"]//a[@class="focus-item"]' title_xpath = 'div[@class="txt"]/h2/text()' url_xpath = '@href' # summary_xpath = 'p/text()' image_url_xpath = 'div[@class="pic"]/img/@src' news_time_xpath = 'div[@class="txt"]/div[@class="info"]/span[2]/text()' re_selectors = response.xpath(div_xpath) for re_selector in re_selectors: title = re_selector.xpath(title_xpath).extract() url = re_selector.xpath(url_xpath).extract() # url_md5 = get_md5(url) summary = '' # 提取图片地址 temp_urls = re_selector.xpath(image_url_xpath).extract() if len(temp_urls) >= 1: img_urls = 'http:' + temp_urls[0] else: img_urls = 'https://mat1.gtimg.com/pingjs/ext2020/newom/build/static/images/new_logo.png' category = response.meta.get("category", "") from_platform = self.from_platform news_time = re_selector.xpath(news_time_xpath).extract() if len(news_time) == 0: news_time = '1天前' crawl_time = datetime.datetime.now().date() # 排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_urls): pass else: continue # 设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time[0]) else: news_score = 1.0 news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_urls) news_itemloader.add_value("url", url) news_itemloader.add_value("url_md5", get_md5(url[0])) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # 测试json,减少数量 cnt = cnt + 1 if cnt == 2: break yield news_item self.open_selenium = True yield Request(url=response.url, callback=self.parse_more, meta={"category": category}, dont_filter=True) pass
def parse_more(self, response): print('more') self.open_selenium = False cnt = 0 div_xpath = '//div[@id="List"]//ul[@class="list"]/li[@class="item cf" or "item-pics cf"]' title_pics_xpath = 'h3/a/text()' title_xpath = 'div[@class="detail"]/h3/a/text()' url_xpath = 'div[@class="detail"]/h3/a/@href' url_pic_xpath = 'h3/a/@href' # summary_xpath = 'p/text()' image_url_xpath = 'div[@class="picture" or "fl picture"]//img[1]/@src' news_time_xpath = 'div[@class="detail"]//span[@class="time"]/text()' re_selectors = response.xpath(div_xpath) for re_selector in re_selectors: item_type = re_selector.xpath('@class').extract()[0] if item_type == "item cf": title = re_selector.xpath(title_xpath).extract() url = re_selector.xpath(url_xpath).extract() else: title = re_selector.xpath(title_pics_xpath).extract() url = re_selector.xpath(url_pic_xpath).extract() # url_md5 = get_md5(url) summary = '' # 提取图片地址 temp_urls = re_selector.xpath(image_url_xpath).extract() if len(temp_urls) >= 1: img_urls = 'http:' + temp_urls[0] else: img_urls = 'https://mat1.gtimg.com/pingjs/ext2020/newom/build/static/images/new_logo.png' category = response.meta.get("category", "") from_platform = self.from_platform news_time = re_selector.xpath(news_time_xpath).extract() if len(news_time) == 0: news_time = '1天前' crawl_time = datetime.datetime.now().date() # 排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_urls): pass else: continue # 设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time[0]) else: news_score = 1.0 news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_urls) news_itemloader.add_value("url", url) news_itemloader.add_value("url_md5", get_md5(url[0])) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # 测试json,减少数量 cnt = cnt + 1 if cnt == 50: break yield news_item pass
def parse_detail(self, response): print('detail') cnt = 0 div_xpath = '//div[@id="section"]/div[@class="figure flex-block"]' title_xpath = 'div/h2/a/text()' url_xpath = 'div/h2/a/@href' # summary_xpath = 'p/text()' image_url_xpath = 'a/@style' news_time_xpath = 'div/div/span[2]/text()' re_selectors = response.xpath(div_xpath) for re_selector in re_selectors: title = re_selector.xpath(title_xpath).extract() url = re_selector.xpath(url_xpath).extract()[0] url = 'http:' + url # url_md5 = get_md5(url) summary = 'null' #提取图片地址 temp_urls = re_selector.xpath(image_url_xpath).extract() if len(temp_urls) >= 1: temp_url = temp_urls[0] temp = temp_url.split(":")[1] temp=temp.replace("url(", "http:") temp = temp.replace(");", "") img_urls = temp else: img_urls = 'http://zkres.myzaker.com/static/zaker_web2/img/logo.png?v=20170726' category = response.meta.get("category", "") from_platform = self.from_platform news_time = re_selector.xpath(news_time_xpath).extract() try: crawl_time = datetime.datetime.strptime(crawl_time, "%Y/%m/%d").date() except Exception as e: crawl_time = datetime.datetime.now().date() #排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_urls): pass else: continue #设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time[0]) else: news_score = 1.0 img_url = make_str('http:', img_urls) news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_urls) news_itemloader.add_value("url", url) news_itemloader.add_value("url_md5", get_md5(url[0])) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # #测试json,减少数量 # cnt = cnt + 1 # if cnt == 2: # break yield news_item #获取下一页json内容 next_page_xpath = '//div[@id="content"]/div[@class="main flex-block"]/a[@class="next_page"]/@href' next_url = response.xpath(next_page_xpath).extract()[0] deal_url = tranfer_str(next_url) kv = get_re_zaker(deal_url) appid = kv.get("appid") date = kv.get("date") artilce = kv.get("artcile") stamp = kv.get("stamp") tab = kv.get("tab") version = kv.get("version") myversion = '&_version='+version head = 'http://www.myzaker.com/news/next_new.php?f=myzaker_com&url=' no_aticle_url = 'http://iphone.myzaker.com/zaker/blog2news.php?app_id={0}&since_date={1}&nt={2}&_appid=iphone&opage={3}&top_tab_id={4}&_version={5}' base_url = 'http://iphone.myzaker.com/zaker/blog2news.php?app_id={0}&since_date={1}&nt={2}&next_aticle_id={3}&_appid=iphone&opage={4}&otimestamp={5}&top_tab_id={6}&_version={7}' for page in range(1): nt = page + 1 opage = page + 2 if artilce == None or stamp==None: json_url = head + reverse_tranfer_str(no_aticle_url.format(appid, date, nt, opage, tab, version)) + myversion else: json_url = head + reverse_tranfer_str(base_url.format(appid, date, nt, artilce, opage, stamp, tab, version)) + myversion # json_url = 'http://www.myzaker.com/news/next_new.php?f=myzaker_com&url=http%3A%2F%2Fiphone.myzaker.com%2Fzaker%2Fblog2news.php%3Fapp_id%3D10001%26since_date%3D1531383311%26nt%3D1%26_appid%3Diphone%26top_tab_id%3D12183%26_version%3D6.5&_version=6.5' yield Request(url=json_url, callback=self.parse_json, meta={"category": category}, dont_filter=True) pass
def parse_json(self, response): print('json') cnt = 0 pretty_content = decode_zaker(response.text) # 检测爬取是否正确 # with open('zaker.html', 'wb') as fp: # fp.write(response.text.encode('utf-8')) # fp.close() # # print(response.text.decode('unicode_escape')) next_url = pretty_content['data']['next_url'] next_url = 'http:' + next_url num = len(pretty_content['data']['article']) for article in pretty_content['data']['article']: url = article['href'] title = article['title'] news_time = article["marks"][1] img_urls = article["img"] url = 'http:' + url summary = 'null' # 提取图片地址 if len(img_urls) >= 1: pass else: img_urls = 'zkres.myzaker.com/static/zaker_web2/img/logo.png?v=20170726' category = response.meta.get("category", "") from_platform = self.from_platform crawl_time = datetime.datetime.now() try: crawl_time = datetime.datetime.strptime(crawl_time, "%Y/%m/%d").date() except Exception as e: crawl_time = datetime.datetime.now().date() finally: pass # 排除亲爱的会员这几种特殊情况 if len(title) and len(url) and len(news_time) and len(img_urls): pass else: continue # 设置一个默认值,防止出错 if len(news_time) >= 1: news_score = get_score(news_time) else: news_score = 1.0 img_urls = 'http://'+ img_urls news_itemloader = ItemLoader(item=NewsItem(), response=response) news_itemloader.add_value("title", title) news_itemloader.add_value("image_urls", img_urls) news_itemloader.add_value("url", url) news_itemloader.add_value("url_md5", get_md5(url)) news_itemloader.add_value("category", category) news_itemloader.add_value("summary", summary) news_itemloader.add_value("from_platform", from_platform) news_itemloader.add_value("news_time", news_time) news_itemloader.add_value("crawl_time", crawl_time) news_itemloader.add_value("news_score", news_score) news_item = news_itemloader.load_item() # # 测试json,减少数量 # cnt = cnt + 1 # if cnt == 2: # break yield news_item pass depth = int(response.meta.get("depth", "")) if depth <= 3: yield Request(url=next_url, callback=self.parse_json, meta={"category": category}, dont_filter=True)