def parse_detail(self, response): #标题title title = response.xpath('//h1/text()').extract_first(default="") #关键字keyman keyman = response.xpath( '''//meta[@name="keywords"]/@content|//meta[@name="Keywords"]/@content''' ).extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: title = title_slice(title) #简介summary try: summary = response.xpath( '//meta[@name="description"]/@content|//meta[@name="Description"]/@content' ).extract_first(default="") except Exception as e: summary = "" summary = summay_slice(summary) titlepic_image = response.meta.get("titlepic_image", "") index_node = response.xpath( '''//span[@class="la_t_a"]/text()''').extract_first() try: time_node = re.search( r".*?(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}).*?", index_node, re.S).group(1) time_node = time_node.strip() time_node = time_node.replace("年", "-").replace("月", "-").replace( "日", "") time_node = time_node + ":00" news_time = datetime.datetime.strptime( str(time_node).strip(), "%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print(e, "time") news_time = None '2016年04月13日 09:42 来源:深圳中原地产网 作者: 中原地产' # writer作者 writer = writer_defined source = response.xpath( '''string(//span[@class="la_t_b"])''').extract_first( default=news_source_defined) source = source.strip() news_source = news_source_slice(source) #新闻内容content content = response.xpath('//div[@class="la_con"]').extract_first() content = content.replace("【环球时报综合报道】", "") content = content.replace("【环球网体育频道】", "") content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic_image self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items
def parse_detail(self, response): #标题title title = response.xpath('//div[@id="biaoti"]/text()').extract_first( default="") title = title.strip() title = title_slice(title) #关键字keyman keyman = response.xpath( '''//meta[@name="keywords"]/@content''').extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: #简介summary try: summary = response.xpath('//meta[@name="description"]/@content' ).extract_first(default="").strip() summary = summary.replace("东方网-东方新闻-", "") except Exception as e: summary = "" summary = summay_slice(summary) index_node = response.xpath( 'string(//div[@class="time grey12a fc lh22"]/p[last()])' ).extract_first() try: time_node = response.meta.get("time_node", "") time_node = time_node.replace("/", "-") news_time = datetime.datetime.strptime( str(time_node).strip(), "%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print(e, "time") news_time = None # '来源:新华社 作者:胡浩 林晖 朱基钗 史竞男 选稿:刘晓晶 ' #writer作者 try: writer = re.search(r".*?作者:(.*?)选稿:.*?", index_node, re.S).group(1) writer = writer.strip() except Exception as e: print(e, "writer") writer = writer_defined writer = writer_slice(writer) # 新闻来源news_source try: source = re.search(r".*?来源:(.*?)作者:.*?", index_node, re.S).group(1) source = source.strip() except Exception as e: try: source = re.search(r".*?来源:(.*?)选稿:.*?", index_node, re.S).group(1) source = source.strip() except Exception as e: try: source = re.search(r".*?来源:(.*)", index_node, re.S).group(1) source = source.strip() except Exception as e: print(e, "source") source = news_source_defined news_source = news_source_slice(source) #新闻内容content content = response.xpath('//div[@id="zw"]').extract_first() content = content.replace(" ", "") content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items
def parse_detail(self, response): #标题title title = response.xpath('//h1/text()').extract_first(default="") #关键字keyman keyman = response.xpath( '''//meta[@name="keywords"]/@content''').extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: title = title_slice(title) #简介summary try: summary = response.xpath('//meta[@name="description"]/@content' ).extract_first(default="") except Exception as e: summary = "" summary = summay_slice(summary) titlepic_image = response.meta.get("titlepic_image", "") index_node = response.xpath( '''string(//div[@class="article-aboute"])''').extract_first() try: time_node = response.xpath( '''//div[@class="article-aboute"]/span[@id="pubtime_baidu"]/text()''' ).extract_first() news_time = datetime.datetime.strptime( str(time_node).strip(), "%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print("time", e) news_time = None # writer作者 writer = writer_defined try: writer = response.xpath( '''string(//div[@class="article-aboute"]/span[@id="author_baidu"])''' ).extract_first() writer = writer.replace("作者:", "") writer = writer.strip() except Exception as e: print(e, "writer") writer = writer_defined writer = writer_slice(writer) # 新闻来源news_source news_source = news_source_defined try: source = response.xpath( '''//div[@class="article-aboute"]/span[@id="source_baidu"]/text()''' ).extract_first() source = source.replace("[", "").replace("]", "") source = source.strip() except Exception as e: print(e, "source") source = news_source_defined news_source = news_source_slice(source) #新闻内容content content = response.xpath( '//div[@id="article-content"]').extract_first() content = content.replace(" ", "") content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic_image self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items