Esempio n. 1
0
    def parse_detail(self, response):

        #标题title
        title = response.xpath('//h1/text()').extract_first(default="")

        #关键字keyman
        keyman = response.xpath(
            '''//meta[@name="keywords"]/@content|//meta[@name="Keywords"]/@content'''
        ).extract_first(default="")
        if keyman:
            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:

            title = title_slice(title)
            #简介summary
            try:
                summary = response.xpath(
                    '//meta[@name="description"]/@content|//meta[@name="Description"]/@content'
                ).extract_first(default="")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)

            titlepic_image = response.meta.get("titlepic_image", "")
            index_node = response.xpath(
                '''//span[@class="la_t_a"]/text()''').extract_first()
            try:

                time_node = re.search(
                    r".*?(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}).*?",
                    index_node, re.S).group(1)
                time_node = time_node.strip()
                time_node = time_node.replace("年", "-").replace("月",
                                                                "-").replace(
                                                                    "日", "")
                time_node = time_node + ":00"
                news_time = datetime.datetime.strptime(
                    str(time_node).strip(), "%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print(e, "time")
                news_time = None
                '2016年04月13日 09:42 来源:深圳中原地产网 作者: 中原地产'

                # writer作者

            writer = writer_defined
            source = response.xpath(
                '''string(//span[@class="la_t_b"])''').extract_first(
                    default=news_source_defined)
            source = source.strip()
            news_source = news_source_slice(source)

            #新闻内容content
            content = response.xpath('//div[@class="la_con"]').extract_first()
            content = content.replace("【环球时报综合报道】", "")
            content = content.replace("【环球网体育频道】", "")
            content = content.replace(" ", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&", "")
            content = content.replace("nbsp", "")
            content = content.replace("&nbsp", "")
            content = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic_image
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items
Esempio n. 2
0
    def parse_detail(self, response):

        #标题title
        title = response.xpath('//div[@id="biaoti"]/text()').extract_first(
            default="")
        title = title.strip()
        title = title_slice(title)
        #关键字keyman
        keyman = response.xpath(
            '''//meta[@name="keywords"]/@content''').extract_first(default="")
        if keyman:

            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:
            #简介summary
            try:
                summary = response.xpath('//meta[@name="description"]/@content'
                                         ).extract_first(default="").strip()
                summary = summary.replace("东方网-东方新闻-", "")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)
            index_node = response.xpath(
                'string(//div[@class="time grey12a fc lh22"]/p[last()])'
            ).extract_first()

            try:
                time_node = response.meta.get("time_node", "")
                time_node = time_node.replace("/", "-")
                news_time = datetime.datetime.strptime(
                    str(time_node).strip(), "%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print(e, "time")
                news_time = None

# '来源:新华社 作者:胡浩 林晖 朱基钗 史竞男 选稿:刘晓晶 '
#writer作者
            try:
                writer = re.search(r".*?作者:(.*?)选稿:.*?", index_node,
                                   re.S).group(1)
                writer = writer.strip()
            except Exception as e:
                print(e, "writer")
                writer = writer_defined
            writer = writer_slice(writer)
            # 新闻来源news_source
            try:
                source = re.search(r".*?来源:(.*?)作者:.*?", index_node,
                                   re.S).group(1)
                source = source.strip()
            except Exception as e:
                try:
                    source = re.search(r".*?来源:(.*?)选稿:.*?", index_node,
                                       re.S).group(1)
                    source = source.strip()
                except Exception as e:
                    try:
                        source = re.search(r".*?来源:(.*)", index_node,
                                           re.S).group(1)
                        source = source.strip()
                    except Exception as e:
                        print(e, "source")
                        source = news_source_defined
            news_source = news_source_slice(source)

            #新闻内容content

            content = response.xpath('//div[@id="zw"]').extract_first()
            content = content.replace(" ", "")
            content = content.replace("&nbsp", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&", "")
            content = content.replace("nbsp", "")
            content = content.replace("&nbsp", "")
            content = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items
    def parse_detail(self, response):

        #标题title
        title = response.xpath('//h1/text()').extract_first(default="")

        #关键字keyman
        keyman = response.xpath(
            '''//meta[@name="keywords"]/@content''').extract_first(default="")
        if keyman:
            keyman = keyman_slice(keyman)
        else:
            keyman = ""

        if title:

            title = title_slice(title)
            #简介summary
            try:
                summary = response.xpath('//meta[@name="description"]/@content'
                                         ).extract_first(default="")
            except Exception as e:
                summary = ""
            summary = summay_slice(summary)

            titlepic_image = response.meta.get("titlepic_image", "")

            index_node = response.xpath(
                '''string(//div[@class="article-aboute"])''').extract_first()
            try:
                time_node = response.xpath(
                    '''//div[@class="article-aboute"]/span[@id="pubtime_baidu"]/text()'''
                ).extract_first()
                news_time = datetime.datetime.strptime(
                    str(time_node).strip(), "%Y-%m-%d %H:%M:%S")
                news_time = int(time.mktime(news_time.timetuple()))
            except Exception as e:
                print("time", e)
                news_time = None

                # writer作者
            writer = writer_defined
            try:
                writer = response.xpath(
                    '''string(//div[@class="article-aboute"]/span[@id="author_baidu"])'''
                ).extract_first()
                writer = writer.replace("作者:", "")
                writer = writer.strip()
            except Exception as e:
                print(e, "writer")
                writer = writer_defined
            writer = writer_slice(writer)
            # 新闻来源news_source
            news_source = news_source_defined
            try:
                source = response.xpath(
                    '''//div[@class="article-aboute"]/span[@id="source_baidu"]/text()'''
                ).extract_first()
                source = source.replace("[", "").replace("]", "")
                source = source.strip()
            except Exception as e:
                print(e, "source")
                source = news_source_defined
            news_source = news_source_slice(source)

            #新闻内容content
            content = response.xpath(
                '//div[@id="article-content"]').extract_first()
            content = content.replace("&nbsp", "")
            content = content.replace(" ", "")
            content = content.replace("&nbsp&nbsp&nbsp&nbsp", "")
            content = content.replace("&", "")
            content = content.replace("nbsp", "")
            content = content.replace("&nbsp", "")
            content = contentfilter(content)
            self.items["news_keyman"] = keyman
            self.items["title"] = title
            self.items["content"] = content
            self.items['content_summary'] = summary
            self.items['click_num'] = click_num
            self.items['news_time'] = news_time
            self.items['news_source'] = news_source
            self.items['writer'] = writer
            #
            #
            self.items["class_id"] = self.class_id
            self.items["user_id"] = user_id
            self.items["istop"] = istop
            self.items["ismember"] = ismember
            self.items["userfen"] = userfen
            self.items["isgood"] = isgood
            self.items["user_name"] = "admin"
            self.items["group_id"] = group_id
            self.items["plnum"] = plnum
            self.items["first_title"] = first_title
            self.items["is_qf"] = is_qf
            self.items["totaldown"] = totaldown
            self.items["have_html"] = have_html
            self.items["last_dotime"] = int(time.time())
            self.items["diggtop"] = diggtop
            self.items["stb"] = stb
            self.items["ttid"] = ttid
            self.items["ispic"] = ispic
            self.items["isurl"] = isurl
            self.items["fstb"] = fstb
            self.items["restb"] = restb
            self.items["news_tem_pid"] = news_tem_pid
            self.items["dokey"] = dokey
            self.items["closepl"] = closepl
            self.items["haveaddfen"] = haveaddfen
            self.items["infotags"] = keyman
            self.items["checked"] = checked
            self.items["keyid"] = keyid
            self.items["news_path"] = news_path
            self.items["titlepic"] = titlepic_image
            self.items["ftitle"] = ftitle
            #
            #
            self.items['filename'] = filename
            self.items['titlefont'] = titlefont
            self.items['title_url_z'] = title_url_z
            self.items['originalurl'] = response.url
            #
            yield self.items