Exemple #1
0
 def parse(self, response):
     item = FagaiweiItem()
     html = etree.HTML(response.text)
     divs = html.xpath("//div[@data-jsx='99852006']/div")[1:-3]
     dates = ''.join(list(
         html.xpath(".//div[@class='time']/text()"))).strip()
     for div in divs:
         item['webname'] = '财联社'
         item['web'] = 'http://cailianpress.com'
         try:
             da = ''.join(
                 list(div.xpath(
                     ".//div/div[@class='cTime']/text()"))).strip()
             if str(dates) in da:
                 times = da
             else:
                 times = dates + ' ' + da
         except Exception as e:
             times = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
         content = ''.join(
             list(
                 div.xpath(".//div[@class='newsRight']/p/text()"))).strip()
         if "【" in content:
             item['title'] = ''.join(list(content.split('】')[0] + "】"))
             if len(item["title"]) < 10:
                 item['title'] = ''.join(list(content[0:30]))
         else:
             item['title'] = ''.join(list(content[0:30]))
         item['pub_time'] = times
         item['content'] = content
         item['url'] = 'http://cailianpress.com?' + str(times).replace(
             " ", "").replace(":", "")
         item["keyword"] = keyword.get_keyword(item["content"])
         item['web_id'] = 73
         # print(item)
         result = session.query(NewsItemInfo).filter_by(url=item['url'],
                                                        web_id=73).count()
         if result:
             # print("{} 存在".format(item['url']))
             pass
         else:
             yield item
 def parse_page(self, response):
     item = FagaiweiItem()
     item['url'] = response.url
     item['pub_time'] = response.xpath("//span[@class='Ff']/text()").get()
     item['title'] = response.xpath("//h1/text()").get()
     content1 = ' '.join(
         list(
             response.xpath("//div[@class='artical_t']//span//text()")
             [0:-1].getall()))
     content2 = '\n'.join(list(response.xpath("//div[@class='artical_c']/p/text()").getall())) \
         .replace('\u3000', '').replace('\xa0', '')
     item['content'] = content1 + '\n' + content2
     item['web'] = 'http://www.cs.com.cn/sylm/jsxw/'
     item['webname'] = '中证网'
     item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     item["keyword"] = keyword.get_keyword(item["content"])
     item['web_id'] = 62
     return item
     pass
Exemple #3
0
 def process_detail(self, response):
     item = FagaiweiItem()
     item['web_id'] = 34
     item['url'] = response.url
     item['title'] = response.xpath(
         '//div[@class="article-header"]/h1/text()').extract_first(
             default='')
     item['web'] = 'http://kan.china.com/'
     item['webname'] = response.xpath(
         '//*[@id="article-source"]/text()').extract_first(default='热点新闻')
     item['pub_time'] = response.xpath(
         '//*[@id="article-data"]/text()').extract_first(
             default=datetime.now())
     item['content'] = '\n'.join(
         response.xpath('//*[@id="main-content"]/p/text() | \
                                        //*[@id="main-content"]/p/strong/text() | \
                                        //*[@id="main-content"]/p/strong/span/text()'
                        ).extract())
     item["keyword"] = keyword.get_keyword(item["content"])
     yield item
    def parse(self, response):
        titles = response.xpath(
            "//div[contains(@class,'title')]/a/text()").getall()
        urls = response.xpath(
            "//div[contains(@class,'items-col')]/a/@href").getall()
        dates = response.xpath(
            "//div[@class='items']/div[contains(@class,'date')]/text()"
        ).getall()
        dabao = zip(urls, titles, dates)
        # print(len(urls), len(dates))
        for url, title, time in dabao:
            if url[-4:] == '.pdf':
                result = session.query(NewsItemInfo).filter_by(
                    url=url, web_id=79).count()
                if result:
                    # print("PDF 文件地址: {} 存在".format(url))
                    pass
                else:
                    item = FagaiweiItem()
                    item['webname'] = '中华交易服务'
                    item['web'] = response.url
                    title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \
                        .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \
                        .replace('?', '').replace('\t', '')
                    time = time.replace('/', '-')
                    item['pub_time'] = datetime.strptime(time, '%d-%m-%Y')
                    item['url'] = url
                    item['title'] = title
                    content = pdf.main(url=url, fileName=title)
                    if content == '':
                        item['content'] = '请点击原文链接查看' + response.url
                    else:
                        item['content'] = ''.join(list(content))
                    # item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    item["keyword"] = keyword.get_keyword(item["content"])
                    item['web_id'] = 79
                    # print(item)
                    yield item
            else:

                pass
Exemple #5
0
 def parse(self, response):
     item = FagaiweiItem()
     urls = response.xpath(
         "//ul[@class='gg-list']/li/span[@class='tit']/a/@href").getall()
     titles1 = response.xpath(
         "//ul[@class='gg-list']/li/span[@class='tit']/a/text()").getall()
     titles2 = response.xpath(
         "//ul[@class='gg-list']/li/span[@class='code']/a/text()").getall()
     times = response.xpath(
         "//ul[@class='gg-list']/li/span[@class='time']/text()").getall()
     dabao = zip(urls, titles1, titles2, times)
     for url, title1, title2, time in dabao:
         title = title2 + ' ' + title1
         title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \
             .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \
             .replace('?', '')
         shijian, filename = re.findall(r'=(\d{8})(\w+)', url)[0]
         url2 = 'http://php.cnstock.com/texts/2018/' + shijian + '/' + filename + '.pdf'
         durl = url2  # PDF文件下载地址
         if durl[-4:] == '.pdf':
             # print("==================================\n{}".format(durl))
             result = session.query(NewsItemInfo).filter_by(
                 url=url2, web_id=67).count()
             if result:
                 # print("PDF 文件地址: {} 存在".format(url2))
                 pass
             else:
                 content = pdf.main(url=url2, fileName=title)
                 if len(content) == 0:
                     item['content'] = '请点击原文链接查看'
                 else:
                     item['content'] = ''.join(list(content))
                 item['web_id'] = 67
                 item['title'] = title
                 time = time.replace('(', '').replace(')', '')
                 item['pub_time'] = time
                 item['webname'] = '中国证券网信息披露平台'
                 item['web'] = response.url
                 item['url'] = url2
                 item["keyword"] = keyword.get_keyword(item["content"])
                 yield item
Exemple #6
0
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        # item["title"] = response.meta["title"]
        item["title"] = response.xpath("//h1/text()").get()

        # print(response.url)
        contents = "".join(
            response.xpath('//*[@id="UCAP-CONTENT"]/p/text()|\
                                        //*[@id="UCAP-CONTENT"]/p/span/span/text()|\
                                        //div[@class="pages_content"]/p/text()|\
                                        //div[@class="pages_content"]/p/a/text()|\
                                        //div[@class="pages_content"]/div/p/text()|\
                                        //*[@id="UCAP-CONTENT"]/p/span/text()'
                           ).extract())
        if contents == "":
            item["content"] = "可能是图片或表格 打开原网站查看"
        else:
            item["content"] = contents
        date = "".join(
            response.xpath('//div[@class="pages-date"]/text()').extract())

        if date:
            dates = str(date).replace("  ", "").replace("\r", "").replace(
                "\n", "") + ":00"
            date = datetime.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
        else:
            date = response.meta["date"]
        item["pub_time"] = date
        from_s = "".join(
            response.xpath('//div[@class="pages-date"]/span/text()').extract())
        if from_s == "":
            webname = "国务院新闻"
        else:
            webname = from_s

        item["webname"] = webname.replace("来源:", "")
        item["web"] = response.meta["laiyuan"]
        item["keyword"] = keyword.get_keyword(item["content"])
        item["web_id"] = 1
        return item
Exemple #7
0
    def process_detail(self, response):
        item = FagaiweiItem()
        item['web_id'] = 22
        item['url'] = response.url
        item['title'] = response.meta.get('title')
        item['web'] = response.meta.get('web')
        # item['keyword'] = ''
        news_about = response.xpath(
            '//div[@class="Article_61"]/h3[@class="daty"]/div/em[1]/text()').extract_first() + ' '
        item['webname'] = ''.join(re.findall(r'来源:(.*?)\s', news_about))
        time = response.xpath('//div[@class="Article_61"]/h3[@class="daty"]/div/em[2]/text()').extract_first()
        item['pub_time'] = ''.join(re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', time))
        content = '\n'.join(response.xpath('//div[@class="Article_61"]/div[@class="content"]/div/div/p/text() | \
                                //div[@class="Article_61"]/div[@class="content"]/div/div/p/font/text() | \
                                 //div[@class="Article_61"]/div[@class="content"]/div/div/p/strong/text() ').extract())
        if not content:
            content = '这可能是图片或者文件,打开查看!'
        item['content'] = content
        item["keyword"] = keyword.get_keyword(item["content"])

        yield item
Exemple #8
0
 def process_detail(self, response):
     item = FagaiweiItem()
     if response.xpath('//div[@class="news_txt"]'):  # 为了排除几个特殊的网址加入判断
         item["web_id"] = 37
         item["url"] = response.url
         item["title"] = response.xpath(
             '//h1[@class="news_title"]/text()').extract_first()
         item["pub_time"] = response.xpath(
             '//div[@class="news_about"]/p[2]/text()').extract_first(
             ).strip()
         item["content"] = '\n'.join(
             response.xpath('//div[@class="news_txt"]/div/text() | \
                                           //div[@class="news_txt"]/text() | \
                                           //div[@class="news_txt"]/strong/text() '
                            ).extract())
         item["webname"] = response.xpath(
             '//div[@class="news_about"]/p[1]/text()').extract_first(
             ).strip()
         item["web"] = response.meta.get('web')
         item["keyword"] = keyword.get_keyword(item["content"])
         yield item
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["pub_time"] = response.meta["date"]
        item["title"] = response.meta["title"]
        form_s = "".join(response.xpath('//div[@class="right_md_laiy"]/h4/text()').extract())
        form_s = form_s.split(" ")[0].replace("一", "")
        if form_s != "":
            item["webname"] = form_s
        else:
            item["webname"] = "中国工程院"
        item["web"] = response.meta["laiyuan"]
        # item["keyword"] = ""
        item["web_id"] = 16
        contents = "".join(response.xpath('\
                                     //*[@id="zoom"]/div/p/text()|\
                                     //*[@id="zoom"]/div/p/span/text()|\
                                     //*[@id="zoom"]/strong/span/p/strong/text()|\
                                     //*[@id="zoom"]/p/text()|\
                                     //*[@id="zoom"]/p/a/text()|\
                                     //*[@id="zoom"]/p/b/span/text()|\
                                     //*[@id="zoom"]/p/strong/text()|\
                                     //*[@id="zoom"]/p/span/text()|\
                                     //*[@id="zoom"]/p/span/span/text()|\
                                     //*[@id="zoom"]/span/p/text()|\
                                     //*[@id="zoom"]/span/p/a/text()|\
                                     //*[@id="zoom"]/span/p/a/font/text()|\
                                     //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\
                                     //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\
                                     //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\
                                     //*[@id="zoom"]/span/strong/span/span/p/strong/text()').extract())

        # print(contents)
        if contents != "":
            item["content"] = contents.replace("\u3000", "").replace("\xa0", "").replace("\u2002", "")
        else:
            item["content"] = "可能是图片 请打开详情页查看"
        item["keyword"] = keyword.get_keyword(item["content"])

        return item
Exemple #10
0
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["title"] = response.meta["title"]
        item["pub_time"] = response.meta["date"]
        contents = "\n".join(response.xpath('\
                                            //*[@id="Zoom"]/span/p/text()|\
                                            //*[@id="Zoom"]/span/p/font/text/text()|\
                                            //*[@id="Zoom"]/span/p/font/strong/text/text()|\
                                            //div[@class="TRS_Editor"]/div/b/text()|\
                                            //div[@class="TRS_Editor"]/p/text()|\
                                            //div[@class="TRS_Editor"]/p/font/text()|\
                                            //div[@class="TRS_Editor"]/p/strong/text()|\
                                            //div[@class="TRS_Editor"]/p/a/text()|\
                                            //div[@class="TRS_Editor"]/p/a/font/text()|\
                                            //div[@class="TRS_Editor"]/div/p/text()|\
                                            //div[@class="TRS_Editor"]/div/text/text()|\
                                            //*[@id="neirongText"]/div/p/text()|\
                                            //*[@id="neirongText"]/div/p/text/text()\
                                            //*[@id="neirongText"]/div/p/strong/text/text()').extract())
        if contents == "":
            item["content"] = "可能是图片或表格 打开原网站查看"
        else:
            item["content"] = contents.replace("\u3000", " ").replace("\xa0", "  ")
        form_s = "".join(response.xpath('//ul/li[1]/text()').extract())
        if form_s == "":
            # 只能使用正则进行匹配
            com = re.compile(r'laiyuan  = "(.*?)";')
            form_s = "".join(re.findall(com, response.text))
            if form_s == "":
                form_s = "海南省政府网站"
        else:
            form_s = form_s
        item["webname"] = form_s.replace("来源:", "")
        item["web"] = response.meta["laiyuan"]
        item["keyword"] = keyword.get_keyword(item["content"])

        item["web_id"] = 23
        return item
Exemple #11
0
    def parse_page(self, response):
        item = FagaiweiItem()
        item['webname'] = '投资时报'
        item['web'] = response.meta['url']
        item['title'] = response.xpath("//h2/text()").get().replace('\xa0', '/n')
        item['url'] = response.url
        item['content'] = ''.join(list(response.xpath("//div[@class='para_ycont']/p/text()|"
                                                      "//div[@class='para_ycont']/p/span/text()|"
                                                      "//div[@class='para_ycont']/div/text()|"
                                                      "//div[@class='para_ycont']/text()").getall())) \
            .replace('\r\n', '').replace('\xa0', '')
        times = ''.join(list(response.xpath("//p[contains(@class,'s14')]/text()|"
                                            "//p[contains(@class,'s14')][1]/text()").getall()))
        item['pub_time'] = re.search(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}', times).group() + ':00'
        item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        item["keyword"] = keyword.get_keyword(item["content"])

        item['web_id'] = 28
        # time.sleep(0.5)

        return item
        pass
Exemple #12
0
 def get_detail(self, response):
     item = FagaiweiItem()
     item["url"] = response.url
     item["pub_time"] = response.meta["date"]
     item["title"] = response.meta["title"]
     item["web"] = response.meta["laiyuan"]
     item["web_id"] = 9
     contents = "".join(
         response.xpath('//*[@id="ziti"]/p/text()|\
                                         //table[@class="MsoNormalTable"]/tbody/tr/td/p/span/span/span/text()|\
                                         //table[@class="MsoNormalTable"]/tbody/tr/td/p/span/span/span/span/text()|\
                                         //*[@id="ziti"]/p/font/text()|\
                                         //*[@id="ziti"]/p/span/text()|\
                                         //*[@id="ziti"]/p/span/font/text()|\
                                         //*[@id="ziti"]/p/span/span/text()|\
                                         //*[@id="ziti"]/p/span/span/span/text()|\
                                         //*[@id="ziti"]/p/span/span/span/span/text()|\
                                         //*[@id="ziti"]/p/span/span/span/span/span/text()|\
                                         //*[@id="ziti"]/p/span/span/span/span/span/span/text()|\
                                         //*[@id="ziti"]/p/b/span/span/span/text()|\
                                         //*[@id="ziti"]/p/span/span/span/font/text()|\
                                         //*[@id="ziti"]/p/text()').extract(
         ))
     # print(contents)
     if contents != "":
         item["content"] = contents.replace("\u3000",
                                            "").replace("\xa0", "")
     else:
         item["content"] = "国家体育局 可能是图片 打开原文查看"
     form_s = "".join(
         response.xpath('//div[@class="wz_info"]/span[2]/text()').extract())
     form_s = form_s.replace("来源:", "")
     if form_s != "":
         webname = form_s
     else:
         webname = "国家体育局"
     item["webname"] = webname
     item["keyword"] = keyword.get_keyword(item["content"])
     return item
Exemple #13
0
 def parse(self, response):
     info_list = response.xpath(
         '//body/table[2]/tr[3]/td/table/tr[contains(@class,"row")]')
     for info in info_list:
         item = FagaiweiItem()
         url = 'http://www.hkexnews.hk' + info.xpath(
             './td[4]/a/@href').extract_first(default='')
         # print(url)
         if url[-4:] == '.pdf':
             result = session.query(NewsItemInfo).filter_by(
                 url=url, web_id=80).count()
             if result:
                 # print("PDF 文件地址: {} 存在".format(url))
                 pass
             else:
                 item['url'] = url
                 title = info.xpath('./td[3]/nobr/text()').extract_first(
                 ) + ':' + info.xpath('./td[4]/div/text()').extract_first(
                     default='').strip()
                 title = ''.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \
                     .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \
                     .replace('?', '')
                 item['title'] = title
                 item['web'] = response.url
                 item['webname'] = '披漏网'
                 time = ' '.join(
                     info.xpath('./td[1]/text()').extract()).replace(
                         '/', '-')
                 item['pub_time'] = datetime.strptime(
                     time, '%d-%m-%Y %H:%M')
                 content = pdf.main(url=url, fileName=title)
                 if len(content) == 0:
                     item['content'] = '这可能是图片或者文件,打开查看!'
                 else:
                     item['content'] = ''.join(list(content))
                 item['web_id'] = 80
                 item["keyword"] = keyword.get_keyword(item["content"])
                 yield item
Exemple #14
0
    def parse_item(self, response):

        conten_detail = response.xpath('//*[@id="p-detail"]').extract_first(
            default='')
        if response.status != 200 or not conten_detail:
            pass
        else:
            item = FagaiweiItem()
            item['webname'] = response.css('#source::text').extract_first(
                '新华国际')
            item['web'] = re.split(r'[0-9]+', response.url)[0]
            item['title'] = response.xpath(
                '//div[@class="h-title"]/text()').extract_first(
                    default=None).strip()
            item['pub_time'] = response.css('.h-time::text').extract_first()
            content = '\n'.join(
                response.xpath('//div[@id="p-detail"]//p//strong/text() | \
                                                 //div[@id="p-detail"]//p//strong/font/text() | \
                                                //div[@id="p-detail"]//p/font/text() |\
                                                //div[@id="p-detail"]//p/text()| \
                                                //div[@id="p-detail"]//p/font/strong/text()| \
                                                //div[@id="p-detail"]//p/p/text() '
                               ).extract())
            if content != '\n':
                content = re.sub('\u3000', '', content)
            item['content'] = content
            item['url'] = response.url
            item['add_time'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H-%M-%S')
            item['web_id'] = 40
            item["keyword"] = keyword.get_keyword(item["content"])
            result = session.query(NewsItemInfo).filter_by(url=item['url'],
                                                           web_id=40).count()
            if result:
                # print("{} 存在".format(item['url']))
                pass
            else:
                yield item
Exemple #15
0
 def process_detail(self, response):
     item = FagaiweiItem()
     item['web_id'] = 65
     item['url'] = response.url
     item['title'] = response.xpath(
         '//div[@class="news_content"]/h1/text()').extract_first(default='')
     item['web'] = response.meta.get('web')
     news_about = response.xpath(
         '//div[@class="news_content"]/div[@class="info_news"]/text()'
     ).extract_first(default='') + ' '
     item['webname'] = re.search(r'来源:(.*?)\s', news_about).group(1)
     item['pub_time'] = re.search(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2})',
                                  news_about).group(1)
     content = '\n'.join(
         response.xpath(
             '//div[@class="news_content"]/div[@class="content"]/p/text() | \
                                  //div[@class="news_content"]/div[@class="content"]/p/strong/text()'
         ).extract()).replace('\u2002', '').replace('\xa0', '')
     if not content:
         content = '这可能是图片或者文件,打开查看!'
     item['content'] = content
     item["keyword"] = keyword.get_keyword(item["content"])
     yield item
Exemple #16
0
 def get_detail(self, response):
     item = FagaiweiItem()
     item["url"] = response.url
     item["pub_time"] = response.meta["date"]
     item["title"] = response.meta["title"]
     item["webname"] = "海关总署"
     item["web"] = response.meta["laiyuan"]
     item["web_id"] = 11
     contents = "".join(
         response.xpath('//*[@id="easysiteText"]/p/text()|\
                                         //p[@class="p1"]/text()|\
                                         //p[@class="p1"]/span/text()|\
                                         //*[@id="easysiteText"]/p/strong/text()|\
                                         //*[@id="easysiteText"]/p/strong/text()'
                        ).extract())
     # print(contents)
     if contents != "":
         item["content"] = contents.replace("\u3000",
                                            "").replace("\xa0", "")
     else:
         item["content"] = "可能是图片 请打开详情页查看"
     item["keyword"] = keyword.get_keyword(item["content"])
     return item
Exemple #17
0
 def process_detail(self, response):
     item = FagaiweiItem()
     item['web_id'] = 74
     item['url'] = response.url
     item['title'] = ''.join(
         response.xpath('//div[@class="newscontent_right2"]/h1/text()| \
                                    //div[@class="newscontent_right"]/h1/text() '
                        ).extract())
     item['web'] = response.meta.get('web')
     item['webname'] = response.xpath(
         '//div[@class="newscontent_right2"]/div[@class="content_info clearfix"]/span[1]/i[@class="zhuoze"]/a/@title | \
                                      //div[@class="newscontent_right2"]/div[@class="content_info clearfix"]/span[1]/i[@class="zhuoze"]/text()'
     ).extract_first(default='全景网').strip()
     time = ''.join(
         response.xpath(
             '//div[@class="newscontent_right2"]/div[@class="content_info clearfix"]/span[1]/time/text()|\
                           //div[contains(@class,"content_info")]/span[@class="left"]/text()'
         ).extract()).strip().replace('月', '-').replace('日', '')
     if '2018' not in time:
         time = '2018-' + time
     item['pub_time'] = time
     content = '\n'.join(
         response.xpath(
             '//div[@class="newscontent_right2"]/div[@class="article_content2"]//div/p/text()| \
                                 //div[@class="newscontent_right2"]/div[@class="article_content2"]//div/p/a/text()| \
                                 //div[@class="newscontent_right2"]/div[@class="article_content2"]//div/p/a/text()| \
                                 //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/div/p/text()| \
                                 //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/p/font/text()| \
                                  //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/div/p//strong/text()| \
                                    //div[@class="newscontent_right2"]/div[@class="article_content2"]/div[2]/div/p//font/text()| \
                                     //div[@class="article_content"]/p/text()'
         ).extract())
     if not content:
         content = '这可能是图片或者文件,打开查看!'
     item['content'] = content.replace('\u3000', '')
     item["keyword"] = keyword.get_keyword(item["content"])
     yield item
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["title"] = response.meta["title"]
        item["web"] = response.meta["laiyuan"]
        contents = "".join(
            response.xpath('\
                            //*[@id="zoom"]/p/text()|\
                            //*[@id="zoom"]/p/strong/text()|\
                            //*[@id="zoom"]/p/span/text()|\
                            //*[@id="zoom"]/p/span/span/text()|\
                            //*[@id="zoom"]/p/span/strong/span/text()|\
                            //*[@id="zoom"]/strong/text()|\
                            //*[@id="zoom"]/span/text()|\
                            //*[@id="zoom"]/div/span/text()|\
                            //*[@id="zoom"]/text()').extract())

        item["content"] = contents.replace("\u3000", "")
        form_s = "".join(
            response.xpath('//*[@id="container"]/div/div[2]/ul[1]/li[1]/text()'
                           ).extract())
        if form_s == "":
            form_s = "最高人民法院新闻"
        item["webname"] = form_s.replace("来源:", "")
        date = "".join(
            response.xpath('//*[@id="container"]/div/div[2]/ul[1]/li[2]/text()'
                           ).extract())
        date_s = date.split("间")[-1][1:]
        if date_s == "":
            date_s = response.meta["date"]
        item["pub_time"] = date_s
        item["web_id"] = 5
        item["keyword"] = keyword.get_keyword(item["content"])

        # print(item)
        return item
Exemple #19
0
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["pub_time"] = response.meta["date"]
        item["title"] = response.meta["title"]
        form_s = "".join(
            response.xpath(
                '//div[@class="right_md_laiy"]/h4/text()').extract())
        if form_s != "":
            item["webname"] = form_s
        else:
            item["webname"] = "国家粮食和物资储备局门户网站 "
        item["web"] = response.meta["laiyuan"]
        # item["keyword"] = ""
        item["web_id"] = 18
        contents = "".join(
            response.xpath(
                '//div[@class="detail-pane search-help"]/table/tr/td/p/font/text()|\
                           //ul[@class="lsj_spe_list"]/li/div/text()|\
                            //*[@id="UCAP-CONTENT"]/p/text()|\
                            //*[@id="UCAP-CONTENT"]/p/span/span/text()|\
                            //div[@class="pages_content"]/p/text()|\
                            //div[@class="pages_content"]/p/a/text()|\
                            //div[@class="pages_content"]/div/p/text()|\
                            //*[@id="UCAP-CONTENT"]/p/span/text()|\
                           //ul[@class="lsj_spe_list"]/li/div/a/text()').
            extract())

        if contents != "":
            item["content"] = contents.replace("\u3000", "").replace(
                "\xa0", "").replace("\u2002", "")
        else:
            item["content"] = "可能是图片 请打开详情页查看"
        item["keyword"] = keyword.get_keyword(item["content"])

        return item
Exemple #20
0
 def process_detail(self, response):
     # print(response.url)
     item = FagaiweiItem()
     item['web_id'] = 39
     item['url'] = response.url
     item['title'] = response.css('.post-title strong::text').extract_first(
         default='')
     item['web'] = response.meta.get('web')
     item['webname'] = "龙腾网"
     news_about = response.xpath(
         'string(//div[@class="post-param"])').extract_first()
     item['pub_time'] = re.search(r'(\d{4}-\d{2}-\d{2})',
                                  news_about).group(1)
     content = ''.join(
         response.xpath(
             'string(//div[@class="post-content"])').extract()).replace(
                 '\xa0', '').replace('\r', '')
     comment = ''.join(
         response.xpath(
             'string(//div[@class="post-comment"])').extract()).replace(
                 '\xa0', '').replace('\r', '')
     item['content'] = "正文翻译:\n" + content + "评论翻译:\n" + comment
     item["keyword"] = keyword.get_keyword(item["content"])
     yield item
Exemple #21
0
 def process_detail(self, response):
     item = FagaiweiItem()
     item['web_id'] = 96
     item['url'] = response.url
     item['title'] = response.xpath(
         '//div[@class="article-content"]/h1/text()').extract_first(
             default='')
     item['web'] = response.meta.get('web')
     news_about = response.xpath(
         '//div[@class="article-content"]/div[contains(@class,"user-info-box")]//span[@class="time1"]/text()'
     ).extract_first()
     item['webname'] = '观察者网'
     item['pub_time'] = self.process_time(news_about)
     content = '\n'.join(response.xpath('//div[contains(@class,"article-txt")]/div/p/text() | \
                            //div[contains(@class,"article-txt")]/div/p/strong/text() | \
                        //div[contains(@class,"article-txt")]/p/text() | \
                            //div[contains(@class,"article-txt")]/p/text() | \
                            //div[contains(@class,"article-txt")]/p/a/text()'                                                                                ).extract()) \
         .replace('\r\n', '').replace('\xa0', '')
     if not content:
         content = '这可能是图片或者文件,打开查看!'
     item['content'] = content
     item["keyword"] = keyword.get_keyword(item["content"])
     yield item
Exemple #22
0
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["pub_time"] = response.meta["date"]
        item["title"] = response.meta["title"]
        contents = "".join(
            response.xpath('\
                                    //div[@class="TRS_Editor"]/font/font/span/p/text()|\
                                    //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/span/span/p/font/text()|\
                                    //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/span/text()|\
                                    //div[@class="TRS_Editor"]/span/font/text()|\
                                    //div[@class="TRS_Editor"]/span/span/text()|\
                                    //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\
                                    //div[@class="TRS_Editor"]/span/h2/span/text()|\
                                    //div[@class="TRS_Editor"]/span/p/text()|\
                                    //div[@class="TRS_Editor"]/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\
                                    //div[@class="TRS_Editor"]/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/span/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\
                                    //div[@class="TRS_Editor"]/strong/font/p/text()|\
                                    //div[@class="TRS_Editor"]/strong/font/p/a/text()|\
                                    //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/text()|\
                                    //div[@class="TRS_Editor"]/p/a/text()|\
                                    //div[@class="TRS_Editor"]/p/b/text()|\
                                    //div[@class="TRS_Editor"]/p/b/span/text()|\
                                    //div[@class="TRS_Editor"]/p/a/span/text()|\
                                    //div[@class="TRS_Editor"]/p/u/span/a/text()|\
                                    //div[@class="TRS_Editor"]/p/span/a/span/text()|\
                                    //div[@class="TRS_Editor"]/p/span/text()|\
                                    //div[@class="TRS_Editor"]/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/p/strong/text()|\
                                    //div[@class="TRS_Editor"]/p/strong/font/text()|\
                                    //div[@class="TRS_Editor"]/p/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/a/text()|\
                                    //div[@class="TRS_Editor"]/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/span/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/strong/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/a/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/strong/text()|\
                                    //div[@class="TRS_Editor"]/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/text()|\
                                    //div[@class="TRS_Editor"]/a/text()|\
                                    //div[@class="TRS_Editor"]/font/text()|\
                                    //div[@class="TRS_Editor"]/div/text()|\
                                    //div[@class="TRS_Editor"]/div/a/text()|\
                                    //div[@class="TRS_Editor"]/div/b/text()|\
                                    //div[@class="TRS_Editor"]/div/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/a/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/span/a/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/sup/text()|\
                                    //div[@class="TRS_Editor"]/div/font/sup/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/b/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/font/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/a/strong/text()|\
                                    //div[@class="TRS_Editor"]/div/p/a/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/a/strong/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/a/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\
                                    //div[@class="TRS_Editor"]/div/text()|\
                                    //div[@class="TRS_Editor"]/div/div/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/font/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/font/font/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\
                                    //div[@class="cen_main"]/div/h1/text()|\
                                    //div[@class="cen_main"]/div/div/p/text()|\
                                    //div[@class="cen_main"]/div/div/p/span/text()|\
                                    //div[@class="cen_main"]/div/div/p/span/span/text()|\
                                    //div[@class="cen_main"]/div/div/div/p/span/text()|\
                                    //div[@class="cen_main"]/div/div/div/p/span/span/text()|\
                                    //font[@face="Calibri"]/text()|\
                                    //font[@face="Calibri"]/span/text()|\
                                    //font[@face="Calibri"]/span/span/text()|\
                                    //*[@id="ozoom"]/p/text()|\
                                    //*[@id="zoom"]/div/p/text()|\
                                    //*[@id="zoom"]/div/p/span/text()|\
                                    //*[@id="zoom"]/strong/span/p/strong/text()|\
                                    //*[@id="zoom"]/p/text()|\
                                    //*[@id="zoom"]/p/a/text()|\
                                    //*[@id="zoom"]/p/strong/text()|\
                                    //*[@id="zoom"]/p/span/text()|\
                                    //*[@id="zoom"]/span/p/text()|\
                                    //*[@id="zoom"]/span/p/a/text()|\
                                    //*[@id="zoom"]/span/p/a/font/text()|\
                                    //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\
                                    //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\
                                    //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\
                                    //*[@id="zoom"]/span/strong/span/span/p/strong/text()'
                           ).extract())

        if contents == "":
            contents = "可能是图片或表格 打开原网站查看"
        item["content"] = contents.replace("\u3000", "").replace("\xa0", "") \
            .replace("\t\n", "").replace("\t", "")  # .replace("  ", "")
        item["keyword"] = "".join(
            response.xpath('//dl[@class="xl_guanjc"]/dd/text()').extract())
        if item["keyword"] == "":
            item["keyword"] = keyword.get_keyword(item["content"])
        # form_s = "".join(response.xpath('//div[@class="xilan_nengr"]/h2/text()').extract())
        # print(form_s)
        webname = "".join(
            re.findall(re.compile(r'var docsource="(.*?)";'), response.text))
        if webname == "":
            webname = "中国期货业协会"
        else:
            webname = webname
        item["webname"] = webname
        item["web"] = response.meta["laiyuan"]
        item["web_id"] = 57
        return item
Exemple #23
0
    def get_detail(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["pub_time"] = response.meta["date"]
        item["title"] = response.meta["title"]
        form_s = "".join(
            response.xpath(
                '//*[@id="source"]/span/text()|//span[@class="aticle-src"]/text()'
            ).extract())
        form_s = form_s.split(" ")[0].replace("一", "")
        if form_s != "":
            item["webname"] = form_s.replace("\r",
                                             "").replace("\n",
                                                         "").replace(" ", "")
        else:
            item["webname"] = "中国科学院"
        item["web"] = response.meta["laiyuan"]
        # item["keyword"] = ""
        item["web_id"] = 17
        contents = "".join(
            response.xpath('\
                                    //div[@class="TRS_Editor"]/font/font/span/p/text()|\
                                    //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/span/span/p/font/text()|\
                                    //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/span/text()|\
                                    //div[@class="TRS_Editor"]/span/span/text()|\
                                    //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\
                                    //div[@class="TRS_Editor"]/span/h2/span/text()|\
                                    //div[@class="TRS_Editor"]/span/p/text()|\
                                    //div[@class="TRS_Editor"]/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\
                                    //div[@class="TRS_Editor"]/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/span/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\
                                    //div[@class="TRS_Editor"]/strong/font/p/text()|\
                                    //div[@class="TRS_Editor"]/strong/font/p/a/text()|\
                                    //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/text()|\
                                    //div[@class="TRS_Editor"]/p/a/text()|\
                                    //div[@class="TRS_Editor"]/p/b/text()|\
                                    //div[@class="TRS_Editor"]/p/b/span/text()|\
                                    //div[@class="TRS_Editor"]/p/a/span/text()|\
                                    //div[@class="TRS_Editor"]/p/span/a/span/text()|\
                                    //div[@class="TRS_Editor"]/p/span/text()|\
                                    //div[@class="TRS_Editor"]/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/p/strong/text()|\
                                    //div[@class="TRS_Editor"]/p/strong/font/text()|\
                                    //div[@class="TRS_Editor"]/p/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/a/text()|\
                                    //div[@class="TRS_Editor"]/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/span/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/strong/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/a/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/p/font/font/strong/text()|\
                                    //div[@class="TRS_Editor"]/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/text()|\
                                    //div[@class="TRS_Editor"]/a/text()|\
                                    //div[@class="TRS_Editor"]/font/text()|\
                                    //div[@class="TRS_Editor"]/div/a/text()|\
                                    //div[@class="TRS_Editor"]/div/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/sup/text()|\
                                    //div[@class="TRS_Editor"]/div/font/sup/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/b/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/a/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/a/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\
                                    //div[@class="TRS_Editor"]/div/text()|\
                                    //div[@class="TRS_Editor"]/div/div/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/span/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/div/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/div/div/text()|\
                                    //div[@class="TRS_Editor"]/div/div/div/div/div/div/p/text()|\
                                    //font[@face="Calibri"]/text()|\
                                    //font[@face="Calibri"]/span/text()|\
                                    //font[@face="Calibri"]/span/span/text()|\
                                    //*[@id="p-detail"]/p/text()|\
                                    //*[@id="p-detail"]/p/font/text()|\
                                    //*[@id="p-detail"]/p/font/strong/text()|\
                                    //*[@id="p-detail"]/p/font/span/text()|\
                                    //*[@id="p-detail"]/div/p/text()|\
                                    //*[@id="ozoom"]/p/text()|\
                                    //*[@id="zoom"]/div/p/text()|\
                                    //*[@id="zoom"]/div/p/span/text()|\
                                    //*[@id="zoom"]/strong/span/p/strong/text()|\
                                    //*[@id="zoom"]/p/text()|\
                                    //*[@id="zoom"]/p/a/text()|\
                                    //*[@id="zoom"]/p/strong/text()|\
                                    //*[@id="zoom"]/p/span/text()|\
                                    //*[@id="zoom"]/span/p/text()|\
                                    //*[@id="zoom"]/span/p/a/text()|\
                                    //*[@id="zoom"]/span/p/a/font/text()|\
                                    //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\
                                    //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\
                                    //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\
                                    //*[@id="zoom"]/span/strong/span/span/p/strong/text()'
                           ).extract())

        if contents == "":
            contents = "可能是图片或表格 打开原网站查看"
        item["content"] = contents.replace("\u3000",
                                           "").replace("\xa0", "").replace(
                                               "\u200b", "")
        item["keyword"] = keyword.get_keyword(item["content"])

        return item
Exemple #24
0
 def get_detail(self, response):
     item = FagaiweiItem()
     item["url"] = response.url
     date = "".join(
         response.xpath(
             '//div[@class="article__heading"]/div/div/span/text()').
         extract())
     pub_time = date.split("\n")[0]
     if pub_time:
         item["pub_time"] = pub_time
     else:
         item["pub_time"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime(time.time()))
     item["title"] = response.meta["title"]
     item["webname"] = "华尔街见闻"
     item["web"] = response.meta["laiyuan"]
     item["web_id"] = 46
     contents = "".join(
         response.xpath('//div[@class="article__content"]/div/div/text()|\
                                        //div[@class="article__content"]/div/div/p/text()|\
                                        //div[@class="article__content"]/div/div/p/strong/text()|\
                                        //div[@class="article__content"]/div/div/h2/text()|\
                                        //div[@class="article__content"]/div/div/h2/strong/text()|\
                                        //div[@class="article__content"]/div/p/text()|\
                                        //div[@class="article__content"]/div/p/strong/text()|\
                                        //div[@class="article__content"]/div/h2/text()|\
                                        //div[@class="pa-main__content preview"]/p/text()|\
                                        //div[@class="pa-main__content preview"]/p/strong/text()|\
                                        //div[@class="pa-main__content"]/p/text()|\
                                        //div[@class="pa-main__content"]/p/span/text()|\
                                        //div[@class="pa-main__content"]/p/span/span/text()|\
                                        //div[@class="pa-main__content"]/p/span/strong/text()|\
                                        //div[@class="pa-main__content"]/p/span/strong/span/text()|\
                                        //div[@class="article__content"]/div/div/h2/p/text()'
                        ).extract())
     # print(contents)
     if contents != "":
         item["content"] = contents.replace("\u3000",
                                            "").replace("\xa0", "")
     else:
         u = response.url
         article_id = u.split('/')[-1]
         url = "https://api-prod.wallstreetcn.com/apiv1/content/articles/{}?extract=0".format(
             article_id)
         # url = "https://api-prod.wallstreetcn.com/apiv1/content/articles/3297387?extract=0"
         res = requests.get(url, headers=DEFAULT_REQUEST_HEADERS)
         result = res.json()
         ress = result["data"]["content"]
         tree = etree.HTML(ress)
         content_sss = "".join(
             tree.xpath('//p/text()|//span/text()|//strong/text()'))
         # print(content_sss)
         if content_sss:
             item["content"] = content_sss
         else:
             item["content"] = "可能是图片 请打开详情页查看"
     item["keyword"] = keyword.get_keyword(item["content"])
     if item["url"] == item["web"]:
         pass
     else:
         # pass
         # print(item)
         return item
Exemple #25
0
 def get_detail(self, response):
     item = FagaiweiItem()
     item["url"] = response.url
     item["title"] = response.meta["title"]
     item["pub_time"] = response.meta["date"]
     contents = "".join(
         response.xpath('\
                                 //div[@class="TRS_Editor"]/font/font/span/p/text()|\
                                 //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/span/span/p/font/text()|\
                                 //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\
                                 //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/span/text()|\
                                 //div[@class="TRS_Editor"]/span/font/text()|\
                                 //div[@class="TRS_Editor"]/span/span/text()|\
                                 //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\
                                 //div[@class="TRS_Editor"]/span/h2/span/text()|\
                                 //div[@class="TRS_Editor"]/span/p/text()|\
                                 //div[@class="TRS_Editor"]/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\
                                 //div[@class="TRS_Editor"]/span/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/span/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\
                                 //div[@class="TRS_Editor"]/strong/font/p/text()|\
                                 //div[@class="TRS_Editor"]/strong/font/p/a/text()|\
                                 //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\
                                 //div[@class="TRS_Editor"]/p/text()|\
                                 //div[@class="TRS_Editor"]/p/a/text()|\
                                 //div[@class="TRS_Editor"]/p/b/text()|\
                                 //div[@class="TRS_Editor"]/p/b/span/text()|\
                                 //div[@class="TRS_Editor"]/p/a/span/text()|\
                                 //div[@class="TRS_Editor"]/p/u/span/a/text()|\
                                 //div[@class="TRS_Editor"]/p/span/a/span/text()|\
                                 //div[@class="TRS_Editor"]/p/span/text()|\
                                 //div[@class="TRS_Editor"]/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/p/strong/text()|\
                                 //div[@class="TRS_Editor"]/p/strong/font/text()|\
                                 //div[@class="TRS_Editor"]/p/a/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/a/text()|\
                                 //div[@class="TRS_Editor"]/p/font/span/text()|\
                                 //div[@class="TRS_Editor"]/p/font/span/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/strong/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/a/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/p/font/font/strong/text()|\
                                 //div[@class="TRS_Editor"]/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/text()|\
                                 //div[@class="TRS_Editor"]/a/text()|\
                                 //div[@class="TRS_Editor"]/font/text()|\
                                 //div[@class="TRS_Editor"]/div/text()|\
                                 //div[@class="TRS_Editor"]/div/a/text()|\
                                 //div[@class="TRS_Editor"]/div/b/text()|\
                                 //div[@class="TRS_Editor"]/div/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/text()|\
                                 //div[@class="TRS_Editor"]/div/span/a/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/p/span/span/a/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/text()|\
                                 //div[@class="TRS_Editor"]/div/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/span/sup/text()|\
                                 //div[@class="TRS_Editor"]/div/font/sup/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/b/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/font/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/a/strong/text()|\
                                 //div[@class="TRS_Editor"]/div/p/a/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/a/strong/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/a/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\
                                 //div[@class="TRS_Editor"]/div/text()|\
                                 //div[@class="TRS_Editor"]/div/div/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/font/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/font/font/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/span/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/span/font/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/p/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\
                                 //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\
                                 //div[@class="cen_main"]/div/h1/text()|\
                                 //div[@class="cen_main"]/div/div/p/text()|\
                                 //div[@class="cen_main"]/div/div/p/span/text()|\
                                 //div[@class="cen_main"]/div/div/p/span/span/text()|\
                                 //div[@class="cen_main"]/div/div/div/p/span/text()|\
                                 //div[@class="cen_main"]/div/div/div/p/span/span/text()|\
                                 //font[@face="Calibri"]/text()|\
                                 //font[@face="Calibri"]/span/text()|\
                                 //font[@face="Calibri"]/span/span/text()|\
                                 //*[@id="ozoom"]/p/text()|\
                                 //*[@id="zoom"]/div/p/text()|\
                                 //*[@id="zoom"]/div/p/span/text()|\
                                 //*[@id="zoom"]/strong/span/p/strong/text()|\
                                 //*[@id="zoom"]/p/text()|\
                                 //*[@id="zoom"]/p/a/text()|\
                                 //*[@id="zoom"]/p/strong/text()|\
                                 //*[@id="zoom"]/p/span/text()|\
                                 //*[@id="zoom"]/span/p/text()|\
                                 //*[@id="zoom"]/span/p/a/text()|\
                                 //*[@id="zoom"]/span/p/a/font/text()|\
                                 //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\
                                 //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\
                                 //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\
                                 //*[@id="zoom"]/span/strong/span/span/p/strong/text()'
                        ).extract())
     if contents == "":
         contents = "可能是图片或表格 打开原网站查看"
     item["content"] = contents.replace("\u3000", "").replace("\xa0", "")
     fo = "".join(response.xpath('//td[@class="Gray12"]/text()').extract())
     fo_s = fo.replace("\n", "").replace("\r",
                                         "").replace("\t",
                                                     "").replace(" ", "")
     form_s = "".join(re.findall(re.compile(r'来源:(.*?)分享'), fo_s))
     # print(form_s)
     if form_s == "":
         form_s = "中华人民共和国自然资源部"
     item["webname"] = form_s
     item["web"] = response.meta["laiyuan"]
     item["web_id"] = 8
     item["keyword"] = keyword.get_keyword(item["content"])
     # print(item)
     return item
Exemple #26
0
 def parse(self, response):
     # print(response.url)
     if response.url == "http://live.nbd.com.cn/":
         message_list = response.xpath('//ul[@class="live-list"]/li')
         # print(len(message_list))
         for message in message_list[:10]:
             item = FagaiweiItem()
             date = "".join(message.xpath('div[1]/p/span/text()').extract())
             content = "".join(message.xpath('div[2]/a/text()').extract())
             href = "".join(message.xpath('div[2]/a/@href').extract())
             days = time.strftime('%Y-%m-%d', time.localtime(time.time()))
             date = days + " " + date
             try:
                 dates = datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S')
             except:
                 dates = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             # print(dates, content, href)
             result = session.query(NewsItemInfo).filter_by(url=href, web_id=26).count()
             if result:
                 # print("{} 存在".format(href))
                 pass
             else:
                 item["url"] = href
                 item["pub_time"] = dates
                 item["title"] = content[:30]
                 item["webname"] = "每经网"
                 item["web"] = response.url
                 item["web_id"] = 26
                 item["content"] = content
                 # print(item)
                 item["keyword"] = keyword.get_keyword(item["content"])
                 yield item
     else:
         message_list = response.xpath('//ul[@class="m-columnnews-list"]/li|\
                                         //ul[@class="mt-ul"]/li|\
                                         //ul[@class="u-news-list"]/li')
         # print(len(message_list))
         for message in message_list:
             date_1 = "".join(message.xpath('//p[@class="u-channeltime"]/text()').extract())
             date_1 = date_1.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "")
             date = "".join(message.xpath('div/div/p/span[2]/text()|div/p[2]/text()|span/text()').extract())
             title = "".join(message.xpath('div/div/a[1]/text()|div/a/text()|a/text()').extract())
             href = "".join(message.xpath('div/div/a/@href').extract())
             if not href:
                 href = "".join(message.xpath('div/a/@href').extract())
                 if not href:
                     href = "".join(message.xpath('a/@href').extract())
             date = date.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "")
             date = date_1 + " " + date
             # print(date)
             try:
                 date = datetime.datetime.strptime(str(date).replace('-', '-'), '%Y-%m-%d %H:%M:%S')
             except Exception as e:
                 date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             url = href
             result = session.query(NewsItemInfo).filter_by(url=url.replace("#", ""), web_id=26).count()
             if result:
                 # print("{} 存在".format(url))
                 pass
             else:
                 yield scrapy.Request(url=url, callback=self.get_detail,
                                      meta={"date": date, "title": title, "laiyuan": response.url})
Exemple #27
0
    def get_detail_cma(self, response):
        item = FagaiweiItem()
        item["url"] = response.url
        item["title"] = response.meta["title"]
        contents = "".join(
            response.xpath('\
                                           //div[@class="TRS_Editor"]/font/font/span/p/text()|\
                                           //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/span/span/p/font/text()|\
                                           //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\
                                           //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/span/text()|\
                                           //div[@class="TRS_Editor"]/span/font/text()|\
                                           //div[@class="TRS_Editor"]/span/span/text()|\
                                           //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\
                                           //div[@class="TRS_Editor"]/span/h2/span/text()|\
                                           //div[@class="TRS_Editor"]/span/p/text()|\
                                           //div[@class="TRS_Editor"]/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\
                                           //div[@class="TRS_Editor"]/span/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/span/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\
                                           //div[@class="TRS_Editor"]/strong/font/p/text()|\
                                           //div[@class="TRS_Editor"]/strong/font/p/a/text()|\
                                           //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\
                                           //div[@class="TRS_Editor"]/p/text()|\
                                           //div[@class="TRS_Editor"]/p/a/text()|\
                                           //div[@class="TRS_Editor"]/p/b/text()|\
                                           //div[@class="TRS_Editor"]/p/b/span/text()|\
                                           //div[@class="TRS_Editor"]/p/a/span/text()|\
                                           //div[@class="TRS_Editor"]/p/u/span/a/text()|\
                                           //div[@class="TRS_Editor"]/p/span/a/span/text()|\
                                           //div[@class="TRS_Editor"]/p/span/text()|\
                                           //div[@class="TRS_Editor"]/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/p/strong/text()|\
                                           //div[@class="TRS_Editor"]/p/strong/font/text()|\
                                           //div[@class="TRS_Editor"]/p/a/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/a/text()|\
                                           //div[@class="TRS_Editor"]/p/font/span/text()|\
                                           //div[@class="TRS_Editor"]/p/font/span/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/strong/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/a/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/p/font/font/strong/text()|\
                                           //div[@class="TRS_Editor"]/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/text()|\
                                           //div[@class="TRS_Editor"]/a/text()|\
                                           //div[@class="TRS_Editor"]/font/text()|\
                                           //div[@class="TRS_Editor"]/div/text()|\
                                           //div[@class="TRS_Editor"]/div/a/text()|\
                                           //div[@class="TRS_Editor"]/div/b/text()|\
                                           //div[@class="TRS_Editor"]/div/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/a/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/p/span/span/a/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/text()|\
                                           //div[@class="TRS_Editor"]/div/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/span/sup/text()|\
                                           //div[@class="TRS_Editor"]/div/font/sup/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/b/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/font/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/font/font/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/a/strong/text()|\
                                           //div[@class="TRS_Editor"]/div/p/a/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/a/strong/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/a/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\
                                           //div[@class="TRS_Editor"]/div/text()|\
                                           //div[@class="TRS_Editor"]/div/div/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/font/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/font/font/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/span/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/span/font/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/div/p/font/font/font/font/font/font/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\
                                           //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\
                                           //div[@class="cen_main"]/div/h1/text()|\
                                           //div[@class="cen_main"]/div/div/p/text()|\
                                           //div[@class="cen_main"]/div/div/p/span/text()|\
                                           //div[@class="cen_main"]/div/div/p/span/span/text()|\
                                           //div[@class="cen_main"]/div/div/div/p/span/text()|\
                                           //div[@class="cen_main"]/div/div/div/p/span/span/text()|\
                                           //font[@face="Calibri"]/text()|\
                                           //font[@face="Calibri"]/span/text()|\
                                           //font[@face="Calibri"]/span/span/text()|\
                                           //*[@id="ozoom"]/p/text()|\
                                           //*[@id="zoom"]/div/p/text()|\
                                           //*[@id="zoom"]/div/p/span/text()|\
                                           //*[@id="zoom"]/strong/span/p/strong/text()|\
                                           //*[@id="zoom"]/p/text()|\
                                           //*[@id="zoom"]/p/a/text()|\
                                           //*[@id="zoom"]/p/strong/text()|\
                                           //*[@id="zoom"]/p/span/text()|\
                                           //*[@id="zoom"]/span/p/text()|\
                                           //*[@id="zoom"]/span/p/a/text()|\
                                           //*[@id="zoom"]/span/p/a/font/text()|\
                                           //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\
                                           //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\
                                           //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\
                                           //*[@id="zoom"]/span/strong/span/span/p/strong/text()'
                           ).extract())

        if contents == "":
            contents = "可能是图片或表格 打开原网站查看"
        item["content"] = contents.replace("\u3000", "").replace("\xa0", "") \
            .replace("\t\n", "").replace("\t", "")  # .replace("  ", "")
        form_s = "".join(
            response.xpath(
                '//div[@class="news_textspan"]/div/span[1]/text()').extract())
        item["webname"] = form_s.replace("来源:", "")
        item["web"] = response.meta["laiyuan"]
        date = "".join(
            response.xpath(
                '//div[@class="news_textspan"]/div/span[2]/text()').extract())
        date = date[:-5].replace("发布时间:", "").replace("年", "-").replace(
            "月", "-").replace("日", "")
        try:
            date = datetime.datetime.strptime(
                str(date).replace('/', '-'), '%Y-%m-%d')
            # print(date)
        except Exception as e:
            # print(e)
            date = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(time.time()))
        item["pub_time"] = date
        # print(item)
        item["web_id"] = 3
        item["keyword"] = keyword.get_keyword(item["content"])
        return item
Exemple #28
0
def parse_fagaiwei(response, item):
    item = FagaiweiItem()
    item["url"] = response.url
    contents = "".join(
        response.xpath('-\
                                //div[@class="TRS_Editor"]/font/font/span/p/text()|\
                                //div[@class="TRS_Editor"]/font/font/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/font/font/span/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/span/span/p/font/text()|\
                                //div[@class="TRS_Editor"]/span/span/p/font/span/text()|\
                                //div[@class="TRS_Editor"]/span/span/p/font/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/font/span/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/span/p/span/font/strong/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/span/p/strong/font/font/font/font/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/font/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/span/text()|\
                                //div[@class="TRS_Editor"]/span/span/text()|\
                                //div[@class="TRS_Editor"]/span/font/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/span/font/p/span/font/span/text()|\
                                //div[@class="TRS_Editor"]/span/h2/span/text()|\
                                //div[@class="TRS_Editor"]/span/p/text()|\
                                //div[@class="TRS_Editor"]/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/span/strong/span/span/p/strong/text()|\
                                //div[@class="TRS_Editor"]/span/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/span/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/div/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/div/span/p/font/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/b/span/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/b/span/span/p/b/span/text()|\
                                //div[@class="TRS_Editor"]/strong/font/p/text()|\
                                //div[@class="TRS_Editor"]/strong/font/p/a/text()|\
                                //div[@class="TRS_Editor"]/strong/font/p/a/font/text()|\
                                //div[@class="TRS_Editor"]/p/text()|\
                                //div[@class="TRS_Editor"]/p/a/text()|\
                                //div[@class="TRS_Editor"]/p/b/text()|\
                                //div[@class="TRS_Editor"]/p/b/span/text()|\
                                //div[@class="TRS_Editor"]/p/a/span/text()|\
                                //div[@class="TRS_Editor"]/p/span/a/span/text()|\
                                //div[@class="TRS_Editor"]/p/span/text()|\
                                //div[@class="TRS_Editor"]/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/p/strong/text()|\
                                //div[@class="TRS_Editor"]/p/strong/font/text()|\
                                //div[@class="TRS_Editor"]/p/a/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/a/text()|\
                                //div[@class="TRS_Editor"]/p/font/span/text()|\
                                //div[@class="TRS_Editor"]/p/font/span/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/strong/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/font/font/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/a/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/a/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/a/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/a/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/font/font/font/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/p/font/font/strong/text()|\
                                //div[@class="TRS_Editor"]/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/text()|\
                                //div[@class="TRS_Editor"]/a/text()|\
                                //div[@class="TRS_Editor"]/font/text()|\
                                //div[@class="TRS_Editor"]/div/a/text()|\
                                //div[@class="TRS_Editor"]/div/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/font/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/span/span/span/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/text()|\
                                //div[@class="TRS_Editor"]/div/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/span/sup/text()|\
                                //div[@class="TRS_Editor"]/div/font/sup/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/font/span/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/b/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/b/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/font/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/text()|\
                                //div[@class="TRS_Editor"]/div/p/font/font/span/span/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/a/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/span/a/text()|\
                                //div[@class="TRS_Editor"]/div/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/div/p/span/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/p/span/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/table/tbody/tr/td/p/span/a/span/text()|\
                                //div[@class="TRS_Editor"]/div/text()|\
                                //div[@class="TRS_Editor"]/div/div/text()|\
                                //div[@class="TRS_Editor"]/div/div/p/text()|\
                                //div[@class="TRS_Editor"]/div/div/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/p/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/p/span/font/text()|\
                                //div[@class="TRS_Editor"]/div/div/p/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/p/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/span/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/div/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/div/p/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/div/span/p/text()|\
                                //div[@class="TRS_Editor"]/div/div/div/span/p/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/div/span/p/font/font/span/text()|\
                                //div[@class="TRS_Editor"]/div/div/div/div/span/p/text()|\
                                //font[@face="Calibri"]/text()|\
                                //font[@face="Calibri"]/span/text()|\
                                //font[@face="Calibri"]/span/span/text()|\
                                //div[@class="txt1"]/text()|\
                                //div[@class="txt1"]/a/text()|\
                                //div[@class="txt1"]/a/font/text()|\
                                //*[@id="ozoom"]/p/text()|\
                                //*[@id="zoom"]/div/p/text()|\
                                //*[@id="zoom"]/div/p/span/text()|\
                                //*[@id="zoom"]/strong/span/p/strong/text()|\
                                //*[@id="zoom"]/p/text()|\
                                //*[@id="zoom"]/p/a/text()|\
                                //*[@id="zoom"]/p/strong/text()|\
                                //*[@id="zoom"]/p/span/text()|\
                                //*[@id="zoom"]/span/p/text()|\
                                //*[@id="zoom"]/span/p/a/text()|\
                                //*[@id="zoom"]/span/p/a/font/text()|\
                                //*[@id="zoom"]/span/span/span/span/span/strong/span/span/strong/span/p/span/strong/span/span/strong/text()|\
                                //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/span/text()|\
                                //*[@id="zoom"]/span/span/span/span/span/strong/span/span/p/text()|\
                                //*[@id="zoom"]/span/strong/span/span/p/strong/text()'
                       ).extract())

    if contents == "":
        contents = "可能是图片或表格 打开原网站查看"
    item["content"] = contents
    item["pub_time"] = response.meta["date"]
    item["title"] = response.meta["title"]
    from_s = "".join(
        response.xpath('//*[@id="dSourceText"]/a/text()').extract())
    from_s_url = "".join(
        response.xpath('//*[@id="dSourceText"]/a/@href').extract())
    if from_s == "":
        webname = "发改委"
        depart_url = response.meta["laiyuan"]
    else:
        webname = from_s
        depart_url = from_s_url
    item["webname"] = webname
    item["web"] = depart_url
    item["keyword"] = keyword.get_keyword(item["content"])
    item["web_id"] = 2
    yield item