def parse_news(self, response):
     item = response.meta['item']
     pageindex = response.meta['pageindex']
     soup = BeautifulSoup(response.body, "lxml")
     #eg:2016年07月13日 21:50:42
     news_date = soup.find("span", class_="item time").string if soup.find(
         "span", class_="item time") else None
     struct_date = datetime.datetime.strptime(news_date.encode('utf-8'),
                                              "%Y年%m月%d日 %H:%M:%S")
     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")  #规范日期时间
     title = soup.find("h1", class_="article-title").string if soup.find(
         "h1", class_="article-title") else None
     author = soup.find("span", class_="item author").a.string if soup.find(
         "span", class_="item author") else None
     comment_num = soup.find("span",
                             class_="wscn-cm-counter").string if soup.find(
                                 "span", class_="wscn-cm-counter") else None
     content = soup.find("div", class_="article-content").text if soup.find(
         "div", class_="article-content") else None
     news_no = response.url.rsplit("/", 1)[-1]
     item["title"] = title
     item["author"] = author
     item["comment_num"] = comment_num
     item["content"] = content
     item["news_date"] = news_date
     item["crawl_date"] = datetime.datetime.now().strftime(
         "%Y-%m-%d %H:%M:%S")
     item["news_no"] = news_no
     item = judge_news_crawl(item)  #判断是否符合爬取时间
     if item:
         yield item
     else:
         self.flag = pageindex
Exemple #2
0
 def parse(self, response):
     origin_url = response.url
     #http://money.163.com/special/002526O5/transport_02.html
     search_result = re.search(r"_(\d)*?\.", origin_url)
     #获取页数
     pageindex = search_result.group(1) if search_result else 1
     soup = BeautifulSoup(response.body, "lxml")
     news_list = soup("div", class_="list_item clearfix")
     for news in news_list:
         news_date = news.find("span", class_="time").text if news.find(
             "span", class_="time") else None
         title = news.find("h2").text if news.find("h2") else None
         news_url = news.find("h2").a.get("href",
                                          None) if news.find("h2") else None
         abstract = news.find("p").contents[0] if news.find("p") else None
         item = NewsItem(title=title,
                         news_url=news_url,
                         abstract=abstract,
                         news_date=news_date)
         item = judge_news_crawl(item)  #判断是否符合爬取时间
         if item:
             request = scrapy.Request(news_url,
                                      callback=self.parse_news,
                                      meta={"item": item})
             yield request
         else:
             self.flag = int(pageindex)
     if not self.flag:
         next_url = self.next_url % int(pageindex) + 1
         yield scrapy.Request(next_url)
Exemple #3
0
 def parse(self, response):
     origin_url = response.url
     pageindex = origin_url.rsplit("/")[-1]
     soup = BeautifulSoup(response.body,"lxml")
     news_list = soup.find_all("li",class_="mt24 pr")
     for news in news_list:
         news_date = news.find("a",href="javascript:;").text if news.find("a",href="javascript:;") else None
         if news_date:
             news_url = news.find("p",class_="h1").a.get("href",None) if news.find("p",class_="h1") else None
             news_no = news_url.rsplit("/")[-1].split(".")[0]    #http://www.nbd.com.cn/articles/2016-07-25/1025147.html
             title =  news.find("p",class_="h1").text.strip() if news.find("p",class_="h1") else None
             #显示不全,在新闻具体页拿
             # abstract = news.find("p",class_="news-p").text.strip() if news.find("p",class_="news-p") else None
             referer_web =news.find("div",class_="messge").contents[-2].a.text if news.find("div",class_="messge") else None
             referer_web = referer_web if referer_web != '' else None
             comment_num =soup.find("span",class_="fr").a.text if soup.find("span",class_="fr") else None
             item = NewsItem(news_date=news_date,
                             title=title,
                             # abstract=abstract,
                             referer_web=referer_web,
                             comment_num=comment_num,
                             news_no = news_no,
                             news_url=news_url
                             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("can't find news_date")
     if not self.flag:
         next_url = self.next_url % (int(pageindex)+1)
         yield scrapy.Request(next_url)
Exemple #4
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     pageindex = response.meta.get("pageindex", 1)
     topic_url = response.meta.get("topic_url", None)
     origin_url = response.url
     news_no_res = re.search(r"news/(\d+)\.html", origin_url)
     news_no = news_no_res.group(1) if news_no_res else None
     soup = BeautifulSoup(response.body, "lxml")
     ff3 = soup.find("h2", class_="f-ff3 f-fwn")
     referer_web = soup.find("h2",
                             class_="f-ff3 f-fwn").i.text if ff3 else None
     #日期
     origin_date = soup.find(
         "h2", class_="f-ff3 f-fwn").contents[-1].text if ff3 else None
     struct_date = datetime.datetime.strptime(origin_date, "%Y-%m-%d %H:%M")
     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
     content = soup.find("div", class_="m-text").text if soup.find(
         "div", class_="m-text") else None
     author = soup.find("h3", class_="f-ff3 f-fwn").span.text if soup.find(
         "h3", class_="f-ff3 f-fwn") else None
     crawl_date = NOW
     item["referer_web"] = referer_web
     item["crawl_date"] = crawl_date
     item["author"] = author
     item["content"] = content
     item["news_no"] = news_no
     item["news_date"] = news_date
     item = judge_news_crawl(item)
     if item:
         yield item
     else:
         self.flag[topic_url] = pageindex
Exemple #5
0
 def parse_topic(self,response):
     origin_url = response.url
     if "_" not in origin_url:
         pageindex = 0
         topic_url = origin_url.rsplit(".",1)[0]
     else:
         temp = origin_url.rsplit("_",1)
         pageindex = temp[-1].split(".",1)[0]
         topic_url = temp[0]
     soup = BeautifulSoup(response.body,"lxml")
     catalogue =  soup.find("a",class_ = "blue CurrChnlCls").get("title").strip()
     news_list = soup.find("div", class_ = "lie_main_m").find_all("li")
     for news in news_list:
         news_date = news.find("span").text.strip() + " 00:00:00"
         title = news.find("a").text.strip()[10:]
         news_url = topic_url.rsplit("/",1)[0] + news.find("a").get("href")[1:]
         news_no = news_url.rsplit("/",1)[-1].split(".")[0]
         item = NewsItem(
                     news_date = news_date,
                     news_url =news_url,
                     title = title,
                     news_no = news_no,
                     catalogue = catalogue,
         )
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
         else:
             self.flag[topic_url] = pageindex
     if not self.flag[topic_url]:
         next_url = topic_url + "_" + str(int(pageindex) + 1) + ".shtml"
         yield scrapy.Request(next_url,callback=self.parse_topic)
Exemple #6
0
 def parse(self,response):
     origin_url = response.url
     if 'index' not in origin_url:
         pageindex = 0
     else:
         pageindex = origin_url.rsplit('index_',1)[-1].replace('.html','')
         pageindex = int(pageindex)
     soup = BeautifulSoup(response.body.decode('utf8'),"lxml")
     news_list = soup.find_all('li',style = 'overflow:hidden;')
     for news in news_list:
         news_date = news.find('span').text if news.find('span') else None
         if news_date :
             news_url = news.find('a').get('href')
             news_no = news_url.rsplit('/',1)[-1].replace('.html','') # http://www.caac.gov.cn/XWZX/MHYW/201607/t20160726_39146.html
             title = news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')).text.strip() if news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')) else None
             item = NewsItem(
                 news_date = news_date + ' 00:00:00',
                 title = title,
                 news_url = news_url,
                 news_no = news_no
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("mhyw can't find news_date")
     if not self.flag:
         next_url = self.next_url % (str(pageindex + 1 ))
         yield scrapy.Request(next_url)
    def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",None)
        soup = BeautifulSoup(response.body)
        #TODO:新闻列表中会有专题,里面没有新闻的内容。现在是抛弃!
        #爬取新闻
        news_txt=soup.find("div",class_="news_txt")
        if news_txt:
            content = news_txt.text
            news_about = soup.find("div",class_="news_about")
            #referer_web,news_date
            if news_about:
                referer_web = news_about.p.string
                news_date = news_about.p.next_sibling.next_sibling.text[0:16]
                struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                item["referer_web"]=referer_web
                item["news_date"]=news_date
                item["content"]=content
                item["crawl_date"]=NOW
                # import pdb;pdb.set_trace()
                item = judge_news_crawl(item)
                if item:
                    yield item
                else:
                    self.flag = pageindex

        else:

            logger.info("news page can't find news_txt.That may be a theme")
    def parse(self, response):
        keyword = response.meta.get("keyword", None)
        soup = BeautifulSoup(response.body,"lxml")
        for t in soup.find_all("div", attrs={"class":"result title"}):
            item = NewsItem()
            url = t.find("a").get("href")       #新闻url
            title = t.find("a").text            #新闻标题
            temp_list = t.find("div",attrs={"class":"c-title-author"}).text.split(u"\xa0\xa0")
            website_name = temp_list[0]         #新闻网站名称、
            news_time = temp_list[1]
            #TODO: Some error
            now = datetime.datetime.now()
            if u"分钟前" in news_time:
                print news_time[:-3]
                struct_date = now - datetime.timedelta(minutes=int(news_time[:-3]))
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            elif u"小时前" in news_time:
                print news_time[:-3]
                struct_date = now - datetime.timedelta(hours=int(news_time[:-3]))
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            else:
                news_date = "%s-%s-%s %s:00" % (news_time[:4],news_time[5:7],news_time[8:10],news_time[12:])

            item['news_url'] = url
            item['title'] = title
            item['news_date'] = news_date
            item['referer_web'] = website_name
            item["crawl_date"] = NOW
            item["keywords"] = [keyword]
            item = judge_news_crawl(item)
            if item:
                yield item
 def parse(self, response):
     origin_url = response.url
     pageindex = origin_url.rsplit("=", 1)[-1]
     soup = BeautifulSoup(response.body, "lxml")
     news_temp = soup.find("table",
                           class_="list").find("table",
                                               border="0").find("tbody")
     if not news_temp:
         return
     news_list = news_temp.find_all("tr")[1:]
     for news in news_list:
         temp = news.find("a")
         news_url = temp.get("href")
         title = temp.text.strip()
         temp = news.find_all("span")
         referer_web = temp[0].text.strip()
         news_date = temp[1].text.strip()
         news_no = news_url.rsplit("=", 1)[-1]
         item = NewsItem(news_date=news_date,
                         title=title,
                         referer_web=referer_web,
                         news_url=news_url,
                         news_no=news_no)
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],
                                  callback=self.parse_news,
                                  meta={"item": item})
         else:
             self.flag = pageindex
     if not self.flag:
         next_url = self.next_url % (int(pageindex) + 1)
         yield scrapy.Request(next_url)
Exemple #10
0
 def parse_news(self,response):
     item = response.meta.get("item",NewsItem())
     pageindex = response.meta.get("pageindex",1)
     topic_url = response.meta.get("topic_url",None)
     origin_url = response.url
     news_no_res = re.search(r"news/(\d+)\.html",origin_url)
     news_no = news_no_res.group(1) if news_no_res else None
     soup = BeautifulSoup(response.body,"lxml")
     ff3 = soup.find("h2",class_="f-ff3 f-fwn")
     referer_web = soup.find("h2",class_="f-ff3 f-fwn").i.text if ff3 else None
     #日期
     origin_date = soup.find("h2",class_="f-ff3 f-fwn").contents[-1].text if ff3 else None
     struct_date = datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M")
     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
     content = soup.find("div",class_="m-text").text.strip() if soup.find("div",class_="m-text") else None
     author = soup.find("h3",class_="f-ff3 f-fwn").span.text if soup.find("h3",class_="f-ff3 f-fwn") else None
     crawl_date = NOW
     item["referer_web"]=referer_web
     item["crawl_date"]=crawl_date
     item["author"]=author
     item["content"]=content
     item["news_no"]=news_no
     item["news_date"]=news_date
     item = judge_news_crawl(item)
     if item:
         yield item
     else:
         self.flag[topic_url]=pageindex
 def parse(self,response):
     origin_url = response.url
     if '_' not in origin_url:
         pageindex = '1'
         origin_url = origin_url.rsplit('.',1)[0] + '_0' + '.html'
     else:
         pageindex = origin_url.rsplit('_',1)[-1].replace('.html','')
     soup = BeautifulSoup(response.body)
     catalogue = soup.find('div' ,class_ = 'nav_cur_index').find('span').text
     news_list = soup.find_all('div',class_ = 'item_top')
     for news in news_list:
         news_date = news.find('span',class_ = 'time').text.strip() if news.find('span',class_ = 'time') else None
         if news_date:
             news_url = news.find('h2').find('a',href = re.compile('http://money.163.com/')).get('href')
             title = news.find('h2').find('a',href = re.compile('http://money.163.com/')).text
             item = NewsItem(
                 news_date = news_date,
                 news_url = news_url,
                 title = title,
                 catalogue = catalogue
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("carnoc:%s can't find news_date " % origin_url)
     if not self.flag:
         pageindex = str(int(pageindex) + 1 ) if int(pageindex) > 9 else '0' + str(int(pageindex) + 1)
         next_url = self.next_url % (pageindex)
         yield scrapy.Request(next_url)
 def parse(self,response):
     origin_url = response.url
     pageindex = origin_url.rsplit("=",1)[-1]
     soup = BeautifulSoup(response.body,"lxml")
     news_temp = soup.find("table" ,class_ = "list").find("table" , border="0").find("tbody")
     if not news_temp:
         return
     news_list = news_temp.find_all("tr")[1:]
     for news in news_list:
         temp = news.find("a")
         news_url = temp.get("href")
         title = temp.text.strip()
         temp = news.find_all("span")
         referer_web = temp[0].text.strip()
         news_date = temp[1].text.strip()
         news_no = news_url.rsplit("=",1)[-1]
         item = NewsItem(news_date=news_date,
                         title=title,
                         referer_web=referer_web,
                         news_url=news_url,
                         news_no = news_no
                         )
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item})
         else:
             self.flag = pageindex
     if not self.flag:
         next_url = self.next_url % (int(pageindex) + 1)
         yield scrapy.Request(next_url)
Exemple #13
0
 def parse(self, response):
     origin_url = response.url
     if origin_url == _36krSpider.start_urls[0]:
         temp = re.search('"highProjects\|focus":([\w\W]+?),"editorChoice\|focus"',response.body).group(1)
         news_list = json.loads(temp)
         temp = re.search('"editorChoice\|focus":([\w\W]+?),"feedHeaders\|column"',response.body).group(1)
         news_list.extend(json.loads(temp))
         for news in news_list:
             title = news["title"]
             news_url = news["url"]
             if "36kr.com" not in news_url or ".html" not in news_url:
                 continue
             news_date = news["created_at"]
             pic = news["cover"]
             news_no = news_url.rsplit("/",1)[-1].split(".")[0]
             item = NewsItem(
                     news_date=news_date,
                     title=title,
                     news_no=news_no,
                     news_url=news_url,
                     pic = pic
                     )
             yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item})
         next_url = _36krSpider.next_url % ("",str(time.time()).split(".")[0])
         yield scrapy.Request(next_url)
     else:
         news_list = json.loads(response.body)["data"]["items"]
         next_id = news_list[-1]["id"]
         for news in news_list:
             title = news["title"]
             news_no = news["id"]
             news_url = "http://36kr.com/p/%s.html" % (news_no)
             if "36kr.com" not in news_url or ".html" not in news_url:
                 continue
             news_date = news["created_at"]
             pic = news["cover"]
             abstract = news["summary"]
             topic = ",".join([ t for t in re.findall('\["([\w\W]+?)",',news["extraction_tags"])])
             catalogue = news["column"]["name"]
             author = news["user"]["name"]
             item = NewsItem(
                 news_date=news_date,
                 title=title,
                 news_no=news_no,
                 news_url=news_url,
                 pic=pic,
                 abstract = abstract,
                 topic = topic,
                 author = author,
                 catalogue = catalogue
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item})
             else:
                 self.flag = 1
         if not self.flag:
             next_url = self.next_url % (next_id,str(time.time()).split(".")[0])
             yield scrapy.Request(next_url)
Exemple #14
0
    def parse_quick(self, response):
        soup = BeautifulSoup(response.body)
        news_list_inner = soup.find("div", class_="list-inner")
        next_timestamp = None

        news_list = news_list_inner.find_all(
            "div",
            class_=re.compile(r"bulletin-item.*")) if news_list_inner else None
        #json 页面
        if not news_list:
            news_list = soup.find_all("div",
                                      class_=re.compile(r"bulletin-item.*"))
        for index, news in enumerate(news_list):
            origin_date = news.find("div", class_="news-time").get(
                "data-time", None) if news.find("div",
                                                class_="news-time") else None
            next_timestamp = origin_date if index == len(
                news_list) - 1 else None  #取最后一篇文章的时间戳作下一页的时间戳
            struct_date = datetime.datetime.fromtimestamp(int(origin_date))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            title = news.find("a", class_="item-title").text if news.find(
                "a", class_="item-title") else None
            news_url = news.find("a", class_="item-title").get(
                "href", None) if news.find("a", class_="item-title") else None
            pic = news.find("img").get("src",
                                       None) if news.find("img") else None
            content = news.find("div", class_="item-desc").text if news.find(
                "div", class_="item-desc") else None
            id_result = re.search(r"/(\d+)\.html", news_url)
            news_no = id_result.group(1) if id_result else None
            item = NewsItem(content=content,
                            news_url=news_url,
                            pic=pic,
                            title=title,
                            news_no=news_no,
                            news_date=news_date,
                            catalogue=u"快报")
            item = judge_news_crawl(item)
            if item:
                request = scrapy.Request(news_url,
                                         meta={"item": item},
                                         callback=self.parse_quick_news)
                yield request
            else:
                self.quick_flag = int(self.quick_page)

        if not self.quick_flag:
            if next_timestamp:
                next_quick_url = self.quick_json_url % next_timestamp
                yield scrapy.Request(next_quick_url, callback=self.parse_quick)
            else:
                logger.warning("can't find next_timestamp,url is %s " %
                               response)
Exemple #15
0
    def parse(self, response):
        soup = BeautifulSoup(response.body,"lxml")
        newslist = soup.find(name="div", attrs={"data-lastkey": True})
        lastkey = newslist.get("data-lastkey",None)
        logger.info(lastkey)
        if not lastkey:
            logger.warning("can't find next page")
        else:
            if newslist:
                for i in newslist.children:
                    #文章中间有其余无关信息
                    if i != u' ':
                        news_url = self.domain+i.a.get('href',None)
                        pic = i.find("img").get('data-src') if i.find("img") else None
                        title = i.find("h3").string if i.find("h3") else None
                        comment_num = i.find(class_="iconfont icon-message").string if i.find(class_="iconfont icon-message") else 0
                        heart = i.find(class_="iconfont icon-heart").string if i.find(class_="iconfont icon-heart") else 0
                        topic = i.find(class_="category").span.string if i.find(class_="category") else 0
                        news_date =None
                        if i.find(name="span", attrs={"data-origindate": True}):
                            news_date= i.find(name="span", attrs={"data-origindate": True}).get("data-origindate",None)
                            if news_date:
                                news_date = news_date[:-6]

                        #no content and have heart&conment but not add
                        item = NewsItem(title=title,news_url=news_url,pic=pic,topic=topic,news_date=news_date,comment_num=comment_num)
                        # 所属目录
                        item['catalogue'] = "Top 15" if "tags" in response.url else u"商业"
                        #判断是否结束
                        item = judge_news_crawl(item)
                        if item :
                            request = scrapy.Request(news_url,callback=self.parse_article)
                            request.meta["item"] = item
                            yield request
                        else:
                            if "tags" in response.url:
                                self.top_flag = lastkey
                            else:
                                self.com_flag = lastkey
                next_url = None
                #判断各个类别是否需要爬取下一页
                if "tags" in response.url:
                    if not self.top_flag:
                        next_url = "http://www.qdaily.com/tags/tagmore/29/%s.json" % lastkey
                else:
                    if not self.com_flag:
                        next_url = "http://www.qdaily.com/categories/categorymore/18/%s.json" % lastkey
                # logger.info(next_url)
                if next_url:
                    yield scrapy.Request(next_url,callback=self.parse_next_page)
            else:
                logger.warning("can't find newslit")
Exemple #16
0
 def parse_topic(self, response):
     origin_url = response.url
     topic_url = origin_url.split("_", 1)[1].rsplit("_", 1)[0]
     pageindex = int(origin_url.rsplit("_", 1)[1].replace('.html', ''))
     catalogue = re.search('</a> -&gt; ([\w\W]+?) </i></h3>',
                           response.body).group(1).decode("gb2312")
     soup = BeautifulSoup(response.body, "lxml")
     news_list = soup.find_all('li')
     for news in news_list:
         news_date = news.find('i').text.split(' ')[1].replace(
             ']', '') if news.find('i') else None
         if news_date:
             news_url = news.find(
                 'a', href=re.compile(
                     'http://news.carnoc.com/list/*.?')).get('href')
             news_no = news_url.rsplit('/', 1)[1].replace('.html', '')
             title = news.find(
                 'a', href=re.compile(
                     'http://news.carnoc.com/list/*.?')).text.strip()
             abstract = news.find('div').text.strip()
             pic = news.find('div').find(
                 'img', src=re.compile('http://pic.carnoc.com/file/*.?')
             ).get('src') if news.find('div').find(
                 'img',
                 src=re.compile('http://pic.carnoc.com/file/*.?')) else None
             tags = news.find(
                 'div', class_='keywordslist').text.strip() if news.find(
                     'div', class_='keywordslist') else None
             item = NewsItem(
                 news_url=news_url,
                 news_date=news_date + ' 00:00:00',
                 title=title,
                 abstract=abstract,
                 news_no=news_no,
                 catalogue=catalogue,
                 pic=pic,
                 tags=tags,
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],
                                      callback=self.parse_news,
                                      meta={'item': item})
             else:
                 self.flag[topic_url] = pageindex
         else:
             logger.warning("carnoc:%s can't find news_date " % origin_url)
     if not self.flag[topic_url]:
         next_url = origin_url.rsplit(
             "_", 1)[0] + '_' + str(pageindex + 1) + '.html'
         yield scrapy.Request(next_url, callback=self.parse_topic)
Exemple #17
0
    def parse_topic(self,response):
        topic_url = response.url
        # print topic_url
        body = json.loads(response.body)
        news_list = body["data"]
        page = response.meta.get("page","1")
        topic_name = response.meta.get("topic_name",None)
        #http://m.iwshang.com/category/20 没有新闻
        if not news_list:
            self.flag[topic_url]=page
        for news in news_list:
            news_date_timestamp = news.get("published",None)
            struct_date = datetime.datetime.fromtimestamp(int(news_date_timestamp))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            title = news.get("title",None)
            news_no = news.get("contentid",None)
            abstract = news.get("description",None)
            pic = news.get("thumb",None)
            news_url = news.get("url",None)                 #手机端新闻页面链接
            referenceid = news.get("referenceid",None)      #pc端的id,手机端的id跟pc端的id不一样
            pc_news_url = self.pc_news_url % referenceid    #pc端新闻页面链接
            item = NewsItem(
                news_date=news_date,
                title=title,
                news_no=news_no,
                abstract=abstract,
                pic=pic,
                news_url=pc_news_url,
                topic=topic_name
            )
            item = judge_news_crawl(item)
            if item:
                # yield item
                yield scrapy.Request(pc_news_url,callback=self.parse_news,meta={"item":item})
            else:

                self.flag[topic_url]=page
        if not self.flag[topic_url]:
            page = str(int(page)+1)
            post_data = {
                    "inslider":"0",
                    "page":page,
                    "pagesize":"10"
                }
            yield scrapy.FormRequest(
                    url=topic_url,
                    formdata=post_data,
                    callback=self.parse_topic,
                    meta={"page":page}
                )
Exemple #18
0
 def parse_topic(self, response):
     origin_url = response.url
     temp = origin_url.rsplit("/", 1)
     topic_url = temp[0]
     if temp[1] == "":
         pageindex = 1
     else:
         pageindex = temp[1].split("_", 1)[-1].split(".", 1)[0]
     soup = BeautifulSoup(response.body, "lxml")
     catalogue = soup.find("div", class_="arttitle").text.strip()
     news_list = soup.find("ul", class_="art_list mt11").find_all("li")
     for news in news_list:
         title_info = news.find("h5", class_="title")
         text_info = news.find("div", class_="text")
         news_date = text_info.find("span", class_="time").text
         news_date = "%s-%s-%s %s:00" % (time.strftime("%Y"),
                                         int(news_date[0:2]),
                                         int(news_date[3:5]), news_date[7:])
         author = text_info.find("span", class_="place").text.strip()
         if author == "":
             author = None
         abstract = text_info.find("p", class_="info").text.strip()
         pic = text_info.find("img").get("src") if text_info.find(
             "img") else None
         title = title_info.find("a").text.strip()
         news_url = title_info.find("a").get("href")
         temp = news_url.split("/")
         news_no = temp[-2] + "_" + temp[-1].split(".")[0]
         item = NewsItem(
             news_url=news_url,
             news_date=news_date,
             title=title,
             abstract=abstract,
             author=author,
             news_no=news_no,
             catalogue=catalogue,
             pic=pic,
         )
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],
                                  callback=self.parse_news,
                                  meta={'item': item})
         else:
             self.flag[topic_url] = pageindex
     if not self.flag[topic_url]:
         next_url = "%s/index_%s.html" % (topic_url, int(pageindex) + 1)
         yield scrapy.Request(next_url, callback=self.parse_topic)
Exemple #19
0
 def parse(self, response):
     url = response.url
     pageindex = url.rsplit("/", 1)[-1]
     soup = BeautifulSoup(response.body)
     wrap = soup.find("div", class_="wrap")
     news_list = wrap.find_all("li", class_="pbox clr")
     for news in news_list:
         origin_date = news.find("div", class_="time").text.strip()
         struct_date = datetime.datetime.strptime(origin_date,
                                                  "%Y / %m / %d\n%H:%M")
         news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
         # if not self.flag or self.flag == pageindex:
         topic = news.find("div",
                           class_="img").a.string.strip() if news.find(
                               "div", class_="img") else None
         pic = news.find("img").get("src",
                                    None) if news.find("img") else None
         title = news.find("div", class_="tit").string if news.find(
             "div", class_="tit") else None
         abstract = news.find("div", class_="des").string if news.find(
             "div", class_="des") else None
         author = news.find("div", class_="aut").text.strip() if news.find(
             "div", class_="aut") else None
         news_url = news.find("div",
                              class_="tit").parent.get("href") if news.find(
                                  "div", class_="tit") else None
         comment_num = news.find("a", class_="cmt").text if news.find(
             "a", class_="cmt") else None
         item = NewsItem(topic=topic,
                         news_url=news_url,
                         pic=pic,
                         title=title,
                         abstract=abstract,
                         author=author,
                         comment_num=comment_num,
                         news_date=news_date)
         item = judge_news_crawl(item)
         if item:
             request = scrapy.Request(news_url,
                                      meta={"item": item},
                                      callback=self.parse_news)
             yield request
         else:
             self.flag = int(pageindex)
     if not self.flag:
         pageindex = int(pageindex) + 1
         next_url = self.next_url % pageindex
         yield scrapy.Request(next_url)
Exemple #20
0
    def parse_newest(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        page = response.request.body.split('=')[-1]
        li = soup.find_all('li')
        if li:
            for news in li:
                news_date = news.find(class_="time").string[2:] if news.find(
                    class_="time") else None
                struct_date = datetime.datetime.strptime(
                    news_date, "%Y-%m-%d %H:%M")
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                title = news.find(class_="title").string if news.find(
                    class_="title") else None
                news_url = self.domain + news.find(class_="title").a.get(
                    "href", None) if news.find(class_="title") else None
                abstract = news.find(class_="info").string if news.find(
                    class_="info") else None
                pic = self.domain + news.find('img').get(
                    'src', None) if news.find('img') else None
                topic = news.find(class_="type").string if news.find(
                    class_="type") else None
                item = NewsItem(catalogue=u"最新内容",
                                title=title,
                                news_url=news_url,
                                abstract=abstract,
                                pic=pic,
                                topic=topic,
                                news_date=news_date)
                item = judge_news_crawl(item)
                if item:
                    request = scrapy.Request(news_url,
                                             callback=self.parse_news,
                                             dont_filter=True)
                    request.meta["item"] = item
                    yield request
                else:
                    self.flag = page
        else:
            logger.info("can't find news list")

        #下一页
        if not self.flag:
            new_request = scrapy.FormRequest(
                self.start_url,
                formdata={'page': str(int(page) + 1)},
                callback=self.parse_newest)
            yield new_request
Exemple #21
0
    def parse(self, response):
        origin_url = response.url
        result = re.search(r"page=(\d+)", origin_url)
        # import pdb;pdb.set_trace()
        pageindex = result.group(1) if result else None

        soup = BeautifulSoup(response.body)
        news_list = soup.find_all("div", class_="article-item clearfix")
        for news in news_list:

            info = news.find("div", class_="item-push-info")
            author = info.text[:-3] if info else None
            news_date = info.span.get("data-time") if info.span else None  #时间戳
            struct_date = datetime.datetime.fromtimestamp(int(news_date))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")

            title = news.find("a", class_="item-title").text if news.find(
                "a", class_="item-title") else None
            news_url = news.find("a", class_="item-title").get(
                "href", None) if news.find("a", class_="item-title") else None
            abstract = news.find("p", class_="item-desc").text if news.find(
                "p", class_="item-desc") else None
            pic = news.find("img").get("src",
                                       None) if news.find("img") else None
            id_result = re.search(r"/(\d+)\.html", news_url)
            news_no = id_result.group(1) if id_result else None

            item = NewsItem(abstract=abstract,
                            news_url=news_url,
                            pic=pic,
                            title=title,
                            author=author,
                            news_no=news_no,
                            news_date=news_date,
                            catalogue=u"中间推荐模板")
            item = judge_news_crawl(item)
            if item:
                request = scrapy.Request(news_url,
                                         meta={"item": item},
                                         callback=self.parse_news)
                yield request
            else:
                self.mid_flag = int(pageindex)
        if not self.mid_flag:
            pageindex = int(pageindex) + 1
            next_url = self.middle_next_url % pageindex
            yield scrapy.Request(next_url)
Exemple #22
0
 def parse_news(self,response):
     item = response.meta.get("item",NewsItem())
     pageindex = response.meta.get("pageindex",1)
     soup = BeautifulSoup(response.body, 'lxml')
     origin_date = soup.find("td", class_="time").text.strip()
     struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M")
     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
     content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None
     item["news_date"]= news_date
     item["crawl_date"]= NOW
     item["content"] = content
     item["catalogue"] = u"最新资讯"
     item = judge_news_crawl(item)
     if item:
         yield item
     else:
         self.flag = int(pageindex)
Exemple #23
0
    def parse_next_page(self,response):
        data = json.loads(response.body)
        newslist = data['data']["feeds"]
        last_key = data['data']['last_key'] if data['data']['has_more'] else None
        for news in newslist:
            post = news.get("post",None)
            if post:


                pic = post.get("image",None)
                title = post.get("title",None)
                comment_num = post.get("comment_count",None)
                praise_count = post.get("praise_count",None)    #heart
                topic = post['category']['title']
                id = post.get("id",None)
                datatype = news.get("datatype",None)

                news_date= post.get("publish_time",None)
                if news_date:
                    news_date = news_date[:-6]
                #文章
                if id and datatype:
                    news_url = self.domain+"%s/%s" % (datatype+"s",id)
                    item = NewsItem(title=title,news_url=news_url,pic=pic,topic=topic,news_date=news_date,comment_num=comment_num)
                    item['catalogue'] = "Top 15" if "tags" in response.url else u"商业"
                    item = judge_news_crawl(item)
                    if item :
                        request = scrapy.Request(news_url,callback=self.parse_article)
                        request.meta["item"] = item
                        yield request
                    else:
                        if "tags" in response.url:
                            self.top_flag = last_key
                        else:
                            self.com_flag = last_key
        #下一页
        next_url = None
        if "tags" in response.url:
            if not self.top_flag:
                next_url = "http://www.qdaily.com/tags/tagmore/29/%s.json" % last_key
        else:
            if not self.com_flag:
                next_url = "http://www.qdaily.com/categories/categorymore/18/%s.json" % last_key
        if next_url:
            yield scrapy.Request(next_url,callback=self.parse_next_page)
Exemple #24
0
 def parse(self, response):
     origin_url = response.url
     if "page" not in origin_url:
         pageindex = 1
     else:
         pageindex = origin_url.split("&", 1)[0].rsplit("=", 1)[-1]
     soup = BeautifulSoup(response.body, "lxml")
     news_list = soup.find_all("article", class_="item-wrap cf")
     for news in news_list:
         news_date = news.find("span",
                               class_="timeago").text.strip() if news.find(
                                   "span", class_="timeago") else None
         if news_date:
             title = news.find("a", class_="title").text.strip()
             news_url = news.find("a", class_="title").get("href")
             abstract = news.find("div", class_="brief").text.strip()
             author = news.find("span", class_="name").text.strip()
             news_no = news_url.rsplit("/", 1)[-1].split(".")[0]
             item = NewsItem(news_date=news_date + ":00",
                             title=title,
                             abstract=abstract,
                             news_no=news_no,
                             news_url=news_url,
                             author=author)
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],
                                      callback=self.parse_news,
                                      meta={"item": item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("can't find news_date")
     if not self.flag:
         if pageindex == 1:
             next_url = self.next_url + soup.find("a",
                                                  class_="more").get("href")
         else:
             next_url = self.next_url + "/?page=" + str(
                 int(pageindex) + 1) + "&" + origin_url.split("&", 1)[-1]
         headers = {"X-Requested-With": "XMLHttpRequest"}
         yield scrapy.Request(
             next_url,
             headers=headers,
         )
Exemple #25
0
 def parse(self, response):
     origin_url = response.url
     pageindex = origin_url.rsplit("/")[-1]
     soup = BeautifulSoup(response.body, "lxml")
     news_list = soup.find_all("li", class_="mt24 pr")
     for news in news_list:
         news_date = news.find("a", href="javascript:;").text if news.find(
             "a", href="javascript:;") else None
         if news_date:
             news_url = news.find("p", class_="h1").a.get(
                 "href", None) if news.find("p", class_="h1") else None
             news_no = news_url.rsplit("/")[-1].split(".")[
                 0]  #http://www.nbd.com.cn/articles/2016-07-25/1025147.html
             title = news.find("p", class_="h1").text.strip() if news.find(
                 "p", class_="h1") else None
             #显示不全,在新闻具体页拿
             # abstract = news.find("p",class_="news-p").text.strip() if news.find("p",class_="news-p") else None
             referer_web = news.find(
                 "div", class_="messge").contents[-2].a.text if news.find(
                     "div", class_="messge") else None
             referer_web = referer_web if referer_web != '' else None
             comment_num = soup.find("span",
                                     class_="fr").a.text if soup.find(
                                         "span", class_="fr") else None
             item = NewsItem(
                 news_date=news_date,
                 title=title,
                 # abstract=abstract,
                 referer_web=referer_web,
                 comment_num=comment_num,
                 news_no=news_no,
                 news_url=news_url)
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],
                                      callback=self.parse_news,
                                      meta={"item": item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("can't find news_date")
     if not self.flag:
         next_url = self.next_url % (int(pageindex) + 1)
         yield scrapy.Request(next_url)
Exemple #26
0
 def parse_topic(self,response):
     origin_url = response.url
     temp = origin_url.rsplit("/",1)
     topic_url = temp[0]
     if temp[1] == "":
         pageindex = 1
     else:
         pageindex = temp[1].split("_",1)[-1].split(".",1)[0]
     soup = BeautifulSoup(response.body,"lxml")
     catalogue = soup.find("div",class_ ="arttitle").text.strip()
     news_list = soup.find("ul",class_ = "art_list mt11").find_all("li")
     for news in news_list:
         title_info = news.find("h5",class_= "title")
         text_info = news.find("div",class_ = "text")
         news_date = text_info.find("span",class_ = "time").text
         news_date = "%s-%s-%s %s:00" % (time.strftime("%Y"),int(news_date[0:2]),int(news_date[3:5]),news_date[7:])
         author = text_info.find("span",class_ = "place").text.strip()
         if author == "":
             author = None
         abstract = text_info.find("p",class_ = "info").text.strip()
         pic = text_info.find("img").get("src") if text_info.find("img") else None
         title = title_info.find("a").text.strip()
         news_url = title_info.find("a").get("href")
         temp = news_url.split("/")
         news_no = temp[-2] + "_" + temp[-1].split(".")[0]
         item = NewsItem(
                 news_url =news_url,
                 news_date = news_date,
                 title = title,
                 abstract = abstract,
                 author = author,
                 news_no = news_no,
                 catalogue = catalogue,
                 pic = pic,
         )
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
         else:
             self.flag[topic_url] = pageindex
     if not self.flag[topic_url]:
         next_url = "%s/index_%s.html" % (topic_url,int(pageindex) + 1)
         yield scrapy.Request(next_url,callback=self.parse_topic)
Exemple #27
0
    def parse(self,response):
        origin_url = response.url

        temp = origin_url.rsplit("/",2)
        year_month = temp[-2]
        day = temp[-1][7:9]
        pageindex = datetime.datetime(int(year_month[:4]), int(year_month[4:]), int(day)) - datetime.timedelta(days=1)
        pageindex = pageindex.strftime('%Y%m%d')

        soup = BeautifulSoup(response.body,"lxml")
        temp = soup.find("div",class_="mod newslist") if soup.find("div",class_="mod newslist") else None
        if temp:
            news_list = temp.find_all("li")
            for news in news_list:
                news_url = news.find("a").get("href")
                title = news.find("a").text.strip()
                news_no = news_url.rsplit("/",1)[-1].replace(".htm","")
                temp = news.find("span").text
                news_date = "%s-%s-%s %s:%s" % (time.strftime("%Y"),temp[0:2],temp[3:5],temp[7:],"00")
                item = NewsItem(news_date=news_date,
                                title=title,
                                news_no = news_no,
                                news_url=news_url
                                )
                item = judge_news_crawl(item)
                if item:
                    yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item})
                else:
                    self.flag = pageindex
        else:
            logger.warning("can't find news_list")
        temp = soup.find_all("a",class_ = "f12") if soup.find("a",class_ = "f12") else None
        today_text_url = None
        if temp:
            for t in temp:
                if u"下一页" in t:
                    today_text_url = t.get("href")
        if today_text_url:
            yield scrapy.Request(today_text_url)
        else:
            if not self.flag:
                next_url = self.next_date_url % (pageindex[0:6],pageindex[6:])
                yield scrapy.Request(next_url)
Exemple #28
0
    def parse(self, response):
        origin_url = response.url
        if origin_url == Tech163Spider.start_urls[0]:
            pageindex = 1
        else:
            pageindex = int(origin_url.split("_")[2][0:-1])
        soup = BeautifulSoup(response.body, "lxml")
        news_list = soup.find("ul", class_="newsList").find_all("li")
        for news in news_list:
            temp = news.find("p", class_="sourceDate").text.strip()
            news_date = temp[-19:]
            if news_date:
                referer_web = temp[:-19]
                comment_num = news.find("a",
                                        class_="commentCount  ").text.strip()
                temp = news.find("div", class_="titleBar clearfix").find("a")
                news_url = temp.get("href")
                title = temp.text.strip()
                news_no = news_url.rsplit("/", 1)[-1][:-5]
                item = NewsItem(news_date=news_date,
                                title=title,
                                referer_web=referer_web,
                                comment_num=comment_num,
                                news_no=news_no,
                                news_url=news_url)
                item = judge_news_crawl(item)
                if item:
                    yield scrapy.Request(item["news_url"],
                                         callback=self.parse_news,
                                         meta={"item": item})
                else:
                    self.flag = pageindex
            else:
                logger.warning("can't find news_date")
        if not self.flag:
            pageindex = pageindex + 1
            if pageindex < 10:
                pageindex = '0' + str(pageindex)
            else:
                pageindex = str(pageindex)

            next_url = self.next_url + "_%s/" % (pageindex)
            yield scrapy.Request(next_url)
Exemple #29
0
    def parse(self,response):
        origin_url = response.url
        if origin_url == Tech163Spider.start_urls[0]:
            pageindex = 1
        else:
            pageindex = int(origin_url.split("_")[2][0:-1])
        soup = BeautifulSoup(response.body, "lxml")
        news_list = soup.find("ul",class_ = "newsList").find_all("li")
        for news in news_list:
            temp = news.find("p",class_ = "sourceDate").text.strip()
            news_date = temp[-19:]
            if news_date:
                referer_web = temp[:-19]
                comment_num = 0     #news.find("a", class_ = "commentCount  ").text.strip()   #评论条数需要用模拟器获取
                temp = news.find("div",class_="titleBar clearfix").find("a")
                news_url = temp.get("href")
                title = temp.text.strip()
                news_no = news_url.rsplit("/",1)[-1][:-5]
                item = NewsItem(
                                news_date=news_date,
                                title=title,
                                referer_web=referer_web,
                                comment_num=comment_num,
                                news_no=news_no,
                                news_url=news_url
                                )
                item = judge_news_crawl(item)
                if item:
                    yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item})
                else:
                    self.flag = pageindex
            else:
                logger.warning("can't find news_date")
        if not self.flag:
            pageindex = pageindex + 1
            if pageindex < 10:
                pageindex = '0' + str(pageindex)
            else:
                pageindex = str(pageindex)

            next_url = self.next_url + "_%s/" % (pageindex)
            yield scrapy.Request(next_url)
Exemple #30
0
 def parse(self, response):
     origin_url = response.url
     pageindex = origin_url.rsplit("&", 1)[0].split("=", 1)[-1]
     dejson = json.loads(response.body)
     news_list = dejson["data"]
     for news in news_list:
         news_date = time.strftime(
             "%Y-%m-%d %H:%M:%S",
             time.localtime(float(news["time_published"])))
         title = news["title"]
         abstract = news["summary"]
         read_num = news["number_of_reads"]
         comment_num = news["number_of_comments"]
         if len(news["hero_image"]["original"]) > 0:
             pic = news["hero_image"]["original"][0]["url"]
         else:
             pic = None
         news_url = news["short_url"]
         news_no = news_url.rsplit("/", 1)[-1].split(".", 1)[0]
         author = ",".join([t["username"] for t in news["authors"]])
         topic = ",".join([t["tag"] for t in news["tags"]])
         item = NewsItem(news_date=news_date,
                         title=title,
                         abstract=abstract,
                         comment_num=comment_num,
                         news_no=news_no,
                         news_url=news_url,
                         read_num=read_num,
                         pic=pic,
                         author=author,
                         topic=topic)
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],
                                  callback=self.parse_news,
                                  meta={"item": item})
         else:
             self.flag = pageindex
     if not self.flag:
         next_url = self.next_url % (int(pageindex) + 30)
         yield scrapy.Request(next_url)
Exemple #31
0
    def parse(self, response):
        origin_url = response.url
        result = re.search(r"page=(\d+)",origin_url)
        # import pdb;pdb.set_trace()
        pageindex = result.group(1) if result else None

        soup = BeautifulSoup(response.body)
        news_list = soup.find_all("div",class_="article-item clearfix")
        for news in news_list:

            info = news.find("div",class_="item-push-info")
            author = info.text[:-3] if info else None
            news_date = info.span.get("data-time") if info.span else None   #时间戳
            struct_date = datetime.datetime.fromtimestamp(int(news_date))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")

            title =news.find("a",class_="item-title").text if news.find("a",class_="item-title") else None
            news_url =news.find("a",class_="item-title").get("href",None) if news.find("a",class_="item-title") else None
            abstract =news.find("p",class_="item-desc").text if news.find("p",class_="item-desc") else None
            pic = news.find("img").get("src",None) if news.find("img") else None
            id_result = re.search(r"/(\d+)\.html",news_url)
            news_no = id_result.group(1) if id_result else None

            item = NewsItem(abstract=abstract,
                            news_url=news_url,
                            pic=pic,
                            title=title,
                            author=author,
                            news_no=news_no,
                            news_date=news_date,
                            catalogue=u"中间推荐模板")
            item = judge_news_crawl(item)
            if item:
                request = scrapy.Request(news_url,meta={"item":item},callback=self.parse_news)
                yield request
            else:
                self.mid_flag =int(pageindex)
        if not self.mid_flag:
            pageindex = int(pageindex)+1
            next_url = self.middle_next_url % pageindex
            yield scrapy.Request(next_url)
Exemple #32
0
    def parse_quick(self,response):
        soup = BeautifulSoup(response.body)
        news_list_inner = soup.find("div",class_="list-inner")
        next_timestamp=None

        news_list = news_list_inner.find_all("div",class_=re.compile(r"bulletin-item.*")) if news_list_inner else None
        #json 页面
        if not news_list:
            news_list = soup.find_all("div",class_=re.compile(r"bulletin-item.*"))
        for index,news in enumerate(news_list):
            origin_date = news.find("div",class_="news-time").get("data-time",None) if news.find("div",class_="news-time") else None
            next_timestamp = origin_date if index == len(news_list)-1 else None #取最后一篇文章的时间戳作下一页的时间戳
            struct_date = datetime.datetime.fromtimestamp(int(origin_date))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            title =news.find("a",class_="item-title").text if news.find("a",class_="item-title") else None
            news_url =news.find("a",class_="item-title").get("href",None) if news.find("a",class_="item-title") else None
            pic = news.find("img").get("src",None) if news.find("img") else None
            content =news.find("div",class_="item-desc").text if news.find("div",class_="item-desc") else None
            id_result = re.search(r"/(\d+)\.html",news_url)
            news_no = id_result.group(1) if id_result else None
            item = NewsItem(content=content,
                            news_url=news_url,
                            pic=pic,
                            title=title,
                            news_no=news_no,
                            news_date=news_date,
                            catalogue=u"快报")
            item = judge_news_crawl(item)
            if item:
                request = scrapy.Request(news_url,meta={"item":item},callback=self.parse_quick_news)
                yield request
            else:
                self.quick_flag =int(self.quick_page)

        if not self.quick_flag:
            if next_timestamp:
                next_quick_url = self.quick_json_url % next_timestamp
                yield scrapy.Request(next_quick_url,callback=self.parse_quick)
            else:
                logger.warning("can't find next_timestamp,url is %s " % response)
Exemple #33
0
 def parse(self, response):
     pageindex = response.meta.get('pageindex', 1)
     data = json.loads(response.body)
     news_list = data['posts']
     articleCursor = data['articleCursor']
     for news in news_list:
         item = NewsItem()
         news_data = news.get("resource", None)
         if news_data:
             createdAt = news_data.get("createdAt", None)
             struct_date = datetime.datetime.utcfromtimestamp(
                 int(createdAt)) + datetime.timedelta(hours=8)
             news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")  #规范日期时间
             item["news_date"] = news_date
             item["title"] = news_data.get("title", None)
             item["comment_num"] = news_data.get("commentCount", None)
             item["pic"] = news_data.get("imageUrl", None)
             item["news_no"] = news_data.get("id", None)
             item["title"] = news_data.get("title", None)
             news_url = news_data.get("url", None)
             item["news_url"] = news_url
             item["abstract"] = news_data.get("summary", None)
             item["author"] = news_data.get("user", None).get(
                 "screenName", None) if news_data.get("user",
                                                      None) else None
             item = judge_news_crawl(item)  #判断是否符合爬取时间
             if item:
                 request = scrapy.Request(news_url,
                                          callback=self.parse_news)
                 request.meta['item'] = item
                 yield request
             else:
                 self.flag = pageindex
         else:
             logger.info("can't find search_result")
     #下一页
     # if int(pageindex)<self.crawl_page:
     if not self.flag:
         next_url = self.page_url % str(articleCursor)
         yield scrapy.Request(next_url, callback=self.parse)
Exemple #34
0
    def parse_news(self, response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item", NewsItem())
        pageindex = response.meta.get("pageindex", 1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #需要爬取具体的时间
        news_date = soup.find("span", class_="arial").text if soup.find(
            "span", class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find(
                "p", class_="source").strings)[-1] if soup.find(
                    "p", class_="source") else None
            #爬取正文
            art, content = None, None
            art = soup.find("div", class_="article js-article")
            if art:
                #剔除摘要!
                art.find("div", class_="intro").replace_with("")
                content = art.text.strip()
            news_no = response.url.split("/")[-1].split("_")[0]
            item["news_date"] = news_date
            item["content"] = content
            item["referer_web"] = referer_web
            item["crawl_date"] = NOW
            item["news_no"] = news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url)
Exemple #35
0
 def parse(self,response):
     origin_url = response.url
     if "page" not in origin_url:
         pageindex = 1
     else:
         pageindex = origin_url.split("&",1)[0].rsplit("=",1)[-1]
     soup = BeautifulSoup(response.body,"lxml")
     news_list = soup.find_all("article", class_="item-wrap cf")
     for news in news_list:
         news_date = news.find("span" , class_ = "timeago").text.strip() if news.find("span" , class_ = "timeago") else None
         if news_date:
             title = news.find("a" ,class_ = "title").text.strip()
             news_url = news.find("a" ,class_ = "title").get("href")
             abstract = news.find("div",class_ = "brief").text.strip()
             author = news.find("span" , class_ = "name").text.strip()
             news_no = news_url.rsplit("/",1)[-1].split(".")[0]
             item = NewsItem(news_date=news_date + ":00",
                         title=title,
                         abstract=abstract,
                         news_no=news_no,
                         news_url=news_url,
                         author = author
                         )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("can't find news_date")
     if not self.flag:
         if pageindex == 1:
             next_url = self.next_url + soup.find("a" , class_ = "more").get("href")
         else:
             next_url = self.next_url + "/?page=" + str(int(pageindex) + 1) + "&" + origin_url.split("&",1)[-1]
         headers = {
             "X-Requested-With":"XMLHttpRequest"
             }
         yield scrapy.Request(next_url,headers=headers,)
Exemple #36
0
    def parse_news(self, response):
        driver = webdriver.Firefox()
        item = response.meta.get("item", NewsItem())
        page = response.meta.get("page", 1)
        index = response.meta.get("index", 0)
        origin_url = response.url
        no_res = re.search(r"/(\d+)?.html", origin_url)
        news_no = no_res.group(1) if no_res else None
        driver.get(origin_url)
        time.sleep(3)
        code = driver.page_source
        driver.quit()
        soup = BeautifulSoup(code, "lxml")
        # import pdb;pdb.set_trace()
        authors = soup("span", class_="author-name")
        referer_web = None
        for a in authors:
            if "来源".decode("utf-8") in a.text:
                referer_web = a.text[3:]
        news_date = soup.find("span", class_="article-time").text if soup.find(
            "span", class_="article-time") else None
        content = soup.find("div", id="article_content").get_text(
            strip=True) if soup.find("div", id="article_content") else None
        item["content"] = content
        item["news_date"] = news_date
        item["referer_web"] = referer_web
        item["crawl_date"] = NOW
        item["news_no"] = news_no
        item = judge_news_crawl(item, end_day=2)
        if item:
            yield item
        else:
            self.flag = page

        #把抛出下一页,放到每一页的最后篇文章来判断。
        if index == 19 and not self.flag:
            next_page = page + 1
            next_page_url = self.next_page_url % next_page
            yield scrapy.Request(next_page, meta={"page": next_page})
Exemple #37
0
 def parse(self, response):
     origin_url = response.url
     if '_' not in origin_url:
         pageindex = '1'
         origin_url = origin_url.rsplit('.', 1)[0] + '_0' + '.html'
     else:
         pageindex = origin_url.rsplit('_', 1)[-1].replace('.html', '')
     soup = BeautifulSoup(response.body)
     catalogue = soup.find('div', class_='nav_cur_index').find('span').text
     news_list = soup.find_all('div', class_='item_top')
     for news in news_list:
         news_date = news.find('span',
                               class_='time').text.strip() if news.find(
                                   'span', class_='time') else None
         if news_date:
             news_url = news.find('h2').find(
                 'a', href=re.compile('http://money.163.com/')).get('href')
             title = news.find('h2').find(
                 'a', href=re.compile('http://money.163.com/')).text
             item = NewsItem(news_date=news_date,
                             news_url=news_url,
                             title=title,
                             catalogue=catalogue)
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],
                                      callback=self.parse_news,
                                      meta={'item': item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("carnoc:%s can't find news_date " % origin_url)
     if not self.flag:
         pageindex = str(
             int(pageindex) +
             1) if int(pageindex) > 9 else '0' + str(int(pageindex) + 1)
         next_url = self.next_url % (pageindex)
         yield scrapy.Request(next_url)
Exemple #38
0
    def parse_news(self,response):
        # driver = webdriver.Chrome(self.chromedriver)
        driver = webdriver.Firefox(executable_path=self.chromedriver)
        item = response.meta.get("item",NewsItem())
        page = response.meta.get("page",1)
        index = response.meta.get("index",0)
        origin_url = response.url
        no_res = re.search(r"/(\d+)?.html",origin_url)
        news_no = no_res.group(1) if no_res else None
        driver.get(origin_url)
        time.sleep(3)
        code = driver.page_source
        driver.quit()
        soup = BeautifulSoup(code,"lxml")
        # import pdb;pdb.set_trace()
        authors = soup("span",class_="author-name")
        referer_web = None
        for a in authors:
            if "来源".decode("utf-8") in a.text:
                referer_web = a.text[3:]
        news_date = soup.find("span",class_="article-time").text if soup.find("span",class_="article-time") else None
        content = soup.find("div",id="article_content").get_text(strip=True) if soup.find("div",id="article_content") else None
        item["content"]=content
        item["news_date"]=news_date
        item["referer_web"]=referer_web
        item["crawl_date"]=NOW
        item["news_no"]=news_no
        item =judge_news_crawl(item,end_day=2)
        if item:
            yield item
        else:
            self.flag=page

        #把抛出下一页,放到每一页的最后篇文章来判断。
        if index == 19 and not self.flag:
            next_page = page+1
            next_page_url = self.next_page_url % next_page
            yield scrapy.Request(next_page,meta={"page":next_page})
    def parse(self, response):
        keyword = response.meta.get("keyword", None)
        soup = BeautifulSoup(response.body, "lxml")
        for t in soup.find_all("div", attrs={"class": "result title"}):
            item = NewsItem()
            url = t.find("a").get("href")  #新闻url
            title = t.find("a").text  #新闻标题
            temp_list = t.find("div", attrs={
                "class": "c-title-author"
            }).text.split(u"\xa0\xa0")
            website_name = temp_list[0]  #新闻网站名称、
            news_time = temp_list[1]
            #TODO: Some error
            now = datetime.datetime.now()
            if u"分钟前" in news_time:
                print news_time[:-3]
                struct_date = now - datetime.timedelta(
                    minutes=int(news_time[:-3]))
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            elif u"小时前" in news_time:
                print news_time[:-3]
                struct_date = now - datetime.timedelta(
                    hours=int(news_time[:-3]))
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            else:
                news_date = "%s-%s-%s %s:00" % (news_time[:4], news_time[5:7],
                                                news_time[8:10],
                                                news_time[12:])

            item['news_url'] = url
            item['title'] = title
            item['news_date'] = news_date
            item['referer_web'] = website_name
            item["crawl_date"] = NOW
            item["keywords"] = [keyword]
            item = judge_news_crawl(item)
            if item:
                yield item
Exemple #40
0
 def parse(self, response):
     origin_url = response.url
     #http://money.163.com/special/002526O5/transport_02.html
     search_result = re.search(r"_(\d)*?\.",origin_url)
     #获取页数
     pageindex = search_result.group(1) if search_result else 1
     soup = BeautifulSoup(response.body,"lxml")
     news_list = soup("div",class_="list_item clearfix")
     for news in news_list:
         news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None
         title = news.find("h2").text if news.find("h2") else None
         news_url = news.find("h2").a.get("href",None) if news.find("h2") else None
         abstract = news.find("p").contents[0] if news.find("p") else None
         item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date)
         item = judge_news_crawl(item)   #判断是否符合爬取时间
         if item:
             request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item})
             yield request
         else:
             self.flag = int(pageindex)
     if not self.flag:
         next_url = self.next_url % int(pageindex)+1
         yield scrapy.Request(next_url)
    def parse(self,response):
        origin_url = response.url
        pageindex = origin_url.rsplit("/",3)[-3]
        soup = BeautifulSoup(response.body,"lxml")
        news_list = soup.find_all("div",class_=re.compile("zheng_list"))
        for news in news_list:
            news_date = news.find("div" ,class_ = "Function").text.strip() if news.find("div" ,class_ = "Function") else None
            if news_date:
                temp = news.find("a" , class_ = "t_css") if news.find("a" , class_ = "t_css") else None
                if not temp:
                    continue
                news_url = temp.get("href")
                title = temp.get("title")
                news_no = news_url.rsplit("/",1)[-1].split(".")[0]
                abstract = news.find("p").text.strip() if news.find("p") else None
                if len(news_date) == 10:
                    news_date = news_date + " 00:00:00"
                else:
                    news_date=news_date + ":00"
                item = NewsItem(news_date=news_date,
                                title=title,
                                abstract=abstract,
                                news_no=news_no,
                                news_url=news_url
                                )

                item = judge_news_crawl(item)
                if item:
                    yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item})
                else:
                    self.flag = pageindex
            else:
                logger.warning("can't find news_date")
        nextDate = datetime.datetime(int(pageindex[:4]), int(pageindex[4:6]), int(pageindex[6:8])) - datetime.timedelta(days=1)
        if not self.flag:
            next_url = self.next_url % (nextDate.strftime('%Y%m%d'))
            yield scrapy.Request(next_url)
Exemple #42
0
    def parse_news(self,response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #需要爬取具体的时间
        news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
            #爬取正文
            art,content = None,None
            art = soup.find("div",class_="article js-article")
            if art:
                #剔除摘要!
                art.find("div",class_="intro").replace_with("")
                content =art.text.strip()
            news_no =response.url.split("/")[-1].split("_")[0]
            item["news_date"]=news_date
            item["content"]=content
            item["referer_web"]=referer_web
            item["crawl_date"]=NOW
            item["news_no"]=news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url)
 def parse(self,response):
     pageindex = response.meta.get('pageindex', 1)
     data = json.loads(response.body)
     news_list = data['posts']
     articleCursor = data['articleCursor']
     for news in news_list:
         item = NewsItem()
         news_data = news.get("resource",None)
         if news_data:
             createdAt = news_data.get("createdAt", None)
             struct_date = datetime.datetime.utcfromtimestamp(int(createdAt))+ datetime.timedelta(hours=8)
             news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")   #规范日期时间
             item["news_date"]=news_date
             item["title"]=news_data.get("title", None)
             item["comment_num"]=news_data.get("commentCount", None)
             item["pic"]=news_data.get("imageUrl", None)
             item["news_no"]=news_data.get("id", None)
             item["title"]=news_data.get("title", None)
             news_url = news_data.get("url", None)
             item["news_url"] = news_url
             item["abstract"] = news_data.get("summary", None)
             item["author"] = news_data.get("user", None).get("screenName", None) if news_data.get("user", None) else None
             item = judge_news_crawl(item)   #判断是否符合爬取时间
             if item:
                 request = scrapy.Request(news_url,callback=self.parse_news)
                 request.meta['item'] = item
                 yield request
             else:
                 self.flag= pageindex
         else:
             logger.info("can't find search_result")
     #下一页
     # if int(pageindex)<self.crawl_page:
     if not self.flag:
         next_url = self.page_url % str(articleCursor)
         yield scrapy.Request(next_url,callback=self.parse)
 def parse(self, response):
     url = response.url
     pageindex = url.rsplit("/",1)[-1]
     soup = BeautifulSoup(response.body)
     wrap = soup.find("div",class_="wrap")
     news_list = wrap.find_all("li",class_="pbox clr")
     for news in news_list:
         origin_date =news.find("div",class_="time").text.strip()
         struct_date= datetime.datetime.strptime(origin_date,"%Y / %m / %d\n%H:%M")
         news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
         # if not self.flag or self.flag == pageindex:
         topic = news.find("div",class_="img").a.string.strip() if news.find("div",class_="img") else None
         pic = news.find("img").get("src",None) if news.find("img") else None
         title = news.find("div",class_="tit").string if news.find("div",class_="tit") else None
         abstract = news.find("div",class_="des").string if news.find("div",class_="des") else None
         author = news.find("div",class_="aut").text.strip() if news.find("div",class_="aut") else None
         news_url = news.find("div",class_="tit").parent.get("href") if news.find("div",class_="tit") else None
         comment_num = news.find("a",class_="cmt").text if news.find("a",class_="cmt") else None
         item = NewsItem(topic=topic,
                         news_url=news_url,
                         pic=pic,
                         title=title,
                         abstract=abstract,
                         author=author,
                         comment_num=comment_num,
                         news_date=news_date)
         item = judge_news_crawl(item)
         if item:
             request = scrapy.Request(news_url,meta={"item":item},callback=self.parse_news)
             yield request
         else:
             self.flag =int(pageindex)
     if not self.flag:
         pageindex = int(pageindex)+1
         next_url = self.next_url % pageindex
         yield scrapy.Request(next_url)
Exemple #45
0
 def parse_topic(self,response):
     origin_url = response.url
     topic_url = origin_url.split("_",1)[1].rsplit("_",1)[0]
     pageindex = int(origin_url.rsplit("_",1)[1].replace('.html',''))
     catalogue = re.search('</a> -&gt; ([\w\W]+?) </i></h3>',response.body).group(1).decode("gb2312")
     soup = BeautifulSoup(response.body,"lxml")
     news_list = soup.find_all('li')
     for news in news_list:
         news_date = news.find('i').text.split(' ')[1].replace(']','') if news.find('i') else None
         if news_date:
             news_url = news.find('a',href = re.compile('http://news.carnoc.com/list/*.?')).get('href')
             news_no = news_url.rsplit('/',1)[1].replace('.html','')
             title = news.find('a',href = re.compile('http://news.carnoc.com/list/*.?')).text.strip()
             abstract = news.find('div').text.strip()
             pic = news.find('div').find('img',src = re.compile('http://pic.carnoc.com/file/*.?')).get('src') if news.find('div').find('img',src = re.compile('http://pic.carnoc.com/file/*.?')) else None
             tags = news.find('div',class_ = 'keywordslist').text.strip() if news.find('div',class_ = 'keywordslist') else None
             item = NewsItem(
                 news_url =news_url,
                 news_date = news_date + ' 00:00:00',
                 title = title,
                 abstract = abstract,
                 news_no = news_no,
                 catalogue = catalogue,
                 pic = pic,
                 tags = tags,
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
             else:
                 self.flag[topic_url] = pageindex
         else:
             logger.warning("carnoc:%s can't find news_date " % origin_url)
     if not self.flag[topic_url]:
         next_url = origin_url.rsplit("_",1)[0] + '_' + str(pageindex + 1) + '.html'
         yield scrapy.Request(next_url,callback=self.parse_topic)
Exemple #46
0
    def parse_newest(self, response):
        soup = BeautifulSoup(response.body,"lxml")
        page =response.request.body.split('=')[-1]
        li = soup.find_all('li')
        if li:
            for news in li :
                news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None
                struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                title = news.find(class_="title").string if news.find(class_="title") else None
                news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None
                abstract = news.find(class_="info").string if news.find(class_="info") else None
                pic = self.domain+news.find('img').get('src',None) if news.find('img') else None
                topic = news.find(class_="type").string if news.find(class_="type") else None
                item = NewsItem(catalogue=u"最新内容",
                                title=title,
                                news_url=news_url,
                                abstract=abstract,
                                pic=pic,
                                topic=topic,
                                news_date=news_date)
                item = judge_news_crawl(item)
                if item:
                    request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True)
                    request.meta["item"] = item
                    yield request
                else:
                    self.flag=page
        else:
            logger.info("can't find news list")


        #下一页
        if not self.flag:
            new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest)
            yield new_request
Exemple #47
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        #获取下一页链接
        origin_url = response.url
        next_page_number = 2
        if "page" in origin_url:
            next_page_number = int(origin_url.rsplit('/')[-2])+1
        search = soup.find("section",id="omc-main")
        if search:
            news_list = search.find_all("article")
            if news_list:
                for news in news_list:
                    abstract,author,news_date = None,None,None
                    #find date and author
                    if news.find("p",class_="omc-date-time-one"):
                        date_aut = list(news.find("p",class_="omc-date-time-one").strings)
                        author = date_aut[1]
                        news_date = date_aut[2][5:]
                        struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
                        news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                    titile = news.h2.text if news.h2 else None
                    news_url= news.h2.a.get("href",None) if news.h2.a else None
                    news_no = news_url.rsplit("/")[-2]
                    topic_group = news.find("h3",class_="omc-blog-one-cat")
                    topics = []
                    if topic_group:
                        for topic in topic_group.find_all("a"):
                            topics.append(topic.string)

                    #中间会有空隙
                    # topic = list(news.find("h3",class_="omc-blog-one-cat").strings) if news.find("h3",class_="omc-blog-one-cat") else None

                    if news.find("p",class_="omc-blog-one-exceprt"):
                        abstract = news.find("p",class_="omc-blog-one-exceprt").text.strip()
                    pic = news.img.get("src",None) if news.img else None
                    #生成新闻item 并抛向解析内容
                    item = NewsItem(news_url=news_url,
                                    title=titile,
                                    abstract=abstract,
                                    pic=pic,
                                    author=author,
                                    news_date=news_date,
                                    crawl_date=NOW,
                                    news_no=news_no,
                                    topic=topics)
                    item = judge_news_crawl(item,end_day=END_DAY+1)
                    if item:
                        request =  scrapy.Request(news_url,callback=self.parse_news)
                        request.meta["item"]=item
                        if news_url:
                            yield request
                        else:
                            logger.warning("can't find news url")
                    else:
                        self.flag =next_page_number-1

            else:
                logger.info("can't find news list")

        else:
            logger.info("can't find main container")

        if not self.flag:
            next_url = self.page_url % next_page_number
            yield scrapy.Request(next_url,callback=self.parse)
Exemple #48
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        #获取下一页链接
        origin_url = response.url
        next_page_number = 2
        if "page" in origin_url:
            next_page_number = int(origin_url.rsplit('/')[-2]) + 1
        search = soup.find("section", id="omc-main")
        if search:
            news_list = search.find_all("article")
            if news_list:
                for news in news_list:
                    abstract, author, news_date = None, None, None
                    #find date and author
                    if news.find("p", class_="omc-date-time-one"):
                        date_aut = list(
                            news.find("p", class_="omc-date-time-one").strings)
                        author = date_aut[1]
                        news_date = date_aut[2][5:]
                        struct_date = datetime.datetime.strptime(
                            news_date, "%Y-%m-%d")
                        news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                    titile = news.h2.text if news.h2 else None
                    news_url = news.h2.a.get("href",
                                             None) if news.h2.a else None
                    news_no = news_url.rsplit("/")[-2]
                    topic_group = news.find("h3", class_="omc-blog-one-cat")
                    topics = []
                    if topic_group:
                        for topic in topic_group.find_all("a"):
                            topics.append(topic.string)

                    #中间会有空隙
                    # topic = list(news.find("h3",class_="omc-blog-one-cat").strings) if news.find("h3",class_="omc-blog-one-cat") else None

                    if news.find("p", class_="omc-blog-one-exceprt"):
                        abstract = news.find(
                            "p", class_="omc-blog-one-exceprt").text.strip()
                    pic = news.img.get("src", None) if news.img else None
                    #生成新闻item 并抛向解析内容
                    item = NewsItem(news_url=news_url,
                                    title=titile,
                                    abstract=abstract,
                                    pic=pic,
                                    author=author,
                                    news_date=news_date,
                                    crawl_date=NOW,
                                    news_no=news_no,
                                    topic=topics)
                    item = judge_news_crawl(item, end_day=END_DAY + 1)
                    if item:
                        request = scrapy.Request(news_url,
                                                 callback=self.parse_news)
                        request.meta["item"] = item
                        if news_url:
                            yield request
                        else:
                            logger.warning("can't find news url")
                    else:
                        self.flag = next_page_number - 1

            else:
                logger.info("can't find news list")

        else:
            logger.info("can't find main container")

        if not self.flag:
            next_url = self.page_url % next_page_number
            yield scrapy.Request(next_url, callback=self.parse)
Exemple #49
0
    def parse_news(self,response):
        soup = BeautifulSoup(response.body,"lxml")
        origin_url  = response.url
        item = response.meta.get("item",NewsItem())
        news_index = response.meta.get("news_index",1)#新闻当前页数
        pageindex = response.meta.get("pageindex",1)#新闻爬取到的页数
        #新闻第一页,可能有下一页
        if news_index == 1:
            # if soup.find("span",class_="date") == None:
            #     import pdb;pdb.set_trace()
            news_date = soup.find("span",class_="date").get_text(strip=True) if soup.find("span",class_="date") else None
            #2016.07.28 22:12:52
            struct_date = datetime.datetime.strptime(news_date,"%Y.%m.%d %H:%M:%S")
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            # u''
            comment_text = soup.find("span",id="comment_num").text if soup.find("span",id="comment_num") else None
            if comment_text == u'':
                comment_num = 0
            else:
                comment_num = int(comment_text)

            #正文会有下一页 TODO:正文有js,需要替换。
            content_txt = soup.find("div",class_="content_txt")
            content = content_txt.get_text(strip=True) if content_txt else None
            referer_web = soup.find("span",id="source_baidu").a.get_text(strip=True) if soup.find("span",id="source_baidu") else None
            referer_url = soup.find("span",id="source_baidu").a.get("href") if soup.find("span",id="source_baidu") else None
            item["content"] = content
            item["news_date"] =news_date
            item["comment_num"] =comment_num
            item["crawl_date"] =NOW
            item["referer_web"] =referer_web
            item["referer_url"] =referer_url
            catalogue = item["catalogue"]
            item = judge_news_crawl(item)
            if item:
                if u"下一页" in content_txt.find("div",class_="page").text:
                #替换成下一页 格式:http://www.techweb.com.cn/world/2016-07-26/2365804_2.shtml


                    news_next_page = str(int(news_index)+1)
                    news_next_url = re.sub(r'\.shtml','_%s.shtml' % news_next_page,origin_url)
                    yield scrapy.Request(news_next_url,callback=self.parse_news,meta={"news_index":news_next_page,"item":item})
                else:
                    yield item
            else:
                if "原创".decode("utf-8") == catalogue:
                    self.yuanchuang_flag=pageindex
                else:
                    self.news_flag=pageindex
        #新闻的下一页
        else:
            content_txt = soup.find("div",class_="content_txt")
            content = content_txt.get_text(strip=True)
            item["content"] += u"\n第%s页\n%s" % (news_index,content)
            if item:
                #下一页是disabled说明没有下一页
                if not content_txt.find("div",class_="page").find("span",class_="disabled"):
                #替换成下一页 格式:http://www.techweb.com.cn/world/2016-07-26/2365804_2.shtml
                    news_next_url = re.sub(r'_.+?\.shtml','_%s.shtml' % str(int(news_index)+1),origin_url)
                    yield scrapy.Request(news_next_url,callback=self.parse_news,meta={"pageindex":str(int(news_index)+1),"item":item})
                else:
                    yield item
Exemple #50
0
 def parse_index(self, response):
     """
     获取公众号的文章列表
     :param response:    公众号主页
     :return:
     """
     weixin_id = response.meta.get("weixin_id", None)
     msg = re.search(r"var msgList =([\W\w]+?)seajs.use",response.body).group(1).strip()[:-1]  # 获得公众号的文章列表
     msg_dict = json.loads(msg)
     weixin_name = response.meta.get("name", None)
     article_list = []
     for u in msg_dict["list"]:
         news_date = u["comm_msg_info"]["datetime"]  #某天所有发布的文章的时间戳
         news_date = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(int(news_date)))  #时间转换
         title = u["app_msg_ext_info"]["title"] #某天最后跟新的文章的标题
         news_url = "http://mp.weixin.qq.com" + u["app_msg_ext_info"]["content_url"] #某天最后跟新的文章的url
         pic = u["app_msg_ext_info"]["cover"]    #某天最后跟新的文章的图片
         abstract = u["app_msg_ext_info"]["digest"] #某天最后跟新的文章的摘要
         author = u["app_msg_ext_info"]["author"] # 某天最后跟新的文章的作者
         fileid = u["app_msg_ext_info"]["fileid"] # 某天最后跟新的文章的fileid
         source_url = u["app_msg_ext_info"]["source_url"] # 某天最后跟新的文章的来源信息
         article = {"weixin_id":weixin_id,"weixin_name":weixin_name,"news_date":news_date,"title":title,"news_url":news_url.replace("&amp;","&"),"pic":pic,"abstract":abstract,"author":author,"fileid":fileid,"source_url":source_url}
         article_list.append(article)
         item = WechatItem(
             weixin_id = weixin_id,
             weixin_name = weixin_name,
             news_date = news_date,
             title = title,
             news_url = news_url.replace("&amp;","&"),
             pic = pic,
             abstract = abstract,
             author = author,
             fileid = fileid,
             source_url = source_url
         )
         item = judge_news_crawl(item)
         if item:
             time.sleep(random.randint(2,5))
             print item['news_url']
             yield scrapy.Request(item['news_url'],callback=self.parse_news, meta={"item": item})
         for c in u["app_msg_ext_info"]["multi_app_msg_item_list"]:
             title = c["title"] #某天的文章的标题
             news_url = "http://mp.weixin.qq.com" + c["content_url"] #某天的文章的url
             pic = c["cover"]    #某天的文章的图片
             abstract = c["digest"] #某天的文章的摘要
             author = c["author"] # 某天的文章的作者
             fileid = c["fileid"] # 某天的文章的fileid
             source_url = c["source_url"] # 某天的文章的来源url
             article = {"weixin_id":weixin_id,"weixin_name":weixin_name,"news_date":news_date,"title":title,"news_url":news_url.replace("&amp;","&"),"pic":pic,"abstract":abstract,"author":author,"fileid":fileid,"source_url":source_url}
             article_list.append(article)
             item = WechatItem(
                 weixin_id = weixin_id,
                 weixin_name = weixin_name,
                 news_date = news_date,
                 title = title,
                 news_url = news_url.replace("&amp;","&"),
                 pic = pic,
                 abstract = abstract,
                 author = author,
                 fileid = fileid,
                 source_url = source_url
             )
             item = judge_news_crawl(item)
             if item:
                 print item['news_url']
                 time.sleep(random.randint(2,5))
                 yield scrapy.Request(item['news_url'],callback=self.parse_news, meta={"item": item})