Beispiel #1
0
    def parse_news(self, response):
        item = response.meta.get("item", NewsItem())
        soup = BeautifulSoup(response.body, "lxml")
        topic, referer_web, author, news_date = None, None, None, None
        #article_type:topic,出处,作者,发布时间
        article_type = soup.find("div", class_="article-type")
        if article_type:
            topic = article_type.a.string  #专题
            span_list = article_type("span")
            if span_list:
                referer_web = span_list[0].text  #出处
                author = span_list[1].text  #作者
                news_date = span_list[2].text  #发布时间
        #内容
        content = soup.find("div", class_="article-content").text if soup.find(
            "div", class_="article-content") else None
        #评论次数
        comment_num = soup.find(
            "div", class_="jl-comment-title").span.string if soup.find(
                "div", class_="jl-comment-title") else None
        #新闻编号
        news_no = response.url.split("/")[-1][:-5]

        item['content'] = content
        item['referer_web'] = referer_web
        item['author'] = author
        item['news_date'] = news_date
        item['comment_num'] = comment_num
        item['crawl_date'] = NOW
        item['topic'] = topic
        item['news_no'] = news_no

        yield item
Beispiel #2
0
    def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        soup = BeautifulSoup(response.body.decode("gbk"),"lxml")
        referer_web = soup.find("span",bosszone="jgname").text if soup.find("span",bosszone="jgname") else None
        referer_url = soup.find("span",bosszone="jgname").get("href")  if soup.find("span",bosszone="jgname") else None
        abstract = soup.find("p",class_="Introduction").text.strip() if soup.find("p",class_="Introduction") else None
        temp = soup.find("p",align="center") if soup.find("p",align="center") else None
        if temp:
            pic = temp.find("img").get("src") if temp.find("img") else None
        else:
            pic = None
        author = soup.find("span",class_="auth").text if soup.find("span",class_="auth") else None
        crawl_date = NOW
        catalogue = "热点推荐"
        comment_num = soup.find("em",id="top_count").text.strip() if soup.find("em",id="top_count") else None
        temp = soup.find_all("p",style="TEXT-INDENT: 2em") if soup.find("p",style="TEXT-INDENT: 2em") else None
        if temp:
            content = "\n\n".join([ t.text.strip() for t in temp])
        else:
            content = None

        item["referer_web"] = referer_web
        item["referer_url"] = referer_url
        item["abstract"] = abstract
        item["pic"] = pic
        item["author"] = author
        item["crawl_date"] = crawl_date
        item["catalogue"] = catalogue
        item["comment_num"] = comment_num
        item["content"] = content
        item['crawl_date'] = NOW
        item_keywords = judge_key_words(item)  #获得item和关键词匹配的词
        if item_keywords:   #筛选出有关键词的item
            item["keywords"] = item_keywords
            yield item
Beispiel #3
0
 def parse(self,response):
     origin_url = response.url
     if 'index' not in origin_url:
         pageindex = 0
     else:
         pageindex = origin_url.rsplit('index_',1)[-1].replace('.html','')
         pageindex = int(pageindex)
     soup = BeautifulSoup(response.body.decode('utf8'),"lxml")
     news_list = soup.find_all('li',style = 'overflow:hidden;')
     for news in news_list:
         news_date = news.find('span').text if news.find('span') else None
         if news_date :
             news_url = news.find('a').get('href')
             news_no = news_url.rsplit('/',1)[-1].replace('.html','') # http://www.caac.gov.cn/XWZX/MHYW/201607/t20160726_39146.html
             title = news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')).text.strip() if news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')) else None
             item = NewsItem(
                 news_date = news_date + ' 00:00:00',
                 title = title,
                 news_url = news_url,
                 news_no = news_no
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item})
             else:
                 self.flag = pageindex
         else:
             logger.warning("mhyw can't find news_date")
     if not self.flag:
         next_url = self.next_url % (str(pageindex + 1 ))
         yield scrapy.Request(next_url)
Beispiel #4
0
 def parse_topic(self,response):
     origin_url = response.url
     if "_" not in origin_url:
         pageindex = 0
         topic_url = origin_url.rsplit(".",1)[0]
     else:
         temp = origin_url.rsplit("_",1)
         pageindex = temp[-1].split(".",1)[0]
         topic_url = temp[0]
     soup = BeautifulSoup(response.body,"lxml")
     catalogue =  soup.find("a",class_ = "blue CurrChnlCls").get("title").strip()
     news_list = soup.find("div", class_ = "lie_main_m").find_all("li")
     for news in news_list:
         news_date = news.find("span").text.strip() + " 00:00:00"
         title = news.find("a").text.strip()[10:]
         news_url = topic_url.rsplit("/",1)[0] + news.find("a").get("href")[1:]
         news_no = news_url.rsplit("/",1)[-1].split(".")[0]
         item = NewsItem(
                     news_date = news_date,
                     news_url =news_url,
                     title = title,
                     news_no = news_no,
                     catalogue = catalogue,
         )
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
         else:
             self.flag[topic_url] = pageindex
     if not self.flag[topic_url]:
         next_url = topic_url + "_" + str(int(pageindex) + 1) + ".shtml"
         yield scrapy.Request(next_url,callback=self.parse_topic)
Beispiel #5
0
 def parse(self, response):
     url = response.url
     pageindex = url.rsplit("/",1)[-1]
     soup = BeautifulSoup(response.body, "lxml")
     wrap = soup.find("div",class_="lph-pageList index-pageList")
     news_list = wrap.find_all("li")
     for news in news_list:
         topic = news.find("div",class_="img").a.string.strip() if news.find("div",class_="img") else None
         pic = news.find("img").get("data-original",None) if news.find("img") else None
         title = news.find("h3").text.strip() if news.find("h3") else None
         abstract = news.find("div",class_="des").text.strip() if news.find("div",class_="des") else None
         author = news.find("a",class_="aut").text.strip() if news.find("a",class_="aut") else None
         news_url = news.find("h3").a.get("href") if news.find("h3") else None
         tag_list = news.find("div", class_="tags").find_all("a")
         tags = [i.text for i in tag_list] if tag_list else None
         item = NewsItem(topic=topic,
                         news_url=news_url,
                         pic=pic,
                         title=title,
                         abstract=abstract,
                         author=author,
                         tags=tags,
                         )
         request = scrapy.Request(news_url,meta={"item":item,"pageindex":pageindex},callback=self.parse_news)
         yield request
     if not self.flag:
         pageindex = int(pageindex)+1
         next_url = self.next_url % pageindex
         yield scrapy.Request(next_url)
Beispiel #6
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body.decode("gbk"))
     referer_web = soup.find('span',
                             id='source_baidu').find('a').text.strip()
     referer_url = soup.find('span',
                             id='source_baidu').find('a').get('href')
     author = soup.find(
         'span', id='author_baidu').find('a').text.strip() if soup.find(
             'span', id='author_baidu').find('a') else None
     crawl_date = NOW
     news_date = soup.find('span', id='pubtime_baidu').text.strip()
     comment_num = soup.find(
         'span', class_='pltit').find('b').text.strip() if soup.find(
             'span', class_='pltit') else 0
     zan = soup.find('span', class_='zan-plus').text.strip() if soup.find(
         'span', class_='zan-plus') else None
     read_num = int(comment_num) + int(zan)
     content = soup.find("div", id="newstext").get_text(
         strip=True) if soup.find("div", id="newstext") else None
     item['referer_web'] = referer_web
     item['content'] = content
     item['referer_url'] = referer_url
     item['author'] = author
     item['crawl_date'] = crawl_date
     item['news_date'] = news_date
     item['comment_num'] = int(comment_num)
     item['read_num'] = read_num
     yield item
Beispiel #7
0
 def parse(self, response):
     origin_url = response.url
     pageindex = re.search(r"list_(\d+?).shtml",origin_url).group(1) if re.search(r"list_(\d+?).shtml",origin_url) else None
     soup = BeautifulSoup(response.body,"lxml")
     news_list = soup.find_all("div",class_="con_one")
     for news in news_list:
         title = news.h2.get_text(strip=True)
         news_url = news.h2.a.get("href",None)
         news_no = re.search(r"/(\d+?).shtml",news_url).group(1) if re.search(r"/(\d+?).shtml",news_url) else None
         abstract = news.p.get_text(strip=True)
         pic = news.find("img").get("src",None) if news.find("img") else None
         tags_list = news.find("span",class_="tag")("a") if news.find("span",class_="tag") else None
         tags =  [i.text for i in tags_list] if tags_list else None
         catalogue = u"原创" if "yuanchuang" in origin_url else u"咨询"
         item = NewsItem(
             news_url=news_url,
             news_no=news_no,
             title=title,
             pic=pic,
             abstract=abstract,
             tags=tags,
             catalogue=catalogue
         )
         yield scrapy.Request(news_url,callback=self.parse_news,meta={"pageindex":pageindex,"item":item})
     news_next_url = self.news_next_url % str(int(pageindex)+1)
     if "yuanchuang" in origin_url:
         if not self.yuanchuang_flag:
             yield scrapy.Request(news_next_url)
     else:
         if not self.news_flag:
             yield scrapy.Request(news_next_url)
Beispiel #8
0
    def parse_news(self, response):
        item = response.meta.get("item", NewsItem())
        soup = BeautifulSoup(response.body)
        read_num = soup.find("a", title=u"浏览").text.strip() if soup.find(
            "a", title=u"浏览") else None
        comment_num = soup.find(
            "span", class_="comment_count").text.strip() if soup.find(
                "span", class_="comment_count") else None
        author = soup.find("span", class_="author").text.strip() if soup.find(
            "span", class_="author") else None
        news_date = soup.find("span", class_="date").text.strip().replace(
            "/", "-") + ":00" if soup.find("span", class_="date") else None
        pic = soup.find(
            "div",
            class_="article-img").find("img").get("src").strip() if soup.find(
                "div", class_="article-img") and soup.find(
                    "div", class_="article-img").find("img") else None
        temp = soup.find("div", class_="article-content")
        content = "\n".join([t.text.strip() for t in temp.find_all("p")
                             ]) if temp.find("p") else None

        item["read_num"] = read_num
        item["author"] = author
        item['comment_num'] = comment_num
        item["news_date"] = news_date
        item['pic'] = pic
        item["content"] = content
        item['crawl_date'] = NOW

        yield item
Beispiel #9
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     pageindex = response.meta.get("pageindex", 1)
     topic_url = response.meta.get("topic_url", None)
     origin_url = response.url
     news_no_res = re.search(r"news/(\d+)\.html", origin_url)
     news_no = news_no_res.group(1) if news_no_res else None
     soup = BeautifulSoup(response.body, "lxml")
     ff3 = soup.find("h2", class_="f-ff3 f-fwn")
     referer_web = soup.find("h2",
                             class_="f-ff3 f-fwn").i.text if ff3 else None
     #日期
     origin_date = soup.find(
         "h2", class_="f-ff3 f-fwn").contents[-1].text if ff3 else None
     struct_date = datetime.datetime.strptime(origin_date, "%Y-%m-%d %H:%M")
     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
     content = soup.find("div", class_="m-text").text if soup.find(
         "div", class_="m-text") else None
     author = soup.find("h3", class_="f-ff3 f-fwn").span.text if soup.find(
         "h3", class_="f-ff3 f-fwn") else None
     crawl_date = NOW
     item["referer_web"] = referer_web
     item["crawl_date"] = crawl_date
     item["author"] = author
     item["content"] = content
     item["news_no"] = news_no
     item["news_date"] = news_date
     item = judge_news_crawl(item)
     if item:
         yield item
     else:
         self.flag[topic_url] = pageindex
Beispiel #10
0
    def parse_news(self, response):
        item = response.meta.get("item", NewsItem())
        soup = BeautifulSoup(response.body)
        # topic = soup.find("div","navItem").find_all("a")[2].text if len(soup.find("div","navItem").find_all("a")) >= 3 else None

        temp = soup.find("div", class_="actTitle")
        news_date = re.search('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})',
                              temp.text)
        referer_web = temp.find("a").text if temp.find("a") else None
        referer_url = temp.find("a").get("href") if temp.find("a") else None

        temp = soup.find("div", class_="content")
        content = "\n".join([t.text.strip() for t in temp.find_all("p")])
        pic = Cntour2Spider.start_urls[0][:-1] + temp.find("img").get(
            "src") if temp.find("img") else None

        # item["topic"] = topic
        item["news_date"] = news_date
        item["referer_web"] = referer_web
        item["referer_url"] = referer_url
        item["pic"] = pic
        item["content"] = content
        item['crawl_date'] = NOW

        yield item
Beispiel #11
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body.decode('gbk'))
     pic = soup.find('p', class_='f_center').find('img').get(
         'src') if soup.find('p', class_='f_center') and soup.find(
             'p', class_='f_center').find('img') else None
     referer_web = soup.find('a', id='ne_article_source').text if soup.find(
         'a', id='ne_article_source') else None
     referer_url = soup.find(
         'a', id='ne_article_source').get('href') if soup.find(
             'a', id='ne_article_source') else None
     author = soup.find('span', class_='ep-editor').text if soup.find(
         'span', class_='ep-editor') else None
     if u":" in author:
         author = author.split(u":")[-1]
     crawl_date = NOW
     read_num = soup.find(
         'div',
         class_='post_comment_joincount').find('a').text if soup.find(
             'div', class_='post_comment_tiecount') else 0
     comment_num = soup.find(
         'div', class_='post_comment_tiecount').find('a').text if soup.find(
             'div', class_='post_comment_tiecount') else 0
     content = soup.find('div', class_='post_text').get_text(
         strip=True) if soup.find('div', class_='post_text') else None
     item['referer_web'] = referer_web
     item['content'] = content
     item['referer_url'] = referer_url
     item['author'] = author
     item['crawl_date'] = crawl_date
     item['pic'] = pic
     item['comment_num'] = int(comment_num)
     item['read_num'] = int(read_num)
     yield item
    def parse(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        page_res = re.search("page=(\d+)", response.url)
        pageindex = page_res.group(1) if page_res else None  #爬取页数
        search_result = soup.find_all("ul", id="search-result")
        if search_result:
            news_list = search_result[0].find_all("li", class_="news")
            print len(news_list)
            #新闻列表的新闻,获取图片和摘要
            for news in news_list:
                news_url = news.a.get("href", None) if news.a else None
                abstract = None
                if news.find("div", class_="summary hidden-xxs"):
                    abstract = news.find(
                        "div", class_="summary hidden-xxs").string.strip()
                pic = news.find("img").get("data-original",
                                           None) if news.find("img") else None
                item = NewsItem(news_url=news_url, abstract=abstract, pic=pic)
                #将item作为元素传递到解析页面中
                if news_url:
                    request = scrapy.Request(news_url,
                                             callback=self.parse_news)
                    request.meta['item'] = item
                    request.meta['pageindex'] = pageindex
                    yield request
                else:
                    logger.info("can't find news_url")
            #下一页

            # if int(pageindex)<self.crawl_page:
            if not self.flag:
                next_url = self.page_url % str(int(pageindex) + 1)
                yield scrapy.Request(next_url, callback=self.parse)
        else:
            logger.info("can't find search_result")
Beispiel #13
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body)
     referer_web = soup.find("a", id="ne_article_source").text if soup.find(
         "a", id="ne_article_source") else None
     referer_url = soup.find("a", id="ne_article_source").get(
         "href", None) if soup.find("a", id="ne_article_source") else None
     comment_num = soup.find("a", class_="post_cnum_tie").text if soup.find(
         "a", id="ne_article_source") else None
     content = soup.find("div",
                         class_="post_text").text.strip() if soup.find(
                             "div", class_="post_text") else None
     #格式: 本文来源:证券日报-资本证券网  作者:矫 月
     author_source = soup.find("span", class_="left").text if soup.find(
         "span", class_="left") else None
     #TODO 作者编码出错
     # import pdb;pdb.set_trace()
     # author = re.search(u"作者(.*)",author_source).group(1)[1:] if author_source else None
     # item["author"]=author
     item["referer_web"] = referer_web
     item["referer_url"] = referer_url
     item["comment_num"] = comment_num
     item["content"] = content
     item["crawl_date"] = NOW
     yield item
Beispiel #14
0
 def parse(self, response):
     origin_url = response.url
     #http://money.163.com/special/002526O5/transport_02.html
     search_result = re.search(r"_(\d)*?\.", origin_url)
     #获取页数
     pageindex = search_result.group(1) if search_result else 1
     soup = BeautifulSoup(response.body, "lxml")
     news_list = soup("div", class_="list_item clearfix")
     for news in news_list:
         news_date = news.find("span", class_="time").text if news.find(
             "span", class_="time") else None
         title = news.find("h2").text if news.find("h2") else None
         news_url = news.find("h2").a.get("href",
                                          None) if news.find("h2") else None
         abstract = news.find("p").contents[0] if news.find("p") else None
         item = NewsItem(title=title,
                         news_url=news_url,
                         abstract=abstract,
                         news_date=news_date)
         item = judge_news_crawl(item)  #判断是否符合爬取时间
         if item:
             request = scrapy.Request(news_url,
                                      callback=self.parse_news,
                                      meta={"item": item})
             yield request
         else:
             self.flag = int(pageindex)
     if not self.flag:
         next_url = self.next_url % int(pageindex) + 1
         yield scrapy.Request(next_url)
Beispiel #15
0
    def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",None)
        soup = BeautifulSoup(response.body)
        #TODO:新闻列表中会有专题,里面没有新闻的内容。现在是抛弃!
        #爬取新闻
        news_txt=soup.find("div",class_="news_txt")
        if news_txt:
            content = news_txt.text
            news_about = soup.find("div",class_="news_about")
            #referer_web,news_date
            if news_about:
                referer_web = news_about.p.string
                news_date = news_about.p.next_sibling.next_sibling.text[0:16]
                struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                item["referer_web"]=referer_web
                item["news_date"]=news_date
                item["content"]=content
                item["crawl_date"]=NOW
                # import pdb;pdb.set_trace()
                item = judge_news_crawl(item)
                if item:
                    yield item
                else:
                    self.flag = pageindex

        else:

            logger.info("news page can't find news_txt.That may be a theme")
Beispiel #16
0
 def parse(self, response):
     origin_url = response.url
     pageindex = origin_url.rsplit("=", 1)[-1]
     soup = BeautifulSoup(response.body, "lxml")
     news_temp = soup.find("table",
                           class_="list").find("table",
                                               border="0").find("tbody")
     if not news_temp:
         return
     news_list = news_temp.find_all("tr")[1:]
     for news in news_list:
         temp = news.find("a")
         news_url = temp.get("href")
         title = temp.text.strip()
         temp = news.find_all("span")
         referer_web = temp[0].text.strip()
         news_date = temp[1].text.strip()
         news_no = news_url.rsplit("=", 1)[-1]
         item = NewsItem(news_date=news_date,
                         title=title,
                         referer_web=referer_web,
                         news_url=news_url,
                         news_no=news_no)
         item = judge_news_crawl(item)
         if item:
             yield scrapy.Request(item["news_url"],
                                  callback=self.parse_news,
                                  meta={"item": item})
         else:
             self.flag = pageindex
     if not self.flag:
         next_url = self.next_url % (int(pageindex) + 1)
         yield scrapy.Request(next_url)
Beispiel #17
0
    def parse(self, response):
        topic_url = response.url
        catalogue = u"商业" if "business" in topic_url else u"消费"
        self.driver.get(topic_url)                      #打开页面
        index_page_code = self.driver.page_source       #页面源代码
        code =index_page_code
        pageindex = 1
        interval =10
        while True:
            soup = BeautifulSoup(code,"lxml")
            news_list = soup.find_all("dl",class_="f-cb")
            #页数来爬,因为每次都会有之前的新闻.间隔为10
            for news in news_list[interval*(pageindex-1):interval*pageindex]:
                pic = news.find("img").get("src") if news.find("img") else None
                title = news.find("h3").text if news.find("h3") else None
                news_url = news.find("h3").a.get("href") if news.find("h3") else None
                item = NewsItem(pic=pic,title=title,news_url=news_url,catalogue=catalogue)
                yield scrapy.Request(news_url,callback=self.parse_news,meta={"item":item,
                                                                             "pageindex":pageindex,
                                                                            "topic_url":topic_url})
            #结束
            if self.flag[topic_url]:
                break
            #触发下一页操作,追加在页面,而不是打开一个新的页面
            self.driver.find_element_by_id("clickMore").click()    #找到“更多”按钮,触发点击操作
            time.sleep(1)           #等浏览器渲染
            next_page_code =  self.driver.page_source
            code = next_page_code   #更新页面源代码
            pageindex += 1

        #如果放在start_urls一起爬取的话,会报错。原因应该是display不支持并行。
        #现在只能是把每个页面分开
        yield scrapy.Request("http://m.yicai.com/news/consumer/",callback=self.parse)
Beispiel #18
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body)
     content = soup.find(
         "div", class_=re.compile(r"pageCont")).text if soup.find(
             "div", class_=re.compile(r"pageCont")) else None
     item["crawl_date"] = NOW
     item["content"] = content
     yield item
Beispiel #19
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body)
     content = soup.find("div", class_="article-content").text
     tag_list = soup.find_all("a", "tag-link")
     tags = [i.text for i in tag_list] if tag_list else None
     item["content"] = content
     item["tags"] = tags
     item["crawl_date"] = NOW
     yield item
    def parse_news(self, response):
        # print response.url,"response"
        PageKey = response.meta.get("topic_id")
        PageNumber = response.meta.get("PageNumber")
        flag_id = str(int(PageKey) - 40037910)
        soup = BeautifulSoup(response.body, "lxml")
        #2016-07-13
        news_date = soup.find("time").text if soup.find("time") else None
        # print self.flag[flag_id],int(PageNumber)
        """
        条件是该类别标记(self.flag[flag_id])是0爬取,说明还没有爬到过期的。
        爬取页面是该页的也继续爬取。因为一个页面的爬取顺序是异步的。
        self.flag[flag_id]=过期页数
        """
        if not self.flag[flag_id] or int(PageNumber) == self.flag[flag_id]:
            #,没有超出范围

            struct_date = datetime.datetime.strptime(news_date, "%Y-%m-%d")
            # print self.end_now,struct_date,"time"
            delta = self.end_now - struct_date
            # print delta.days,"delta day ~~~~~~~~~~~~~~~~"
            if delta.days > self.end_day:
                self.flag[str(flag_id)] = int(PageNumber)
                # print flag_id,"stop ~~~~~~"
                # raise CloseSpider('today scrapy end')
            else:

                head = soup.find("div", class_="post-head")
                topic, title, abstract = None, None, None
                if head:
                    topic = head.find("span",
                                      class_="category").text if head.find(
                                          "span", class_="category") else None
                    title = head.find("h1", class_="h1").text if head.find(
                        "h1", class_="h1") else None
                    abstract = head.find("span",
                                         class_="kicker").text if head.find(
                                             "span", class_="kicker") else None
                content = soup.find(
                    "div", class_="post-body clearfix").text if soup.find(
                        "div", class_="post-body clearfix") else None
                news_no = response.url.split("/")[-1].split("?")[0]
                #TODO 评论数量js渲染,未解决
                item = NewsItem(
                    title=title,
                    topic=topic,
                    abstract=abstract,
                    news_date=news_date,
                    content=content,
                    news_no=news_no,
                    crawl_date=NOW,
                    news_url=response.url,
                )
                yield item
Beispiel #21
0
 def parse_quick_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body)
     referer_web = soup.find("span", class_="name").text if soup.find(
         "span", class_="name") else None
     tag_list = soup.find_all("a", "tag-link")
     tags = [i.text for i in tag_list] if tag_list else None
     item["tags"] = tags
     item['referer_web'] = referer_web
     item['crawl_date'] = NOW
     yield item
Beispiel #22
0
    def parse_quick(self, response):
        soup = BeautifulSoup(response.body)
        news_list_inner = soup.find("div", class_="list-inner")
        next_timestamp = None

        news_list = news_list_inner.find_all(
            "div",
            class_=re.compile(r"bulletin-item.*")) if news_list_inner else None
        #json 页面
        if not news_list:
            news_list = soup.find_all("div",
                                      class_=re.compile(r"bulletin-item.*"))
        for index, news in enumerate(news_list):
            origin_date = news.find("div", class_="news-time").get(
                "data-time", None) if news.find("div",
                                                class_="news-time") else None
            next_timestamp = origin_date if index == len(
                news_list) - 1 else None  #取最后一篇文章的时间戳作下一页的时间戳
            struct_date = datetime.datetime.fromtimestamp(int(origin_date))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            title = news.find("a", class_="item-title").text if news.find(
                "a", class_="item-title") else None
            news_url = news.find("a", class_="item-title").get(
                "href", None) if news.find("a", class_="item-title") else None
            pic = news.find("img").get("src",
                                       None) if news.find("img") else None
            content = news.find("div", class_="item-desc").text if news.find(
                "div", class_="item-desc") else None
            id_result = re.search(r"/(\d+)\.html", news_url)
            news_no = id_result.group(1) if id_result else None
            item = NewsItem(content=content,
                            news_url=news_url,
                            pic=pic,
                            title=title,
                            news_no=news_no,
                            news_date=news_date,
                            catalogue=u"快报")
            item = judge_news_crawl(item)
            if item:
                request = scrapy.Request(news_url,
                                         meta={"item": item},
                                         callback=self.parse_quick_news)
                yield request
            else:
                self.quick_flag = int(self.quick_page)

        if not self.quick_flag:
            if next_timestamp:
                next_quick_url = self.quick_json_url % next_timestamp
                yield scrapy.Request(next_quick_url, callback=self.parse_quick)
            else:
                logger.warning("can't find next_timestamp,url is %s " %
                               response)
Beispiel #23
0
    def parse(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        origin_url = response.url
        res = re.search(r'ndex_(.*?)\.shtml', origin_url)
        new_index = 1
        if res:
            index = res.group(1)
            new_index = int(index) + 1
        #爬取列表
        viewlist = soup.find_all("div", "list list-640")
        if viewlist:
            for news in viewlist:
                title = news.select("h3 a")[0].string if news.select(
                    "h3 a") else None
                news_url = news.select("h3 a")[0].get(
                    "href", None) if news.select("h3 a") else None
                abstract = news.select(
                    'p[class="info"]')[0].string if news.select(
                        'p[class="info"]') else None  #info
                pic = news.find('img').get(
                    "src", None) if news.find('img') else None  #图片链接
                #brand
                tags = []  #标签组
                fl = news.find(class_="clear date")
                if fl and fl.select("a"):
                    topic = fl.select("a")[0].string  #专题
                    for i in fl.select("a")[1:-1]:
                        tags.append(i.string)
                    news_date = fl.find(class_="fr arial").string  #%Y-%m-%d
                else:
                    news_date = None
                    topic = None

                item = NewsItem(title=title,
                                news_url=news_url,
                                abstract=abstract,
                                pic=pic,
                                topic=topic,
                                news_date=news_date,
                                catalogue=u"咨询")
                request = scrapy.Request(news_url, callback=self.parse_news)
                request.meta['item'] = item
                request.meta['pageindex'] = index
                yield request

        else:
            logger.info("can't find news list")
        if not self.flag and new_index:
            new_url = re.sub(r'ndex_(.*?)\.shtml',
                             'ndex_%s.shtml' % str(new_index), origin_url)
            yield scrapy.Request(new_url)
        else:
            logger.info("can't find index")
Beispiel #24
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     temp = re.search("\"content\":\"([\w\W]+?)\"", response.body).group(1)
     comment_num = re.search('"comment":"([\w\W]+?)"',
                             response.body).group(1)
     print comment_num
     soup = BeautifulSoup(temp)
     content = "\n\n".join([t.text.strip() for t in soup.find_all("p")])
     item['content'] = content
     item['comment_num'] = comment_num
     item['crawl_date'] = NOW
     yield item
Beispiel #25
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body, "lxml")
     temp = soup.find("div", class_="main-content") if soup.find(
         "div", class_="main-content") else None
     if temp:
         content = "\n\n".join([t.text.strip() for t in temp.find_all("p")])
     else:
         content = None
     item["content"] = content
     item['crawl_date'] = NOW
     yield item
Beispiel #26
0
    def parse(self, response):
        soup = BeautifulSoup(response.body,"lxml")
        newslist = soup.find(name="div", attrs={"data-lastkey": True})
        lastkey = newslist.get("data-lastkey",None)
        logger.info(lastkey)
        if not lastkey:
            logger.warning("can't find next page")
        else:
            if newslist:
                for i in newslist.children:
                    #文章中间有其余无关信息
                    if i != u' ':
                        news_url = self.domain+i.a.get('href',None)
                        pic = i.find("img").get('data-src') if i.find("img") else None
                        title = i.find("h3").string if i.find("h3") else None
                        comment_num = i.find(class_="iconfont icon-message").string if i.find(class_="iconfont icon-message") else 0
                        heart = i.find(class_="iconfont icon-heart").string if i.find(class_="iconfont icon-heart") else 0
                        topic = i.find(class_="category").span.string if i.find(class_="category") else 0
                        news_date =None
                        if i.find(name="span", attrs={"data-origindate": True}):
                            news_date= i.find(name="span", attrs={"data-origindate": True}).get("data-origindate",None)
                            if news_date:
                                news_date = news_date[:-6]

                        #no content and have heart&conment but not add
                        item = NewsItem(title=title,news_url=news_url,pic=pic,topic=topic,news_date=news_date,comment_num=comment_num)
                        # 所属目录
                        item['catalogue'] = "Top 15" if "tags" in response.url else u"商业"
                        #判断是否结束
                        item = judge_news_crawl(item)
                        if item :
                            request = scrapy.Request(news_url,callback=self.parse_article)
                            request.meta["item"] = item
                            yield request
                        else:
                            if "tags" in response.url:
                                self.top_flag = lastkey
                            else:
                                self.com_flag = lastkey
                next_url = None
                #判断各个类别是否需要爬取下一页
                if "tags" in response.url:
                    if not self.top_flag:
                        next_url = "http://www.qdaily.com/tags/tagmore/29/%s.json" % lastkey
                else:
                    if not self.com_flag:
                        next_url = "http://www.qdaily.com/categories/categorymore/18/%s.json" % lastkey
                # logger.info(next_url)
                if next_url:
                    yield scrapy.Request(next_url,callback=self.parse_next_page)
            else:
                logger.warning("can't find newslit")
Beispiel #27
0
 def parse_news(self,response):
     item = response.meta.get("item",NewsItem())
     soup = BeautifulSoup(response.body,"lxml")
     temp = soup.find("div",class_ = "main_t").find_all("span")
     news_date = temp[0].text
     referer_web = temp[1].text.split(u":")[1]
     temp = soup.find("div",class_ = "TRS_Editor")
     content  = "\n\n".join([ t.text.strip() for t in temp.find_all("p")])
     item["news_date"] = news_date
     item["referer_web"] = referer_web
     item["content"] = content
     item['crawl_date'] = NOW
     yield item
Beispiel #28
0
 def parse_topic(self, response):
     origin_url = response.url
     topic_url = origin_url.split("_", 1)[1].rsplit("_", 1)[0]
     pageindex = int(origin_url.rsplit("_", 1)[1].replace('.html', ''))
     catalogue = re.search('</a> -&gt; ([\w\W]+?) </i></h3>',
                           response.body).group(1).decode("gb2312")
     soup = BeautifulSoup(response.body, "lxml")
     news_list = soup.find_all('li')
     for news in news_list:
         news_date = news.find('i').text.split(' ')[1].replace(
             ']', '') if news.find('i') else None
         if news_date:
             news_url = news.find(
                 'a', href=re.compile(
                     'http://news.carnoc.com/list/*.?')).get('href')
             news_no = news_url.rsplit('/', 1)[1].replace('.html', '')
             title = news.find(
                 'a', href=re.compile(
                     'http://news.carnoc.com/list/*.?')).text.strip()
             abstract = news.find('div').text.strip()
             pic = news.find('div').find(
                 'img', src=re.compile('http://pic.carnoc.com/file/*.?')
             ).get('src') if news.find('div').find(
                 'img',
                 src=re.compile('http://pic.carnoc.com/file/*.?')) else None
             tags = news.find(
                 'div', class_='keywordslist').text.strip() if news.find(
                     'div', class_='keywordslist') else None
             item = NewsItem(
                 news_url=news_url,
                 news_date=news_date + ' 00:00:00',
                 title=title,
                 abstract=abstract,
                 news_no=news_no,
                 catalogue=catalogue,
                 pic=pic,
                 tags=tags,
             )
             item = judge_news_crawl(item)
             if item:
                 yield scrapy.Request(item["news_url"],
                                      callback=self.parse_news,
                                      meta={'item': item})
             else:
                 self.flag[topic_url] = pageindex
         else:
             logger.warning("carnoc:%s can't find news_date " % origin_url)
     if not self.flag[topic_url]:
         next_url = origin_url.rsplit(
             "_", 1)[0] + '_' + str(pageindex + 1) + '.html'
         yield scrapy.Request(next_url, callback=self.parse_topic)
Beispiel #29
0
 def parse_news(self, response):
     item = response.meta.get("item", NewsItem())
     soup = BeautifulSoup(response.body, "lxml")
     referer_web = soup.find("span", id="source_baidu").text if soup.find(
         "span", id="source_baidu") else None
     temp = soup.find("div", id="arttext")
     if item["pic"] == None:
         item["pic"] = temp.find("img").get("src") if temp.find(
             "img") else None
     content = "\n\n".join([t.text.strip() for t in temp.find_all("p")])
     item['referer_web'] = referer_web
     item['content'] = content
     item['crawl_date'] = NOW
     yield item
Beispiel #30
0
    def parse_topic(self,response):
        topic_url = response.url
        # print topic_url
        body = json.loads(response.body)
        news_list = body["data"]
        page = response.meta.get("page","1")
        topic_name = response.meta.get("topic_name",None)
        #http://m.iwshang.com/category/20 没有新闻
        if not news_list:
            self.flag[topic_url]=page
        for news in news_list:
            news_date_timestamp = news.get("published",None)
            struct_date = datetime.datetime.fromtimestamp(int(news_date_timestamp))
            news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
            title = news.get("title",None)
            news_no = news.get("contentid",None)
            abstract = news.get("description",None)
            pic = news.get("thumb",None)
            news_url = news.get("url",None)                 #手机端新闻页面链接
            referenceid = news.get("referenceid",None)      #pc端的id,手机端的id跟pc端的id不一样
            pc_news_url = self.pc_news_url % referenceid    #pc端新闻页面链接
            item = NewsItem(
                news_date=news_date,
                title=title,
                news_no=news_no,
                abstract=abstract,
                pic=pic,
                news_url=pc_news_url,
                topic=topic_name
            )
            item = judge_news_crawl(item)
            if item:
                # yield item
                yield scrapy.Request(pc_news_url,callback=self.parse_news,meta={"item":item})
            else:

                self.flag[topic_url]=page
        if not self.flag[topic_url]:
            page = str(int(page)+1)
            post_data = {
                    "inslider":"0",
                    "page":page,
                    "pagesize":"10"
                }
            yield scrapy.FormRequest(
                    url=topic_url,
                    formdata=post_data,
                    callback=self.parse_topic,
                    meta={"page":page}
                )