Beispiel #1
0
 def parse(self, response):
     global start_urls
     self.l = ItemLoader(item=NewsItem(), response=response)
     print(self.start_urls[0])
     # yield scrapy.Request(url=self.start_urls[0],callback=self.thehindumain)
     self.l = ItemLoader(item=NewsItem(), response=response)
     # yield scrapy.Request(url=self.start_urls[1],callback=self.thetimesmain)
     #self.l = ItemLoader(item=NewsItem() , response=response)
     while True:
         self.l = ItemLoader(item=NewsItem(), response=response)
         #buiss
         #    yield scrapy.Request(url='http://www.thehindu.com/business/',callback=self.thehindumain)
         #   yield scrapy.Request(url='https://timesofindia.indiatimes.com/business' , callback=self.thetimestopic)
         #interna
         yield scrapy.Request(
             url='http://www.thehindu.com/news/international/',
             callback=self.thehindumain)
         yield scrapy.Request(
             url='https://timesofindia.indiatimes.com/world/',
             callback=self.thetimestopic)
         #india
         #yield scrapy.Request(url='http://www.thehindu.com/news/national/',callback=self.thehindumain)
         #yield scrapy.Request(url='https://timesofindia.indiatimes.com/india',callback=self.thetimestopic)
         #sports
         #yield scrapy.Request(url='http://www.thehindu.com/news/sport/',callback=self.thehindumain)
         #yield scrapy.Request(url='https://timesofindia.indiatimes.com/sports',callback=self.thetimestopic)
         #tech
         # yield scrapy.Request(url='http://www.thehindu.com/news/technology/',callback=self.thehindumain)
         # yield scrapy.Request(url='https://www.gadgetsnow.com?utm_source=toiweb&utm_medium=referral&utm_campaign=toiweb_hptopnav',callback=self.thetimestopic)
         #sci
         #yield scrapy.Request(url='http://www.thehindu.com/news/sci-tech/',callback=self.thehindumain)
         break
        def News(sel, xpath, pri):
            """
            """
            Headline = sel.xpath(xpath)

            for h in Headline:
                item = NewsItem()
                tt = h.xpath('.//a/text()').extract()
                hh = h.xpath('.//a/@href').extract()
                if tt and len(tt) == 1:
                    item['title'] = tt[0].strip()
                    item['href'] = hh[0]
                    item['uptime'] = timeNow
                    item['pri'] = pri
                    item['site'] = self.siteName

                    request = Request(item['href'],
                                      callback=self.parsePeopleContent)
                    request.meta['item'] = item
                    yield request
                else:
                    for i, j in zip(tt, hh):
                        item = NewsItem()
                        item['title'] = tt[0].strip()
                        item['href'] = hh[0]
                        item['uptime'] = timeNow
                        item['pri'] = pri
                        item['site'] = self.siteName

                        request = Request(item['href'],
                                          callback=self.parsePeopleContent)
                        request.meta['item'] = item
                        yield request
    def parse(self, response):
        """
        """
        timeNow = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        hxs = Selector(response)
        xpath_headline = "/html/body/div[7]/div/div/div/div[3]/h1/a"
        xpath_mainnews = "/html/body/div[7]/div/div/div/div[3]/ul/li"
        headline = hxs.xpath(xpath_headline)
        mainNews = hxs.xpath(xpath_mainnews)

        items = []
        item = NewsItem()

        item['title'] = Get(headline, "text()")
        item['href'] = Get(headline, "@href")
        item['uptime'] = timeNow
        item['pri'] = 0
        item['site'] = self.site_name
        items.append(item)

        for mainNewsItem in mainNews:
            item = NewsItem()
            item['title'] = Get(mainNewsItem, "a/text()")
            item['href'] = Get(mainNewsItem, "a/@href")
            item['uptime'] = timeNow
            item['pri'] = 3
            item['site'] = self.site_name

            request = Request(item['href'], callback=parseIfengContent)
            request.meta['item'] = item
            yield request
Beispiel #4
0
 def parse(self, response):
     href = response.url
     title = response.xpath('//header/h1/text()').extract()[0]
     pubtime = response.xpath(
         '//div[@class="reporter"]/time/text()').extract()[0]
     r = re.findall(u'(\d+)年(\d+)月(\d+)日 (\d+):(\d+)', pubtime)
     if not r:
         raise Exception(u'解析时间失败')
     r = [int(x) for x in r[0]]
     pubtime = datetime.datetime(year=r[0],
                                 month=r[1],
                                 day=r[2],
                                 hour=r[3],
                                 minute=r[4])
     htmlcontent = response.xpath(
         '//div[contains(@class, "page_container")]/article/article'
     ).extract()[0]
     keywords = response.xpath('//div[@class="a_k"]/a/text()').extract()
     source = u'中時電子報'
     item = NewsItem(title=title,
                     pubtime=pubtime,
                     htmlcontent=htmlcontent,
                     href=href,
                     keywords=keywords,
                     source=source)
     yield item
Beispiel #5
0
    def parse_news(self, response):
        news = NewsItem()

        news['url'] = response.url

        temp = response.xpath("//h1[@id='artibodyTitle']//text()").extract()
        news['title'] = temp[0] if temp else ''

        temp = response.xpath("//span[@id='media_name']//a//text()").extract()
        news['source'] = temp[0] if temp else ''
        if news['source'] == '':
            temp = response.xpath(
                "//span[@data-sudaclick='media_name']//a//text()").extract()
            news['source'] = temp[0] if temp else ''

        temp = response.xpath("//span[@id='pub_date']//text()").extract()
        news['public_time'] = temp[0] if temp else ''
        if news['public_time'] == '':
            temp = response.xpath(
                "//span[@class='time-source']//text()").extract()
            news['public_time'] = temp[0] if temp else ''
        if news['public_time'] != '':
            news['public_time'] = self.get_datetime(news['public_time'])

        temp = response.xpath("//div[@id='artibody']//p//text()").extract()
        news['content'] = '\n'.join(temp) if temp else ''

        cat = response.url.split('//')[1].split('.')[0]
        sub_cat = response.url.split('//')[1].split('/')[1]
        news['category'] = self.get_category(cat, sub_cat)

        return news
Beispiel #6
0
 def parse(self, response):
     href = response.url
     title = response.xpath(
         '//h3[contains(@class, "post-title")]/text()').extract()[0]
     title = title.replace("\n", "")
     pubtime = response.xpath(
         '//abbr[@class="date-header"]/span/text()').extract()[0]
     r = re.compile(u"(\d+)年(\d+)月(\d+)日")
     r = re.findall(r, pubtime)
     if not r:
         raise Exception(u"解析时间失败")
     r = [int(x) for x in r[0]]
     pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2])
     htmlcontent = response.xpath(
         '//div[contains(@class, "post-body")]').extract()[0]
     keywords = response.xpath(
         '//span[@class="post-labels"]/a[@rel="tag"]/text()').extract()
     source = u"KKday"
     item = NewsItem(title=title,
                     pubtime=pubtime,
                     htmlcontent=htmlcontent,
                     href=href,
                     keywords=keywords,
                     source=source)
     yield item
Beispiel #7
0
    def parse_item(self, response):
        
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = hxs.select("//div[@class='about']/span/text()").extract()[0].split()
            item['date'] = time_raw[0]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//dl[@id='content']/dd//p/text()").extract()
            #if content_list ==[]:
             #   content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(random())

        except:
            pass
Beispiel #8
0
 def saveFile(self, response):
     item = NewsItem()
     tieuDe = response.xpath(
         "//*[@id='ArticleHolder']/div[2]/h1/text()").extract_first()
     ndTomTat = response.xpath(
         "//*[@id='ArticleContent']/div[1]/h2/p").extract()
     nd = response.xpath("//*[@id='ArticleContent']").extract()
     tacgia = response.xpath(
         "//*[@id='ArticleContent']/p[last()]/span/text()").extract_first()
     if tacgia == None:
         tacgia = "admin"
     print("ndtt ===================================================",
           ndTomTat)
     item['tieuDe'] = tieuDe
     item['ndTomTat'] = "ndtt"
     self.a.remove(self.a[len(self.a) - 1])
     item['nd'] = nd
     item['tieuDeKhongDau'] = self.tieudekhongdau(tieuDe)
     item['anh'] = self.anh[len(self.anh) - 1]
     self.anh.remove(self.anh[len(self.anh) - 1])
     item['view'] = 1
     item['tacGia'] = tacgia
     item['create_at'] = datetime.now()
     item['update_at'] = datetime.now()
     item['id_lt_id'] = 1
     item['id_tl_id'] = 1
     yield item
Beispiel #9
0
 def parseDetail(self, response):
     item = NewsItem()
     item['link'] = response.meta['link']
     item['title'] = response.meta['title']
     data = response.xpath("//div[@id='ContentBody']")
     item['detail'] = data.xpath("//p/text()").extract()
     yield item
Beispiel #10
0
 def parseContent(self, response):
     item = NewsItem()
     item['leadingTitle'] = ''
     item['subTitle'] = ''
     item['source'] = '搜狐新闻'
     item['edition'] = ''
     item['anthor'] = ''
     item['url'] = response.url
     title = response.xpath(
         '//div[@class="text-title"]/h1/text()').extract_first()
     item['mainTitle'] = '' if title is None else title.replace('\r', '').replace('\n', '').\
         replace('\xa0', ' ').replace('\u3000', ' ').strip()
     datetext = response.xpath(
         '//div[@class="article-info"]/span/text()').extract_first()
     date = ''
     if datetext is not None:
         datetext = datetext.replace('\r', '').replace('\n', '').replace(
             '\xa0', ' ').replace('\u3000', ' ').strip()
         try:
             date = datetime.datetime.strptime(datetext.strip(),
                                               "%Y-%m-%d %H:%M")
         except:
             print('时间转换失败')
     item['date'] = date
     content = response.xpath('//*[@id="mp-editor"]').xpath(
         'string()').extract_first()
     item['content'] = '' if content is None else content.replace('\r', '').replace('\n', '').replace('\xa0', ' ').\
         replace('\u3000', ' ').strip()
     yield item
Beispiel #11
0
    def parse_item(self, response):
        try:
            hxs = HtmlXPathSelector(response)
            item = NewsItem()
            item['link'] = response.url
            item['title'] = hxs.select("//head/title/text()").extract()[0]
            time_raw = item['link'].split('/')
            item['date'] = time_raw[-3] + "-" + time_raw[-2]
            (province, city) = get_province_city(item['title'])
            item["province"] = province
            item["city"] = city
            item['content'] = ''
            content_list = hxs.select("//p/text()").extract()
            #if content_list ==[]:
            #    content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract()
            if content_list:
                item['content'] = ''.join(content_list)
            else:
                print 'None'
            if item['content'] == '':
                item['content'] = "null"
            yield item
            time.sleep(0.5)

        except:
            pass
Beispiel #12
0
 def parse(self, response):
     youbi_cd = date.today().weekday()
     for idx, bangumi_list in enumerate(response.css('td[valign="top"]')):
         # 対象外(当日の曜日以外)の番組情報は取得しない
         if idx != youbi_cd:
             continue
         item = NewsItem()
         text = ''
         # 取得対象の番組の概要テキストのみ取得
         for bangumi in bangumi_list.css('table.new_day'):
             bangumi_name = bangumi.css(
                 'span.prog_name a.bangumiDetailOpen::text').extract_first(
                 )
             if bangumi_name is None:
                 continue
             target_name = get_target_bangumi_name(bangumi_name.strip(),
                                                   TARGET_BANGUMI_DICT)
             if target_name is None:
                 continue
             print(bangumi_name)
             text = bangumi.css(
                 'span.expo_org a.bangumiDetailOpen::text').extract_first()
             if text is None:
                 continue
             text = remove_words(
                 text, TARGET_BANGUMI_DICT[target_name]['rm_word'],
                 KYOKU_SEP_DICT['asahi'])
             if text != '':
                 item['text'] = text
                 yield item
Beispiel #13
0
    def parse(self, response):
        href = response.url
        title = response.xpath('//div[contains(@class, "post-heading")]/h1/text()').extract()[0]
        pubtime = response.xpath('//div[contains(@class, "post-heading")]/div[@class="date"]').extract()[0]
        r = re.compile("(\d+)-(\d+)-(\d+) (\d+):(\d+)")
        r = re.findall(r, pubtime)
        if not r:
            r = re.compile("(\d+)-(\d+)-(\d+)")
            r = re.findall(r, pubtime)

        if not r:
            raise Exception(u"解析时间失败")
        r = [ int(x) for x in r[0] ]
        if len(r) == 5:
            pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2], hour=r[3], minute=r[4])
        else:
            pubtime = datetime.datetime(year=r[0], month=r[1], day=r[2])
        htmlcontent = response.xpath('//div[@class="post-content"]').extract()[0]

        soup = BeautifulSoup(htmlcontent, "lxml")
        for r in soup.find_all("a", class_="image"):
            r.name = "img"
            r["src"] = r["href"]
            r["alt"] = r["title"]
        htmlcontent = unicode(soup.body.contents[0])

        keywords = []
        source = u"星島日報"
        item = NewsItem(title=title, pubtime=pubtime, htmlcontent=htmlcontent, href=href, keywords=keywords, source=source)
        yield item
    def read_story(self, response):
        storyItem = NewsItem()
        storyItem['url'] = response.request.url
        storyItem['title'] = response.xpath('//h1/text()').extract()[0]
        story = ""
        try:
            date = response.xpath(
                '//meta[@itemprop="datePublished"]/@content').extract()[0]
            #            date = response.xpath('//time[@class="dateline"]/text()')[0].extract()
            storyItem['date'] = date
            for i in response.xpath(
                    '//body//article[@id="story"]//p[@class="story-body-text story-content"]'
            ):
                for j in i.xpath('.//text()').extract():
                    stringpart = j.strip(
                    )  #.replace('\u2019',"'").replace('\u201d',"'").replace('\u201c',"'")
                    story = ''.join([story, ' ', stringpart])
            storyItem['text'] = story
        except:
            storyItem['text'] = ''
            pass
        if len(storyItem['text']) > 1000:
            yield storyItem

        for link in response.xpath(
                '//article[@id="story"]//a/@href').extract():
            if (link.find('#') == -1) & (link.find('index.html') == -1) & (
                    link.find('/by/')
                    == -1) & (link.find('news-event')
                              == -1) & (link.find('newsletter') == -1):
                yield scrapy.Request(url=link, callback=self.read_story)
 def parse_item(self,response):
     title = response.meta['title']
     pic_url = response.meta['pic_url']
     pic_more_url = response.meta['pic_more_url']
     publishedDate = response.meta['publishedDate']
     category = response.meta['category']
     describe = ''
     data = response.body.replace('var XinhuammNews =','')
     data = json.loads(data)
     content = data['content']
     publishedDate = data['releasedate']
     contentt = content.replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '')
     content = re.findall('>(.*?)<', contentt)
     contentdata = ''
     for i in content:
         contentdata += i
     content = contentdata
     home_url = response.meta['home_url']
     crawlTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
     app_name = '新华社'
     author = ''
     print "app名称", app_name
     print "主图片url", pic_url
     print "子图片url", pic_more_url
     print "作者", author
     print "详情页地址", response.url
     print "所属类型", category
     print "标题", title
     print "描述", describe
     print "内容", content
     print "主url", home_url
     print "发布时间", publishedDate
     print "爬取时间", crawlTime
     url = response.url
     item = NewsItem()
     item['count'] = self.count
     item['app_name'] = app_name
     item['pic_url'] = pic_url
     item['pic_more_url'] = pic_more_url
     item['author'] = author
     item['url'] = url
     item['category'] = category
     item['title'] = title
     item['describe'] = describe
     item['content'] = content
     item['home_url'] = home_url
     item['publishedDate'] = publishedDate
     item['crawlTime'] = crawlTime
     try:
         try:
             publishedDate_stamp = int(time.mktime(time.strptime(publishedDate, "%Y-%m-%d %H:%M")))
         except:
             publishedDate_stamp = int(time.mktime(time.strptime(publishedDate, "%Y-%m-%d")))
     except:
         publishedDate_stamp = int(time.mktime(time.strptime(publishedDate, "%Y-%m-%d %H:%M:%S")))
     if publishedDate_stamp > self.timeStamp:
         publishedDate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(publishedDate_stamp)))
         self.count += 1
         item['publishedDate'] = publishedDate
         yield item
Beispiel #16
0
    def page_content(self, response):
        item = NewsItem()

        item["category"] = response.xpath(
            '//*[@id="cSub"]/div/div[2]/div[1]/h3/text()').extract()
        item["title1"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/div[1]/div/strong/a/text()').extract()
        item["link1"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/div[1]/div/strong/a/@href').extract()
        item["title2"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[1]/div[1]/strong/a/text()'
        ).extract()
        item["link2"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[1]/div[1]/strong/a/@href'
        ).extract()
        item["title3"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[2]/div[1]/strong/a/text()'
        ).extract()
        item["link3"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[2]/div[1]/strong/a/@href'
        ).extract()
        item["title4"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[3]/div[1]/strong/a/text()'
        ).extract()
        item["link4"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[3]/div[1]/strong/a/@href'
        ).extract()
        item["title5"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[4]/div[1]/strong/a/text()'
        ).extract()
        item["link5"] = response.xpath(
            '//*[@id="cSub"]/div/div[1]/ul[1]/li[4]/div[1]/strong/a/@href'
        ).extract()
        yield item
Beispiel #17
0
    def get_news(self, response):
        item = NewsItem()

        title = response.css('h1[class ="content__headline "]').css(
            'h1[itemprop="headline"]::text').extract()
        if (len(title) == 0):  # 卫报的news页面有两种形式,如果第一种没抓到,就用模式二
            title = response.css('meta[itemprop="description"]').xpath(
                '@content').extract()

        time = response.css(
            'time[itemprop = "datePublished"]::text').extract_first()

        category = response.css(
            'a[class ="subnav-link subnav-link--current-section"]::text'
        ).extract_first()

        tags = response.css('a[class = "submeta__link"]::text').extract()

        content = response.css('div[itemprop = "articleBody"]').css(
            'p::text').extract()
        if (len(content) == 0):
            content = title  # 有些news内容可能时视频或者其它非文本模式,这时我们用title作为content

        item['title'] = title
        item['time'] = time
        item['category'] = category
        item['tags'] = tags
        item['content'] = content

        yield item
Beispiel #18
0
 def parseContent(self, response):
     item = NewsItem()
     item['leadingTitle'] = ''
     item['mainTitle'] = response.meta['title'].replace('\r', '').replace('\n', '').replace('\xa0', ' ').\
         replace('\u3000', ' ').strip()
     item['subTitle'] = ''
     item['source'] = '人民网'
     date = ''
     datetext = response.xpath(
         '//div[@class="box01"]/div[1]/text()').extract_first()
     if datetext is not None:
         datetext = datetext.replace('\r', '').replace('\n', '').replace(
             '\xa0', ' ').replace('\u3000', ' ').strip()
         index = datetext.find('来源')
         if index > -1:
             datetext = datetext[:index]
             try:
                 date = datetime.datetime.strptime(datetext.strip(),
                                                   "%Y年%m月%d日%H:%M")
             except:
                 pass
     item['date'] = date
     item['edition'] = ''
     item['anthor'] = ''
     content = ''
     pages = response.xpath('//div[@class="box_con"]/p')
     for p in pages:
         c = p.xpath('./text()').extract_first()
         if c is not None:
             content += c
     item['content'] = '' if content is None else content.replace('\r', '').replace('\n', '').\
         replace('\xa0', ' ').replace('\u3000', ' ').strip()
     item['url'] = response.url
     yield item
Beispiel #19
0
 def parse(self, response):
     try:
         #把获取到的数据转化为json
         jsonresponse = json.loads(response.body_as_unicode())
         try:
             info = jsonresponse['data']['article_info']
             list = Selector(text=info).xpath('//div')
             for news in list:
                 cat = news.xpath(
                     'span[@class="t-tit"]/text()').extract()[0]
                 if '图片' in cat:
                     print('跳过图片新闻')
                     continue
                 time = ''.join(news.xpath('dl/dt/span/text()').extract())
                 title = ''.join(news.xpath('dl/dt/a/text()').extract())
                 url = ''.join(news.xpath('dl/dt/a/@href').extract())
                 summary = ''.join(news.xpath('dl/dd/text()').extract())
                 item = NewsItem()
                 item['time'] = time
                 item['title'] = title
                 item['url'] = url
                 item['summary'] = summary
                 item['cata'] = response.meta['name']
                 yield scrapy.Request(url,
                                      meta={'item': item},
                                      callback=self.parse_content)
         except TypeError:
             pass
     except:
         pass
Beispiel #20
0
    def parse(self, response):
        news = response.xpath('//div[@class="news-box clearfix"]')

        for sel in news:

            title = sel.xpath('./div/h4/a/text()').extract_first()
            abstract = sel.xpath('./div/div[1]/text()').extract_first()
            article = sel.xpath(
                './div/div[2]/span[3]/a/text()').extract_first()
            if (article):
                article = article.strip()
            cover_img_src = sel.xpath('./a/img/@src').extract_first()
            news_url = sel.xpath('./div/h4/a/@href').extract_first()
            news_date = sel.xpath(
                './div/div[2]/span[2]/text()').extract_first()
            view_count = sel.xpath(
                './div/div[2]/span[4]/text()').extract_first()

            item = NewsItem()
            item['title'] = title
            item['abstract'] = abstract
            item['article'] = article
            item['cover_img_src'] = cover_img_src
            item['news_url'] = news_url
            item['news_date'] = news_date
            item['view_count'] = view_count

            yield item

        if self.offset < 3000:
            self.offset += 10
            yield scrapy.Request(url=self.base_url + str(self.offset),
                                 callback=self.parse)
Beispiel #21
0
 def parse(self, response):
     news = Selector(response).xpath('//tr[@class="athing"]/td/a')
     for news_item in news:
         item = NewsItem()
         item['title'] = news_item.xpath('text()').extract()[0]
         item['url'] = news_item.xpath('@href').extract()[0]
         yield item
Beispiel #22
0
 def parse(self, response):
     item = NewsItem()
     sel = Selector(response)
     item['name'] = sel.xpath('//a/text()').extract()
     item['image_urls'] = sel.xpath(
         '//a/img/@src[contains(.,".jpg")]').extract()
     yield item
Beispiel #23
0
    def parse_detail(self, response):
        print("爬取信息。。。")
        item = NewsItem()
        news = response.meta['news']

        item['title'] = news['title']
        item['stitle'] = news['stitle']
        item['ctime'] = news['ctime']
        item['url'] = news['url']
        item['wap_url'] = news['wapurl']
        item['summary'] = news['summary']
        item['wap_summary'] = news['wapsummary']
        item['intro'] = news['intro']
        item['keywords'] = news['keywords']
        item['content'] = ''.join(
            response.xpath('//*[@id="artibody"]/p/text()').extract()).replace(
                u'\u3000\u3000', u'\n').replace(u'\xa0\xa0', u'\n').replace(
                    u'\r\n', u'\n').replace(u'\n\r',
                                            u'\n').replace(u'\'',
                                                           u'\\\'').strip()
        if item['content'] == '':
            item['content'] = ''.join(
                response.xpath(
                    '//*[@id="article"]/p/text()').extract()).replace(
                        u'\u3000\u3000',
                        u'\n').replace(u'\xa0\xa0',
                                       u'\n').replace(u'\r\n', u'\n').replace(
                                           u'\n\r',
                                           u'\n').replace(u'\'',
                                                          u'\\\'').strip()
        item['category'] = self.category[self.lid]

        yield item
    def parse(self, response):
        for sel in response.xpath("//div[@class='article-item-warp']"):
            item = NewsItem()

            item['title'] = sel.xpath(".//div[@class='article-item__body']/h3/a/text()").get()
            if item['title'] is not None:
                item['title'] = str.strip(item['title'])

            item['source_url'] = response.urljoin(sel.xpath(".//div[@class='article-item__body']/h3/a/@href").get())
            if item['source_url'] is not None:
                item['source_url'] = str.strip(item['source_url'])

            item['excerpt'] = sel.xpath(".//div[@class='article-item__content']/text()").get()
            if item['excerpt'] is not None:
                item['excerpt'] = str.strip(item['excerpt'])

            item['cover_url'] = response.urljoin(sel.xpath(".//div[@class='article-item__body']/h3/a/@href").get())
            if item['cover_url'] is not None:
                item['cover_url'] = str.strip(item['cover_url'])

            item['author'] = sel.xpath(".//div[@class='article-item__author']/a[2]/text()").get()
            if item['author'] is not None:
                item['author'] = str.strip(item['author'])

            # check crawlInfo exists
            crawlInfo = self.mysqlObj.get_post_info(title=item['title'], author=item['author'])
            if crawlInfo is not None:
                logging.info("author: %s title: %s is exists" % (item['author'], item['title']))
                continue

            yield scrapy.Request(url=item['source_url'], meta={"item": item, "useSel": False}, callback=self.parse_content)
Beispiel #25
0
 def parse(self, response):
     headlines=Selector(response).xpath('//h3[@class="lx-stream-post__header-title gel-great-primer-bold qa-post-title gs-u-mt0 gs-u-mb-"]')
     item=NewsItem()
     for headline in headlines:
         item['title']=headline.xpath('a/span/text()').extract()[0]
         item['url']='https://www.bbc.com'+headline.xpath('a/@href').extract()[0]
         yield item
Beispiel #26
0
 def parse(self, response):
     #news list
     # for sel in response.xpath("//div[@class='left_content left']/a/@href"):
     #     if(sel):
     #         url = 'https://www.yidaiyilu.gov.cn%s' % sel.extract()
     #         yield scrapy.Request(url, callback=self.parse)
     #scroll news
     for sel in response.xpath("//div[@class='bd']/ul/li/a/@href"):
         if (sel):
             url = 'https://www.yidaiyilu.gov.cn%s' % sel.extract()
             yield scrapy.Request(url, callback=self.parse)
     item = NewsItem()
     #print response.body
     item['time'] = response.xpath(
         "//div[@class='szty']/span[1]/text()").extract()
     #item['time'] = datetime.datetime.strftime(time, "%Y-%m-%d %H:%M:%S")
     item['url'] = response.url
     item['title'] = response.xpath('//title/text()').extract()[0][:-10]
     item['content'] = response.xpath(
         "//p[@style='text-indent:2em;']/text()").extract()
     item['img_url'] = response.xpath(
         "//div[@class='info_content']/p/img/@src").extract()
     item['source'] = response.xpath(
         "//div[@class='szty']/span[2]/text()").extract()
     item['type'] = response.xpath(
         "//ul[@class='local_ul']/li[5]/a/text()").extract()
     item['desc'] = response.xpath(
         "//meta[@name='description']/@content").extract()
     yield item
Beispiel #27
0
 def parseContent(self, response):
     item = NewsItem()
     item['leadingTitle'] = ''
     item['source'] = '中国长安网'
     item['edition'] = response.meta['channel']
     item['date'] = response.meta['date']
     item['url'] = response.url
     item['site'] = 'a76304d3-3ee0-47f2-8077-a2d08e2b7333'
     title = response.xpath(
         '//div[@class="content-l fl"]/h1/text()').extract_first()
     item['mainTitle'] = '' if title is None else title. \
         replace('\r', '').replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip()
     source = response.xpath('//div[@class="source"]/text()').extract_first()\
         .replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip()
     index = source.find('责任编辑')
     if index > 0:
         item['anthor'] = source[index + 5:].replace('\n', '').replace(
             '\xa0', ' ').replace('\u3000', ' ').strip()
     # index = source.find('来源')
     # if index > 0:
     #     date = source[:index].replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip()
     #     try:
     #         item['date'] = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M')
     #     except:
     #         pass
     item['date'] = response.meta['date']
     content = response.xpath('//div[@class="content-main"]').xpath(
         'string()').extract_first()
     item['content'] = '' if content is None else content.\
         replace('\r', '').replace('\n', '').replace('\xa0', ' ').replace('\u3000', ' ').strip()
     yield item
Beispiel #28
0
    def parse(self, response):
        if 'news.ycombinator.com' in response.url:
            soup = bs(response.body)
            items = [(x[0].text, x[0].get('href'))
                     for x in filter(None, [
                         x.findChildren()
                         for x in soup.findAll('td', {'class': 'title'})
                     ])]

            for item in items:
                print item
                news_item = NewsItem()
                news_item['title'] = item[0]
                news_item['url'] = item[1]
                try:
                    yield Request(item[1], callback=self.parse)
                except ValueError:
                    yield Request('http://news.ycombinator.com/' + item[1],
                                  callback=self.parse)

                yield news_item

        else:
            sha1_response = hashlib.sha1(response.url).hexdigest()
            folder = PATH + '/' + sha1_response
            if not os.path.exists(folder):
                os.makedirs(folder)
            with open(folder + '/index.html', 'w+') as file_obj:
                file_obj.write(response.body)
Beispiel #29
0
 def parse_data_list(self, response):
     if 'public-api' in response.url:  # 是新闻类型的API
         data_list = json.loads(response.text)
     elif 'integration-api' in response.url:
         data_list = json.loads(response.text)['data']
     else:
         return
     for data in data_list:
         if ('integration-api' in response.url) and (data['resourceType']
                                                     == 3):
             continue
         if data['type'] == 3:  # 是图集
             continue
         items = NewsItem()
         try:
             items['title'] = data['title']
             # items['source_url'] = data['originalSource']  # public-api有来源url,integration无,统一从页面中采集,在网页源代码里有,但JS渲染不可见
             items['url'] = 'http://www.sohu.com/a/' + str(
                 data['id']) + '_' + str(data['authorId'])
             items['date'] = parse_time(str(data['publicTime'])[0:10])
             items['source'] = data['authorName']
         except KeyError:  # 其中一个原因:不是文章而是集合,所以没有authorId,authorName
             print(data_list.index(data))
             print(response.url)
             print(data)
             return
         items['attribute'] = get_attribute(response.url)
         yield scrapy.Request(items['url'],
                              self.parse_article1,
                              meta={'items': items})
Beispiel #30
0
 def parse(self, response):
     headlines=Selector(response).xpath('//div[@class="col-12 col-sm-8 col-md-8 col-lg-8 col-xl-8 pd-left-ultimas-noticias"]')
     item=NewsItem()
     for headline in headlines:
         item['title']=headline.xpath('a/h2/text()').extract()[0]
         item['url']=headline.xpath('a/@href').extract()[0]
         yield item