Python NewsSpiderItem Exemples, news_spider.items.NewsSpiderItem Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : semi_car.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = response.xpath("//div[@class='list']")

        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(
                ".//h2/a/text()").extract_first()
            news_item['origin_website'] = 'SEMI大导体产业网'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item.xpath(
                ".//h2/a/@href").extract_first()
            news_item['section'] = 'SEMI大导体产业网 > 汽车电子应用'
            news_item['abstract'] = info_item.xpath(
                ".//div[@class='abstract']/text()").extract_first().strip()
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            published_at = info_item.xpath(
                ".//div[@class='inputdate']/text()").extract_first()
            news_item['published_at'] = self.parse_timestamp(published_at)
            if self.deadline > news_item['published_at']:
                return
            yield news_item

        self.start_page = self.start_page + 1
        yield scrapy.Request('http://ecar.semi.org.cn/indexLoading_' +
                             str(self.start_page) + '.html',
                             callback=self.parse)

Exemple #2

0

Afficher le fichier

Fichier : zhidx.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_item = NewsSpiderItem()
        news_list = json.loads(response.body_as_unicode())['result']
        if len(news_list) == 0:
            return
        else:
            for info_item in news_list:
                news_item['title'] = info_item['title']
                news_item['origin_website'] = '智东西'
                news_item['created_at'] = int(
                    datetime.datetime.now().timestamp())
                news_item['origin_host'] = self.allowed_domains[0]
                news_item['origin_url'] = info_item['link']
                news_item['section'] = ''
                news_item['abstract'] = info_item['desp']

                yield scrapy.Request(news_item['origin_url'],
                                     meta={'item': news_item},
                                     callback=self.detail_parse,
                                     dont_filter=True)

            self.start_page = self.start_page + 1
            yield scrapy.FormRequest(dont_filter=True,
                                     url=self.start_urls[0],
                                     formdata={
                                         'action': 'category_list',
                                         'page': str(self.start_page)
                                     },
                                     callback=self.parse)

Exemple #3

0

Afficher le fichier

    def parse(self, response):
        news_list = response.xpath("//div[@class='ArticleList']/table/tbody/tr")
        news_list.pop()  # 去除最后一个空行
        relative_path = response.url.split('index')[0]
        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(".//td[@class='fw_t']/a/text()").extract_first().strip()
            news_item['origin_website'] = '中国科学院半导体研究所'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = relative_path + info_item.xpath(".//td[@class='fw_t']/a/@href").extract_first()[2:]
            news_item['section'] = response.xpath("string(//div[@class='Position'])").extract_first()
            news_item['abstract'] = ''
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            published_at = info_item.xpath(".//td[@class='fw_s']/text()").extract_first()
            if published_at:
                news_item['published_at'] = int(datetime.datetime.strptime('20' + published_at.strip(), "%Y-%m-%d").timestamp())
                if self.deadline > news_item['published_at']:
                    return
                yield news_item
            else:
                continue

        next_link = response.xpath("//div[@class='t_page ColorLink']/a[contains(text(),'下一页')]/@href").extract_first()

        if next_link:
            yield scrapy.Request(relative_path + next_link, callback=self.parse, errback=self.err_callback)

Exemple #4

0

Afficher le fichier

    def parse(self, response):
        news_item = NewsSpiderItem()
        news_list = json.loads(response.body_as_unicode())['info']['list']
        if len(news_list) == 0:
            return
        else:
            for info_item in news_list:
                news_item['title'] = info_item['Title']
                news_item['origin_website'] = '人工智能科技'
                news_item['created_at'] = int(
                    datetime.datetime.now().timestamp())
                news_item['published_at'] = int(
                    datetime.datetime.strptime(info_item['CreateTime'],
                                               "%Y-%m-%d").timestamp())
                if self.deadline > news_item['published_at']:
                    return
                news_item['origin_host'] = self.allowed_domains[0]
                news_item[
                    'origin_url'] = 'http://www.aistudyblog.com' + info_item[
                        'Url']
                news_item['section'] = '人工智能科技' + ' > ' + info_item['TypeName']
                news_item['abstract'] = info_item['Description']
                yield news_item

            self.start_page = self.start_page + 1
            next_link = 'http://www.aistudyblog.com/handler/CMSList.ashx?ActionType=InformationAllTypeList&InformationPage=' + str(
                self.start_page)
            yield scrapy.Request(next_link, callback=self.parse)

Exemple #5

0

Afficher le fichier

Fichier : chinasmartgrid.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = response.xpath(
            "//ul[@class='list_left_ul']/li[not(@class='dashed_line')]")

        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(".//a/@title").extract_first()
            news_item['origin_website'] = '智能电网市场'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item.xpath(
                ".//a/@href").extract_first()
            news_item['section'] = '北极星智能电网在线 > 市场'
            news_item['abstract'] = ''
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            published_at = info_item.xpath(
                ".//span/text()").extract_first().strip()
            news_item['published_at'] = int(
                datetime.datetime.strptime(published_at,
                                           "%Y-%m-%d").timestamp())
            if self.deadline > news_item['published_at']:
                return
            yield news_item

        next_link = response.xpath(
            "//div[@class='list_page']/div[@class='page']/a[@title='下一页']/@href"
        ).extract_first()

        if next_link:
            yield scrapy.Request('http://www.chinasmartgrid.com.cn/' +
                                 next_link,
                                 callback=self.parse)

Exemple #6

0

Afficher le fichier

Fichier : ailab.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = response.xpath("//ul[@class='list_jc']/li")
        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(
                ".//a[1]/@title").extract_first()
            news_item['origin_website'] = '人工智能实验室'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item.xpath(
                ".//a[1]/@href").extract_first()
            news_item['section'] = '首页 > 热点信息'
            news_item['abstract'] = info_item.xpath(
                ".//p[@class='cn']/text()").extract_first()
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            published_at = info_item.xpath(
                ".//p[@class='xx']/span[@class='rq']/text()").extract_first(
                ).strip()
            news_item['published_at'] = int(
                datetime.datetime.strptime(published_at,
                                           "%Y-%m-%d").timestamp())
            print(self.deadline, news_item['published_at'])
            if self.deadline > news_item['published_at']:
                return
            yield news_item

        next_link = response.xpath(
            "//div[@class='col-left box mt10']/div[@class='pg']/a[@class='nxt']/@href"
        ).extract_first()
        if next_link:
            yield scrapy.Request(next_link, callback=self.parse)

Exemple #7

0

Afficher le fichier

Fichier : elecfans.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = response.xpath("//div[@class='article-list']")
        for info_item in news_list:
            news_item = NewsSpiderItem()
            published_at = info_item.xpath(
                ".//div[@class='a-content']/p[@class='one-more clearfix']/span[@class='time']/text()"
            ).extract_first().strip()
            news_item['published_at'] = int(
                datetime.datetime.strptime(published_at,
                                           "%Y-%m-%d").timestamp())
            if self.deadline > news_item['published_at']:
                return
            news_item['title'] = info_item.xpath(
                ".//div[@class='a-content']/h3[@class='a-title']/a/text()"
            ).extract_first()
            news_item['origin_website'] = '电子发烧友网'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item.xpath(
                ".//div[@class='a-content']/h3[@class='a-title']/a/@href"
            ).extract_first()
            news_item['section'] = '电子发烧友网 > 人工智能'
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            news_item['abstract'] = info_item.xpath(
                ".//div[@class='a-content']/p[@class='a-summary']/text()"
            ).extract_first().strip()
            yield news_item

        next_link = response.xpath(
            "//div[@class='pagn1']/a[@class='page-next']/@href").extract_first(
            )

        if next_link:
            yield scrapy.Request('http://www.elecfans.com/rengongzhineng/' +
                                 next_link,
                                 callback=self.parse)

Exemple #8

0

Afficher le fichier

    def parse(self, response):
        news_list = response.xpath("//table[@class='gongzuo']/tr")

        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(".//td[1]//div[@class='jishu01']/a//span[@class='hei1']").extract_first().strip()
            news_item['origin_website'] = 'SEMI大导体产业网'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item.xpath(".//td[1]//div[@class='jishu01']/a/@href").extract_first().replace('..', self.domain)
            news_item['section'] = 'SEMI大导体产业网 > 热点新闻'
            news_item['abstract'] = info_item.xpath(".//td[1]//div[@class='jishu01']/a/text()").extract_first().strip()
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            yield scrapy.Request(news_item['origin_url'], meta={'item': news_item}, callback=self.detail_parse)

        next_link = response.xpath("//table[@id='AspNetPager1']//tr/td/a[contains('下一页',text())]/@href").extract_first()

        if next_link:
            self.start_page = self.start_page + 1
            print('正在爬第几页', self.start_page)
            yield scrapy.FormRequest(
                url=self.start_urls[0],
                formdata={'__EVENTTARGET': 'AspNetPager1',
                          '__EVENTARGUMENT': str(self.start_page),
                          '__VIEWSTATE': self.param_viewstate,
                          '__VIEWSTATEGENERATOR': self.param_viewstategenerator},
                callback=self.parse
            )

Exemple #9

0

Afficher le fichier

	def parseNewsHref(self,response):
		category_type = response.meta['category_type']
		newsList = response.xpath(".//li[@class='cfix']")
		for news in newsList:
			item = NewsSpiderItem()
			item['type']=category_type
			item['title'] = news.xpath('.//h2/a/text()').extract_first()
			item['url'] = news.xpath('.//h2/a/@href').extract_first()
			item['summary'] = news.xpath('.//p/text()').extract_first()
			item['time'] = news.xpath(".//em[@class='fRight']/text()").extract_first()
			item['content'] = []
			yield scrapy.Request(url=item['url'], meta={'item': item},
								 callback=self.parseNews)

Exemple #10

0

Afficher le fichier

Fichier : huxiu_spider.py Projet : liutianrui00/KGQA_Movie

    def parse_article(self, response):
        detail = response.xpath('//div[@class="article-wrap"]')
        item = NewsSpiderItem()
        item['title'] = detail.xpath('./h1[@class="t-h1"]/text()')[0].extract()
        item['auth'] = u"作者：" + detail.xpath(
            './div/span[@class="author-name"]/a/text()')[0].extract()
        item['post_time'] = u"发表时间：" + detail.xpath(
            './div/div[@class="column-link-box"]/span[@class="article-time pull-left"]/text()'
        )[0].extract()
        item['descr'] = u"简述：" + self.desc + "\n"  # 简述存在错误
        all_pars = detail.xpath(
            '//div[@class="article-content-wrap"]//p/text()').extract()

        content = ''
        for par in all_pars:
            content = content + par + "\n"

        desc = item.get('main_news')
        if desc == None:
            item['main_news'] = content
        else:
            item['main_news'] = desc + content

        yield item

Exemple #11

0

Afficher le fichier

Fichier : cnn.py Projet : wondereamer/eva

 def parse_news(self, response):
     url = response.url
     title = response.xpath('//article//h1/text()').extract_first()
     post_time = response.xpath(
         '//article//p[@class="update-time"]/text()').extract_first()
     content = response.xpath(
         '//*[@id="body-text"]//*[contains(@class, "zn-body__paragraph")]//text()'
     )
     content = ' '.join(content.extract())
     item = NewsSpiderItem()
     item['url'] = url
     item['title'] = title
     item['report_time'] = post_time
     item['content'] = content
     item['crawl_time'] = time.time()
     yield item

Exemple #12

0

Afficher le fichier

Fichier : huxiu_spider.py Projet : liutianrui00/KGQA_Movie

    def parse(self, response):

        print "Start............................"
        self.desc = ''
        for sel in response.xpath('//div[@class="mod-b mod-art clearfix "]'):
            item = NewsSpiderItem()
            item['title'] = sel.xpath(
                './div/h2/a[@class="transition msubstr-row2"]/text()'
            )[0].extract()
            self.desc = sel.xpath(
                './div[@class="mob-ctt index-article-list-yh"]/div[@class="mob-sub"]/text()'
            )[0].extract()
            link = sel.xpath('./div/h2/a/@href')[0].extract()
            url = response.urljoin(link)

            yield scrapy.Request(url, callback=self.parse_article)

Exemple #13

0

Afficher le fichier

Fichier : dramx.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = response.xpath("//div[@id='divArticleList']/div[contains(@class,'Article-box-cont')]/div[@class='Article-content']")
        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(".//h3/a/text()").extract_first().strip()
            news_item['origin_website'] = '全球半导体观察'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = 'https://www.dramx.com' + info_item.xpath(".//h3/a/@href").extract_first()
            news_item['section'] = '全球半导体观察 > ' + response.xpath("//a[@class='Article-boxtitle-active']/text()").extract_first()
            news_item['abstract'] = info_item.xpath(".//p[@class='Article-essay']/text()").extract_first()
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            yield scrapy.Request(news_item['origin_url'], meta={'item': news_item}, callback=self.detail_parse, dont_filter=True)

        next_link = response.xpath("(//div[@class='jogger']/a)[last()]/@href").extract_first()
        if next_link:
            yield scrapy.Request('https://www.dramx.com' + next_link, callback=self.parse, errback=self.err_callback, dont_filter=True)

Exemple #14

0

Afficher le fichier

Fichier : bbc.py Projet : chi1st/news_spider

 def parse_news(self, response):
     url = response.url
     title = response.xpath('//h1[@class="story-body__h1"]/text()').extract_first()
     post_time = response.xpath(
         '//div[@class="date date--v2"]/text()').extract_first()
     content = response.xpath(
         '//div[@class="story-body__inner"]//p//text()'
     )
     content = ' '.join(content.extract())
     if title:
         item = NewsSpiderItem()
         item['url'] = url
         item['title'] = title
         item['report_time'] = post_time
         item['content'] = content
         item['crawl_time'] = time.time()
         yield item

Exemple #15

0

Afficher le fichier

Fichier : iyiou_com.py Projet : oogou11/news_spider

    def parse(self, response):
        array_split_url = response.request.url.split('-')
        category = ''
        if len(array_split_url) > 1:
            if '/' in array_split_url[1]:
                category_key = array_split_url[1].split('/')[0]
            else:
                category_key = array_split_url[1]
            category = project_items.get(category_key)
        content_list = response.xpath(
            ".//*[@class='viewpointListWrap contentWrap perspective']/ul/li")
        for content in content_list:
            url = content.xpath(".//a/@href").extract()
            if len(url) > 0:
                url = url[0]
            title = content.xpath(
                ".//a/p[@class='perspectiveTitle']/text()").extract()
            if len(title) > 0:
                title = title[0]
            time = content.xpath(
                ".//p[@class='researchInfo']/span[@class='time']/text()"
            ).extract()
            if len(time) > 0:
                time = time[0]
            author = content.xpath(
                ".//*[@class='researchInfo-author']/span[@class='author']/text()"
            ).extract()
            if len(author) > 0:
                author = author[0]
            item = NewsSpiderItem(url=url,
                                  title=title,
                                  time=time,
                                  author=author,
                                  source=self.__source,
                                  category=category,
                                  create_time=datetime.datetime.now())
            request = scrapy.Request(url=url, callback=self.parse_body)
            request.meta['item'] = item  # 将item暂存
            yield request

        next_page = response.xpath(
            ".//*[@id='page']/ul/li[@class='active']/following-sibling::*[1]/a/@href"
        ).extract()
        if len(next_page) > 0:
            yield scrapy.Request(url=next_page[0], callback=self.parse)

Exemple #16

0

Afficher le fichier

Fichier : Tencent.py Projet : TiYife/NewsSpider

    def parseNews(self, response):
        data = response.xpath("//div[@id='C-Main-Article-QQ']")
        item = NewsSpiderItem()
        timee = data.xpath("//span[@class='article-time']/text()").extract()
        title = data.xpath("//div[@class='hd']//h1/text()").extract()
        content = data.xpath("//p/text()").extract()

        time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
        if len(timee) != 0 and len(title) != 0 and len(content) != 0:
            tm = time_pattern.findall(timee[0])[0]
            item['time'] = int(time.mktime(time.strptime(tm, '%Y-%m-%d %H:%M')))
            item['title'] = title[0]
            item['url'] = response.url
            cc = ''
            if len(content) != 0:
                for c in content:
                    cc = cc + c + '\n'
            item['content'] = cc
            yield item

Exemple #17

0

Afficher le fichier

Fichier : news_spider.py Projet : cheersberry/news_similarity_detection

    def parse_raw_html(self, response):
        item = NewsSpiderItem()
        item['news_url'] = response.url
        # try:
        #     chatset = response.encoding
        #     body = response.body.decode(chatset, errors='ignore')
        #     item['raw_html'] = body
        # except UnicodeDecodeError as e:
        #     logger.error(e)
        #     item['raw_html'] = None
        item['raw_html'] = response.body

        try:
            g = Goose({'stopwords_class': StopWordsChinese})

            extr = g.extract(raw_html=item['raw_html'])
            cleaned_text =extr.cleaned_text
            title = extr.title

            if cleaned_text:
                title_pair = jieba.analyse.extract_tags(title, topK=20, withWeight=True)
                cleaned_text_pair = jieba.analyse.extract_tags(cleaned_text, topK=20, withWeight=True)
                title_pair_list = [[k[0], k[1]] for k in title_pair]
                cleaned_text_pair_list = [[k[0], k[1]] for k in cleaned_text_pair]
                for ti_va in title_pair_list:
                    flag = True
                    for te_va in cleaned_text_pair_list:
                        if ti_va[0] == te_va[0]:
                            te_va[1] += ti_va[1] * 0.5
                            flag = False
                    if flag:
                        cleaned_text_pair_list.append(ti_va)
                cleaned_text_pair_list.sort(key=self.takeSecond,reverse=True)
                simhash = MySimHash().get_simhash(cleaned_text_pair_list[:20])
                item['title']=title
                item['simhash'] = str(simhash)
                item['cleaned_text'] = cleaned_text
                item['tags'] = cleaned_text_pair_list[:20]
                yield item
            else:
                pass
        except UnicodeDecodeError as e:
            logger.error("Something unexpected happened")

Exemple #18

0

Afficher le fichier

Fichier : NetEase.py Projet : Zack-Ku/newsCrawler

	def parseNews(self,response):
		data = response.xpath("//div[@class='post_content_main']")
		item = NewsSpiderItem()
		timee = data.xpath("//div[@class='post_time_source']/text()").extract()
		title = data.xpath("//h1/text()").extract()
		content = data.xpath("//div[@class='post_text']/p/text()").extract()

		time_pattern = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}")
		if(len(timee)!=0 and len(title)!=0 and len(content)!=0):
			tm = time_pattern.findall(timee[0])[0]
			item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
			item['title'] = title[0]
			item['url'] = response.url
			item['origin'] = 'netease'
			cc=''
			if(len(content)!=0):
				for c in content:
					cc = cc+c+'\n'
			item['content'] = cc
			yield item

Exemple #19

0

Afficher le fichier

Fichier : semi_ic.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = response.xpath("//table[@class='gongzuo']/tr")
        for info_item in news_list:
            news_item = NewsSpiderItem()
            published_at_text = info_item.xpath(
                ".//td[2]/text()").extract_first().strip()
            news_item['title'] = info_item.xpath(
                ".//td[@class='zuobian']/a/text()").extract_first()
            news_item['origin_website'] = 'SEMI大导体产业网'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = self.domain + info_item.xpath(
                ".//td[@class='zuobian']/a/@href").extract_first()
            news_item['section'] = '大导体产业网 > IC设计与制造'
            news_item['abstract'] = ''
            news_item['published_at'] = int(
                datetime.datetime.strptime(published_at_text,
                                           "%Y-%m-%d").timestamp())
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            if self.deadline > news_item['published_at']:
                return
            yield news_item

        next_link = response.xpath(
            "//table[@id='AspNetPager1']//tr/td/a[contains('下一页',text())]/@href"
        ).extract_first()

        if next_link:
            self.start_page = self.start_page + 1
            yield scrapy.FormRequest(url=self.start_urls[0],
                                     dont_filter=True,
                                     formdata={
                                         '__EVENTTARGET':
                                         'AspNetPager1',
                                         '__EVENTARGUMENT':
                                         str(self.start_page),
                                         '__VIEWSTATE':
                                         self.param_viewstate,
                                         '__VIEWSTATEGENERATOR':
                                         self.param_viewstategenerator
                                     },
                                     callback=self.parse)

Exemple #20

0

Afficher le fichier

Fichier : TouTiaoSpider.py Projet : a289237642/Python-Study

    def parseNews(self, response):
        articles = response.xpath("//div[@id='pagelet-article']")
        item = NewsSpiderItem()
        title = articles.xpath(
            "//div[@class='article-header']/h1/text()").extract()[0]
        tm = articles.xpath(
            "//div[@id='pagelet-article']//span[@class='time']/text()"
        ).extract()[0]
        content = articles.xpath(
            "//div[@class='article-content']//p/text()").extract()

        if (len(title) != 0 and len(tm) != 0 and len(content) != 0):
            item['title'] = title
            item['time'] = int(time.mktime(time.strptime(tm,
                                                         '%Y-%m-%d %H:%M')))
            item['url'] = response.url
            cc = ''
            if (len(content) != 0):
                for c in content:
                    cc = cc + c + '\n'
                item['content'] = cc
                yield item

Exemple #21

0

Afficher le fichier

Fichier : ai_ofweek.py Projet : kingking888/news_spider-1

    def parse(self, response):
        news_list = json.loads(response.body_as_unicode())['newsList']
        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item['title']
            news_item['origin_website'] = 'OFweek人工智能网'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item['htmlpath']
            news_item['section'] = 'OFweek人工智能网 > 自然语言处理'
            news_item['abstract'] = info_item['summery']
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            news_item['published_at'] = int(
                datetime.datetime.strptime(info_item['addtimeStr'],
                                           "%Y-%m-%d %H:%M:%S").timestamp())
            if self.deadline > news_item['published_at']:
                return

            yield news_item
        self.start_page = self.start_page + 1
        next_link = 'https://ai.ofweek.com/CAT-201718-nlp-' + str(
            self.start_page) + '.html'
        yield scrapy.Request(next_link, callback=self.parse)

Exemple #22

0

Afficher le fichier

    def parse(self, response):
        news_list = response.xpath("//td[@id='ArticleBody']/ul/li")
        for info_item in news_list:
            news_item = NewsSpiderItem()
            news_item['title'] = info_item.xpath(
                ".//p/a/span/text()").extract_first()
            news_item['origin_website'] = '中国半导体行业协会'
            news_item['origin_host'] = self.allowed_domains[0]
            news_item['origin_url'] = info_item.xpath(
                ".//p/a/@href").extract_first()
            news_item['section'] = '中国半导体行业协会 > 行业要闻'
            news_item['abstract'] = ''
            news_item['created_at'] = int(datetime.datetime.now().timestamp())
            published_at_text = info_item.xpath(
                ".//p/span[2]/text()").extract_first()
            if published_at_text:
                print(published_at_text)
                published_at = re.sub(u"[(\()(\))]", "",
                                      published_at_text.strip())
                news_item['published_at'] = int(
                    datetime.datetime.strptime(
                        published_at, "%Y-%m-%d %H:%M:%S").timestamp())
                if self.deadline > news_item['published_at']:
                    return
            else:
                continue
            # print(news_item)
            yield news_item

        next_link = response.xpath(
            "//div[@class='showpage']/form/a[contains(text(),'下一页')]/@href"
        ).extract_first()

        if next_link:
            yield scrapy.Request('http://www.csia.net.cn/Article/' + next_link,
                                 callback=self.parse,
                                 errback=self.err_callback)

Exemple #23

0

Afficher le fichier

Fichier : Tencent.py Projet : igdnss/NewsSpider

    def parseNews(self, response):
        logging.info("--------------parsing news--------------")
        startparseSingleNews = datetime.datetime.now()
        logging.info("$$$$$$$$$$$$$$$$$$$$startparseSingleNews at : " +
                     str(startparseSingleNews))
        data = response.xpath("//div[@id='Cnt-Main-Article-QQ']")
        item = NewsSpiderItem()
        timee = data.xpath("//span[@class='article-time']/text()").extract()
        # 修改抽取内容  条件或
        content = response.xpath(
            "//div[@id='Cnt-Main-Article-QQ']/p[@style='TEXT-INDENT: 2em']/text()"
        ).extract()
        cc = ''
        if len(content) > 0:
            self.fileName = response.url[-10:-4] + ".txt"
            scripts = response.xpath("//script/text()").extract()
            url = response.url
            title = response.xpath("//title/text()").extract()
            for c in content:
                cc = cc + c + '\n'
            content = cc.strip()
            logging.info("--------------pre url --------------" + response.url)
            logging.info("--------------pre content --------------" + content)
            for scriptCnt in scripts:
                if (scriptCnt.find('pubtime') > 0):
                    time = self.getTimeStr(scriptCnt)
                    logging.info("--------------time--------------" + time)
                    break
            logging.info("--------------questions urls--------------" +
                         response.url)
            title = u''.join(title[0]).encode('utf-8')
            logging.info("--------------questions title--------------" + title)
            content = u''.join(content).encode('utf-8')
            logging.info("--------------content title--------------" + content)
            if (len(content) > 0):
                if (url.find("sports.qq.com") >= 0):
                    self.save("tencent/sports/", url, time, title, content)
                elif (url.find("finance.qq.com") >= 0
                      or url.find("money.qq.com") >= 0
                      or url.find("stock.qq.com") >= 0):
                    self.save("tencent/finance/", url, time, title, content)
                elif (url.find("ent.qq.com") >= 0):
                    self.save("tencent/ent/", url, time, title, content)
                elif (url.find("tech.qq.com") >= 0):
                    self.save("tencent/tech/", url, time, title, content)
                elif (url.find("auto.qq.com") >= 0):
                    self.save("tencent/auto/", url, time, title, content)
                elif (url.find("house.qq.com") >= 0):
                    self.save("tencent/house/", url, time, title, content)
                elif (url.find("fashion.qq.com") >= 0):
                    self.save("tencent/fashion/", url, time, title, content)
                elif (url.find("cul.qq.com") >= 0):
                    self.save("tencent/cul/", url, time, title, content)

                #===============================================================
                # if(url.find("finance.qq.com") >= 0 or url.find("money.qq.com") >= 0 or url.find("stock.qq.com") >= 0):
                # 	self.save("tencent/finance/", url, time, title, content)
                #===============================================================

        endparseSingleNews = datetime.datetime.now()
        logging.info("$$$$$$$$$$$$$$$$$$$$ endparseSingleNews at : " +
                     str(endparseSingleNews))
        logging.info("$$$$$$$$$$$$$$$$$$$$ single news cost time : " +
                     str(endparseSingleNews - startparseSingleNews))