def parse(self, response): print('a new page'.center(60,'=')) item = ArticleItem() item['title'] = response.xpath('//*[@class="wxTitle"]/h2/text()').extract()[0].strip() item['abstract'] = response.xpath('//*[@class="wxBaseinfo"]//*[@id="ChDivSummary"]/text()').extract()[0] # item['cite'] = response.xpath('//div[@class="map"]/div/span/text()').extract() item['date'] = response.xpath('//div[@class="sourinfo"]/p[3]/a/text()').extract()[0].strip().replace(';\r\n','') return item if response.xpath('//div[@class="wxBaseinfo"]/p[2]/label[@id="catalog_FUND"]'): keywords = response.xpath('//div[@class="wxBaseinfo"]/p[3]/a/text()').extract() item['keywords'] = [i.strip().replace(';','') for i in keywords] fund = response.xpath('//div[@class="wxBaseinfo"]/p[2]/a/text()').extract() item['fund'] = [i.strip().replace(';','') for i in fund] else: keywords = response.xpath('//div[@class="wxBaseinfo"]/p[2]/a/text()').extract() item['keywords'] = [i.strip().replace(';','') for i in keywords] item['fund'] = [] authors_info = response.xpath('//div[@class="author"]/span/a/@onclick').extract() authors_list = [i.strip()[21:-3].replace('\'','').split(',') for i in authors_info] for author in authors_list: author_url = 'http://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=au&skey={}&code={}'.format(author[0],author[1]) # item['ins'] = author_url yield scrapy.Request(author_url, meta={'item': item},callback=self.parse_author)
def parse_article(self, response): city = response.meta['city'] item = ArticleItem() # item["the_id"] it is a counter will be asigned in pipelines item["website"] = '搜狐焦点 资讯 ' + city item["title"] = remove_csv_noise( response.xpath('//div[@class="main-content"]/h1/text()').extract()) item["link"] = response.url item["summary"] = remove_csv_noise(response.meta['summary']) item["category"] = remove_csv_noise( response.xpath( '//div[@class="bread-crumbs-area global-clearfix"]/span/a/text()' ).extract()) item["date"] = remove_csv_noise( response.xpath( '//div[@class="info-source"]/span/text()').extract()[0]) item["author"] = remove_csv_noise( response.xpath( '//div[@class="info-source"]/span/a/text()').extract()[0]) # 根据xpath语法选取正文部分的HTML传递给html2text item["text"] = remove_csv_noise( html2text.html2text( response.xpath('//div[@class="info-content"]').extract()[0])) # item["crwaler_time"] = item["other"] = '搜狐焦点 资讯 ' + city yield item
def parse_list(self, response): items = [] bookItem = response.meta['bookItem'] result = JSONDecoder().decode(response.body) for jsonitem in result['catalog']: if jsonitem['grade'] == 2: sourceUuid = result['book']['sourceUuid'] item = ArticleItem() item['author'] = bookItem['author'] item['source'] = bookItem['source'] item['issue'] = bookItem['issue'] item['title'] = jsonitem['title'] item[ 'url'] = 'http://yuedu.163.com/book_reader/' + sourceUuid + '/' + jsonitem[ 'uuid'] # 这里用content字段暂时保存下一步的ajax请求url item[ 'content'] = 'http://yuedu.163.com/getArticleContent.do?sourceUuid=' + sourceUuid + '&articleUuid=' + jsonitem[ 'uuid'] items.append(item) # yield item pass for item in items: yield Request(item['content'], meta={'item': item}, callback=self.parse_details)
def parse_pages(self, response): items = [] for sel in response.xpath( '//*[@id="page-163-com"]/div[2]/div[3]/div/div[2]/div[2]/div/div[2]/div' ): item = ArticleItem() sourceUuid = sel.xpath('a/@href').extract()[0].split('/')[2] item['author'] = sel.xpath( '//*[@class="author-container"]/dl/dd/text()').extract()[0] reload(sys) sys.setdefaultencoding('utf-8') item['source'] = sel.xpath('a/h2/text()').extract()[0].replace( '《', '').replace('》', ' ').split()[0] item['issue'] = sel.xpath('a/h2/text()').extract()[0].replace( '《', '').replace('》', ' ').split()[1] item['url'] = 'http://yuedu.163.com/newBookReader.do?operation=info&catalogOnly=true&sourceUuid=' + \ sourceUuid items.append(item) # yield item_url for item in items: yield Request(item['url'], meta={ 'sourceUuid': sourceUuid, 'bookItem': item }, callback=self.parse_list)
def parseBooks(self, response): for sel in response.xpath('//*[@id="main"]/article/div[1]/div[1]'): item = ArticleItem() item['url'] = sys.getdefaultencoding() # item['title'] = sys.getdefaultencoding() item['title'] = sel.xpath('h1/text()').extract()[0].strip() # # return item
def parse_pages(self, response): item_urls = [] for sel in response.xpath('//*[@id="top2"]/div/ul/li/div[1]'): item_url = ArticleItem() item_url['url'] = sel.xpath('a/@href').extract()[0] item_urls.append(item_url) # yield item_url for item_url in item_urls: yield Request(item_url['url'], callback=self.parse_list)
def parse_list(self, response): items = [] for sel in response.xpath('//*[@class="jx_Article"]/ul/li/h2'): item = ArticleItem() item['url'] = 'http://www.dooland.com/magazine/' + sel.xpath( 'a/@href').extract()[0].strip() # item['title'] = sel.xpath('a/@title').extract()[0].strip() items.append(item) for item in items: # yield Request(item['url'],meta={'item': item}, callback=self.parse_details) yield Request(item['url'], callback=self.parse_details)
def parse_article(self, response): item = ArticleItem() # item["the_id"] = item["website"] = "辅导圈" item["title"] = remove_csv_noise(response.meta['title']) item["link"] = response.url item["summary"] = remove_csv_noise(response.meta['summary']) item["date"] = remove_csv_noise(response.meta['date']) item["category"] = remove_csv_noise(response.xpath('//div[@class="article-meta"]/span/a/text()').extract()) item["author"] = remove_csv_noise(response.meta['author']) # 根据xpath语法选取正文部分的HTML传递给html2text item["text"] = remove_csv_noise(html2text.html2text(response.xpath('//article[@class="article-content"]').extract()[0])) # item["crwaler_time"] = item["other"] = '教育' yield item
def parse_article(self, response): item = ArticleItem() # item["the_id"] = item["website"] = "鲸媒体" item["title"] = str(response.xpath('//h1[@class="title"]/text()').extract()[0]).replace(',',',').replace('\n','').replace('\t','').replace('\r','') item["link"] = response.url item["summary"] = str(response.meta['summary']).replace(',',',').replace('\n','').replace('\t','').replace('\r','') item["category"] = str(response.xpath('//span[@itemprop="name"]/text()').extract()).replace(',',',').replace('\n','').replace('\t','').replace('\r','') item["date"] = str(response.xpath('//span[@class="postclock"]/text()').extract()[0]).replace(',',',').replace('\n','').replace('\t','').replace('\r','') item["author"] = str(response.xpath('//span[@class="postoriginal"]/text()').extract()).replace(',',',').replace('\n','').replace('\t','').replace('\r','') # 根据xpath语法选取正文部分的HTML传递给html2text item["text"] = str(html2text.html2text(response.xpath('//div[@class="post-content"]').extract()[0])).replace(',',',').replace('\n','').replace('\t','').replace('\r','') # item["crwaler_time"] = item["other"] = '' yield item
def parse_details(self, response): item = ArticleItem() sel = Selector(response) item['url'] = response.url item['title'] = sel.xpath( '//*[@class="title"]/div/h1/text()').extract()[0].strip().strip() item['content'] = sel.xpath('//*[@id="article"]/div').extract()[0] item['source'] = sel.xpath('//*[@id="main"]/aside/section[1]/h3/text()' ).extract()[0].split()[0] item['issue'] = sel.xpath('//*[@id="main"]/aside/section[1]/h3/text()' ).extract()[0].split()[1] # TODO 来源ID item['source_id'] = sel.xpath( '//*[@id="main"]/aside/section[1]/h3/text()').extract()[0] item['author'] = '' return item
def parse_article(self, response): item = ArticleItem() # item["the_id"] = item["website"] = "胡润百富" item["title"] = str( response.xpath('//div[@class="title"]/text()').extract()).replace( ',', ',').replace('\n', '').replace('\t', '').replace('\r', '') item["link"] = response.url item["summary"] = str( response.xpath( '//section[@class][@style]/text()').extract()).replace( ',', ',').replace('\n', '').replace('\t', '').replace('\r', '') item["category"] = str( response.xpath('//ol/li/text()').extract()).replace( ',', ',').replace('\n', '').replace('\t', '').replace('\r', '') item["date"] = str( response.xpath('//div[@class="col-sm-6 navsource-l"]/text()'). extract()).replace(',', ',').replace('\n', '').replace('\t', '').replace('\r', '') item["author"] = str( response.xpath('//div[@class="col-xs-12 text-right"]/text()'). extract()).replace(',', ',').replace('\n', '').replace('\t', '').replace('\r', '') # 根据xpath语法选取正文部分的HTML传递给html2text item["text"] = str( html2text.html2text( response.xpath( '//section[@style="font-size:16px;line-height:24px;"]'). extract()[0])).replace(',', ',').replace('\n', '').replace( '\t', '').replace('\r', '') # item["crwaler_time"] = item["other"] = '' yield item
def parse_detail(self, response): item = ArticleItem() # 提取目标数据 # front_img_url = response.meta["front_img_url"] front_img_url = response.meta.get('front_img_url', '') # 文章封面图的URL title = response.css('div.entry-header h1::text').extract()[0] release_date = response.css( 'p.entry-meta-hide-on-mobile ::text').extract()[0].replace( ' ·', '').strip() tag = response.css('p.entry-meta-hide-on-mobile a::text').extract() tags = ','.join(tag) voteup_num = int( response.css('span.vote-post-up h10::text').extract()[0]) collection_num = response.css('span.bookmark-btn::text').extract()[0] collection_pattern = re.match('.*?(\d+).*', collection_num) if collection_pattern: collection_num = int(collection_pattern.group(1)) else: collection_num = 0 comment_num = response.css( 'a[href="#article-comment"] span::text').extract()[0] comment_pattern = re.match('.*?(\d+).*', comment_num) if comment_pattern: comment_num = int(comment_pattern.group(1)) else: comment_num = 0 content = response.css('div.entry').extract()[0] item['front_img_url'] = front_img_url item['title'] = title item['url'] = response.url item['release_date'] = release_date item['tags'] = tags item['voteup_num'] = voteup_num item['collection_num'] = collection_num item['comment_num'] = comment_num item['content'] = content yield item
def parse_detail(self, response): item = ArticleItem() item['url_object_id'] = get_md5(response.url) item['front_image_url'] = [response.meta.get('front_image_url', '')] item['post_url'] = response.url item['description'] = response.meta.get('description', '') #默认为空 item['title'] = response.xpath( '//div[@class="entry-header"]/h1/text()').extract()[0] item['date'] = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/text()').extract( )[0].strip().replace('·', '').strip() item['category'] = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()' ).extract()[0] fav_path = '//span[contains(@class, "vote-post-up")]/h10/text()' item['fav_num'] = 0 if not response.xpath(fav_path).re('\d+') else int( response.xpath(fav_path).re('\d+')[0]) collections_path = '//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()' item['collections'] = 0 if not response.xpath(collections_path).re( '\d+') else int(response.xpath(collections_path).re('\d+')[0]) comment_path = '//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()' item['comment'] = 0 if not response.xpath(comment_path).re( '\d+') else int(response.xpath(comment_path).re('\d+')[0]) yield item
def readFile(self, response): # def parse(self, response): dir = "E:\\kanlishi" wildcard = ".txt" exts = wildcard.split(" ") files = os.listdir(dir) count = 0 items = [] for name in files: for ext in exts: if (name.endswith(ext)): aid = name.split('_')[2] count = count + 1 item = ArticleItem() item['source'] = '看历史'.decode('utf8') item['issue'] = name.split('_')[0].decode('GBK') item[ 'url'] = 'http://www.dooland.com/magazine/article_' + aid + '.html' items.append(item) yield Request(item['url'], meta={'item': item}, callback=self.parse_kanlishi_details) break
def parse_article(self, response): if 'html><head>' in response.body_as_unicode(): return item = ArticleItem() # item["the_id"] it is a counter will be asigned in pipelines item["website"] = '观点房产' item["title"] = remove_csv_noise(response.meta['title']) item["link"] = response.url item["summary"] = remove_csv_noise(response.meta['summary']) item["date"] = remove_csv_noise(response.meta['date']) item["category"] = '资讯' item["author"] = remove_csv_noise( response.xpath('//div[@class="con_l_info_l"]/a/text()').extract() [-1]) # 根据xpath语法选取正文部分的HTML传递给html2text item["text"] = remove_csv_noise( html2text.html2text( response.xpath('//div[@class="con_l_inner"]').extract()[0])) # item["crwaler_time"] = item["other"] = '观点房产 资讯' yield item
def parse(self, response): if len(self.currentUrl) == 0: self.currentUrl = str(response.url) sel = Selector(response) title = sel.xpath('//div[@class="content"]/h1/text()').extract() nextArticleUrlList = sel.xpath( '//div[@class="content"]/div[@class="pre_art"]/a').extract() nextArticleUrl = '' if len(nextArticleUrlList) > 1: nextArticleUrl = sel.xpath( '//div[@class="content"]/div[@class="pre_art"]/a[last()]/@href' ).extract()[0] contents = sel.xpath( '//div[@class="content"]/div[@class="content_01"]/p') nextPage = sel.xpath('//div[@class="page2"]/a[last()]') nextPageStr = nextPage.xpath('./text()').extract()[0].encode('utf-8') nextPageUrl = nextPage.xpath('./@href').extract()[0] # log.msg("Append done." + nextPageStr + nextPageUrl) for content in contents: # 过滤一些特殊的情况 # 判断是不是图片 imgs = content.xpath('./img') if imgs: # 是图片 for img in imgs: if str(img.xpath('@src').extract()[0]).startswith( 'data:image/'): log.msg('diu qi image') else: imgpath = img.xpath('@src').extract()[0] if imgpath.startswith('http:') or imgpath.startswith( 'https:'): log.msg('nothing to do') else: imgpath = self.imageQianZhui + imgpath if len(self.currentMainImage) == 0: self.currentMainImage = imgpath self.contentList.append(imgpath) else: # 不是图片 # 如果是加粗过的文字 if content.xpath('./strong'): strongStr = content.xpath('./strong/text()').extract()[0] self.contentList.append(strongStr) else: textStr = content.xpath('./text()').extract()[0] self.contentList.append(textStr) nextStr = '下一页' nextStr.encode('utf-8') if nextPageStr == nextStr: # 说明是下一页 # log.msg("Append done.----equal") # log.msg("Append done.----nextPageUrl:" + nextPageUrl) yield Request(nextPageUrl, callback=self.parse) else: # 没有下一页了 item = ArticleItem() item['title'] = [t.encode('utf-8') for t in title] item['title'] = item['title'][0] contentStr = "" for index in range(len(self.contentList)): contentStr += self.contentList[index].encode('utf-8') if index <> len(self.contentList) - 1: contentStr += '$' item['content'] = contentStr item['url'] = self.currentUrl.encode('utf-8') item['mainImage'] = self.currentMainImage.encode('utf-8') print self.contentList self.contentList = [] self.articleCount += 1 self.currentUrl = '' self.currentMainImage = '' yield item # 尝试抓取下一篇文章 if nextArticleUrl and self.articleCount < self.articleMaxCount: log.msg("Append done.----nextArticleUrl:" + nextArticleUrl) yield Request(nextArticleUrl, callback=self.parse)