def parse(self, response): print(response.url) parent_titles = response.xpath( '//div[@id="tab01"]//h3[@class="tit02"]/a/text()').extract() parent_urls = response.xpath( '//div[@id="tab01"]//h3[@class="tit02"]/a/@href').extract() sub_titles = response.xpath( '//div[@id="tab01"]//ul[@class="list01"]/li/a/text()').extract() sub_urls = response.xpath( '//div[@id="tab01"]//ul[@class="list01"]/li/a/@href').extract() # 大标题 for index in range(len(parent_titles)): parent_title = parent_titles[index] parent_url = parent_urls[index] # print(parent_url,parent_title) # 大标题下循环取小标题 for index_sub in range(len(sub_titles)): sub_title = sub_titles[index_sub] sub_url = sub_urls[index_sub] if sub_url.startswith(parent_url): tiezi_path = "./datas/" + parent_title + "/" + sub_title if not os.path.exists(tiezi_path): os.makedirs(tiezi_path) # print(sub_url,sub_title) item = SinaItem() item['parent_title'] = parent_title item['parent_url'] = parent_url item['sub_title'] = sub_title item['sub_url'] = sub_url item['tiezi_path'] = tiezi_path yield scrapy.Request(sub_url, callback=self.seconde_detail, meta={'item': item})
def parse(self, response): item = SinaItem() parent_list = response.xpath( '//div[@id="tab01"]/div[@class="clearfix"]/h3[@class="tit02"]/a') for parent in parent_list: parent_url = parent.xpath('./@href').extract()[0] parent_title = parent.xpath('./text()').extract()[0] item['parent_url'] = parent_url item['parent_title'] = parent_title sub_list = response.xpath( '//div[@id="tab01"]/div[@class="clearfix"]/ul/li/a') for sub in sub_list: sub_url = sub.xpath('./@href').extract()[0] sub_title = sub.xpath('./text()').extract()[0] item['sub_url'] = sub_url item['sub_title'] = sub_title if sub_url.startswith(parent_url): save_path = './data/' + parent_title + '/' + sub_title + '/' item['save_path'] = save_path yield scrapy.Request(sub_url, callback=self.second, meta={'item': item})
def second_parse(self, response): #提取每次response的meta数据 meta_1 = response.meta['meta_1'] #提取小类里所有子链接 sonUrls = response.xpath('//a/@href').extract() items = [] for i in range(0, len(sonUrls)): # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True if_belong = sonUrls[i].endswith( '.shtml') and sonUrls[i].startswith(meta_1['parentUrls']) # 如果属于本大类,获取字段值放在同一个item下便于传输 if (if_belong): item = SinaItem() item['parentTitle'] = meta_1['parentTitle'] item['parentUrls'] = meta_1['parentUrls'] item['subUrls'] = meta_1['subUrls'] item['subTitle'] = meta_1['subTitle'] item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] items.append(item) # 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理 for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)
def parse(self, response): resp = response.body.decode('gbk') # str = resp.split('list : ')[-1] # list1 = list(str[:str.rfind('}')]) # # print(list1) # print(type(list1)) ret = js2py.eval_js(resp) for news in ret.list: item = SinaItem() sort = news['channel']['title'] title = news['title'] url = news['url'] time = news['time'] item['sort'] = sort item['title'] = title item['time'] = time item['url'] = url yield item yield scrapy.Request(item['url'], meta={'detail_item': item}, callback=self.parse_page)
def parse_mid(self, response): meta_1 = response.meta['meta1'] # t1 = response.meta['time'] # t2 = time.time() # print '*'*50 # print t2-t1 # 抓取列表也的url list_urls = response.xpath('//ul/li//a/@href').extract() list_title = response.xpath('//ul/li//a/text()').extract() # print len(url_list) items = [] for url in list_urls: # print meta_1['mid_filename'] # print '*'*50 item = SinaItem() # 大类 item['origin_title'] = meta_1['origin_title'] item['origin_link'] = meta_1['origin_link'] # 小类 item['mid_title'] = meta_1['mid_title'] item['mid_link'] = meta_1['mid_link'] item['mid_filename'] = meta_1['mid_filename'] # 列表页 # item['news_link'] = url if url.startswith(item['origin_link']) else item['origin_link']+url item['news_link'] = url item['lnews_title'] = list_title items.append(item) for item in items: yield scrapy.Request(url=item['news_link'], meta={'meta2': item}, callback=self.parse_detail)
def parse(self, response): # print("reponse.url==",response.url) # 大标题:parent_title parent_titles = response.xpath( '//div[@id="tab01"]//h3[@class="tit02"]/a/text()').extract() # 大标题的链接:parent_url parent_urls = response.xpath( '//div[@id="tab01"]//h3[@class="tit02"]/a/@href').extract() # 小标题:sub_title sub_titles = response.xpath( '//div[@id="tab01"]//ul[@class="list01"]/li/a/text()').extract() # 小标题的链接:sub_url sub_urls = response.xpath( '//div[@id="tab01"]//ul[@class="list01"]/li/a/@href').extract() print(len(parent_titles), len(parent_urls)) print(len(sub_titles), len(sub_urls)) # 大标题 for index in range(len(parent_titles)): parent_title = parent_titles[index] parent_url = parent_urls[index] # print("parent_title==",parent_title,"parent_url==",parent_url) # 循环小标题 for index_sub in range(len(sub_urls)): sub_title = sub_titles[index_sub] sub_url = sub_urls[index_sub] # https://news.sina.com.cn/ 新闻 # https://news.sina.com.cn/china/ 国内 if sub_url.startswith(parent_url): # print("sub_title==",sub_title,"sub_url==",sub_url,"parent_url==",parent_url) sub_path = "./datas/" + parent_title + "/" + sub_title if not os.path.exists(sub_path): os.makedirs(sub_path) # 这个item没有完整,携带到下一个请求,成功后再把数据补上 item = SinaItem() item["parent_title"] = parent_title item["parent_url"] = parent_url item["sub_title"] = sub_title item["sub_url"] = sub_url item["tiezi_path"] = sub_path # 直接请求 yield scrapy.Request(sub_url, callback=self.seconde_detail, meta={"item": item})
def parse(self, response): # print("response.url====",response.url) #所以的大标题 parent_titles = response.xpath( '//h3[@class="tit02"]/a/text()').extract() # 大标题对应的所以的链接 parent_urls = response.xpath('//h3[@class="tit02"]/a/@href').extract() #所有小标题 sub_titles = response.xpath( '//ul[@class="list01"]/li/a/text()').extract() #所以小标题对应的链接 sub_urls = response.xpath('//ul[@class="list01"]/li/a/@href').extract() # print(sub_titles) items = [] for i in range(len(parent_titles)): #http://news.sina.com.cn/ 新闻 parent_url = parent_urls[i] parent_title = parent_titles[i] for j in range(len(sub_urls)): #http://news.sina.com.cn/world/ 国际 sub_url = sub_urls[j] sub_title = sub_titles[j] #判断url前缀是否相同,相同就是属于,否则不属于 if sub_url.startswith(parent_url): #装数据 item = SinaItem() # print("parent_url===",parent_url) # print("sub_url===", sub_url) #创建目录 sub_file_name = "./Data/" + parent_title + "/" + sub_title if not os.path.exists(sub_file_name): #不存在就创建 os.makedirs(sub_file_name) item["parent_url"] = parent_url item["parent_title"] = parent_title item["sub_url"] = sub_url item["sub_title"] = sub_title item["sub_file_name"] = sub_file_name items.append(item) #把列表的数据取出 for item in items: sub_url = item["sub_url"] #meta={"item":item} 传递item引用SinaItem对象 yield scrapy.Request(sub_url, callback=self.parse_second, meta={"item": item}, dont_filter=False)
def parse(self, response): items = [] # 所有大类的url 和 标题 parentUrls = response.xpath( '//div[@id="tab01"]/div/h3/a/@href').extract() parentTitle = response.xpath( '//div[@id="tab01"]/div/h3/a/text()').extract() # 所有小类的ur 和 标题 subUrls = response.xpath( '//div[@id="tab01"]/div/ul/li/a/@href').extract() subTitle = response.xpath( '//div[@id="tab01"]/div/ul/li/a/text()').extract() #爬取所有大类 for i in range(0, len(parentTitle)): # 指定大类的路径和目录名 #parentFilename = "./Data/" + parentTitle[i] #如果目录不存在,则创建目录 #if(not os.path.exists(parentFilename)): # os.makedirs(parentFilename) # 爬取所有小类 for j in range(0, len(subUrls)): item = SinaItem() # 保存大类的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) if_belong = subUrls[j].startswith(item['parentUrls']) # 如果属于本大类,将存储目录放在本大类目录下 if (if_belong): #subFilename =parentFilename + '/'+ subTitle[j] # 如果目录不存在,则创建目录 #if(not os.path.exists(subFilename)): # os.makedirs(subFilename) # 存储 小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] = subTitle[j] #item['subFilename'] = subFilename items.append(item) #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 for item in items: yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
def parse(self, response): # 将返回的json数据转换成python对象 js = json.loads(response.body) # 取出键为result的值 result = js['result'] for article in result['data']: item = SinaItem() item['article_urls'] = article['url'] request = scrapy.Request(url=item['article_urls'], meta={'meta_article': item}, callback=self.article_parse, dont_filter=True) request.meta["ChromeDriver"] = True yield request
def parse(self, response): items = [] # 所有大类的url 和 标题 parentTitle = response.xpath('//h3[@class="tit02"]/a/text()').extract() parentUrls = response.xpath('//h3[@class="tit02"]/a/@href').extract() # 所有小类的ur 和 标题 subTitle = response.xpath( '//ul[@class="list01"]/li/a/text()').extract() subUrls = response.xpath('//ul[@class="list01"]/li/a/@href').extract() # for i in range(0, len(parentTitle)): # print 'No.%d: ' % (i + 1) + parentTitle[i] + '\t' + parentUrls[i] # for i in range(0, len(subTitle)): # print 'No.%d: '% (i+1) + subTitle[i] + '\t' + subUrls[i] # 爬取所有大类 for i in range(0, len(parentTitle)): # for title in parentTitle: # 指定大类目录的路径和目录名 # parentFilename = "./Data/" + parentTitle[i] # print 'No.%d: '% (i+1) + parentFilename # # 如果目录不存在,则创建目录 # if (not os.path.exists(parentFilename)): # os.makedirs(parentFilename) # 爬取所有小类 # for Title,Urls in subTitle,subUrls: for j in range(0, len(subTitle)): item = SinaItem() # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) # if_belong = subUrls[j].startswith(item['parentUrls']) # 如果属于本大类,将存储目录放在本大类目录下 # if (if_belong): subFilename = "./Data/" + subTitle[j] # print 'No.%d: '% (j+1) + subFilename # 如果目录不存在,则创建目录 if (not os.path.exists(subFilename)): os.makedirs(subFilename) # 保存大类的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 存储 小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] = subTitle[j] item['subFilename'] = subFilename items.append(item) # 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 for item in items: yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
def parse(self, response): items = [] #大类标题和Urls parentUrls = response.xpath( '//div[@id="tab01"]/div/h3/a/@href').extract() parentTitles = response.xpath( '//div[@id="tab01"]/div/h3/a/text()').extract() #小类标题和urls subUrls = response.xpath( '//div[@id="tab01"]/div/ul/li/a/@href').extract() subTitles = response.xpath( '//div[@id="tab01"]/div/ul/li/a/text()').extract() #爬取所有大类 for i in range(0, len(parentTitles)): #指定大类目录路径 parentFilename = './Data/' + parentTitles[i] #如果目录不存在,创建目录 if (not os.path.exists(parentFilename)): os.makedirs(parentFilename) #爬取小类 for j in range(0, len(subUrls)): item = SinaItem() #保存大类的title和urls print(parentTitles[i], i) item['parentTitle'] = parentTitles[i] item['parentUrls'] = parentUrls[i] #检测小类的url是否以同类别大类Url开头,如果是返回True if_belong = subUrls[j].startswith(item['parentUrls']) #如果属于本大类,将存储目录放在本大类目录下 if if_belong: subFilename = parentFilename + '/' + subTitles[j] #目录不存在建立 if not os.path.exists(subFilename): os.makedirs(subFilename) #存储小类url title 和filename 字段 item['subUrls'] = subUrls[j] item['subTitle'] = subTitles[j] item['subFilename'] = subFilename items.append(item) for item in items: yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
def parse(self, response): soup = BeautifulSoup(response.body, 'lxml') # 获取每个大块 block_list = section = soup.find(id='tab01').find_all('div')[:-1] items = [] # t1 = time.time() for block in block_list: # item = SinaItem() 不能在这儿创建,每一个小类中的每一条新闻对应一个对象 # 获取大类的标题,在本地磁盘中创建大类的路径 origin_title = block.find('h3').get_text() origin_link = block.find('a').get_text() ori_filename = './data/' + origin_title if not os.path.exists(ori_filename): origin_filename = os.makedirs(ori_filename) # 获取小类的标题,在对应的大类下创建小类的路径 mid_title_list = block.find_all('li') for mid_title in mid_title_list: mtitle = mid_title.get_text() # print mtitle # print '*'*30 mlink = mid_title.find('a').get('href') item = SinaItem() # 大类 item['origin_title'] = origin_title item['origin_link'] = origin_link # 小类 item['mid_title'] = mtitle item['mid_link'] = mlink m_filename = ori_filename + '/' + mtitle item['mid_filename'] = m_filename if not os.path.exists(m_filename): mid_filename = os.makedirs(m_filename) items.append(item) # 发送每个小类的链接,并把大类小类的链接通过meta传送到response,回调函数使用 # yield scrapy.Request(url=mlink, meta={'meta1':item}, callback=self.parse_mid) for item in items: yield scrapy.Request(url=item['mid_link'], meta={'meta1': item}, callback=self.parse_mid)
def parsecontents(self, response): title = response.xpath('//title/text()').extract()[0] meta = response.xpath('//meta/@content').extract() keywords = meta[2] time = meta[10] media = meta[13] paragraph = response.xpath( '//div[@class="article"]/p/text()').extract() content = "" for p in paragraph: content = content + p item = SinaItem() item['title'] = str(title) item['keywords'] = str(keywords) item['time'] = str(time) item['media'] = str(media) item['content'] = str(content) item['tag'] = "news" yield item
def second_parse(self, response): meta_item = response.meta['meta_item'] # print(type(meta_item)) url_list = response.xpath('//a/@href').extract() items = [] for i in url_list: parent_urls = meta_item['parent_url'] # print('=' * 50, parent_urls) if i.startswith(parent_urls) and i.endswith('.shtml'): item = SinaItem() sun_url = i # print('=' * 50, sun_url) item['parent_title'] = meta_item['parent_title'] item['parent_url'] = meta_item['parent_url'] item['son_title'] = meta_item['son_title'] item['son_url'] = meta_item['son_url'] item["parent_path"] = meta_item['parent_path'] item['grandson_url'] = sun_url items.append(item) # 第三层 for a in items: sun_url = a['grandson_url'] yield scrapy.Request(sun_url, callback=self.three_parse, meta={'meta_item1': a})
def parse(self, response): parent_title = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract() parent_url = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract() son_title = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract() son_url = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract() item = [] for i in range(len(parent_url)): # 用自己的名字创建文件夹 parent_titles = './Data/' + parent_title[i] parent_urls = parent_url[i] for j in range(len(son_url)): son_titles = son_title[j] son_urls = son_url[j] if son_urls.startswith(parent_urls): items = SinaItem() parent_path = parent_titles + "/" + son_titles if not os.path.exists(parent_path): os.makedirs(parent_path) items["parent_title"] = parent_titles items["parent_url"] = parent_urls items["son_title"] = son_titles items["son_url"] = son_urls items["parent_path"] = parent_path item.append(items) # print(item) # 请求第二层循环 for x in item: son_urls = x['son_url'] yield scrapy.Request(son_urls, callback=self.second_parse, meta={"meta_item": x})
def parse_detail(self, response): import re meta2 = response.meta['meta2'] contents_list = response.xpath('//p/text()').extract() contents = '' title = response.url[7:-6] if len( response.url[7:-6]) > 10 else meta2['lnews_title'] # pattern = re.compile(r'.+?.cn/(.+/)') # if len(response.url[20:-6]) > 5: # title = str(pattern.match(response.url).groups(1)) # else: # meta2['lnews_title'] # print title title = title.replace('/', '-') # print title.encode('utf-8') + '*'*30 # print len(contents) for content in contents_list: if content.strip(): contents += content.strip() + '\n' item = SinaItem() # 大类 item['origin_title'] = meta2['origin_title'] item['origin_link'] = meta2['origin_link'] # 小类 item['mid_title'] = meta2['mid_title'] item['mid_link'] = meta2['mid_link'] item['mid_filename'] = meta2['mid_filename'] # 列表页 item['news_link'] = meta2['news_link'] # 详情页 item['news_content'] = contents item['news_title'] = title yield item