Exemple #1
0
    def parse(self, response):
        print(response.url)
        parent_titles = response.xpath(
            '//div[@id="tab01"]//h3[@class="tit02"]/a/text()').extract()
        parent_urls = response.xpath(
            '//div[@id="tab01"]//h3[@class="tit02"]/a/@href').extract()
        sub_titles = response.xpath(
            '//div[@id="tab01"]//ul[@class="list01"]/li/a/text()').extract()
        sub_urls = response.xpath(
            '//div[@id="tab01"]//ul[@class="list01"]/li/a/@href').extract()
        # 大标题
        for index in range(len(parent_titles)):
            parent_title = parent_titles[index]
            parent_url = parent_urls[index]
            # print(parent_url,parent_title)

            # 大标题下循环取小标题
            for index_sub in range(len(sub_titles)):
                sub_title = sub_titles[index_sub]
                sub_url = sub_urls[index_sub]
                if sub_url.startswith(parent_url):
                    tiezi_path = "./datas/" + parent_title + "/" + sub_title
                    if not os.path.exists(tiezi_path):
                        os.makedirs(tiezi_path)
                # print(sub_url,sub_title)
                    item = SinaItem()
                    item['parent_title'] = parent_title
                    item['parent_url'] = parent_url
                    item['sub_title'] = sub_title
                    item['sub_url'] = sub_url
                    item['tiezi_path'] = tiezi_path
                    yield scrapy.Request(sub_url,
                                         callback=self.seconde_detail,
                                         meta={'item': item})
Exemple #2
0
    def parse(self, response):
        item = SinaItem()

        parent_list = response.xpath(
            '//div[@id="tab01"]/div[@class="clearfix"]/h3[@class="tit02"]/a')
        for parent in parent_list:
            parent_url = parent.xpath('./@href').extract()[0]
            parent_title = parent.xpath('./text()').extract()[0]

            item['parent_url'] = parent_url
            item['parent_title'] = parent_title

            sub_list = response.xpath(
                '//div[@id="tab01"]/div[@class="clearfix"]/ul/li/a')
            for sub in sub_list:
                sub_url = sub.xpath('./@href').extract()[0]
                sub_title = sub.xpath('./text()').extract()[0]

                item['sub_url'] = sub_url
                item['sub_title'] = sub_title

                if sub_url.startswith(parent_url):
                    save_path = './data/' + parent_title + '/' + sub_title + '/'

                    item['save_path'] = save_path

                yield scrapy.Request(sub_url,
                                     callback=self.second,
                                     meta={'item': item})
Exemple #3
0
    def second_parse(self, response):
        #提取每次response的meta数据
        meta_1 = response.meta['meta_1']
        #提取小类里所有子链接
        sonUrls = response.xpath('//a/@href').extract()

        items = []
        for i in range(0, len(sonUrls)):
            # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True
            if_belong = sonUrls[i].endswith(
                '.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])

            # 如果属于本大类,获取字段值放在同一个item下便于传输
            if (if_belong):
                item = SinaItem()
                item['parentTitle'] = meta_1['parentTitle']
                item['parentUrls'] = meta_1['parentUrls']
                item['subUrls'] = meta_1['subUrls']
                item['subTitle'] = meta_1['subTitle']
                item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]
                items.append(item)

        # 发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理
        for item in items:
            yield scrapy.Request(url=item['sonUrls'],
                                 meta={'meta_2': item},
                                 callback=self.detail_parse)
Exemple #4
0
    def parse(self, response):
        resp = response.body.decode('gbk')
        # str = resp.split('list : ')[-1]
        # list1 = list(str[:str.rfind('}')])
        #
        # print(list1)
        # print(type(list1))

        ret = js2py.eval_js(resp)

        for news in ret.list:

            item = SinaItem()
            sort = news['channel']['title']
            title = news['title']
            url = news['url']
            time = news['time']

            item['sort'] = sort
            item['title'] = title
            item['time'] = time
            item['url'] = url

            yield item
            yield scrapy.Request(item['url'],
                                 meta={'detail_item': item},
                                 callback=self.parse_page)
Exemple #5
0
    def parse_mid(self, response):
        meta_1 = response.meta['meta1']
        # t1 = response.meta['time']
        # t2 = time.time()
        # print '*'*50
        # print t2-t1
        # 抓取列表也的url
        list_urls = response.xpath('//ul/li//a/@href').extract()
        list_title = response.xpath('//ul/li//a/text()').extract()
        # print len(url_list)
        items = []
        for url in list_urls:
            # print meta_1['mid_filename']
            # print '*'*50
            item = SinaItem()
            # 大类
            item['origin_title'] = meta_1['origin_title']
            item['origin_link'] = meta_1['origin_link']
            # 小类
            item['mid_title'] = meta_1['mid_title']
            item['mid_link'] = meta_1['mid_link']
            item['mid_filename'] = meta_1['mid_filename']
            # 列表页
            # item['news_link'] = url if url.startswith(item['origin_link']) else item['origin_link']+url
            item['news_link'] = url
            item['lnews_title'] = list_title

            items.append(item)

        for item in items:
            yield scrapy.Request(url=item['news_link'],
                                 meta={'meta2': item},
                                 callback=self.parse_detail)
Exemple #6
0
    def parse(self, response):
        # print("reponse.url==",response.url)

        # 大标题:parent_title
        parent_titles = response.xpath(
            '//div[@id="tab01"]//h3[@class="tit02"]/a/text()').extract()
        # 大标题的链接:parent_url
        parent_urls = response.xpath(
            '//div[@id="tab01"]//h3[@class="tit02"]/a/@href').extract()
        # 小标题:sub_title
        sub_titles = response.xpath(
            '//div[@id="tab01"]//ul[@class="list01"]/li/a/text()').extract()
        # 小标题的链接:sub_url
        sub_urls = response.xpath(
            '//div[@id="tab01"]//ul[@class="list01"]/li/a/@href').extract()

        print(len(parent_titles), len(parent_urls))
        print(len(sub_titles), len(sub_urls))

        # 大标题
        for index in range(len(parent_titles)):

            parent_title = parent_titles[index]

            parent_url = parent_urls[index]

            # print("parent_title==",parent_title,"parent_url==",parent_url)

            # 循环小标题
            for index_sub in range(len(sub_urls)):

                sub_title = sub_titles[index_sub]

                sub_url = sub_urls[index_sub]

                # https://news.sina.com.cn/   新闻
                # https://news.sina.com.cn/china/ 国内
                if sub_url.startswith(parent_url):
                    # print("sub_title==",sub_title,"sub_url==",sub_url,"parent_url==",parent_url)

                    sub_path = "./datas/" + parent_title + "/" + sub_title

                    if not os.path.exists(sub_path):
                        os.makedirs(sub_path)

                    # 这个item没有完整,携带到下一个请求,成功后再把数据补上

                    item = SinaItem()

                    item["parent_title"] = parent_title
                    item["parent_url"] = parent_url
                    item["sub_title"] = sub_title
                    item["sub_url"] = sub_url
                    item["tiezi_path"] = sub_path

                    # 直接请求
                    yield scrapy.Request(sub_url,
                                         callback=self.seconde_detail,
                                         meta={"item": item})
Exemple #7
0
    def parse(self, response):
        # print("response.url====",response.url)
        #所以的大标题
        parent_titles = response.xpath(
            '//h3[@class="tit02"]/a/text()').extract()
        # 大标题对应的所以的链接
        parent_urls = response.xpath('//h3[@class="tit02"]/a/@href').extract()

        #所有小标题
        sub_titles = response.xpath(
            '//ul[@class="list01"]/li/a/text()').extract()
        #所以小标题对应的链接
        sub_urls = response.xpath('//ul[@class="list01"]/li/a/@href').extract()

        # print(sub_titles)

        items = []

        for i in range(len(parent_titles)):

            #http://news.sina.com.cn/ 新闻
            parent_url = parent_urls[i]

            parent_title = parent_titles[i]

            for j in range(len(sub_urls)):
                #http://news.sina.com.cn/world/  国际
                sub_url = sub_urls[j]
                sub_title = sub_titles[j]

                #判断url前缀是否相同,相同就是属于,否则不属于
                if sub_url.startswith(parent_url):
                    #装数据
                    item = SinaItem()
                    # print("parent_url===",parent_url)
                    # print("sub_url===", sub_url)
                    #创建目录
                    sub_file_name = "./Data/" + parent_title + "/" + sub_title
                    if not os.path.exists(sub_file_name):
                        #不存在就创建
                        os.makedirs(sub_file_name)

                    item["parent_url"] = parent_url
                    item["parent_title"] = parent_title
                    item["sub_url"] = sub_url
                    item["sub_title"] = sub_title
                    item["sub_file_name"] = sub_file_name

                    items.append(item)

        #把列表的数据取出
        for item in items:
            sub_url = item["sub_url"]
            #meta={"item":item} 传递item引用SinaItem对象
            yield scrapy.Request(sub_url,
                                 callback=self.parse_second,
                                 meta={"item": item},
                                 dont_filter=False)
Exemple #8
0
    def parse(self, response):
        items = []

        # 所有大类的url 和 标题
        parentUrls = response.xpath(
            '//div[@id="tab01"]/div/h3/a/@href').extract()
        parentTitle = response.xpath(
            '//div[@id="tab01"]/div/h3/a/text()').extract()

        # 所有小类的ur 和 标题
        subUrls = response.xpath(
            '//div[@id="tab01"]/div/ul/li/a/@href').extract()
        subTitle = response.xpath(
            '//div[@id="tab01"]/div/ul/li/a/text()').extract()

        #爬取所有大类
        for i in range(0, len(parentTitle)):

            # 指定大类的路径和目录名
            #parentFilename = "./Data/" + parentTitle[i]

            #如果目录不存在,则创建目录
            #if(not os.path.exists(parentFilename)):
            #    os.makedirs(parentFilename)

            # 爬取所有小类
            for j in range(0, len(subUrls)):
                item = SinaItem()

                # 保存大类的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
                if_belong = subUrls[j].startswith(item['parentUrls'])

                # 如果属于本大类,将存储目录放在本大类目录下
                if (if_belong):
                    #subFilename =parentFilename + '/'+ subTitle[j]

                    # 如果目录不存在,则创建目录
                    #if(not os.path.exists(subFilename)):
                    #    os.makedirs(subFilename)

                    # 存储 小类url、title和filename字段数据
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] = subTitle[j]
                    #item['subFilename'] = subFilename

                    items.append(item)

        #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
        for item in items:
            yield scrapy.Request(url=item['subUrls'],
                                 meta={'meta_1': item},
                                 callback=self.second_parse)
Exemple #9
0
 def parse(self, response):
     # 将返回的json数据转换成python对象
     js = json.loads(response.body)
     # 取出键为result的值
     result = js['result']
     for article in result['data']:
         item = SinaItem()
         item['article_urls'] = article['url']
         request = scrapy.Request(url=item['article_urls'], meta={'meta_article': item}, callback=self.article_parse,
                                  dont_filter=True)
         request.meta["ChromeDriver"] = True
         yield request
Exemple #10
0
    def parse(self, response):
        items = []
        # 所有大类的url 和 标题
        parentTitle = response.xpath('//h3[@class="tit02"]/a/text()').extract()
        parentUrls = response.xpath('//h3[@class="tit02"]/a/@href').extract()

        # 所有小类的ur 和 标题
        subTitle = response.xpath(
            '//ul[@class="list01"]/li/a/text()').extract()
        subUrls = response.xpath('//ul[@class="list01"]/li/a/@href').extract()
        # for i in range(0, len(parentTitle)):
        #     print  'No.%d: ' % (i + 1) + parentTitle[i] + '\t' + parentUrls[i]
        # for i in range(0, len(subTitle)):
        #         print  'No.%d: '% (i+1) + subTitle[i] + '\t' + subUrls[i]
        # 爬取所有大类
        for i in range(0, len(parentTitle)):
            # for title in parentTitle:
            # 指定大类目录的路径和目录名
            # parentFilename = "./Data/" + parentTitle[i]
            # print 'No.%d: '% (i+1) + parentFilename
            # # 如果目录不存在,则创建目录
            # if (not os.path.exists(parentFilename)):
            #     os.makedirs(parentFilename)

            # 爬取所有小类
            # for Title,Urls in subTitle,subUrls:
            for j in range(0, len(subTitle)):
                item = SinaItem()
                # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
                # if_belong = subUrls[j].startswith(item['parentUrls'])

                # 如果属于本大类,将存储目录放在本大类目录下
                # if (if_belong):
                subFilename = "./Data/" + subTitle[j]
                # print 'No.%d: '% (j+1) + subFilename
                # 如果目录不存在,则创建目录
                if (not os.path.exists(subFilename)):
                    os.makedirs(subFilename)
                # 保存大类的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]
                # 存储 小类url、title和filename字段数据
                item['subUrls'] = subUrls[j]
                item['subTitle'] = subTitle[j]
                item['subFilename'] = subFilename

                items.append(item)

        # 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
        for item in items:
            yield scrapy.Request(url=item['subUrls'],
                                 meta={'meta_1': item},
                                 callback=self.second_parse)
Exemple #11
0
    def parse(self, response):
        items = []
        #大类标题和Urls
        parentUrls = response.xpath(
            '//div[@id="tab01"]/div/h3/a/@href').extract()
        parentTitles = response.xpath(
            '//div[@id="tab01"]/div/h3/a/text()').extract()

        #小类标题和urls
        subUrls = response.xpath(
            '//div[@id="tab01"]/div/ul/li/a/@href').extract()
        subTitles = response.xpath(
            '//div[@id="tab01"]/div/ul/li/a/text()').extract()

        #爬取所有大类
        for i in range(0, len(parentTitles)):
            #指定大类目录路径
            parentFilename = './Data/' + parentTitles[i]

            #如果目录不存在,创建目录
            if (not os.path.exists(parentFilename)):
                os.makedirs(parentFilename)

            #爬取小类
            for j in range(0, len(subUrls)):
                item = SinaItem()
                #保存大类的title和urls
                print(parentTitles[i], i)
                item['parentTitle'] = parentTitles[i]
                item['parentUrls'] = parentUrls[i]

                #检测小类的url是否以同类别大类Url开头,如果是返回True
                if_belong = subUrls[j].startswith(item['parentUrls'])
                #如果属于本大类,将存储目录放在本大类目录下
                if if_belong:
                    subFilename = parentFilename + '/' + subTitles[j]

                    #目录不存在建立
                    if not os.path.exists(subFilename):
                        os.makedirs(subFilename)

                    #存储小类url title 和filename 字段
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] = subTitles[j]
                    item['subFilename'] = subFilename

                    items.append(item)

        for item in items:
            yield scrapy.Request(url=item['subUrls'],
                                 meta={'meta_1': item},
                                 callback=self.second_parse)
Exemple #12
0
    def parse(self, response):

        soup = BeautifulSoup(response.body, 'lxml')
        # 获取每个大块
        block_list = section = soup.find(id='tab01').find_all('div')[:-1]

        items = []
        # t1 = time.time()
        for block in block_list:
            # item = SinaItem() 不能在这儿创建,每一个小类中的每一条新闻对应一个对象

            # 获取大类的标题,在本地磁盘中创建大类的路径
            origin_title = block.find('h3').get_text()
            origin_link = block.find('a').get_text()
            ori_filename = './data/' + origin_title
            if not os.path.exists(ori_filename):
                origin_filename = os.makedirs(ori_filename)

            # 获取小类的标题,在对应的大类下创建小类的路径
            mid_title_list = block.find_all('li')
            for mid_title in mid_title_list:
                mtitle = mid_title.get_text()
                # print mtitle
                # print '*'*30
                mlink = mid_title.find('a').get('href')

                item = SinaItem()
                # 大类
                item['origin_title'] = origin_title
                item['origin_link'] = origin_link
                # 小类
                item['mid_title'] = mtitle
                item['mid_link'] = mlink

                m_filename = ori_filename + '/' + mtitle
                item['mid_filename'] = m_filename
                if not os.path.exists(m_filename):
                    mid_filename = os.makedirs(m_filename)

                items.append(item)
                # 发送每个小类的链接,并把大类小类的链接通过meta传送到response,回调函数使用
                # yield scrapy.Request(url=mlink, meta={'meta1':item}, callback=self.parse_mid)

        for item in items:
            yield scrapy.Request(url=item['mid_link'],
                                 meta={'meta1': item},
                                 callback=self.parse_mid)
Exemple #13
0
 def parsecontents(self, response):
     title = response.xpath('//title/text()').extract()[0]
     meta = response.xpath('//meta/@content').extract()
     keywords = meta[2]
     time = meta[10]
     media = meta[13]
     paragraph = response.xpath(
         '//div[@class="article"]/p/text()').extract()
     content = ""
     for p in paragraph:
         content = content + p
     item = SinaItem()
     item['title'] = str(title)
     item['keywords'] = str(keywords)
     item['time'] = str(time)
     item['media'] = str(media)
     item['content'] = str(content)
     item['tag'] = "news"
     yield item
Exemple #14
0
 def second_parse(self, response):
     meta_item = response.meta['meta_item']
     # print(type(meta_item))
     url_list = response.xpath('//a/@href').extract()
     items = []
     for i in url_list:
         parent_urls = meta_item['parent_url']
         # print('=' * 50, parent_urls)
         if i.startswith(parent_urls) and i.endswith('.shtml'):
             item = SinaItem()
             sun_url = i
             # print('=' * 50, sun_url)
             item['parent_title'] = meta_item['parent_title']
             item['parent_url'] = meta_item['parent_url']
             item['son_title'] = meta_item['son_title']
             item['son_url'] = meta_item['son_url']
             item["parent_path"] = meta_item['parent_path']
             item['grandson_url'] = sun_url
             items.append(item)
     # 第三层
     for a in items:
         sun_url = a['grandson_url']
         yield scrapy.Request(sun_url, callback=self.three_parse, meta={'meta_item1': a})
Exemple #15
0
    def parse(self, response):
        parent_title = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()
        parent_url = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
        son_title = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()
        son_url = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()

        item = []

        for i in range(len(parent_url)):
            # 用自己的名字创建文件夹
            parent_titles = './Data/' + parent_title[i]
            parent_urls = parent_url[i]

            for j in range(len(son_url)):
                son_titles = son_title[j]
                son_urls = son_url[j]

                if son_urls.startswith(parent_urls):
                    items = SinaItem()
                    parent_path = parent_titles + "/" + son_titles

                    if not os.path.exists(parent_path):
                        os.makedirs(parent_path)

                    items["parent_title"] = parent_titles
                    items["parent_url"] = parent_urls
                    items["son_title"] = son_titles
                    items["son_url"] = son_urls
                    items["parent_path"] = parent_path
                    item.append(items)
                    # print(item)

            # 请求第二层循环
            for x in item:
                son_urls = x['son_url']

                yield scrapy.Request(son_urls, callback=self.second_parse, meta={"meta_item": x})
Exemple #16
0
    def parse_detail(self, response):
        import re
        meta2 = response.meta['meta2']

        contents_list = response.xpath('//p/text()').extract()
        contents = ''
        title = response.url[7:-6] if len(
            response.url[7:-6]) > 10 else meta2['lnews_title']
        # pattern = re.compile(r'.+?.cn/(.+/)')
        # if len(response.url[20:-6]) > 5:
        #     title = str(pattern.match(response.url).groups(1))
        # else:
        #     meta2['lnews_title']
        # print title
        title = title.replace('/', '-')
        # print title.encode('utf-8') + '*'*30
        # print len(contents)
        for content in contents_list:
            if content.strip():
                contents += content.strip() + '\n'

                item = SinaItem()
                # 大类
                item['origin_title'] = meta2['origin_title']
                item['origin_link'] = meta2['origin_link']
                # 小类
                item['mid_title'] = meta2['mid_title']
                item['mid_link'] = meta2['mid_link']
                item['mid_filename'] = meta2['mid_filename']
                # 列表页
                item['news_link'] = meta2['news_link']
                # 详情页
                item['news_content'] = contents
                item['news_title'] = title

                yield item