コード例 #1
0
    def parse(self, response):
    #抓取所有文章的题目,评论数,收藏数和点赞数
        # 一页的文章连接数
        article_link_list =response.css('#archive .floated-thumb a.archive-title::attr(href)').extract()
        next_page_link=response.css('.next.page-numbers::attr(href)').extract()[0]
        # article_title = response.css('#archive .floated-thumb a.archive-title::attr(title)').extract()[0]


        meta = {
            'page_num': next_page_link[-2:-1]
        }
        for url in article_link_list:
            print('url = ',url)
            yield Request(url=url, callback=self.parse_detail,meta=meta)



        # 进行翻页功能
        # print('next_page_link',next_page_link)
        i =0
        if next_page_link:
            i = i+1
            yield Request(url=next_page_link,callback=self.parse)
            if i >4:
                # 先请求前三页
                return
コード例 #2
0
 def parse_detial(self, response):
     res = json.loads(response.text)
     graphql = res.get('graphql')
     shortcode_media = graphql.get('shortcode_media')
     owner = shortcode_media.get('owner')
     username = owner.get('username')
     personal_url = 'https://www.instagram.com/{}/?__a=1'.format(username)
     # 拼接出个人主页的链接
     yield Request(url=personal_url,
                   meta={'username': username},
                   callback=self.parse_person)
コード例 #3
0
    def parse(self, response):
        #print(response.text)

        rownodes = Selector(
            response=response).xpath('//div[@class="row post"]')

        # 分析每一个资讯
        for rowpost in rownodes:

            inforesult = {}

            colnode = rowpost.xpath('.//div[@class="col-md-7 col-sm-6"]')[0]

            inforesult['i_title'] = colnode.xpath(
                './h4/a/text()').extract_first()
            print(inforesult['i_title'])

            inforesult['i_type'] = "威胁情报"
            inforesult['i_sourcesite'] = "安全牛"

            inforesult['i_url'] = colnode.xpath('./h4/a/@href').extract_first()
            if inforesult['i_url']:
                inforesult['i_uuid'] = MD5.get_md5(inforesult['i_url'])

            inforesult['i_abstract'] = colnode.xpath(
                './p/text()').extract_first()

            author = colnode.xpath(
                './/span[@class="author"]/a/text()').extract_first()
            authorurl = colnode.xpath(
                './/span[@class="author"]/a/@href').extract_first()

            authordict = {'author': author, 'authorurl': authorurl}
            inforesult['i_author'] = json.dumps(authordict, ensure_ascii=False)

            # 星期三, 四月 18, 2018
            timestr = colnode.xpath(
                './/span[@class="date"]/text()').extract_first()
            inforesult['i_releasetime'] = self.get_info_releasetime(timestr)
            inforesult['i_content'] = self.get_info_content(
                inforesult['i_url'])

            inforesult['i_imagesurls'] = rowpost.xpath(
                './/div[@class="thumb"]/a/img/@src').extract()

            yield IbugItem(inforesult)

        # 访问下一页
        nextpage = Selector(response=response).xpath(
            '//div[@class="navigation"]/div[@class="nav-previous"]/a/@href'
        ).extract_first()
        if nextpage:
            print('--------------------------------------访问下一页:' + nextpage)
            yield Request(nextpage, callback=self.parse, dont_filter=True)
コード例 #4
0
 def get_response_object(self, url):
     path_to_file = url.replace(FILE_SYSTEM_PREFIX, '')
     f = open(path_to_file, 'rb')
     bytess = f.read()
     f.close()
     return HtmlResponse(url,
                         200,
                         self.generate_response_headers(),
                         bytess,
                         None,
                         Request(url),
                         encoding='utf-8')
コード例 #5
0
 def parse(self, response):
     res = json.loads(response.text)
     data = res.get('data')
     user = data.get('user')
     edge_web_discover_media = user.get('edge_web_discover_media')
     edges = edge_web_discover_media.get('edges')
     for x in edges:
         node = x.get('node')
         shortcode = node.get('shortcode')
         url = 'https://www.instagram.com/p/{}/?__a=1'.format(shortcode)
         # 进入二级页面 准备找到个人主页的链接
         yield Request(url=url, callback=self.parse_detial)
         break
コード例 #6
0
 def parse_fans(self, response):
     response = json.loads(response.text)
     data = response.get('data')
     user = data['user']
     edge_followed_by = user['edge_followed_by']
     edges = edge_followed_by['edges']
     # 粉丝数据
     for user_name in edges:
         node = user_name['node']
         username = node['username']
         personal_url = 'https://www.instagram.com/{}/?__a=1'.format(
             username)
         # 拼接出粉丝个人主页的链接
         yield Request(url=personal_url, callback=self.parse_person)
コード例 #7
0
    def parse_person(self, response):
        username = response.meta.get('username')
        res = json.loads(response.text)
        graphql = res.get('graphql')
        user = graphql.get('user')
        # 找到查看粉丝请求链接
        # id = user.get('id')
        # fans_url = 'https://www.instagram.com/graphql/query/?query_hash=56066f031e6239f35a904ac20c9f37d9&variables={"id":"{}","include_reel":true,"fetch_mutual":false,"first":24}'.format(id)
        # yield Request(
        #     url=fans_url,
        #     callback=self.parse_fans
        # )

        # 找到每个帖子的请求链接
        edge_owner_to_timeline_media = user.get('edge_owner_to_timeline_media')
        edges = edge_owner_to_timeline_media.get('edges')
        for x in edges:
            node = x.get('node')
            shortcode = node.get('shortcode')
            post_url = 'https://www.instagram.com/p/{}/?__a=1'.format(
                shortcode)
            yield Request(url=post_url,
                          meta={'username': username},
                          callback=self.prase_post)