Beispiel #1
0
 def parse_fans(self, response):
     """
     抓取粉丝列表
     """
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                 yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
     selector = Selector(response)
     urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
     uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
     ID = re.findall('(\d+)/fans', response.url)[0]
     for uid in uids:
         relationships_item = RelationshipsItem()
         relationships_item['crawl_time'] = int(time.time())
         relationships_item["fan_id"] = uid
         self.start_uids.append(uid)
         relationships_item["followed_id"] = ID
         relationships_item["_id"] = uid + '-' + ID
         yield relationships_item
     # todo 递归爬取粉丝列表
     for uid in self.start_uids:
         yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information)
     self.start_uids.clear()
Beispiel #2
0
 def parse_follow(self, response):
     """
     抓取关注列表
     """
     # 如果是第1页,一次性获取后面的所有页
     if response.url.endswith('page=1'):
         all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
         if all_page:
             all_page = all_page.group(1)
             all_page = int(all_page)
             for page_num in range(2, all_page + 1):
                 page_url = response.url.replace('page=1',
                                                 'page={}'.format(page_num))
                 yield Request(page_url,
                               self.parse_follow,
                               dont_filter=True,
                               meta=response.meta)
     selector = Selector(response)
     urls = selector.xpath(
         '//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href'
     ).extract()
     uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
     ID = re.findall('(\d+)/follow', response.url)[0]
     for uid in uids:
         relationships_item = RelationshipsItem()
         relationships_item['crawl_time'] = int(time.time())
         relationships_item["fan_id"] = ID
         relationships_item["followed_id"] = uid
         relationships_item["_id"] = ID + '-' + uid
         yield relationships_item
Beispiel #3
0
    def parse_relationship(self, response):
        """
        functions:
           1. parse followees, fans
           2. pull out followees' id to request a new user
           3. parse relationship
        """
        if "/follow" in response.url:
            ID = re.findall('(\d+)/follow', response.url)[0]
            flag = True
        else:
            ID = re.findall('(\d+)/fans', response.url)[0]
            flag = False
        urls = response.xpath(
            '//a[text()="关注他" or text()="关注她"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        for uid in uids:
            relationshipsItem = RelationshipsItem()
            relationshipsItem["fan_id"] = ID if flag else uid
            relationshipsItem["followed_id"] = uid if flag else ID
            yield relationshipsItem
            yield Request(url="https://weibo.cn/%s/info" % uid,
                          callback=self.parse_information)

        next_url = "https://weibo.cn" + response.xpath(
            "//div[@class='pa']/form/div/a[1]/@href").extract()
        if next_url:
            yield Request(url=next_url[0],
                          callback=self.parse_relationship,
                          dont_filter=True)
Beispiel #4
0
    def parse_relationship(self, response):
        """ 打开url爬取里面的个人ID """
        selector = Selector(response)
        if "/follow" in response.url:
            ID = re.findall('(\d+)/follow', response.url)[0]
            flag = True
        else:
            ID = re.findall('(\d+)/fans', response.url)[0]
            flag = False
        urls = selector.xpath(
            '//a[text()="关注他" or text()="关注她"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        for uid in uids:
            relationshipsItem = RelationshipsItem()
            relationshipsItem["fan_id"] = ID if flag else uid
            relationshipsItem["followed_id"] = uid if flag else ID
            yield relationshipsItem
            yield Request(url="https://weibo.cn/%s/info" % uid,
                          callback=self.parse_information)

        next_url = selector.xpath('//a[text()="下页"]/@href').extract()
        if next_url:
            yield Request(url=self.host + next_url[0],
                          callback=self.parse_relationship,
                          dont_filter=True)
Beispiel #5
0
    def parse_fans(self, response):
        """
        抓取粉丝列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1',
                                                    'page={}'.format(page_num))
                    yield Request(page_url,
                                  self.parse_fans,
                                  dont_filter=True,
                                  meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath(
            '//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract(
            )
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/fans', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = int(time.time())
            relationships_item["fan_id"] = uid
            relationships_item["followed_id"] = ID
            relationships_item["_id"] = uid + '-' + ID
            try:
                sql = "INSERT INTO `sbhdb`.`weibo_user_rela`(`_id`, `followed_id`, `fan_id`, `crawl_time`) VALUES ('%s', '%s', '%s', %s)" % (
                    relationships_item["_id"],
                    relationships_item["followed_id"],
                    relationships_item["fan_id"],
                    relationships_item['crawl_time'])
                self.cursor.execute(sql)
                self.db.commit()
            except:
                # 数据有重复
                pass

            yield relationships_item