def parse_fans(self, response): """ 抓取粉丝列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/fans', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = int(time.time()) relationships_item["fan_id"] = uid self.start_uids.append(uid) relationships_item["followed_id"] = ID relationships_item["_id"] = uid + '-' + ID yield relationships_item # todo 递归爬取粉丝列表 for uid in self.start_uids: yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) self.start_uids.clear()
def parse_follow(self, response): """ 抓取关注列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath( '//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href' ).extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/follow', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = int(time.time()) relationships_item["fan_id"] = ID relationships_item["followed_id"] = uid relationships_item["_id"] = ID + '-' + uid yield relationships_item
def parse_relationship(self, response): """ functions: 1. parse followees, fans 2. pull out followees' id to request a new user 3. parse relationship """ if "/follow" in response.url: ID = re.findall('(\d+)/follow', response.url)[0] flag = True else: ID = re.findall('(\d+)/fans', response.url)[0] flag = False urls = response.xpath( '//a[text()="关注他" or text()="关注她"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) for uid in uids: relationshipsItem = RelationshipsItem() relationshipsItem["fan_id"] = ID if flag else uid relationshipsItem["followed_id"] = uid if flag else ID yield relationshipsItem yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) next_url = "https://weibo.cn" + response.xpath( "//div[@class='pa']/form/div/a[1]/@href").extract() if next_url: yield Request(url=next_url[0], callback=self.parse_relationship, dont_filter=True)
def parse_relationship(self, response): """ 打开url爬取里面的个人ID """ selector = Selector(response) if "/follow" in response.url: ID = re.findall('(\d+)/follow', response.url)[0] flag = True else: ID = re.findall('(\d+)/fans', response.url)[0] flag = False urls = selector.xpath( '//a[text()="关注他" or text()="关注她"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) for uid in uids: relationshipsItem = RelationshipsItem() relationshipsItem["fan_id"] = ID if flag else uid relationshipsItem["followed_id"] = uid if flag else ID yield relationshipsItem yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) next_url = selector.xpath('//a[text()="下页"]/@href').extract() if next_url: yield Request(url=self.host + next_url[0], callback=self.parse_relationship, dont_filter=True)
def parse_fans(self, response): """ 抓取粉丝列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath( '//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract( ) uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/fans', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = int(time.time()) relationships_item["fan_id"] = uid relationships_item["followed_id"] = ID relationships_item["_id"] = uid + '-' + ID try: sql = "INSERT INTO `sbhdb`.`weibo_user_rela`(`_id`, `followed_id`, `fan_id`, `crawl_time`) VALUES ('%s', '%s', '%s', %s)" % ( relationships_item["_id"], relationships_item["followed_id"], relationships_item["fan_id"], relationships_item['crawl_time']) self.cursor.execute(sql) self.db.commit() except: # 数据有重复 pass yield relationships_item