def saveProfile(self, info, uid): """ Save Profile to workspace. Args: info ([type]): [description] uid ([type]): [description] Returns: [DataFrame]: DataFrame of user's profile. [List]: A list of downloaded images. """ # Directory user_dir = os.path.join(self.cache_dir, uid) workspace_dir = os.path.join(user_dir, 'workspace') # Create folder for profile. profile_dir = os.path.join(workspace_dir, 'profile') img_folder = os.path.join(profile_dir, 'img') if not os.path.exists(img_folder): os.makedirs(img_folder, exist_ok=True) avator_url = info['avatar_hd'] cover_url = info['cover_image_phone'] # Download image dl = Downloader() img_path = dl.download_files([avator_url, cover_url], img_folder) # Save profile to disk. profile_dict = dict() profile_dict['id'] = info['id'] profile_dict['screen_name'] = info['screen_name'] profile_dict['description'] = info['description'] profile_dict['gender'] = info['gender'] profile_dict['verified'] = info['verified'] profile_dict['verified_type'] = info['verified_type'] profile_dict['close_blue_v'] = info['close_blue_v'] profile_dict['followers_count'] = info['followers_count'] profile_dict['follow_count'] = info['follow_count'] profile_dict['cover_image_phone'] = info['cover_image_phone'] profile_dict['avatar_hd'] = info['avatar_hd'] profile_df = pd.DataFrame.from_dict([profile_dict]) profile_df.to_csv(os.path.join(profile_dir, 'profile.csv')) return img_path
class Spider(object): def __init__(self, config): """Weibo类初始化""" self.config = config # change cookie from string to dict if type(self.config['cookie']) == type(u''): self.config['cookie'] = { t.strip().split("=")[0]: t.strip().split("=")[1] for t in self.config['cookie'].split(";") } if type(self.config['user_id_list']) == type(u""): user_id_list = self.config['user_id_list'] if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list self.config['user_id_list'] = user_id_list with open(self.config['user_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] self.config['user_id_list'] = [ line.split(' ')[0] for line in lines if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() ] if type(self.config['since_date']) == type(0): self.config['since_date'] = str( date.today() - timedelta(self.config['since_date'])) self.validator = Validator(self.config) self.validator.validate() self.printer = Printer() self.writer = Writer(self.config) self.downloader = Downloader(self.config) self.parser = Parser(self.config) def get_nickname(self): """获取用户昵称""" url = 'https://weibo.cn/%s/info' % (self.user['id']) selector = self.parser.deal_html(url, self.config['cookie']) nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] if nickname == u'登录 - 新' or nickname == u'新浪': write_log(self.config['since_date']) sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') self.user['nickname'] = nickname def get_user_info(self, selector): """获取用户昵称、微博数、关注数、粉丝数""" self.get_nickname() # 获取用户昵称 user_info = selector.xpath("//div[@class='tip2']/*/text()") self.user['weibo_num'] = int(user_info[0][3:-1]) self.user['following'] = int(user_info[1][3:-1]) self.user['followers'] = int(user_info[2][3:-1]) self.printer.print_user_info(self.user) self.writer.write_user(self.user) print('*' * 100) def get_one_page(self, page): """获取第page页的全部微博""" url = 'https://weibo.cn/u/%s?page=%d' % (self.user['id'], page) selector = self.parser.deal_html(url, self.config['cookie']) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") if is_exist: for i in range(0, len(info) - 2): weibo = self.parser.get_one_weibo(info[i]) if weibo: if weibo['id'] in self.weibo_id_list: continue publish_time = datetime.strptime( weibo['publish_time'][:10], "%Y-%m-%d") since_date = datetime.strptime(self.config['since_date'], "%Y-%m-%d") if publish_time < since_date: if self.parser.is_pinned_weibo(info[i]): continue else: return True self.printer.print_one_weibo(weibo) self.weibo.append(weibo) self.weibo_id_list.append(weibo['id']) self.got_num += 1 print('-' * 100) self.writer.write_weibo([weibo]) def get_weibo_info(self): """获取微博信息""" url = 'https://weibo.cn/u/%s' % (self.user['id']) selector = self.parser.deal_html(url, self.config['cookie']) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.parser.get_page_num(selector) # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc='Progress'): is_end = self.get_one_page(page) # 获取第page页的全部微博 if is_end: break # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) if not self.config['filter']: print(u'共爬取' + str(self.got_num) + u'条微博') else: print(u'共爬取' + str(self.got_num) + u'条原创微博') def initialize_info(self, user_id): """初始化爬虫信息""" self.got_num = 0 # 爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 self.user = {'id': user_id} # 存储爬取到的用户信息 self.weibo_id_list = [] # 存储爬取到的所有微博id def start(self): """运行爬虫""" for user_id in self.config['user_id_list']: self.initialize_info(user_id) print('*' * 100) self.get_weibo_info() print(u'信息抓取完毕') print('*' * 100) if self.config['pic_download'] == 1: file_path = get_filepath('img', self.user['nickname']) self.downloader.download_files(file_path, 'img', self.weibo) if self.config['video_download'] == 1: file_path = get_filepath('video', self.user['nickname']) self.downloader.download_files(file_path, 'video', self.weibo)