def updateUserInfo(self, user_id, dict): if not dict: return try: self.openMySql() name = dict.get('name') image = dict.get('image', "") sex = dict.get('sex') sign = dict.get('sign', '') location = dict.get('location', '') major = dict.get('major', '') job = dict.get('job', '') education = dict.get('education', '') info = dict.get('info', '') url = dict.get('url', '') #解决特殊字符的问题 info = self.filterString(info) # update user u set u.name='轮子哥2',u.image='www.baidu.com',u.sex='男',u.sign='个性签名',u.location='西雅图',u.major='软件',u.job='谷歌',u.education='xx大学',u.info='个人信息',u.is_catch=1 where u.user_id='vch2' sql = "update user u set u.name='{0}',u.image='{1}',u.sex='{2}',u.sign='{3}',u.location='{4}',u.major='{5}',u.job='{6}',u.education='{7}',u.info='{8}',url_following='{9}',u.is_catch=1 where u.user_id='{10}'".format( name, image, sex, sign, location, major, job, education, info, url, user_id) cursor = self.conn.cursor() cursor.execute(sql) self.conn.commit() log('更新用户信息到数据库成功,user_id=%s' % user_id) except Exception as e: loge(e) finally: cursor.close() self.closeMySql()
def saveAchieveInfo(self, user_id, achieveDict): try: self.openMySql() record_num = achieveDict.get('record_num', 0) record_by = achieveDict.get('record_by', '') applaud_num = achieveDict.get('applaud_num', 0) gratitude_num = achieveDict.get('gratitude_num', 0) collect_num = achieveDict.get('collect_num', 0) public_edit_num = achieveDict.get('public_edit_num', 0) is_excellent_answer = 1 if achieveDict.get('is_excellent_answer', False) else 0 excellent_topic = achieveDict.get('excellent_topic', '') #('asd2',10,'编辑推荐',2,3,5,6,1,'生活话题') sql = "insert into achieve(user_id,record_num,record_by,applaud_num,gratitude_num,collect_num,public_edit_num,is_excellent_answer,excellent_topic) values('{0}',{1},'{2}',{3},{4},{5},{6},{7},'{8}')".format( user_id, record_num, record_by, applaud_num, gratitude_num, collect_num, public_edit_num, is_excellent_answer, excellent_topic) cursor = self.conn.cursor() cursor.execute(sql) self.conn.commit() log('保存用户个人成就到数据库成功,user_id = {0}'.format(user_id)) except Exception as e: loge(e) finally: cursor.close() self.closeMySql()
def start(self): t1 = threading.Thread(target=self.catchUserInfoThread) t2 = threading.Thread(target=self.catchUserFollowingThread) t3 = threading.Thread(target=self.exitThread) t1.start() t2.start() t3.start() t1.join() t2.join() t3.join() log("全部程序运行完毕")
def getFirstUserToFollowing(self): result = None try: self.openMySql() sql = 'select * from user where is_following=0 order by id' cursor = self.conn.cursor() cursor.execute(sql) res = cursor.fetchone() result = res[1] except Exception as e: loge(e) finally: cursor.close() self.closeMySql() log('获取第一个没有爬取关注者的用户, user_id={0}'.format(result)) return result
def getFirstUserToCatch(self): result = None try: self.openMySql() sql = "select * from user where is_catch=0 order by id" cursor = self.conn.cursor() cursor.execute(sql) res = cursor.fetchone() if res: result = res[1] except Exception as e: loge(e) finally: cursor.close() self.closeMySql() log('获取第一个没有被抓取的用户,user_id=%s' % result) return result
def start(self): #创建线程 spiderThreads = [] followingThread = threading.Thread( target=self.catchUserFollowingThread) exitThread = threading.Thread(target=self.exitThread) spiderThreads.append(followingThread) spiderThreads.append(exitThread) lock = threading.Lock() for i in range(0, self.thread_num): th = threading.Thread(target=self.catchUserInfoThread, args=(lock, )) spiderThreads.append(th) #启动线程 [th.start() for th in spiderThreads] #等待线程结束 [th.join() for th in spiderThreads] log('所有程序运行完毕')
def getUserInfo(self, userId): dict = {} # 构造用户信息页面的url url = base_url.format(userId) dict['user_id'] = userId dict['url'] = url dict['code'] = self.code_success count = 0 while count < 3: driver = None try: driver = webdriver.PhantomJS( executable_path=spider_const.phantomjs_path, desired_capabilities=spider_const.desired_cap) driver.implicitly_wait(self.time_wait) driver.get(url) # 保存图片 # dt = datetime.now() # fileName = dt.strftime('%Y-%m-%d_%H-%M-%S') + ".jpg" # driver.save_screenshot(fileName) error = driver.page_source.find('你似乎来到了没有知识存在的荒原...') # 404界面 if error != -1: dict['code'] = self.code_user_not_exist else: elem = driver.find_element_by_class_name( 'ProfileHeader-expandButton') elem.send_keys(Keys.ENTER) #解析用户信息 dictResult = self.parseUserInfo(driver.page_source) #解析用户个人成就 log('开始抓取用户个人成就,user_id = {0}'.format(userId)) dictAchieve = self.parseAchieve(driver.page_source) dict.update(dictResult) dict.update(dictAchieve) break except Exception as e: loge(e) count = count + 1 log('发生异常,尝试第{0}次重试, user_id={1}'.format(count, userId)) finally: if driver: driver.quit() log('进入{0}秒休眠'.format(self.time_duration)) time.sleep(self.time_duration) log('{0}秒休眠结束'.format(self.time_duration)) # 尝试次数超过3次,那么认为抓取失败 if count >= 3: dict['code'] = self.code_failure return dict
def saveFollowerInfo(self, user_id, follower_list): if not user_id or not follower_list: return self.openMySql() cursor = self.conn.cursor() for item in follower_list: try: sql = "insert into follow(user_id,follower_id) values('%s','%s')" % ( user_id, item) cursor.execute(sql) self.conn.commit() except Exception as e: log('保存用户关注信息,插入follow表发生异常,user_id = {0}'.format(user_id)) loge(e) for item in follower_list: try: sql = "insert into user(user_id) values('%s')" % item cursor.execute(sql) self.conn.commit() except Exception as e: pass cursor.close() self.closeMySql()
def exitThread(self): log('检测是否退出的线程启动') while True: file = spider_const.control_exit_file if os.path.exists(file): self.isExit = True log('检测到退出文件,退出程序.exit_file = {0}'.format(file)) break else: duration = spider_const.control_exit_duration * 60 log('未检测到退出文件,休眠{0}秒'.format(duration)) time.sleep(duration)
def parseAchieve(self, content): dict = {} if content is None: return dict p = pq(content) card = p('div.Profile-sideColumnItem') for item in card.items(): pTitle = item('div.IconGraf') title = pTitle.text() if title == '优秀回答者': topic = item('div.Profile-sideColumnItemValue').text() dict['is_excellent_answer'] = True dict['excellent_topic'] = topic log('优秀回答者:topic=' + topic) elif title[:4] == '知乎收录': record_num = re.sub('\D', "", title) record_by = item('div.Profile-sideColumnItemValue').text() dict['record_num'] = record_num dict['record_by'] = record_by log('知乎收录{0}个答案, {1}'.format(record_num, record_by)) elif title[:2] == '获得': # 获得xx次赞同 applaud_num = re.sub('\D', '', title) itemContent = item('div.Profile-sideColumnItemValue').text() # 获得感谢的次数 pattern = re.compile(r'获得\s?(\d+)\s?次感谢') result = re.search(pattern, itemContent) gratitude_num = result.groups()[0] if result else 0 # 获得收藏的次数 pattern2 = re.compile(r'(\d+)\s?次收藏') result2 = re.search(pattern2, itemContent) collect_num = result2.groups()[0] if result2 else 0 dict['applaud_num'] = applaud_num dict['gratitude_num'] = gratitude_num dict['collect_num'] = collect_num log('获得{0}次称赞,{1}次感谢,{2}次收藏'.format(applaud_num, gratitude_num, collect_num)) elif title[:2] == '参与': public_edit_num = re.sub('\D', '', title) dict['public_edit_num'] = public_edit_num log('参与{0}次公共编辑'.format(public_edit_num)) return dict
def catchUserFollowingThread(self): s = ZhiHuSpider() d = DBUtil() st = Status.Following() while self.isExit == False: #取出第一个用户 userId, currentPage = d.getFirstUserToFollowing2() log('开始抓取用户关注者,user_id={0}, current_page={1}'.format( userId, currentPage)) if userId is None: time.sleep(3) continue d.setUserIsFollowing(userId, st.is_catching) #获取关注者页数 total = self.getUserFollowingPageNum(userId) log('当前用户总的关注者的页数,user_id={0}, total_page={1}'.format( userId, total)) #用户没有关注任何人 if total == 0: d.setUserIsFollowing(userId, st.user_following_none) continue #标识是否正常退出 isFinished = True for i in range(currentPage + 1, total + 1): # 判断是否要退出 if self.isExit: isFinished = False break list = self.getUserFollowingPageContent(userId, i) #获取关注者成功 if len(list) > 0: d.saveFollowerInfo(userId, list) #设置状态 d.setUserIsFollowing(userId, st.is_catching) #设置这一页抓取完毕了 d.setUserFollowingPage(userId, i) log('抓取完一页用户的关注者,user_id={0}, page={1}, list.size={2}'.format( userId, i, len(list))) time.sleep(self.time_duration * 20) # 全部抓取成功 if isFinished: # 设置抓取完毕 d.setUserIsFollowing(userId, st.catched) log('当前用户关注的人全部抓取完毕,user_id= %s' % userId) # 没有抓取完毕 else: log('当前用户关注的人没有抓取完毕,中途退出,user_id = {0}'.format(userId)) log('获取用户关注者的线程运行结束')
def catchUserInfoThread(self, lock): s = ZhiHuSpider() db = DBUtil() st = Status.Catch() while not self.isExit: #加锁 lock.acquire() #获取第一个用户开始爬 userId = db.getFirstUserToCatch() if userId is None: lock.release() time.sleep(5) continue #设置为正在爬取 db.setUserIsCatch(userId, st.is_catching) lock.release() log('开始爬取用户,pid={0}, user_id={1}'.format(os.getpid(), userId)) #开始爬取用户信息 dict = s.getUserInfo(userId) code = dict['code'] # 用户没有价值 if code == s.code_user_not_useful: log('用户没有价值,pid={0}, user_id={1}'.format(os.getpid(), userId)) db.setUserIsCatch(userId, st.user_not_useful) # 用户不存在 elif code == s.code_user_not_exist: log('用户不存在,是僵尸粉,pid={0}, user_id={1}'.format( os.getpid(), userId)) db.setUserIsCatch(userId, st.user_not_exist) # 抓取失败 elif code == s.code_failure: log('用户抓取失败,pid={0}, user_id={1}'.format(os.getpid(), userId)) db.setUserIsCatch(userId, st.failed) # 抓取成功 else: log('用户抓取成功,pid={0}, user_id={1}'.format(os.getpid(), userId)) db.updateUserInfo(userId, dict) db.saveAchieveInfo(userId, dict) log('获取用户详细信息的线程结束,tid = {0}'.format(self.getThreadId()))
def testProess(self, lock): while True: log('count={0},pid={1}'.format(self.count, os.getpid())) time.sleep(3)