def main(username='', userurl='/u/1765475181'): globalValue.mainusername = username globalValue.mainuserurl = userurl if globalValue.isLogIn: wbpage = getWeiboPage.getWeiboPage() #uid ='/u/1765475181' #master传过来的<key value>中的value t = parserSaverThread( ) #parserSaverThread是处理爬下来的网页,把所有@的用户提取出来。并保存在userque2salve中传给slave们 t.setDaemon(False) #主线程结束,子线程不跟着结束 t.start() wbpage.get_userpage(userurl) #爬网页咯 else: filename = './conf/account' #保存微博账号的用户名和密码,第一行为用户名,第二行为密码 WBLogin = weiboLogin.weiboLogin() if WBLogin.login(filename) == 1: globalValue.isLogIn = True print 'Login success!' wbpage = getWeiboPage.getWeiboPage() #uid ='/u/1765475181' #master传过来的<key value>中的value t = parserSaverThread( ) #parserSaverThread是处理爬下来的网页,把所有@的用户提取出来。并保存在userque2salve中传给slave们 t.setDaemon(False) #主线程结束,子线程不跟着结束 t.start() wbpage.get_userpage(userurl) #爬网页咯 else: print 'Login error!' exit()
def main(): categorys = ['102803_ctg1_4188_-_ctg1_4188','102803_ctg1_2088_-_ctg1_2088','102803_ctg1_5988_-_ctg1_5988','102803_ctg1_5088_-_ctg1_5088','102803_ctg1_1288_-_ctg1_1288','102803_ctg1_4288_-_ctg1_4288', '102803_ctg1_4688_-_ctg1_4688','102803_ctg1_2488_-_ctg1_2488','102803_ctg1_3288_-_ctg1_3288','102803_ctg1_5288_-_ctg1_5288','102803_ctg1_5188_-_ctg1_5188','102803_ctg1_1388_-_ctg1_1388', '102803_ctg1_4788_-_ctg1_4788','102803_ctg1_2188_-_ctg1_2188','102803_ctg1_6088_-_ctg1_6088','102803_ctg1_1199_-_ctg1_1199','102803_ctg1_2288_-_ctg1_2288','102803_ctg1_4988_-_ctg1_4988', '102803_ctg1_1988_-_ctg1_1988','102803_ctg1_4388_-_ctg1_4388','102803_ctg1_5788_-_ctg1_5788','102803_ctg1_4888_-_ctg1_4888','102803_ctg1_2588_-_ctg1_2588','102803_ctg1_3188_-_ctg1_3188', '102803_ctg1_1488_-_ctg1_1488','102803_ctg1_2688_-_ctg1_2688','102803_ctg1_5588_-_ctg1_5588','102803_ctg1_5888_-_ctg1_5888','102803_ctg1_1688_-_ctg1_1688','102803_ctg1_4588_-_ctg1_4588', '102803_ctg1_5388_-_ctg1_5388','102803_ctg1_5488_-_ctg1_5488','102803_ctg1_4488_-_ctg1_4488','102803_ctg1_1588_-_ctg1_1588','102803_ctg1_2388_-_ctg1_2388','102803_ctg1_5688_-_ctg1_5688', '102803_ctg1_6399_-_ctg1_6399','102803_ctg1_2788_-_ctg1_2788'] categorys = ['102803_ctg1_1199_-_ctg1_1199'] #需要修改 categorys = ['102803_ctg1_2288_-_ctg1_2288','102803_ctg1_4988_-_ctg1_4988', '102803_ctg1_1988_-_ctg1_1988','102803_ctg1_4388_-_ctg1_4388','102803_ctg1_5788_-_ctg1_5788','102803_ctg1_4888_-_ctg1_4888','102803_ctg1_2588_-_ctg1_2588','102803_ctg1_3188_-_ctg1_3188', '102803_ctg1_1488_-_ctg1_1488','102803_ctg1_2688_-_ctg1_2688','102803_ctg1_5588_-_ctg1_5588','102803_ctg1_5888_-_ctg1_5888','102803_ctg1_1688_-_ctg1_1688','102803_ctg1_4588_-_ctg1_4588', '102803_ctg1_5388_-_ctg1_5388','102803_ctg1_5488_-_ctg1_5488','102803_ctg1_4488_-_ctg1_4488','102803_ctg1_1588_-_ctg1_1588','102803_ctg1_2388_-_ctg1_2388','102803_ctg1_5688_-_ctg1_5688', '102803_ctg1_6399_-_ctg1_6399','102803_ctg1_2788_-_ctg1_2788'] categorys = ['102803_ctg1_5688_-_ctg1_5688'] username = '******' pwd = 'nan18756072542' WBLogin = weiboLogin.weiboLogin() if WBLogin.login(username,pwd)==1: print 'Login success!' for category in categorys: i = 1 while True: print u'正在获取第' + str(i) + '页内容、、、' page01 = getWeiboPage.getWeiboPage(category,i).get_firstpage() if page01 == 0: break else: #matcher.matcher(page01).pageAnalyse() matcher.matcher(page01).insertContents() time.sleep(5) page02 = getWeiboPage.getWeiboPage(category,i).get_secondpage() if page02 == 0: break else: #matcher.matcher(page02).pageAnalyse() matcher.matcher(page02).insertContents() time.sleep(10) page03 = getWeiboPage.getWeiboPage(category,i).get_thirdpage() if page03 == 0: break else: #matcher.matcher(page03).pageAnalyse() matcher.matcher(page03).insertContents() time.sleep(30) i += 1 time.sleep(60) else: print 'Login error!' exit()
def crawler(self,uname,uid): page_num = self.get_page_count(uid) print uname+" has "+str(page_num)+" pages fans" url = "http://weibo.com/u/"+str(uid)+"?source=webim" req = urllib2.Request(url) text = urllib2.urlopen(req).read() pid = self.pidpattern.findall(text)[0] # print pid WBpage = getWeiboPage.getWeiboPage() content = WBpage.get_msg(uname,uid,pid,page_num) return 1
def start(self): for userInfo in self.uidList: time.sleep(5) try: uid = userInfo.split(',')[0] print('Start crawling uid: ') print(uid) uidURL = userInfo.split(',')[1] print(uidURL) print '' gwp = getWeiboPage(uid, uidURL, self.outputFolder + '/' + str(uid)) gwp.body['page'] = 1 html1 = gwp.get_firstpage() est_totalPost = 0 est_totalPost = int(self.getTotalPost(html1)) if est_totalPost <= 15: continue if est_totalPost > 15: html2 = gwp.get_secondpage() if est_totalPost <= 30: continue if est_totalPost > 30: html3 = gwp.get_thirdpage() if est_totalPost <= 45: continue totalPages = 0 totalPages = int(self.getTotalPage(html3)) if self.startcrawlpage < 3 or self.startcrawlpage > totalPages: self.startcrawlpage = 2 if self.wantpages > 0: if totalPages >= self.wantpages + self.startcrawlpage: totalPages = self.wantpages + self.startcrawlpage for page in range(self.startcrawlpage, totalPages + 1): gwp.body['pre_page'] = page - 1 gwp.body['page'] = page print page ''' gwp.get_firstpage(); gwp.get_secondpage(); gwp.get_thirdpage();''' except BaseException: self.writefile('./fail_log.txt', uid) break
def start(self): for userInfo in self.uidList: time.sleep(5) try: uid = userInfo.split(',')[0] print('Start crawling uid: ') print(uid) uidURL = userInfo.split(',')[1] print(uidURL) print '' gwp = getWeiboPage(uid, uidURL, self.outputFolder + '/' + str(uid)) gwp.body['page'] = 1; html1 = gwp.get_firstpage() est_totalPost = 0 est_totalPost = int(self.getTotalPost(html1)) if est_totalPost <= 15: continue if est_totalPost > 15: html2 = gwp.get_secondpage(); if est_totalPost <= 30 : continue if est_totalPost > 30: html3 = gwp.get_thirdpage(); if est_totalPost <= 45: continue totalPages = 0 totalPages = int(self.getTotalPage(html3)) if self.startcrawlpage <3 or self.startcrawlpage > totalPages: self.startcrawlpage = 2 if self.wantpages>0: if totalPages >= self.wantpages + self.startcrawlpage: totalPages = self.wantpages + self.startcrawlpage for page in range(self.startcrawlpage, totalPages+1): gwp.body['pre_page'] = page-1; gwp.body['page'] = page; print page ''' gwp.get_firstpage(); gwp.get_secondpage(); gwp.get_thirdpage();''' except BaseException: self.writefile('./fail_log.txt', uid) break
def test(): WBcontent = getWeiboPage.getWeiboPage(); while not GV.task_list.empty(): keyword = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if keyword: WBcontent.set_keyword(keyword); utility.iprint( 'handle id:%s'%WBcontent.get_keyword() ); try: WBcontent.get_msg(WBcontent.get_keyword()); except Exception, e: logging.exception(keyword + "用户信息解析出错: " + str(e)); continue;
def test(): WBcontent = getWeiboPage.getWeiboPage() while not GV.task_list.empty(): keyword = GV.task_list.get() utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize()) if keyword: WBcontent.set_keyword(keyword) utility.iprint('handle id:%s' % WBcontent.get_keyword()) try: WBcontent.get_msg(WBcontent.get_keyword()) except Exception, e: logging.exception(keyword + "用户信息解析出错: " + str(e)) continue
def test(): WBcontent = getWeiboPage.getWeiboPage(GV.dict_klg); while not GV.task_list.empty(): uid = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if uid: WBcontent.set_uid(uid); utility.iprint( 'handle id:%s'%WBcontent.get_uid() ); try: WBcontent.get_msg(WBcontent.get_uid()); except Exception, e: logging.exception(uid + "用户信息解析出错: " + str(e)); continue;
def run(self): global task_list; WBcontent = getWeiboPage.getWeiboPage(); while not GV.task_list.empty() and not self.thread_stop: keyword = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if keyword: WBcontent.set_keyword( keyword ); utility.iprint( 'Thread %s handle id:%s'%( self.t_name, WBcontent.get_keyword() ) ); try: WBcontent.get_msg(WBcontent.get_keyword()); except Exception, e: logging.exception( "%s 用户信息解析出错:" + str(e), WBcontent.get_keyword() ); continue;
def run(self): global task_list; WBcontent = getWeiboPage.getWeiboPage(); while not GV.task_list.empty() and not self.thread_stop: uid = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if uid: WBcontent.set_uid(uid); utility.iprint( 'Thread %s handle id:%s'%(self.t_name, WBcontent.get_uid()) ); try: WBcontent.get_msg(WBcontent.get_uid()); except Exception, e: logging.exception("%s 用户信息解析出错:" + str(e), WBcontent.get_uid()); continue;
def run(self): global task_list WBcontent = getWeiboPage.getWeiboPage() while not GV.task_list.empty() and not self.thread_stop: keyword = GV.task_list.get() utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize()) if keyword: WBcontent.set_keyword(keyword) utility.iprint('Thread %s handle id:%s' % (self.t_name, WBcontent.get_keyword())) try: WBcontent.get_msg(WBcontent.get_keyword()) except Exception, e: logging.exception("%s 用户信息解析出错:" + str(e), WBcontent.get_keyword()) continue
fread = file(filename) for line in fread: uid_list.append(line.strip()) def writefile(filename,content): fw = file(filename,'a') fw.write(content) fw.close() if __name__ == '__main__': username = '' pwd = '' WBLogin = weiboLogin.weiboLogin() if(WBLogin.login(username, pwd)=='servertime_error'): print 'login failed. check out your network.' sys.exit() uid_list=[] get_uid('C:/Result1.txt',uid_list) path='C:/weibodata' if not os.path.exists(path): os.mkdir(path) for uid in uid_list: try: WBpage = getWeiboPage.getWeiboPage() WBpage.get_msg(uid) except Exception as e: writefile('C:/id.txt',str(uid)+'\n')
fread = file(filename) for line in fread: uid_list.append(line.strip()) def writefile(filename, content): fw = file(filename, 'a') fw.write(content) fw.close() if __name__ == '__main__': username = '' pwd = '' WBLogin = weiboLogin.weiboLogin() if (WBLogin.login(username, pwd) == 'servertime_error'): print 'login failed. check out your network.' sys.exit() uid_list = [] get_uid('C:/Result1.txt', uid_list) path = 'C:/weibodata' if not os.path.exists(path): os.mkdir(path) for uid in uid_list: try: WBpage = getWeiboPage.getWeiboPage() WBpage.get_msg(uid) except Exception as e: writefile('C:/id.txt', str(uid) + '\n')