コード例 #1
0
def main(username='', userurl='/u/1765475181'):
    globalValue.mainusername = username
    globalValue.mainuserurl = userurl

    if globalValue.isLogIn:
        wbpage = getWeiboPage.getWeiboPage()
        #uid ='/u/1765475181' #master传过来的<key value>中的value

        t = parserSaverThread(
        )  #parserSaverThread是处理爬下来的网页,把所有@的用户提取出来。并保存在userque2salve中传给slave们
        t.setDaemon(False)  #主线程结束,子线程不跟着结束
        t.start()
        wbpage.get_userpage(userurl)  #爬网页咯
    else:
        filename = './conf/account'  #保存微博账号的用户名和密码,第一行为用户名,第二行为密码
        WBLogin = weiboLogin.weiboLogin()
        if WBLogin.login(filename) == 1:
            globalValue.isLogIn = True
            print 'Login success!'
            wbpage = getWeiboPage.getWeiboPage()
            #uid ='/u/1765475181' #master传过来的<key value>中的value

            t = parserSaverThread(
            )  #parserSaverThread是处理爬下来的网页,把所有@的用户提取出来。并保存在userque2salve中传给slave们
            t.setDaemon(False)  #主线程结束,子线程不跟着结束
            t.start()
            wbpage.get_userpage(userurl)  #爬网页咯
        else:
            print 'Login error!'
            exit()
コード例 #2
0
ファイル: main.py プロジェクト: nnchen1231/Hello-world
def main():
    categorys = ['102803_ctg1_4188_-_ctg1_4188','102803_ctg1_2088_-_ctg1_2088','102803_ctg1_5988_-_ctg1_5988','102803_ctg1_5088_-_ctg1_5088','102803_ctg1_1288_-_ctg1_1288','102803_ctg1_4288_-_ctg1_4288',
                 '102803_ctg1_4688_-_ctg1_4688','102803_ctg1_2488_-_ctg1_2488','102803_ctg1_3288_-_ctg1_3288','102803_ctg1_5288_-_ctg1_5288','102803_ctg1_5188_-_ctg1_5188','102803_ctg1_1388_-_ctg1_1388',
                 '102803_ctg1_4788_-_ctg1_4788','102803_ctg1_2188_-_ctg1_2188','102803_ctg1_6088_-_ctg1_6088','102803_ctg1_1199_-_ctg1_1199','102803_ctg1_2288_-_ctg1_2288','102803_ctg1_4988_-_ctg1_4988',
                 '102803_ctg1_1988_-_ctg1_1988','102803_ctg1_4388_-_ctg1_4388','102803_ctg1_5788_-_ctg1_5788','102803_ctg1_4888_-_ctg1_4888','102803_ctg1_2588_-_ctg1_2588','102803_ctg1_3188_-_ctg1_3188',
                 '102803_ctg1_1488_-_ctg1_1488','102803_ctg1_2688_-_ctg1_2688','102803_ctg1_5588_-_ctg1_5588','102803_ctg1_5888_-_ctg1_5888','102803_ctg1_1688_-_ctg1_1688','102803_ctg1_4588_-_ctg1_4588',
                 '102803_ctg1_5388_-_ctg1_5388','102803_ctg1_5488_-_ctg1_5488','102803_ctg1_4488_-_ctg1_4488','102803_ctg1_1588_-_ctg1_1588','102803_ctg1_2388_-_ctg1_2388','102803_ctg1_5688_-_ctg1_5688',
                 '102803_ctg1_6399_-_ctg1_6399','102803_ctg1_2788_-_ctg1_2788']
    categorys = ['102803_ctg1_1199_-_ctg1_1199']   #需要修改
    categorys = ['102803_ctg1_2288_-_ctg1_2288','102803_ctg1_4988_-_ctg1_4988',
                 '102803_ctg1_1988_-_ctg1_1988','102803_ctg1_4388_-_ctg1_4388','102803_ctg1_5788_-_ctg1_5788','102803_ctg1_4888_-_ctg1_4888','102803_ctg1_2588_-_ctg1_2588','102803_ctg1_3188_-_ctg1_3188',
                 '102803_ctg1_1488_-_ctg1_1488','102803_ctg1_2688_-_ctg1_2688','102803_ctg1_5588_-_ctg1_5588','102803_ctg1_5888_-_ctg1_5888','102803_ctg1_1688_-_ctg1_1688','102803_ctg1_4588_-_ctg1_4588',
                 '102803_ctg1_5388_-_ctg1_5388','102803_ctg1_5488_-_ctg1_5488','102803_ctg1_4488_-_ctg1_4488','102803_ctg1_1588_-_ctg1_1588','102803_ctg1_2388_-_ctg1_2388','102803_ctg1_5688_-_ctg1_5688',
                 '102803_ctg1_6399_-_ctg1_6399','102803_ctg1_2788_-_ctg1_2788']
    categorys = ['102803_ctg1_5688_-_ctg1_5688']
    username = '******'
    pwd = 'nan18756072542'
    WBLogin = weiboLogin.weiboLogin()
    if WBLogin.login(username,pwd)==1:
        print 'Login success!'
        for category in categorys:
            i = 1
            while True:
                print u'正在获取第' + str(i) + '页内容、、、'
                page01 = getWeiboPage.getWeiboPage(category,i).get_firstpage()
                if page01 == 0:
                    break
                else:
                    #matcher.matcher(page01).pageAnalyse()
                    matcher.matcher(page01).insertContents()
                time.sleep(5)
                page02 = getWeiboPage.getWeiboPage(category,i).get_secondpage()
                if page02 == 0:
                    break
                else:
                    #matcher.matcher(page02).pageAnalyse()
                    matcher.matcher(page02).insertContents()
                time.sleep(10)
                page03 = getWeiboPage.getWeiboPage(category,i).get_thirdpage()
                if page03 == 0:
                    break
                else:
                    #matcher.matcher(page03).pageAnalyse()
                    matcher.matcher(page03).insertContents()
                time.sleep(30)
                i += 1
            time.sleep(60)
    else:
        print 'Login error!'
        exit()
コード例 #3
0
ファイル: crawler.py プロジェクト: 52Pig/falcon
 def crawler(self,uname,uid):
     page_num = self.get_page_count(uid)
     print uname+"  has  "+str(page_num)+"  pages  fans"
     url = "http://weibo.com/u/"+str(uid)+"?source=webim"
     req = urllib2.Request(url)
     text = urllib2.urlopen(req).read()
     pid = self.pidpattern.findall(text)[0]
     # print pid
     WBpage = getWeiboPage.getWeiboPage()
     content = WBpage.get_msg(uname,uid,pid,page_num)      
     return 1
コード例 #4
0
    def start(self):
        for userInfo in self.uidList:
            time.sleep(5)
            try:
                uid = userInfo.split(',')[0]
                print('Start crawling uid: ')
                print(uid)
                uidURL = userInfo.split(',')[1]
                print(uidURL)
                print ''
                gwp = getWeiboPage(uid, uidURL,
                                   self.outputFolder + '/' + str(uid))
                gwp.body['page'] = 1

                html1 = gwp.get_firstpage()
                est_totalPost = 0
                est_totalPost = int(self.getTotalPost(html1))

                if est_totalPost <= 15:
                    continue

                if est_totalPost > 15:
                    html2 = gwp.get_secondpage()
                    if est_totalPost <= 30:
                        continue

                if est_totalPost > 30:
                    html3 = gwp.get_thirdpage()
                    if est_totalPost <= 45:
                        continue

                totalPages = 0
                totalPages = int(self.getTotalPage(html3))

                if self.startcrawlpage < 3 or self.startcrawlpage > totalPages:
                    self.startcrawlpage = 2

                if self.wantpages > 0:
                    if totalPages >= self.wantpages + self.startcrawlpage:
                        totalPages = self.wantpages + self.startcrawlpage

                for page in range(self.startcrawlpage, totalPages + 1):
                    gwp.body['pre_page'] = page - 1
                    gwp.body['page'] = page
                    print page
                    '''
                    gwp.get_firstpage();
                    gwp.get_secondpage();
                    gwp.get_thirdpage();'''

            except BaseException:
                self.writefile('./fail_log.txt', uid)
                break
コード例 #5
0
     def start(self):
         for userInfo in self.uidList:
             time.sleep(5)
             try:
                 uid = userInfo.split(',')[0]
                 print('Start crawling uid: ')
                 print(uid)
                 uidURL = userInfo.split(',')[1]
                 print(uidURL)
                 print ''
                 gwp = getWeiboPage(uid, uidURL, self.outputFolder + '/' + str(uid))
                 gwp.body['page'] = 1;

                 html1 = gwp.get_firstpage()
                 est_totalPost = 0
                 est_totalPost = int(self.getTotalPost(html1))

                 if est_totalPost <= 15:
                     continue

                 if est_totalPost > 15:
                     html2 = gwp.get_secondpage();
                     if est_totalPost <= 30 :
                         continue

                 if est_totalPost > 30:
                     html3 = gwp.get_thirdpage();
                     if est_totalPost <= 45:
                         continue

                 totalPages = 0
                 totalPages = int(self.getTotalPage(html3))

                 if self.startcrawlpage <3 or self.startcrawlpage > totalPages:
                     self.startcrawlpage = 2

                 if self.wantpages>0:
                     if totalPages >= self.wantpages + self.startcrawlpage:
                         totalPages = self.wantpages + self.startcrawlpage

                 for page in range(self.startcrawlpage, totalPages+1):
                    gwp.body['pre_page'] = page-1;
                    gwp.body['page'] = page;
                    print page
                    '''
                    gwp.get_firstpage();
                    gwp.get_secondpage();
                    gwp.get_thirdpage();'''

             except BaseException:
                 self.writefile('./fail_log.txt', uid)
                 break
コード例 #6
0
ファイル: main2.py プロジェクト: bollwang/weibo_search
def test():
	WBcontent = getWeiboPage.getWeiboPage();
	while not GV.task_list.empty():
		keyword = GV.task_list.get();
		utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
		if keyword:
			WBcontent.set_keyword(keyword);
			utility.iprint( 'handle id:%s'%WBcontent.get_keyword() );
			try:
				WBcontent.get_msg(WBcontent.get_keyword());
			except Exception, e:
				logging.exception(keyword + "用户信息解析出错: " + str(e));
				continue;
コード例 #7
0
ファイル: main2.py プロジェクト: wesavetheworld/weibo_search
def test():
    WBcontent = getWeiboPage.getWeiboPage()
    while not GV.task_list.empty():
        keyword = GV.task_list.get()
        utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize())
        if keyword:
            WBcontent.set_keyword(keyword)
            utility.iprint('handle id:%s' % WBcontent.get_keyword())
            try:
                WBcontent.get_msg(WBcontent.get_keyword())
            except Exception, e:
                logging.exception(keyword + "用户信息解析出错: " + str(e))
                continue
コード例 #8
0
def test():
	WBcontent = getWeiboPage.getWeiboPage(GV.dict_klg);
	while not GV.task_list.empty():
		uid = GV.task_list.get();
		utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
		if uid:
			WBcontent.set_uid(uid);
			utility.iprint( 'handle id:%s'%WBcontent.get_uid() );
			try:
				WBcontent.get_msg(WBcontent.get_uid());
			except Exception, e:
				logging.exception(uid + "用户信息解析出错: " + str(e));
				continue;
コード例 #9
0
ファイル: main2.py プロジェクト: bollwang/weibo_search
	def run(self):
		global task_list;

		WBcontent = getWeiboPage.getWeiboPage();
		while not GV.task_list.empty() and not self.thread_stop:
			keyword = GV.task_list.get();
			utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
			if keyword:
				WBcontent.set_keyword( keyword );
				utility.iprint( 'Thread %s handle id:%s'%( self.t_name, WBcontent.get_keyword() ) );
				try:
					WBcontent.get_msg(WBcontent.get_keyword());
				except Exception, e:					
					logging.exception( "%s 用户信息解析出错:" + str(e), WBcontent.get_keyword() );
					continue;
コード例 #10
0
	def run(self):
		global task_list;

		WBcontent = getWeiboPage.getWeiboPage();
		while not GV.task_list.empty() and not self.thread_stop:
			uid = GV.task_list.get();
			utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
			if uid:
				WBcontent.set_uid(uid);
				utility.iprint( 'Thread %s handle id:%s'%(self.t_name, WBcontent.get_uid()) );
				try:
					WBcontent.get_msg(WBcontent.get_uid());
				except Exception, e:					
					logging.exception("%s 用户信息解析出错:" + str(e), WBcontent.get_uid());
					continue;
コード例 #11
0
ファイル: main2.py プロジェクト: wesavetheworld/weibo_search
    def run(self):
        global task_list

        WBcontent = getWeiboPage.getWeiboPage()
        while not GV.task_list.empty() and not self.thread_stop:
            keyword = GV.task_list.get()
            utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize())
            if keyword:
                WBcontent.set_keyword(keyword)
                utility.iprint('Thread %s handle id:%s' %
                               (self.t_name, WBcontent.get_keyword()))
                try:
                    WBcontent.get_msg(WBcontent.get_keyword())
                except Exception, e:
                    logging.exception("%s 用户信息解析出错:" + str(e),
                                      WBcontent.get_keyword())
                    continue
コード例 #12
0
ファイル: main.py プロジェクト: doumengyu/weibocrawler
    fread = file(filename)
    for line in fread:
        uid_list.append(line.strip())

def writefile(filename,content):
    fw = file(filename,'a')
    fw.write(content)
    fw.close()


if __name__ == '__main__':
    username = ''
    pwd = ''
    WBLogin = weiboLogin.weiboLogin()
    if(WBLogin.login(username, pwd)=='servertime_error'):
        print 'login failed. check out your network.'
        sys.exit()
    uid_list=[]
    get_uid('C:/Result1.txt',uid_list)
    path='C:/weibodata'
    if not os.path.exists(path):
        os.mkdir(path)
    for uid in uid_list:
        try:
            WBpage = getWeiboPage.getWeiboPage()
            WBpage.get_msg(uid)
           
        except Exception as e:
            writefile('C:/id.txt',str(uid)+'\n')

コード例 #13
0
ファイル: main.py プロジェクト: zdong1/weibocrawler
    fread = file(filename)
    for line in fread:
        uid_list.append(line.strip())


def writefile(filename, content):
    fw = file(filename, 'a')
    fw.write(content)
    fw.close()


if __name__ == '__main__':
    username = ''
    pwd = ''
    WBLogin = weiboLogin.weiboLogin()
    if (WBLogin.login(username, pwd) == 'servertime_error'):
        print 'login failed. check out your network.'
        sys.exit()
    uid_list = []
    get_uid('C:/Result1.txt', uid_list)
    path = 'C:/weibodata'
    if not os.path.exists(path):
        os.mkdir(path)
    for uid in uid_list:
        try:
            WBpage = getWeiboPage.getWeiboPage()
            WBpage.get_msg(uid)

        except Exception as e:
            writefile('C:/id.txt', str(uid) + '\n')