def fans_crawler(username,n): print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>fans_crawler" fans_info_list = list() fans_pair_list = list() url = get_homepage(username) uid,domain = get_ids(url) login.weiboLogin() # login module must set here fans_page,follow_page = create_url(uid,domain) fans_info_list,fans_pair_list = get_info(fans_page,username,n) # print fans_info_list # print fans_id_list for item in fans_info_list: new_fans_info = list() new_fans_pair = list() new_follows_info = list() new_follows_pair = list() new_follows = list() uid_page = "http://weibo.com/u/"+str(item["id"]) new_uid,new_domain = get_ids(uid_page) new_fans_page,new_follows_page = create_url(new_uid,new_domain) new_fans_info,new_fans_pair = get_info(new_fans_page,str(item["name"]),n) new_follows_info,new_follows_pair= get_info(new_follows_page,str(item["name"]) ,n) fans_pair_list.extend(new_fans_pair) new_follows.extend(new_follows_pair) return fans_pair_list,new_follows
def crawler(username,n): # print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>fans_crawler" result = dir() fans_info_list = list() fans_pair_list = list() new_follows = list() follows_info_list = list() follow_pair_list = list() new_fans = list() url = get_homepage(username) uid,domain = get_ids(url) login.weiboLogin() # login module must set here fans_page,follow_page = create_url(uid,domain) fans_info_list,fans_pair_list = get_info(fans_page,username,n) follows_info_list,follow_pair_list = get_info(follow_page,username,n) follow_pair_list.extend(fans_pair_list) for item in fans_info_list: new_fans_info = list() new_fans_pair = list() new_follows_info = list() new_follows_pair = list() uid_page = "http://weibo.com/u/"+str(item["id"]) new_uid,new_domain = get_ids(uid_page) new_fans_page,new_follows_page = create_url(new_uid,new_domain) new_fans_info,new_fans_pair = get_info(new_fans_page,str(item["name"]),n) new_follows_info,new_follows_pair= get_info(new_follows_page,str(item["name"]) ,n) # fans_pair_list.extend(new_fans_pair) follow_pair_list.extend(new_fans_pair) # new_follows.extend(new_follows_pair) follow_pair_list.extend(new_follows_pair) for item in follows_info_list: new_fans_info = list() new_fans_pair = list() new_follows_info = list() new_follows_pair = list() uid_page = "http://weibo.com/u/"+str(item["id"]) new_uid,new_domain = get_ids(uid_page) new_fans_page,new_follows_page = create_url(new_uid,new_domain) new_fans_info,new_fans_pair = get_info(new_fans_page,str(item["name"]),n) new_follows_info,new_follows_pair= get_info(new_follows_page,str(item["name"]) ,n) # follow_pair_list.extend(new_follows_pair) # new_fans.extend(new_fans_pair) follow_pair_list.extend(new_fans_pair) follow_pair_list.extend(new_follows_pair) # fans_pair_list.extend(new_fans) # follow_pair_list.extend(new_follows) # result = {"fans":fans_pair_list,"follows":follow_pair_list} result = {"relation":follow_pair_list} return result
def __init__(self): threading.Thread.__init__(self) #add login code at here client_id = '1000570550' redirect_uri = 'http://www.data-god.com' username = '******' passwd = 'antonidas25' url = "https://api.weibo.com/oauth2/authorize?client_id="+client_id+"&redirect_uri="+redirect_uri+"&response_type=code" conn = httplib.HTTPSConnection("api.weibo.com") postdata = urllib.urlencode({'client_id':client_id,'redirect_uri':redirect_uri,'action':'submit','userId':username,'passwd':passwd}) conn.request('POST','/oauth2/authorize',postdata,{'Referer':url, 'Content-Type': 'application/x-www-form-urlencoded'}) res = conn.getresponse() page = res.read() code = res.msg['Location'].split("?")[1][5:] try: res.getheaders() except: print 'authorize error!' print'authorize sucess!' login.weiboLogin()
def mainCrawler(name, initial): logPath = conf.logPath logging.basicConfig(filename=logPath,format="%(threadName)s:%(asctime)s:%(levelname)s:%(message)s",level=logging.DEBUG) cookie = login.weiboLogin() if not cookie: print "cookie is None" return startID = readID() if startID == None: startID = conf.startID if conf.specificID != "": startID = conf.specificID errorBound = 20 errorCount = errorBound #test whether getting error page continuously for 10 times relogin = conf.relogin while(startID != None): relogin -= 1 if relogin < 0: print "System relogin" logging.info("System relogin") cookie = login.weiboLogin() relogin = conf.relogin if not cookie: print "cookie is none" return logging.info("ID:\t"+startID) if startID == conf.skipID: startID = readID() continue info = getInfo('http://www.weibo.com/u/' + startID, cookie, logPath) if info == None: logging.info("info: None\t" + startID) print "info: None\t" + startID logging.info("relogin") print "relogin" cookie = login.weiboLogin() startID = readID() continue if info == "error": errorCount -= 1 if errorCount == 0: logging.error("Too many error pages") print "Too many error pages" return startID = readID() continue else: errorCount = errorBound if not personalInfoCrawler(info['id'], info['domain'], cookie, logPath): return if not fansCrawler(info['id'], info['domain'], cookie, logPath): return if not followCrawler(info['id'], info['domain'], cookie, logPath): return if not weiboCrawler(cookie, info, logPath): return if conf.specificID != "": break if initial: dbhandler.createIndex() break startID = readID() logging.info("Finished! " + str(name)) print "Finished! " + str(name)
def weiboCrawler(cookie,info,logPath): logging.basicConfig(filename=logPath,format="%(asctime)s:%(levelname)s:%(message)s",level=logging.DEBUG) domain = info["domain"] idstr = "" + domain + info["id"] #微博第一页获取Url信息 weiboUrl = "http://weibo.com/" + info['id'] pageCount = 1 weiboNum = int(info["weiboNum"]) totalPage = weiboNum / 46 + 1 if totalPage > conf.maxWeiboPage: #Only crawl the recent 460 weibos of the user defaultly totalPage = conf.maxWeiboPage while weiboNum != 0 and pageCount <= totalPage: #微博页面需要动态加载两次 postfix = "/weibo?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=%u#feedtop"%(pageCount) #微博每一页的后缀 firstWeiboUrl = weiboUrl + postfix print firstWeiboUrl secondUrlInfo = getWeiboUrlInfo(cookie, firstWeiboUrl, 0, False,logPath) if secondUrlInfo == None: postfix = "/mblog?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=%u#feedtop"%(pageCount) firstWeiboUrl = weiboUrl + postfix print firstWeiboUrl secondUrlInfo = getWeiboUrlInfo(cookie, firstWeiboUrl, 0, False,logPath) if secondUrlInfo == None: postfix = "/feed?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=%u#feedtop"%(pageCount) firstWeiboUrl = weiboUrl + postfix print firstWeiboUrl secondUrlInfo = getWeiboUrlInfo(cookie, firstWeiboUrl, 0, False,logPath) if secondUrlInfo == None: logging.warning("Failed to get weibos, skip " + info["id"]) logging.info(firstWeiboUrl) print "Failed to get weibos, skip " + info["id"] return "skip" #skip the user #微博的内容 try: weiboList = parser.parseWeibo(secondUrlInfo["page"],info["id"],1) for item in weiboList: zanCrawler(item["weiboID"], item["userID"], item["zanNum"], cookie) logging.info("zan written") print "zan written" repostCrawler(item["weiboID"], item["userID"], item["repostNum"], cookie) logging.info("repost written") print "repost written" commentCrawler(item["weiboID"], item["userID"], item["commentNum"], cookie) logging.info("comment written") print "comment written" except exception.CookieExpiredException, e: logging.info(firstWeiboUrl) logging.warning("CookieExpiredException: " + e.info) logging.info("Sleep:\t" + str(conf.waitTime)) print firstWeiboUrl print "CookieExpiredException: " + e.info print "Sleep:\t" + str(conf.waitTime) time.sleep(conf.waitTime) cookie = login.weiboLogin()#coolie expired, loin again if not cookie: logging.error("failed to relogin") print "failed to relogin" return False HEADERS = {"cookie": cookie} continue if weiboNum - (pageCount-1)*46 > 16: max_id = secondUrlInfo["max_id"] end_id = secondUrlInfo["end_id"] pre_page = pageCount pagebar = 0 secondWeiboUrl = "http://www.weibo.com/p/aj/mblog/mbloglist?domain=%s&pre_page=%u&page=%u&max_id=%s&end_id=%s&count=15&pagebar=%u&max_msign=&filtered_min_id=&pl_name=Pl_Official_LeftProfileFeed__11&id=%s&script_uri=/p/%s/weibo&feed_type=0&is_search=0&visible=0&is_tag=0&profile_ftype=1"%(domain, pre_page, pageCount, max_id, end_id, pagebar, idstr, idstr) print secondWeiboUrl thirdUrlInfo = getWeiboUrlInfo(cookie, secondWeiboUrl, 1, False, logPath) #微博的内容 if thirdUrlInfo != None: weiboList = parser.parseWeibo(thirdUrlInfo["page"],info["id"],2) for item in weiboList: zanCrawler(item["weiboID"], item["userID"], item["zanNum"], cookie) logging.info("zan written") print "zan written" repostCrawler(item["weiboID"], item["userID"], item["repostNum"], cookie) logging.info("repost written") print "repost written" commentCrawler(item["weiboID"], item["userID"], item["commentNum"], cookie) logging.info("comment written") print "comment written" if weiboNum - (pageCount-1)*46 > 26 and thirdUrlInfo != None: max_id = thirdUrlInfo["max_id"] end_id = thirdUrlInfo["end_id"] pre_page = pageCount pagebar = 1 thirdWeiboUrl = "http://www.weibo.com/p/aj/mblog/mbloglist?domain=%s&pre_page=%u&page=%u&max_id=%s&end_id=%s&count=15&pagebar=%u&max_msign=&filtered_min_id=&pl_name=Pl_Official_LeftProfileFeed__11&id=%s&script_uri=/p/%s/weibo&feed_type=0&is_search=0&visible=0&is_tag=0&profile_ftype=1"%(domain, pre_page, pageCount, max_id, end_id, pagebar, idstr, idstr) print thirdWeiboUrl HEADERS = {"cookie": cookie} try: req = urllib2.Request(thirdWeiboUrl, headers=HEADERS) page = urllib2.urlopen(req).read() except Exception, e: logging.warning(e) logging.info("Sleep:\t" + str(conf.waitTime)) print e print thirdWeiboUrl print "Sleep:\t" + str(conf.waitTime) time.sleep(conf.waitTime) try: req = urllib2.Request(thirdWeiboUrl, headers=HEADERS) page = urllib2.urlopen(req).read() except Exception, e: logging.warning(e) logging.info(thirdWeiboUrl) print e print thirdWeiboUrl return False
def followCrawler(userID, domain, cookie,logPath): logging.basicConfig(filename=logPath,format="%(asctime)s:%(levelname)s:%(message)s",level=logging.DEBUG) HEADERS = {"cookie": cookie} pageCount = 1 totalPage = 10 flag = True #only to get the total page number one time count = 1 timesToWait = 1 #the times of conf.waitTime to wait while pageCount <= totalPage: #Sina only allow 10 pages for us to see url = "http://www.weibo.com/p/" + domain + userID + "/follow?page=" + str(pageCount); try: req = urllib2.Request(url, headers=HEADERS) page = urllib2.urlopen(req).read() except socket.error as e: if count < 1: return None logging.warning(e) print e if e.errno == errno.ECONNREFUSED: #when sina.com has detected the crawling, wait for enough time to reconnect timesToWait = timesToWait * 2; logging.info("Sleep:\t" + str(timesToWait * conf.waitTime)) #'timesToWait' times of the conf.waitTime print "Sleep:\t" + str(timesToWait * conf.waitTime) time.sleep(timesToWait * conf.waitTime) count = count - 1 continue except Exception, e: logging.warning(e) logging.info("Sleep:\t" + str(conf.waitTime)) print e print url time.sleep(conf.waitTime) continue if flag: temp = parser.getNum(page, 2) total = 0 if temp != None: total = int(temp) if total <= 180: totalPage = total / 20 + 1 flag = False try: if not parser.parseFollows(page, userID): logging.info(url) print url break else: #print url logging.info("Page " + str(pageCount) + " Follows written!") print "Page " + str(pageCount) + " Follows written!" pageCount += 1 except exception.CookieExpiredException, e: logging.info(url) logging.warning("CookieExpiredException: " + e.info) logging.info("Sleep:\t" + str(conf.waitTime)) print url print "CookieExpiredException: " + e.info print "Sleep:\t" + str(conf.waitTime) time.sleep(conf.waitTime) cookie = login.weiboLogin()#coolie expired, loin again if not cookie: logging.error("failed to relogin") print "failed to relogin" return False HEADERS = {"cookie": cookie} continue
return None count = count - 1 try: info = parser.parseInfo(page) except exception.ParseInfoException, e: logging.warning("ParseInfoException: " + e.info) logging.info("Sleep:\t" + str(conf.waitTime)) print "ParseInfoException: " + e.info print "Sleep:\t" + str(conf.waitTime) print url time.sleep(conf.waitTime) try: print "relogin" logging.info("relogin") cookie = login.weiboLogin() HEADERS = {"cookie": cookie} req = urllib2.Request(url, headers=HEADERS) page = urllib2.urlopen(req).read() if parser.errorPage(page) == True: logging.warning("Get error page " + url) print "Get error page " + url return "error" newUrl = parser.resetUrl(page) if newUrl != None: url = newUrl req = urllib2.Request(url, headers=HEADERS) page = urllib2.urlopen(req).read() info = parser.parseInfo(page) except Exception, e: