Beispiel #1
0
def fans_crawler(username,n):
    print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>fans_crawler"
    fans_info_list = list()
    fans_pair_list = list()
    url = get_homepage(username)
    uid,domain = get_ids(url)
    login.weiboLogin()                                    # login module must set here
    fans_page,follow_page = create_url(uid,domain)
    fans_info_list,fans_pair_list = get_info(fans_page,username,n)
    # print fans_info_list
    # print fans_id_list
    for item in fans_info_list:
        new_fans_info = list()
        new_fans_pair = list()
        new_follows_info = list()
        new_follows_pair = list()
        new_follows = list()
        uid_page = "http://weibo.com/u/"+str(item["id"])
        new_uid,new_domain = get_ids(uid_page)
        new_fans_page,new_follows_page = create_url(new_uid,new_domain)
        new_fans_info,new_fans_pair = get_info(new_fans_page,str(item["name"]),n)
        new_follows_info,new_follows_pair= get_info(new_follows_page,str(item["name"])  ,n)
        fans_pair_list.extend(new_fans_pair)
        new_follows.extend(new_follows_pair)
    return fans_pair_list,new_follows
Beispiel #2
0
def crawler(username,n):
    # print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>fans_crawler"
    result = dir()
    fans_info_list = list()
    fans_pair_list = list()
    new_follows = list()
    follows_info_list = list()
    follow_pair_list = list()
    new_fans = list()
    url = get_homepage(username)
    uid,domain = get_ids(url)
    login.weiboLogin()                                    # login module must set here
    fans_page,follow_page = create_url(uid,domain)
    fans_info_list,fans_pair_list = get_info(fans_page,username,n)
    follows_info_list,follow_pair_list = get_info(follow_page,username,n)
    follow_pair_list.extend(fans_pair_list)
    for item in fans_info_list:
        new_fans_info = list()
        new_fans_pair = list()
        new_follows_info = list()
        new_follows_pair = list()
        uid_page = "http://weibo.com/u/"+str(item["id"])
        new_uid,new_domain = get_ids(uid_page)
        new_fans_page,new_follows_page = create_url(new_uid,new_domain)
        new_fans_info,new_fans_pair = get_info(new_fans_page,str(item["name"]),n)
        new_follows_info,new_follows_pair= get_info(new_follows_page,str(item["name"])  ,n)
        # fans_pair_list.extend(new_fans_pair)
        follow_pair_list.extend(new_fans_pair)
        # new_follows.extend(new_follows_pair)
        follow_pair_list.extend(new_follows_pair)
    for item in follows_info_list:
        new_fans_info = list()
        new_fans_pair = list()
        new_follows_info = list()
        new_follows_pair = list()
        uid_page = "http://weibo.com/u/"+str(item["id"])
        new_uid,new_domain = get_ids(uid_page)
        new_fans_page,new_follows_page = create_url(new_uid,new_domain)
        new_fans_info,new_fans_pair = get_info(new_fans_page,str(item["name"]),n)
        new_follows_info,new_follows_pair= get_info(new_follows_page,str(item["name"])  ,n)
        # follow_pair_list.extend(new_follows_pair)
        # new_fans.extend(new_fans_pair)
        follow_pair_list.extend(new_fans_pair)
        follow_pair_list.extend(new_follows_pair)
    # fans_pair_list.extend(new_fans)
    # follow_pair_list.extend(new_follows)
    # result = {"fans":fans_pair_list,"follows":follow_pair_list}
    result = {"relation":follow_pair_list}
    return result
Beispiel #3
0
 def __init__(self):
     threading.Thread.__init__(self)
     #add login code at here
     client_id = '1000570550'
     redirect_uri = 'http://www.data-god.com'
     username = '******'
     passwd = 'antonidas25'
     url = "https://api.weibo.com/oauth2/authorize?client_id="+client_id+"&redirect_uri="+redirect_uri+"&response_type=code"
     conn = httplib.HTTPSConnection("api.weibo.com")
     postdata = urllib.urlencode({'client_id':client_id,'redirect_uri':redirect_uri,'action':'submit','userId':username,'passwd':passwd})
     conn.request('POST','/oauth2/authorize',postdata,{'Referer':url, 'Content-Type': 'application/x-www-form-urlencoded'})
     res = conn.getresponse()
     page = res.read()
     code = res.msg['Location'].split("?")[1][5:]
     try:
         res.getheaders()
     except:
         print 'authorize error!'
     print'authorize sucess!'
     login.weiboLogin()
Beispiel #4
0
def mainCrawler(name, initial):
    logPath = conf.logPath
    logging.basicConfig(filename=logPath,format="%(threadName)s:%(asctime)s:%(levelname)s:%(message)s",level=logging.DEBUG)

    cookie = login.weiboLogin()
    if not cookie:
        print "cookie is None"
        return
    
    startID = readID()

    if startID == None:
        startID = conf.startID

    if conf.specificID != "":
        startID = conf.specificID

    errorBound = 20
    errorCount = errorBound #test whether getting error page continuously for 10 times
    relogin = conf.relogin
    while(startID != None):
        relogin -= 1
        if relogin < 0:
            print "System relogin"
            logging.info("System relogin")
            cookie = login.weiboLogin()
            relogin = conf.relogin
            if not cookie:
                print "cookie is none"
                return

        logging.info("ID:\t"+startID)
        
        if startID == conf.skipID:
            startID = readID()
            continue

        info = getInfo('http://www.weibo.com/u/' + startID, cookie, logPath)
        if info == None:
            logging.info("info: None\t" + startID)
            print "info: None\t" + startID
            logging.info("relogin")
            print "relogin"
            cookie = login.weiboLogin()
            startID = readID()
            continue

        if info == "error":
            errorCount -= 1
            if errorCount == 0:
                logging.error("Too many error pages")
                print "Too many error pages"
                return
            startID = readID()
            continue
        else:
            errorCount = errorBound

        if not personalInfoCrawler(info['id'], info['domain'], cookie, logPath):
            return
        if not fansCrawler(info['id'], info['domain'], cookie, logPath):
            return
        if not followCrawler(info['id'], info['domain'], cookie, logPath):
            return
        if not weiboCrawler(cookie, info, logPath):
            return

        if conf.specificID != "":
            break
        if initial:
            dbhandler.createIndex()
            break
    
        startID = readID() 

    logging.info("Finished! " + str(name))
    print "Finished! " + str(name)
Beispiel #5
0
def weiboCrawler(cookie,info,logPath):  
    logging.basicConfig(filename=logPath,format="%(asctime)s:%(levelname)s:%(message)s",level=logging.DEBUG) 
    domain = info["domain"]
    idstr = "" + domain + info["id"]

    #微博第一页获取Url信息    
    weiboUrl = "http://weibo.com/" + info['id']

    pageCount = 1
    weiboNum = int(info["weiboNum"])
    totalPage = weiboNum / 46 + 1
    if totalPage > conf.maxWeiboPage: #Only crawl the recent 460 weibos of the user defaultly
        totalPage = conf.maxWeiboPage
    while weiboNum != 0 and pageCount <= totalPage:
        #微博页面需要动态加载两次
        postfix = "/weibo?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=%u#feedtop"%(pageCount) #微博每一页的后缀
        firstWeiboUrl = weiboUrl + postfix
        print firstWeiboUrl
        secondUrlInfo = getWeiboUrlInfo(cookie, firstWeiboUrl, 0, False,logPath) 
        if secondUrlInfo == None:
            postfix = "/mblog?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=%u#feedtop"%(pageCount)
            firstWeiboUrl = weiboUrl + postfix
            print firstWeiboUrl
            secondUrlInfo = getWeiboUrlInfo(cookie, firstWeiboUrl, 0, False,logPath) 
            if secondUrlInfo == None:
                postfix = "/feed?is_search=0&visible=0&is_tag=0&profile_ftype=1&page=%u#feedtop"%(pageCount)
                firstWeiboUrl = weiboUrl + postfix
                print firstWeiboUrl
                secondUrlInfo = getWeiboUrlInfo(cookie, firstWeiboUrl, 0, False,logPath)
                if secondUrlInfo == None:
                    logging.warning("Failed to get weibos, skip " + info["id"])
                    logging.info(firstWeiboUrl)
                    print "Failed to get weibos, skip " + info["id"]
                    return "skip" #skip the user

        #微博的内容
        try:
            weiboList = parser.parseWeibo(secondUrlInfo["page"],info["id"],1) 
            for item in weiboList:
                zanCrawler(item["weiboID"], item["userID"], item["zanNum"], cookie)
                logging.info("zan written")
                print "zan written"
                repostCrawler(item["weiboID"], item["userID"], item["repostNum"], cookie)
                logging.info("repost written")
                print "repost written"
                commentCrawler(item["weiboID"], item["userID"], item["commentNum"], cookie)
                logging.info("comment written")
                print "comment written"
                
        except exception.CookieExpiredException, e:
            logging.info(firstWeiboUrl)
            logging.warning("CookieExpiredException: " + e.info)
            logging.info("Sleep:\t" + str(conf.waitTime))
            print firstWeiboUrl
            print "CookieExpiredException: " + e.info
            print "Sleep:\t" + str(conf.waitTime)
            time.sleep(conf.waitTime)
            cookie = login.weiboLogin()#coolie expired, loin again
            if not cookie:
                logging.error("failed to relogin")
                print "failed to relogin"
                return False
            HEADERS = {"cookie": cookie}
            continue
    
        if weiboNum - (pageCount-1)*46 > 16:
            max_id = secondUrlInfo["max_id"]
            end_id = secondUrlInfo["end_id"]
            pre_page = pageCount
            pagebar = 0
            secondWeiboUrl = "http://www.weibo.com/p/aj/mblog/mbloglist?domain=%s&pre_page=%u&page=%u&max_id=%s&end_id=%s&count=15&pagebar=%u&max_msign=&filtered_min_id=&pl_name=Pl_Official_LeftProfileFeed__11&id=%s&script_uri=/p/%s/weibo&feed_type=0&is_search=0&visible=0&is_tag=0&profile_ftype=1"%(domain, pre_page, pageCount, max_id, end_id, pagebar, idstr, idstr)    
            print secondWeiboUrl
            thirdUrlInfo = getWeiboUrlInfo(cookie, secondWeiboUrl, 1, False, logPath)

            #微博的内容
            if thirdUrlInfo != None:
                weiboList = parser.parseWeibo(thirdUrlInfo["page"],info["id"],2)
                for item in weiboList:
                    zanCrawler(item["weiboID"], item["userID"], item["zanNum"], cookie)
                    logging.info("zan written")
                    print "zan written"
                    repostCrawler(item["weiboID"], item["userID"], item["repostNum"], cookie)
                    logging.info("repost written")
                    print "repost written"
                    commentCrawler(item["weiboID"], item["userID"], item["commentNum"], cookie)
                    logging.info("comment written")
                    print "comment written"
            
        if weiboNum - (pageCount-1)*46 > 26 and thirdUrlInfo != None:
            max_id = thirdUrlInfo["max_id"]
            end_id = thirdUrlInfo["end_id"]
            pre_page = pageCount
            pagebar = 1
            thirdWeiboUrl = "http://www.weibo.com/p/aj/mblog/mbloglist?domain=%s&pre_page=%u&page=%u&max_id=%s&end_id=%s&count=15&pagebar=%u&max_msign=&filtered_min_id=&pl_name=Pl_Official_LeftProfileFeed__11&id=%s&script_uri=/p/%s/weibo&feed_type=0&is_search=0&visible=0&is_tag=0&profile_ftype=1"%(domain, pre_page, pageCount, max_id, end_id, pagebar, idstr, idstr)
            print thirdWeiboUrl
           
            HEADERS = {"cookie": cookie}
            try:
                req = urllib2.Request(thirdWeiboUrl, headers=HEADERS)
                page  = urllib2.urlopen(req).read()
            except Exception, e:
                logging.warning(e)
                logging.info("Sleep:\t" + str(conf.waitTime))
                print e
                print thirdWeiboUrl
                print "Sleep:\t" + str(conf.waitTime)
                time.sleep(conf.waitTime)
                try:
                    req = urllib2.Request(thirdWeiboUrl, headers=HEADERS)
                    page  = urllib2.urlopen(req).read()
                except Exception, e:
                    logging.warning(e)
                    logging.info(thirdWeiboUrl)
                    print e
                    print thirdWeiboUrl
                    return False
Beispiel #6
0
def followCrawler(userID, domain, cookie,logPath):
    logging.basicConfig(filename=logPath,format="%(asctime)s:%(levelname)s:%(message)s",level=logging.DEBUG)
    HEADERS = {"cookie": cookie}

    pageCount = 1
    totalPage = 10
    flag = True #only to get the total page number one time
    count = 1
    timesToWait = 1 #the times of conf.waitTime to wait
    while pageCount <= totalPage: #Sina only allow 10 pages for us to see
        url = "http://www.weibo.com/p/" + domain + userID + "/follow?page=" + str(pageCount);
        try:
            req = urllib2.Request(url, headers=HEADERS)
            page  = urllib2.urlopen(req).read()
        except socket.error as e:
            if count < 1:
                return None
            logging.warning(e)
            print e
            if e.errno == errno.ECONNREFUSED: #when sina.com has detected the crawling, wait for enough time to reconnect
                timesToWait = timesToWait * 2;
                logging.info("Sleep:\t" + str(timesToWait * conf.waitTime)) #'timesToWait' times of the conf.waitTime
                print "Sleep:\t" + str(timesToWait * conf.waitTime)
                time.sleep(timesToWait * conf.waitTime)
                count = count - 1
                continue
        except Exception, e:
            logging.warning(e)
            logging.info("Sleep:\t" + str(conf.waitTime))
            print e
            print url
            time.sleep(conf.waitTime)
            continue
        
        if flag:
            temp = parser.getNum(page, 2)
            total = 0
            if temp != None:
                total = int(temp)
            if total <= 180:
                totalPage = total / 20 + 1
            flag = False

        try:
            if not parser.parseFollows(page, userID):
                logging.info(url)
                print url
                break
            else:
                #print url
                logging.info("Page " + str(pageCount) + " Follows written!")
                print "Page " + str(pageCount) + " Follows written!"
                pageCount += 1
        except exception.CookieExpiredException, e:
            logging.info(url)
            logging.warning("CookieExpiredException: " + e.info)
            logging.info("Sleep:\t" + str(conf.waitTime))
            print url
            print "CookieExpiredException: " + e.info
            print "Sleep:\t" + str(conf.waitTime)
            time.sleep(conf.waitTime)
            cookie = login.weiboLogin()#coolie expired, loin again
            if not cookie:
                logging.error("failed to relogin")
                print "failed to relogin"
                return False
            HEADERS = {"cookie": cookie}
            continue
Beispiel #7
0
            return None
        count = count - 1

    try:
        info = parser.parseInfo(page)
    except exception.ParseInfoException, e:
        logging.warning("ParseInfoException: " + e.info)
        logging.info("Sleep:\t" + str(conf.waitTime))
        print "ParseInfoException: " + e.info
        print "Sleep:\t" + str(conf.waitTime)
        print url
        time.sleep(conf.waitTime)
        try:
            print "relogin"
            logging.info("relogin")
            cookie = login.weiboLogin()
            HEADERS = {"cookie": cookie}
            req = urllib2.Request(url, headers=HEADERS)
            page  = urllib2.urlopen(req).read() 
            if parser.errorPage(page) == True:
                logging.warning("Get error page " + url)
                print "Get error page " + url
                return "error"
            newUrl = parser.resetUrl(page)
            if newUrl != None:
                url = newUrl
                req = urllib2.Request(url, headers=HEADERS)
                page  = urllib2.urlopen(req).read() 

            info = parser.parseInfo(page)
        except Exception, e: