def start_testing(self): """start_testing function will only be used to test the login part """ self.login_weibo() cookie_str = self.config.get('crawler','user_token') + '=' + self.cookie_dict[self.config.get('crawler','user_token')] \ + ';' + \ self.config.get('crawler','user_id') + '=' + self.cookie_dict[self.config.get('crawler','user_id')] print 'start visiting!: ', cookie_str headers = { 'User-Agent': self.config.get('crawler', 'User-Agent'), 'Cookie': cookie_str } to_visit_url = 'http://weibo.cn/1644798402/info' req = urllib2.Request(url=to_visit_url, headers=headers) while 1: # for test only try: response = self.opener.open(req) html = response.read() parse.parse_user_info(html, headers, self.opener, self.logger) response.close() except URLError, e: if hasattr(e, 'code'): self.logger.error("http url error code: %s" % e.code) if hasattr(e, 'reason'): self.logger.error("http url error reason: %s" % e.reason) file = open('test.html', 'w') file.write(html) file.close() # will sleep for how many seconds... sleep_time = 12 * 60 print 'will sleep for %d seconds' % sleep_time time.sleep(sleep_time)
def start_testing(self): """start_testing function will only be used to test the login part """ self.login_weibo() cookie_str = ( self.config.get("crawler", "user_token") + "=" + self.cookie_dict[self.config.get("crawler", "user_token")] + ";" + self.config.get("crawler", "user_id") + "=" + self.cookie_dict[self.config.get("crawler", "user_id")] ) print "start visiting!: ", cookie_str headers = {"User-Agent": self.config.get("crawler", "User-Agent"), "Cookie": cookie_str} to_visit_url = "http://weibo.cn/1644798402/info" req = urllib2.Request(url=to_visit_url, headers=headers) while 1: # for test only try: response = self.opener.open(req) html = response.read() parse.parse_user_info(html, headers, self.opener, self.logger) response.close() except URLError, e: if hasattr(e, "code"): self.logger.error("http url error code: %s" % e.code) if hasattr(e, "reason"): self.logger.error("http url error reason: %s" % e.reason) file = open("test.html", "w") file.write(html) file.close() # will sleep for how many seconds... sleep_time = 12 * 60 print "will sleep for %d seconds" % sleep_time time.sleep(sleep_time)
def get_user_info(self, headers, user_url): user_home = {} user_info = {} user_id = 0 username = user_id is_stored = 0 is_banned = False if 'u/' in user_url: # means that u/***, *** is a number namely user_id user_url = user_url[2:] user_id = user_url # if user_url is the user_id, then username will also be user_id # else, the username would be the user_url username = user_url # judge the user_id here # check if the user_id exists in the database already # if so...do not proceed.... # else go on and get the user info if user_id == 0: # if still not get the user_id is_stored = self.is_stored_username(username) else: # already have the user_id is_stored = self.is_stored_user(user_id) if is_stored: user_id = self.get_userid_by_username(username) print '%s has been stored already' % user_id return is_banned, user_id else: # if a new user, add it to db to_visit_url = 'http://weibo.cn/' + str(user_url) req = urllib2.Request(url=to_visit_url, headers=headers) # user_home contains the user_id, following, follower, and status count try: response = self.opener.open(req) html = response.read() if parse.is_pub_page(html): is_banned = True return is_banned, user_id user_home = parse.parse_user_home(html) print user_home user_id = user_home['user_id'] response.close() except URLError, e: user_id = 0 is_banned = True if hasattr(e, 'code'): self.logger.error("http url error code: %s" % e.code) if hasattr(e, 'reason'): self.logger.error("http url error reason: %s" % e.reason) return is_banned, user_id to_visit_url = 'http://weibo.cn/' + str(user_id) + "/info" req = urllib2.Request(url=to_visit_url, headers=headers) # to get the user info try: response = self.opener.open(req) html_str = response.read() if parse.is_pub_page(html_str): is_banned = True return is_banned, user_id user_info = parse.parse_user_info(str(html_str), user_id, headers, self.opener, self.logger) response.close() # store the user_home(u know, those numbers) and user_info into database if user_info['screen_name'] != '': self.store_user_into_db(user_home, user_info, username) time_sleep = random.randint(12, 23) print "after requesting the user info, sleep for %s secs" % str( time_sleep) time.sleep(time_sleep) except URLError, e: if hasattr(e, 'code'): self.logger.error("http url error code: %s" % e.code) if hasattr(e, 'reason'): self.logger.error("http url error reason: %s" % e.reason)
def get_user_info(self, headers, user_url): user_home = {} user_info = {} user_id = 0 username = user_id is_stored = 0 is_banned = False if "u/" in user_url: # means that u/***, *** is a number namely user_id user_url = user_url[2:] user_id = user_url # if user_url is the user_id, then username will also be user_id # else, the username would be the user_url username = user_url # judge the user_id here # check if the user_id exists in the database already # if so...do not proceed.... # else go on and get the user info if user_id == 0: # if still not get the user_id is_stored = self.is_stored_username(username) else: # already have the user_id is_stored = self.is_stored_user(user_id) if is_stored: user_id = self.get_userid_by_username(username) print "%s has been stored already" % user_id return is_banned, user_id else: # if a new user, add it to db to_visit_url = "http://weibo.cn/" + str(user_url) req = urllib2.Request(url=to_visit_url, headers=headers) # user_home contains the user_id, following, follower, and status count try: response = self.opener.open(req) html = response.read() if parse.is_pub_page(html): is_banned = True return is_banned, user_id user_home = parse.parse_user_home(html) print user_home user_id = user_home["user_id"] response.close() except URLError, e: user_id = 0 is_banned = True if hasattr(e, "code"): self.logger.error("http url error code: %s" % e.code) if hasattr(e, "reason"): self.logger.error("http url error reason: %s" % e.reason) return is_banned, user_id to_visit_url = "http://weibo.cn/" + str(user_id) + "/info" req = urllib2.Request(url=to_visit_url, headers=headers) # to get the user info try: response = self.opener.open(req) html_str = response.read() if parse.is_pub_page(html_str): is_banned = True return is_banned, user_id user_info = parse.parse_user_info(str(html_str), user_id, headers, self.opener, self.logger) response.close() # store the user_home(u know, those numbers) and user_info into database if user_info["screen_name"] != "": self.store_user_into_db(user_home, user_info, username) time_sleep = random.randint(12, 23) print "after requesting the user info, sleep for %s secs" % str(time_sleep) time.sleep(time_sleep) except URLError, e: if hasattr(e, "code"): self.logger.error("http url error code: %s" % e.code) if hasattr(e, "reason"): self.logger.error("http url error reason: %s" % e.reason)