Exemple #1
0
 def start_testing(self):
     """start_testing function will only be used to test
         the login part
     """
     self.login_weibo()
     cookie_str = self.config.get('crawler','user_token') + '=' + self.cookie_dict[self.config.get('crawler','user_token')] \
                  + ';' + \
                  self.config.get('crawler','user_id') + '=' + self.cookie_dict[self.config.get('crawler','user_id')]
     print 'start visiting!: ', cookie_str
     headers = {
         'User-Agent': self.config.get('crawler', 'User-Agent'),
         'Cookie': cookie_str
     }
     to_visit_url = 'http://weibo.cn/1644798402/info'
     req = urllib2.Request(url=to_visit_url, headers=headers)
     while 1:  # for test only
         try:
             response = self.opener.open(req)
             html = response.read()
             parse.parse_user_info(html, headers, self.opener, self.logger)
             response.close()
         except URLError, e:
             if hasattr(e, 'code'):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, 'reason'):
                     self.logger.error("http url error reason: %s" %
                                       e.reason)
                 file = open('test.html', 'w')
                 file.write(html)
                 file.close()
                 # will sleep for how many seconds...
                 sleep_time = 12 * 60
                 print 'will sleep for %d seconds' % sleep_time
                 time.sleep(sleep_time)
Exemple #2
0
 def start_testing(self):
     """start_testing function will only be used to test
         the login part
     """
     self.login_weibo()
     cookie_str = (
         self.config.get("crawler", "user_token")
         + "="
         + self.cookie_dict[self.config.get("crawler", "user_token")]
         + ";"
         + self.config.get("crawler", "user_id")
         + "="
         + self.cookie_dict[self.config.get("crawler", "user_id")]
     )
     print "start visiting!: ", cookie_str
     headers = {"User-Agent": self.config.get("crawler", "User-Agent"), "Cookie": cookie_str}
     to_visit_url = "http://weibo.cn/1644798402/info"
     req = urllib2.Request(url=to_visit_url, headers=headers)
     while 1:  # for test only
         try:
             response = self.opener.open(req)
             html = response.read()
             parse.parse_user_info(html, headers, self.opener, self.logger)
             response.close()
         except URLError, e:
             if hasattr(e, "code"):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, "reason"):
                     self.logger.error("http url error reason: %s" % e.reason)
                 file = open("test.html", "w")
                 file.write(html)
                 file.close()
                 # will sleep for how many seconds...
                 sleep_time = 12 * 60
                 print "will sleep for %d seconds" % sleep_time
                 time.sleep(sleep_time)
 def get_user_info(self, headers, user_url):
     user_home = {}
     user_info = {}
     user_id = 0
     username = user_id
     is_stored = 0
     is_banned = False
     if 'u/' in user_url:
         # means that u/***, *** is a number namely user_id
         user_url = user_url[2:]
         user_id = user_url
     # if user_url is the user_id, then username will also be user_id
     # else, the username would be the user_url
     username = user_url
     # judge the user_id here
     # check if the user_id exists in the database already
     # if so...do not proceed....
     # else go on and get the user info
     if user_id == 0:  # if still not get the user_id
         is_stored = self.is_stored_username(username)
     else:  # already have the user_id
         is_stored = self.is_stored_user(user_id)
     if is_stored:
         user_id = self.get_userid_by_username(username)
         print '%s has been stored already' % user_id
         return is_banned, user_id
     else:  # if a new user, add it to db
         to_visit_url = 'http://weibo.cn/' + str(user_url)
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # user_home contains the user_id, following, follower, and status count
         try:
             response = self.opener.open(req)
             html = response.read()
             if parse.is_pub_page(html):
                 is_banned = True
                 return is_banned, user_id
             user_home = parse.parse_user_home(html)
             print user_home
             user_id = user_home['user_id']
             response.close()
         except URLError, e:
             user_id = 0
             is_banned = True
             if hasattr(e, 'code'):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, 'reason'):
                     self.logger.error("http url error reason: %s" %
                                       e.reason)
             return is_banned, user_id
         to_visit_url = 'http://weibo.cn/' + str(user_id) + "/info"
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # to get the user info
         try:
             response = self.opener.open(req)
             html_str = response.read()
             if parse.is_pub_page(html_str):
                 is_banned = True
                 return is_banned, user_id
             user_info = parse.parse_user_info(str(html_str), user_id,
                                               headers, self.opener,
                                               self.logger)
             response.close()
             # store the user_home(u know, those numbers) and user_info into database
             if user_info['screen_name'] != '':
                 self.store_user_into_db(user_home, user_info, username)
             time_sleep = random.randint(12, 23)
             print "after requesting the user info, sleep for %s secs" % str(
                 time_sleep)
             time.sleep(time_sleep)
         except URLError, e:
             if hasattr(e, 'code'):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, 'reason'):
                     self.logger.error("http url error reason: %s" %
                                       e.reason)
 def get_user_info(self, headers, user_url):
     user_home = {}
     user_info = {}
     user_id = 0
     username = user_id
     is_stored = 0
     is_banned = False
     if "u/" in user_url:
         # means that u/***, *** is a number namely user_id
         user_url = user_url[2:]
         user_id = user_url
     # if user_url is the user_id, then username will also be user_id
     # else, the username would be the user_url
     username = user_url
     # judge the user_id here
     # check if the user_id exists in the database already
     # if so...do not proceed....
     # else go on and get the user info
     if user_id == 0:  # if still not get the user_id
         is_stored = self.is_stored_username(username)
     else:  # already have the user_id
         is_stored = self.is_stored_user(user_id)
     if is_stored:
         user_id = self.get_userid_by_username(username)
         print "%s has been stored already" % user_id
         return is_banned, user_id
     else:  # if a new user, add it to db
         to_visit_url = "http://weibo.cn/" + str(user_url)
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # user_home contains the user_id, following, follower, and status count
         try:
             response = self.opener.open(req)
             html = response.read()
             if parse.is_pub_page(html):
                 is_banned = True
                 return is_banned, user_id
             user_home = parse.parse_user_home(html)
             print user_home
             user_id = user_home["user_id"]
             response.close()
         except URLError, e:
             user_id = 0
             is_banned = True
             if hasattr(e, "code"):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, "reason"):
                     self.logger.error("http url error reason: %s" % e.reason)
             return is_banned, user_id
         to_visit_url = "http://weibo.cn/" + str(user_id) + "/info"
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # to get the user info
         try:
             response = self.opener.open(req)
             html_str = response.read()
             if parse.is_pub_page(html_str):
                 is_banned = True
                 return is_banned, user_id
             user_info = parse.parse_user_info(str(html_str), user_id, headers, self.opener, self.logger)
             response.close()
             # store the user_home(u know, those numbers) and user_info into database
             if user_info["screen_name"] != "":
                 self.store_user_into_db(user_home, user_info, username)
             time_sleep = random.randint(12, 23)
             print "after requesting the user info, sleep for %s secs" % str(time_sleep)
             time.sleep(time_sleep)
         except URLError, e:
             if hasattr(e, "code"):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, "reason"):
                     self.logger.error("http url error reason: %s" % e.reason)