def store_userinfo_to_db(self, uid_or_nickname, user_info): if type(user_info) is str: WeiboSearchLog().get_scheduler_logger().info(self.name + " nothing ! :" + user_info) return unique_user_info = UserInfo_store( uid_or_uname=user_info.uid_or_uname, nickname=user_info.nickname, is_persion=user_info.is_persion, check_or_not=user_info.check_or_not, fensi=user_info.fensi, sex=user_info.sex, location=user_info.location, check_info=user_info.check_info, weibo_all_nums=user_info.weibo_all_nums, guan_zhu_nums=user_info.guan_zhu_nums) # Bie_Ming_store if unique_user_info['uid_or_uname'] != uid_or_nickname: bie_ming = Bie_Ming_store( uid_or_uname=unique_user_info['uid_or_uname'], bie_ming=uid_or_nickname) sign = 0 try: unique_user_info.save() except NotUniqueError: sign = 1 WeiboSearchLog().get_scheduler_logger().info( self.name + " insert to database, not unique ! " + unique_user_info['uid_or_uname'] + " crawl: " + uid_or_nickname) except: sign = 2 WeiboSearchLog().get_scheduler_logger().info( self.name + " insert to database, something wrong !") if sign == 0: WeiboSearchLog().get_scheduler_logger().info( self.name + " insert to database, success success success success!") try: bie_ming.save() except NotUniqueError: WeiboSearchLog().get_scheduler_logger().info( self.name + " bieming already in database" + unique_user_info['uid_or_uname'] + " crawl: " + uid_or_nickname) return except: WeiboSearchLog().get_scheduler_logger().info( self.name + " bieming insert to database, something wrong !") return pass
def init_url_queue(self): global UserInfo_store for uid_or_nickname in self.uid_or_uname_list: if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: continue self.url_queue.put(uid_or_nickname) print "crawl size ::::::::: ", self.url_queue.qsize() pass
def store_userinfo_to_db(self, uid_or_nickname, user_info): if type(user_info) is str: WeiboSearchLog().get_scheduler_logger().info(self.name + " nothing ! :" + user_info) return unique_user_info = UserInfo_store(uid_or_uname=user_info.uid_or_uname, nickname=user_info.nickname, is_persion=user_info.is_persion, check_or_not=user_info.check_or_not, fensi=user_info.fensi, sex=user_info.sex, location=user_info.location, check_info=user_info.check_info, weibo_all_nums=user_info.weibo_all_nums, guan_zhu_nums=user_info.guan_zhu_nums ) # Bie_Ming_store if unique_user_info['uid_or_uname'] != uid_or_nickname: bie_ming = Bie_Ming_store(uid_or_uname=unique_user_info['uid_or_uname'] , bie_ming=uid_or_nickname) sign = 0 try: unique_user_info.save() except NotUniqueError: sign = 1 WeiboSearchLog().get_scheduler_logger().info(self.name + " insert to database, not unique ! " + unique_user_info['uid_or_uname'] + " crawl: " + uid_or_nickname) except: sign = 2 WeiboSearchLog().get_scheduler_logger().info(self.name + " insert to database, something wrong !") if sign == 0: WeiboSearchLog().get_scheduler_logger().info(self.name + " insert to database, success success success success!") try: bie_ming.save() except NotUniqueError: WeiboSearchLog().get_scheduler_logger().info(self.name + " bieming already in database" + unique_user_info['uid_or_uname'] + " crawl: " + uid_or_nickname) return except: WeiboSearchLog().get_scheduler_logger().info(self.name + " bieming insert to database, something wrong !") return pass
def crawl(self, uid_or_nickname, is_again=False): # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ url = '' if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname) return "nothing" quote_uid_or_nickname = "" try: quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip())) except: print traceback.format_exc() print uid_or_nickname # url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" if quote_uid_or_nickname == uid_or_nickname: url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" else: url = "http://weibo.cn/n/" + quote_uid_or_nickname # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) user_info = "" try: page = craw_object.get_page() user_info = page_parser_from_search_for_UserInfo(page, url) except: if is_again: return self.crawl(url, is_again=False) else: return user_info return user_info
# -*- coding: utf-8 -*- ''' Created on 2016年5月2日 @author: nlp ''' from store_model import UserInfo_store if __name__ == '__main__': print len( UserInfo_store.objects( uid_or_uname = str( "2080114694" ) ) ) pass
# -*- coding: utf-8 -*- ''' Created on 2016年5月2日 @author: nlp ''' from store_model import UserInfo_store if __name__ == '__main__': print len(UserInfo_store.objects(uid_or_uname=str("2080114694"))) pass