def initialize_info(self, user_config): """初始化爬虫信息""" self.weibo = [] self.user = UserInfo() self.user_config = user_config self.got_count = 0 self.weibo_id_list = []
def get_user_info(): """Get UserInfo for currently logged in user. This will insert the new user if it does not already exist in datastore. Returns: UserInfo record for user if user is logged in, else None. """ user = users.get_current_user() if user is None: return None auth_email = user.email() effective_email = auth_email if auth_email == '*****@*****.**': effective_email = '*****@*****.**' if auth_email == '*****@*****.**': effective_email = '*****@*****.**' if auth_email == effective_email: ui = UserInfo.get_or_insert(key_name='user:%s' % auth_email) else: ui = UserInfo.get_by_key_name('user:%s' % effective_email) if not ui: logging.error("User %s failed to act as %s; %s doesn't exist", auth_email, effective_email, effective_email) return None logging.info("User %s acting as %s", auth_email, effective_email) ui.non_owner = True ui.real_email = auth_email return ui
def get_user_info(self): """获取用户信息""" params = {'containerid': '100505' + str(self.user_config['user_id'])} js = self.get_json(params) if js['ok']: info = js['data']['userInfo'] user_info = OrderedDict() user_info['user_id'] = self.user_config['user_id'] user_info['screen_name'] = info.get('screen_name', '') user_info['gender'] = "女" if js['data']['userInfo']['gender'] == "f" else "男" params = { 'containerid': '230283' + str(self.user_config['user_id']) + '_-_INFO' } zh_list = [ u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间', u'阳光信用' ] en_list = [ 'birthday', 'location', 'education', 'education', 'education', 'education', 'company', 'registration_time', 'sunshine' ] for i in en_list: user_info[i] = '' js = self.get_json(params) if js['ok']: cards = js['data']['cards'] if isinstance(cards, list) and len(cards) > 1: card_list = cards[0]['card_group'] + cards[1]['card_group'] for card in card_list: if card.get('item_name') in zh_list: user_info[en_list[zh_list.index( card.get('item_name'))]] = card.get( 'item_content', '') user_info['statuses_count'] = info.get('statuses_count', 0) user_info['followers_count'] = info.get('followers_count', 0) user_info['follow_count'] = info.get('follow_count', 0) user_info['description'] = info.get('description', '') user_info['profile_url'] = info.get('profile_url', '') user_info['profile_image_url'] = info.get('profile_image_url', '') user_info['avatar_hd'] = info.get('avatar_hd', '') user_info['urank'] = info.get('urank', 0) user_info['mbrank'] = info.get('mbrank', 0) user_info['verified'] = info.get('verified', False) user_info['verified_type'] = info.get('verified_type', -1) user_info['verified_reason'] = info.get('verified_reason', '') user = self.standardize_info(user_info) self.user = UserInfo(**dict(user)) self.user_to_database() return user
def bassCreateUser(self, username, userId): user = UserInfo.UserInfo() user.name = username user.id = userId user = user.__dict__ # 打印字典 print(user) # 字典转化为json userJson = json.dumps(user) userDao = UserDao.UserDao() userDict = userDao.baasCreateUser(userJson) userInfo = UserInfo.UserInfo() userInfo.userId, userInfo.publicKey, userInfo.privateKey, userInfo.address = userDict[ "id"], userDict['basePublicKey'], userDict[ 'basePrivateKey'], userDict['baseAccountAddress'] return userInfo
def lookup_and_authenticate_user(handler, claimed_email, claimed_password): if not claimed_email: return None claimed_user = UserInfo.get_by_key_name("user:%s" % claimed_email) if not claimed_user: return None if claimed_email == "*****@*****.**" and handler.request.headers["Host"] == "localhost:8080": # No auth for testing. return claimed_user if claimed_user.upload_password and claimed_user.upload_password == claimed_password: return claimed_user return None
def get_user_info(): """Get UserInfo for currently logged in user. This will insert the new user if it does not already exist in datastore. Returns: UserInfo record for user if user is logged in, else None. """ user = users.get_current_user() if user is None: return None else: return UserInfo.get_or_insert(key_name='user:%s' % user.email())
def get_user_info(): """Get UserInfo for currently logged in user. This will insert the new user if it does not already exist in datastore. Returns: UserInfo record for user if user is logged in, else None. """ user = users.get_current_user() if user is None: return None else: return UserInfo.get_or_insert(key_name="user:%s" % user.email())
def lookup_and_authenticate_user(handler, claimed_email, claimed_password): if not claimed_email: return None claimed_user = UserInfo.get_by_key_name('user:%s' % claimed_email) if not claimed_user: return None if claimed_email == '*****@*****.**' and \ handler.request.headers["Host"] == "localhost:8080": # No auth for testing. return claimed_user if claimed_user.upload_password and \ claimed_user.upload_password == claimed_password: return claimed_user return None
def get(self): effective_user = None claimed_email = self.request.get('user_email') if claimed_email: claimed_user = UserInfo.get_by_key_name('user:%s' % claimed_email) if claimed_user and \ claimed_user.upload_password and \ claimed_user.upload_password == self.request.get('password'): effective_user = claimed_user if effective_user: self.response.headers['Content-Type'] = 'text/plain' upload_url = blobstore.create_upload_url('/admin/store') self.response.out.write(upload_url) else: self.error(403)
def store_media(): """Store media object info in datastore. Also updates the user-info record to keep count of media objects. This function is run as a transaction. """ user_info = UserInfo.get_by_key_name("user:%s" % user_email) if user_info is None: error_messages.append("User record has been deleted. " "Try uploading again") return media = MediaObject( parent=user_info, owner=user_info, blob=blob_info.key(), creation=blob_info.creation, content_type=blob_info.content_type, filename=blob_info.filename, size=int(blob_info.size), lacks_document=True, ) user_info.media_objects += 1 db.put(user_info) db.put(media) if bool(is_doc) and is_doc != "0": tag_list = [] if tags is not None: tag_list = [x for x in re.split("\s*,\s*", tags) if x] doc = Document( parent=user_info, owner=user_info, pages=[media.key()], title=title, description=description, no_tags=(len(tag_list) == 0), tags=tag_list, ) db.put(doc) media.document = doc.key() media.lacks_document = False db.put(media)
def get(self): self.response.headers['Cache-Control'] = "private" self.response.headers['Content-Type'] = "text/plain; charset=utf-8" user = UserInfo.get_by_key_name('user:[email protected]') docs = Document.all().filter('owner', user) docs = docs.fetch(10000) self.response.out.write("# got %d docs\n" % len(docs)) for doc in docs: self.response.out.write("%s tags[%s] date[%s] title[%s] \n" % (doc.display_url, doc.tag_comma_separated, doc.date_yyyy_mm_dd, doc.title_or_empty_string)) for page in doc.pages: self.response.out.write(" has_page: %d\n" % (page.id_or_name())) meds = MediaObject.all().filter('owner', user) meds = meds.fetch(10000) self.response.out.write("# got %d mediaobjects\n" % len(meds)) for mo in meds: self.response.out.write("%s creation[%s] size[%d]\n" % (mo.url_path, str(mo.creation), mo.size))
def store_media(): """Store media object info in datastore. Also updates the user-info record to keep count of media objects. This function is run as a transaction. """ user_info = UserInfo.get_by_key_name('user:%s' % user_email) if user_info is None: error_messages.append('User record has been deleted. ' 'Try uploading again') return media = MediaObject( parent=user_info, owner=user_info, blob=blob_info.key(), creation=blob_info.creation, content_type=blob_info.content_type, filename=blob_info.filename, size=int(blob_info.size), lacks_document=True) user_info.media_objects += 1 db.put(user_info) db.put(media) if bool(is_doc) and is_doc != "0": tag_list = [] if tags is not None: tag_list = [x for x in re.split('\s*,\s*', tags) if x] doc = Document( parent=user_info, owner=user_info, pages=[media.key()], title=title, description=description, no_tags=(len(tag_list)==0), tags=tag_list) db.put(doc) media.document = doc.key() media.lacks_document = False db.put(media)
def findUserInfoByAddress(self, address): sql = """ select id,name,address,publicKey,privateKey from user a left join user_info b on a.id = b.userId where address = %s """ array = [] param = [] param.append(address) try: baseDao = BaseDao.BaseDao() result = baseDao.execteGetOneSql(sql, param) user = UserInfo.UserInfo() user.id = result[0] user.name = result[1] user.address = result[2] user.privateKey = result[4] user.publicKey = result[3] return user except Exception as e: print(e)
def user_info(): """ Grab user's profile information into the database user sign up page, user must log in successfully before getting to this page """ if not is_loggedin(): return redirect(url_for('login')) form=UserInfoForm() if request.method=='POST': userInfo=UserInfo(form.nickName.data, form.email.data, form.phone.data, form.city.data, form.state.data, form.zipcode.data, form.education.data, form.sports.data,form.arts.data, form.travel.data,form.music.data,form.reading.data,form.gardening.data, form.nature.data, form.snowboard.data,form.food.data) db.session.add(userInfo) #add returned data to user table in the database db.session.commit() return redirect(url_for('home')) elif request.method=='GET': return render_template('user_info.html',form=form)
def getBossAccount(self, customerName): sql = """ select id,name,address,publicKey,privateKey from user a left join user_info b on a.id = b.userId where customerName = %s and b.isBoss=0 """ array = [] param = [] param.append(customerName) try: baseDao = BaseDao.BaseDao() result = baseDao.execteGetOneSql(sql, param) user = UserInfo.UserInfo() user.id = result[0] user.name = result[1] user.address = result[2] user.privateKey = result[4] user.publicKey = result[3] return user except Exception as e: print(e)
async def join(users: UserI): userlist=list(users) user_id = userlist[1][1] user_pw = userlist[2][1] user_name = userlist[3][1] user_age = userlist[4][1] user = UserInfo() user.user_id = user_id user.user_pw = user_pw user.user_name = user_name user.user_age = user_age session.add(user) session.commit() return "회원가입완료!"
def findUserInfo(self, customerName): sql = """ select id,name,address,publicKey,privateKey from user a left join user_info b on a.id = b.userId where customerName = %s and b.isBoss=1 """ array = [] param = [] param.append(customerName) try: result = baseDao.executeGetAllSql(sql, param) for row in result: user = UserInfo.UserInfo() user.id = row[0] user.name = row[1] user.address = row[2] user.privateKey = row[4] user.publicKey = row[3] array.append(user) return array except Exception as e: print(e)
def settleTest(address,amount): userController = NewUserController.NewUserController() userInfo = UserInfo.UserInfo() list=[] settleObject =NewSettleObject.NewSettleObject() settleObject.amount = amount settleObject.ownerAccount = address userInfo = userController.findUserInfoByAddress(settleObject.ownerAccount) list.append(userInfo) settleObject.userPrivateKey = userInfo.privateKey userAssetArray = userController.findAssetId(userInfo) userAssetStr = "" first= True for each in userAssetArray: if first: userAssetStr = userAssetStr+each.assetAddress first=False else: userAssetStr = userAssetStr+"," + each.assetAddress settleObject.srcAsset = userAssetStr userController.settle(settleObject)
def get(self): self.response.headers['Cache-Control'] = "private" self.response.headers['Content-Type'] = "text/plain; charset=utf-8" user = UserInfo.get_by_key_name('user:[email protected]') docs = Document.all().filter('owner', user) docs = docs.fetch(10000) self.response.out.write("# got %d docs\n" % len(docs)) for doc in docs: self.response.out.write( "%s tags[%s] date[%s] title[%s] \n" % (doc.display_url, doc.tag_comma_separated, doc.date_yyyy_mm_dd, doc.title_or_empty_string)) for page in doc.pages: self.response.out.write(" has_page: %d\n" % (page.id_or_name())) meds = MediaObject.all().filter('owner', user) meds = meds.fetch(10000) self.response.out.write("# got %d mediaobjects\n" % len(meds)) for mo in meds: self.response.out.write("%s creation[%s] size[%d]\n" % (mo.url_path, str(mo.creation), mo.size))
def get(self): self.response.headers['Content-Type'] = 'text/html' user = users.get_current_user() current_folder = self.request.get('current_folder') folder = None if user: user_key = ndb.Key('UserInfo', user.email()) user_info = user_key.get() if user_info is None: user_info = UserInfo(id=user.email()) user_info.email = user.email() folder_info = FolderInfo(id=user.email() + "/") folder_info.name = "/" user_info.folder = folder_info.key folder_info.put() user_info.put() time.sleep(1) if len(current_folder) == 0: folder = user_info.folder.get() else: folder_key = ndb.Key('FolderInfo', user.email() + current_folder) folder = folder_key.get() url = users.create_logout_url(self.request.uri) url_string = 'logout' else: url = users.create_login_url(self.request.uri) url_string = 'login' template_values = { 'url': url, 'url_string': url_string, 'user': user, 'upload_url': blobstore.create_upload_url('/upload'), 'message': self.request.get('message'), 'current_folder': folder } template = JINJA_ENVIRONMENT.get_template('main.html') self.response.write(template.render(template_values)) MainPage.obj = self
def __init__(self, user_id_list, config=None): """Weibo类初始化""" if not config: config = weibo_config config['user_id_list'] = user_id_list self.validate_config(config) self.filter = config[ 'filter'] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 since_date = config['since_date'] if isinstance(since_date, int): since_date = date.today() - timedelta(since_date) since_date = str(since_date) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.start_page = config.get('start_page', 1) # 开始爬的页,如果中途被限制而结束可以用此定义开始页码 self.write_mode = config[ 'write_mode'] # 结果信息保存类型,为list形式,可包含csv、mongo和mysql三种类型 self.original_pic_download = config[ 'original_pic_download'] # 取值范围为0、1, 0代表不下载原创微博图片,1代表下载 self.retweet_pic_download = config[ 'retweet_pic_download'] # 取值范围为0、1, 0代表不下载转发微博图片,1代表下载 self.original_video_download = config[ 'original_video_download'] # 取值范围为0、1, 0代表不下载原创微博视频,1代表下载 self.retweet_video_download = config[ 'retweet_video_download'] # 取值范围为0、1, 0代表不下载转发微博视频,1代表下载 self.result_dir_name = config.get( 'result_dir_name', 0) # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里 cookie = config.get('cookie') # 微博cookie,可填可不填 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' self.headers = {'User_Agent': user_agent, 'Cookie': cookie} self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 query_list = config.get('query_list') or [] if isinstance(query_list, str): query_list = query_list.split(',') self.query_list = query_list if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = self.get_user_config_list(user_id_list) else: self.user_config_file_path = '' user_config_list = [{ 'user_id': user_id, 'since_date': self.since_date, 'query_list': query_list } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.start_date = '' # 获取用户第一条微博时的日期 self.query = '' self.user: UserInfo = UserInfo() # 存储目标微博用户信息 self.got_count = 0 # 存储爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 self.weibo_id_list = [] # 存储爬取到的所有微博id self.proxies = requests.get( "http://api.hailiangip.com:8422/api/getIp?type=1&num=1&pid=&unbindTime=600&cid=&orderId=O21042810412537647150&time=1619577728&sign=d79086f5b8ba9dbe1a17e5b710b77032&noDuplicate=1&dataType=0&lineSeparator=0&singleIp=" ).json()['data'][0] logger.info({ "http": f"http://{self.proxies['ip']}:{self.proxies['port']}", "https": f"https://{self.proxies['ip']}:{self.proxies['port']}", })
def get(self): #=======================================================================? # ** create dic, and then compare to session. If different, rewrite session # and UserInfo. If not, return session! Saves time if button was hit # without updating! #=======================================================================? fs_id = self.request.get('fs_id') access_token = self.request.get('access_token') if self.request.get('reset'): #reset to defaults... (possibly store these in UserInfo) self.store_user(access_token, reset=True) logging.info('*** fs_id = {} ***'.format(fs_id)) homes = self.request.get_all('homes') charities = self.request.get_all('charities') latlon = self.request.get('latlon') home_prefs = [] char_prefs = [] robot_pref = self.request.get('robot_pref') user = UserInfo().all().filter('fs_id =', fs_id).get() #logging.info('*** user = {} ***'.format(user)) prefs = json.loads(user.prefs) prefs['latlon'] = latlon # for debuggin, shouldn't need to change this... if homes: for home in homes: h_tup = home.split('|||') home_prefs.append(h_tup) prefs['homes'] = home_prefs if charities: for charity in charities: c_tup = charity.split('|||') char_prefs.append(c_tup) prefs['charities'] = char_prefs if robot_pref: friends = self.request.get('friends') if robot_pref == 'robot': #here we add a function to friend the robot if not already friends! if friends == 'no': friend = utils.friend_the_robot(access_token, fs_id) #returns True or False if the friending worked! if friend: prefs['friends_with_ond'] = 'yes' prefs['robot_posts'] = True else: prefs['robot_posts'] = False udic = self.session.get('user') udic['prefs'] = prefs #Store 3-ways! self.session['user'] = udic memcache.set('user_' + fs_id, udic) pref_dump = json.dumps(prefs) user.prefs = pref_dump user.put() self.write(pref_dump)
def store_user(self, access_token, reset=False): #we might need something that #======================================================================= # Function for storing user on first oauth. Oauth will check to see if # user is in db, and if not, store them. This should happen ONLY once # so as not to overwrite set prefs! We'll use Memcache AND Session to # Store! Update will update access token in all params! (??) #======================================================================= curr_client = utils.makeFoursquareClient(access_token) current_user = curr_client.users() if current_user: # Not an existing user so get user info # Store fs_id, token and prefs! profile = current_user['user'] fs_id = profile["id"] existing_user_info = UserInfo.get_by_fs_id(fs_id) if existing_user_info and not reset: # User in DB we'll just update his/her access token! logging.info('*** There was an existing user with fs_id = {} ***'.format(fs_id)) user = existing_user_info user.token = access_token prefs = json.loads(user.prefs) if not prefs.get('name'): prefs['name'] = profile['firstName'] prefs['gender'] = profile['gender'] if not prefs.get('latlon'): prefs['latlon'] = utils.get_latlon(current_user) user.prefs = json.dumps(prefs) elif existing_user_info and reset: #user in db, but we want to reset to default prefs user = existing_user_info prefs = utils.make_default_prefs(curr_client,current_user) user.transactions = json.dumps([]) user.prefs = json.dumps(prefs) user.token = access_token else: logging.info('*** Creating a new user for fs_id = {} ***'.format(fs_id)) user = UserInfo(fs_id = fs_id, token = access_token) # store default prefs in user that can be reset later! prefs = utils.make_default_prefs(curr_client, current_user) user.transactions = json.dumps([]) user.prefs = json.dumps(prefs) user.put() #make new user or update token of existing user! logging.info('****Added or updated User {} to DataBase!****'.format(user.fs_id)) # Now store in Memcache and Session for later retrieval! udic = dict(fs_id = user.fs_id, access_token = user.token, gender=profile['gender'], prefs = prefs) self.session["user"] = udic memcache.set('user_' + user.fs_id, udic) return self.session.get("user")
def get(self): # # to be rendered from UserInfo # # poss add param of rank to tuples? To sort by most used? sets = '' # homenow = '' # setprefs = '' check_session = '' check_store_user = '' reset_user = '' logout_user = '' trivtest = ' ' transtest = '' #this tests set-info page! if sets: user = self.session.get('user') content = {'its_a_bar' : True} prefs = user['prefs'] content.update(prefs) self.render('set-info.html', **content) #Test trivia game page! elif trivtest: content = DEF_CONTENT #this happens when user checks in to home! transaction = utils.create_transaction(content) self.update_transaction(transaction, activate= True) self.store_curr_transaction(transaction, db= True) ## *** Will need to do this on before home-now post! *** content['trivia_url'] = transaction['trivia_url'] content['trans_id'] = transaction['trans_id'] self.render('home-now.html', **content) #Test transaction functions on real database! elif transtest: content = DEF_CONTENT user = self.fetchUserInfo(content['fs_id']) transaction = json.loads(user.transaction) #Logs out user so they can go through process again! elif logout_user: self.logout() self.write('You are logged out!<br><br><br>') self.write('Why not go <a href="/">HERE</a> now?') #Tests if Store User worked! elif check_store_user: access_token = '0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX' udic = self.store_user(access_token) self.write('udic = <br><br>') self.write(udic) self.write('<br><br><br>') self.write('session_user = <br><br>') self.write(self.session.get('user')) self.write('<br><br><br>') self.write('memcached_user = <br><br>') self.write(memcache.get('user_' + udic['fs_id'])) #resets a user's prefs elif reset_user: access_token = '0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX' udic = self.store_user(access_token, reset=True) self.write('user reset!<br>') self.write('udic = <br><br>') self.write(udic) self.write('<br><br><br>') self.write('session_user = <br><br>') self.write(self.session.get('user')) self.write('<br><br><br>') self.write('memcached_user = <br><br>') self.write(memcache.get('user_' + udic['fs_id'])) #compares session to user! elif check_session: #see what's in my session cookie! user = self.current_user user_info = UserInfo.all().filter('fs_id = ','4091108').get() self.write('user = <br><br>') self.write(user) self.write('<br><br><br>') self.write('user_info.prefs = <br><br>') self.write(user_info.prefs) # elif homenow: # content = {"human_time": "4:55:50 PM", # "human_wager": "$420", # "charity_id": "23-90876", # "pronoun": "he", # "then": "1366836950018", # "home_id": "4d60a5e4865a224bdd32ae85", # "charity": "The Creation Museum", # "its_a_bar": True, # "made_it": "y", # "home": "Waterphone Of Dreams (S&T's)", # "name": "Scott", # "now": "1366661436381", # "wager": "420"} # # # right_now = time.time() # # # self.render('home-now.html', **content) # elif setprefs: # user = UserInfo() # user.fs_id = '4091108' # user.prefs = json.dumps({'homes' : [] , 'charities' : []}) # user.token = "0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX" # user.put() # # # self.write(user.fs_id) else: #self.logout() ##uncomment to debug! (ie set new prefs!) user = self.current_user #=================================================================== # can put in an authed param to speed up the render! # #=================================================================== if not user: #udic = me! (for testing) Will update handling to create say['authed = false'] udic = dict(name = 'Scott', fs_id = '4091108', access_token = "0T0ETAYAS3ZET51DJW01U1LFBSVMLF0BCJ3ONWINO3YVEWRX", gender = 'male', prefs = PREFS) self.session["user"] = udic logging.info("*** Set a User! ***") user = udic client_id = CONFIG['client_id'] params = {'client_id': client_id} params['auth_url'] = utils.generateFoursquareAuthUri(client_id) params['site_name'] = CONFIG['site_name'] params['description'] = CONFIG['site_description'] params['fs_id'] = user['fs_id'] params.update(user['prefs']) params['bad_charities'] = BAD_CHARITIES #This will be added and set to false if not self.current_user! params['authed'] = 'true' self.render('index.html', **params)
class Weibo(object): def __init__(self, user_id_list, config=None): """Weibo类初始化""" if not config: config = weibo_config config['user_id_list'] = user_id_list self.validate_config(config) self.filter = config[ 'filter'] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 since_date = config['since_date'] if isinstance(since_date, int): since_date = date.today() - timedelta(since_date) since_date = str(since_date) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.start_page = config.get('start_page', 1) # 开始爬的页,如果中途被限制而结束可以用此定义开始页码 self.write_mode = config[ 'write_mode'] # 结果信息保存类型,为list形式,可包含csv、mongo和mysql三种类型 self.original_pic_download = config[ 'original_pic_download'] # 取值范围为0、1, 0代表不下载原创微博图片,1代表下载 self.retweet_pic_download = config[ 'retweet_pic_download'] # 取值范围为0、1, 0代表不下载转发微博图片,1代表下载 self.original_video_download = config[ 'original_video_download'] # 取值范围为0、1, 0代表不下载原创微博视频,1代表下载 self.retweet_video_download = config[ 'retweet_video_download'] # 取值范围为0、1, 0代表不下载转发微博视频,1代表下载 self.result_dir_name = config.get( 'result_dir_name', 0) # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里 cookie = config.get('cookie') # 微博cookie,可填可不填 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' self.headers = {'User_Agent': user_agent, 'Cookie': cookie} self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 query_list = config.get('query_list') or [] if isinstance(query_list, str): query_list = query_list.split(',') self.query_list = query_list if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = self.get_user_config_list(user_id_list) else: self.user_config_file_path = '' user_config_list = [{ 'user_id': user_id, 'since_date': self.since_date, 'query_list': query_list } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.start_date = '' # 获取用户第一条微博时的日期 self.query = '' self.user: UserInfo = UserInfo() # 存储目标微博用户信息 self.got_count = 0 # 存储爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 self.weibo_id_list = [] # 存储爬取到的所有微博id self.proxies = requests.get( "http://api.hailiangip.com:8422/api/getIp?type=1&num=1&pid=&unbindTime=600&cid=&orderId=O21042810412537647150&time=1619577728&sign=d79086f5b8ba9dbe1a17e5b710b77032&noDuplicate=1&dataType=0&lineSeparator=0&singleIp=" ).json()['data'][0] logger.info({ "http": f"http://{self.proxies['ip']}:{self.proxies['port']}", "https": f"https://{self.proxies['ip']}:{self.proxies['port']}", }) def validate_config(self, config): """验证配置是否正确""" # 验证filter、original_pic_download、retweet_pic_download、original_video_download、retweet_video_download argument_list = [ 'filter', 'original_pic_download', 'retweet_pic_download', 'original_video_download', 'retweet_video_download' ] for argument in argument_list: if config[argument] != 0 and config[argument] != 1: logger.warning(u'%s值应为0或1,请重新输入', config[argument]) sys.exit() # 验证since_date since_date = config['since_date'] if (not self.is_date(str(since_date))) and (not isinstance( since_date, int)): logger.warning(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') sys.exit() # 验证query_list query_list = config.get('query_list') or [] if (not isinstance(query_list, list)) and (not isinstance( query_list, str)): logger.warning(u'query_list值应为list类型或字符串,请重新输入') sys.exit() # 验证write_mode write_mode = ['csv', 'json', 'mongo', 'mysql'] if not isinstance(config['write_mode'], list): sys.exit(u'write_mode值应为list类型') for mode in config['write_mode']: if mode not in write_mode: logger.warning( u'%s为无效模式,请从csv、json、mongo和mysql中挑选一个或多个作为write_mode', mode) sys.exit() # 验证user_id_list user_id_list = config['user_id_list'] if (not isinstance(user_id_list, list)) and (not user_id_list.endswith('.txt')): logger.warning(u'user_id_list值应为list类型或txt文件路径') sys.exit() if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list if not os.path.isfile(user_id_list): logger.warning(u'不存在%s文件', user_id_list) sys.exit() def is_date(self, since_date): """判断日期格式是否正确""" try: datetime.strptime(since_date, '%Y-%m-%d') return True except ValueError: return False def get_json(self, params): """获取网页中json数据""" url = 'https://m.weibo.cn/api/container/getIndex?' r = requests.get(url, params=params, headers=self.headers, proxies={ "http": f"http://{self.proxies['ip']}:{self.proxies['port']}", "https": f"https://{self.proxies['ip']}:{self.proxies['port']}", }, verify=False) return r.json() def get_weibo_json(self, page): """获取网页中微博json数据""" params = { 'container_ext': 'profile_uid:' + str(self.user_config['user_id']), 'containerid': '100103type=401&q=' + self.query, 'page_type': 'searchall' } if self.query else { 'containerid': '107603' + str(self.user_config['user_id']) } params['page'] = page js = self.get_json(params) return js def user_to_database(self): """将用户信息写入文件/数据库""" hbase.check_create_table("user", {"info": {}}) hbase.update("user", self.user.user_id, {"info": self.user.dict()}) def get_user_info(self): """获取用户信息""" params = {'containerid': '100505' + str(self.user_config['user_id'])} js = self.get_json(params) if js['ok']: info = js['data']['userInfo'] user_info = OrderedDict() user_info['user_id'] = self.user_config['user_id'] user_info['screen_name'] = info.get('screen_name', '') user_info['gender'] = "女" if js['data']['userInfo']['gender'] == "f" else "男" params = { 'containerid': '230283' + str(self.user_config['user_id']) + '_-_INFO' } zh_list = [ u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间', u'阳光信用' ] en_list = [ 'birthday', 'location', 'education', 'education', 'education', 'education', 'company', 'registration_time', 'sunshine' ] for i in en_list: user_info[i] = '' js = self.get_json(params) if js['ok']: cards = js['data']['cards'] if isinstance(cards, list) and len(cards) > 1: card_list = cards[0]['card_group'] + cards[1]['card_group'] for card in card_list: if card.get('item_name') in zh_list: user_info[en_list[zh_list.index( card.get('item_name'))]] = card.get( 'item_content', '') user_info['statuses_count'] = info.get('statuses_count', 0) user_info['followers_count'] = info.get('followers_count', 0) user_info['follow_count'] = info.get('follow_count', 0) user_info['description'] = info.get('description', '') user_info['profile_url'] = info.get('profile_url', '') user_info['profile_image_url'] = info.get('profile_image_url', '') user_info['avatar_hd'] = info.get('avatar_hd', '') user_info['urank'] = info.get('urank', 0) user_info['mbrank'] = info.get('mbrank', 0) user_info['verified'] = info.get('verified', False) user_info['verified_type'] = info.get('verified_type', -1) user_info['verified_reason'] = info.get('verified_reason', '') user = self.standardize_info(user_info) self.user = UserInfo(**dict(user)) self.user_to_database() return user def get_long_weibo(self, id): """获取长微博""" for i in range(5): url = 'https://m.weibo.cn/detail/%s' % id html = requests.get(url, headers=self.headers, verify=False).text html = html[html.find('"status":'):] html = html[:html.rfind('"hotScheme"')] html = html[:html.rfind(',')] html = '{' + html + '}' js = json.loads(html, strict=False) weibo_info = js.get('status') if weibo_info: weibo = self.parse_weibo(weibo_info) return weibo sleep(random.randint(6, 10)) def get_pics(self, weibo_info): """获取微博原始图片url""" if weibo_info.get('pics'): pic_info = weibo_info['pics'] pic_list = [pic['large']['url'] for pic in pic_info] pics = ','.join(pic_list) else: pics = '' return pics def get_live_photo(self, weibo_info): """获取live photo中的视频url""" live_photo_list = [] live_photo = weibo_info.get('pic_video') if live_photo: prefix = 'https://video.weibo.com/media/play?livephoto=//us.sinaimg.cn/' for i in live_photo.split(','): if len(i.split(':')) == 2: url = prefix + i.split(':')[1] + '.mov' live_photo_list.append(url) return live_photo_list def get_video_url(self, weibo_info): """获取微博视频url""" video_url = '' video_url_list = [] if weibo_info.get('page_info'): if ((weibo_info['page_info'].get('urls') or weibo_info['page_info'].get('media_info')) and weibo_info['page_info'].get('type') == 'video'): media_info = weibo_info['page_info']['urls'] if not media_info: media_info = weibo_info['page_info']['media_info'] video_url = media_info.get('mp4_720p_mp4') if not video_url: video_url = media_info.get('mp4_hd_url') if not video_url: video_url = media_info.get('hevc_mp4_hd') if not video_url: video_url = media_info.get('mp4_sd_url') if not video_url: video_url = media_info.get('mp4_ld_mp4') if not video_url: video_url = media_info.get('stream_url_hd') if not video_url: video_url = media_info.get('stream_url') if video_url: video_url_list.append(video_url) live_photo_list = self.get_live_photo(weibo_info) if live_photo_list: video_url_list += live_photo_list return ';'.join(video_url_list) def download_one_file(self, url, file_name, type, weibo_id): """下载单个文件(图片/视频)""" hbase.update("weibo", str(weibo_id), {"img" if type == "img" else "video": {file_name: url}}) def handle_download(self, file_type, file_dir, urls, w): """处理下载相关操作""" file_prefix = w['created_at'][:11].replace('-', '') + '_' + str( w['id']) if file_type == 'img': if ',' in urls: url_list = urls.split(',') for i, url in enumerate(url_list): index = url.rfind('.') if len(url) - index >= 5: file_suffix = '.jpg' else: file_suffix = url[index:] file_name = file_prefix + '_' + str(i + 1) + file_suffix self.download_one_file(url, file_name, file_type, w['id']) else: index = urls.rfind('.') if len(urls) - index > 5: file_suffix = '.jpg' else: file_suffix = urls[index:] file_name = file_prefix + file_suffix self.download_one_file(urls, file_name, file_type, w['id']) else: file_suffix = '.mp4' if ';' in urls: url_list = urls.split(';') if url_list[0].endswith('.mov'): file_suffix = '.mov' for i, url in enumerate(url_list): file_name = file_prefix + '_' + str(i + 1) + file_suffix self.download_one_file(url, file_name, file_type, w['id']) else: if urls.endswith('.mov'): file_suffix = '.mov' file_name = file_prefix + file_suffix self.download_one_file(urls, file_name, file_type, w['id']) def download_files(self, file_type, weibo_type, wrote_count): """下载文件(图片/视频)""" try: describe = '' if file_type == 'img': describe = u'图片' key = 'pics' else: describe = u'视频' key = 'video_url' if weibo_type == 'original': describe = u'原创微博' + describe else: describe = u'转发微博' + describe logger.info(u'即将进行%s下载', describe) file_dir = self.get_filepath(file_type) file_dir = file_dir + os.sep + describe if not os.path.isdir(file_dir): os.makedirs(file_dir) for w in self.weibo[wrote_count:]: if weibo_type == 'retweet': if w.get('retweet'): w = w['retweet'] else: continue if w.get(key): self.handle_download(file_type, file_dir, w.get(key), w) logger.info(u'%s下载完毕,保存路径:', describe) logger.info(file_dir) except Exception as e: logger.exception(e) def get_location(self, selector): """获取微博发布位置""" location_icon = 'timeline_card_small_location_default.png' span_list = selector.xpath('//span') location = '' for i, span in enumerate(span_list): if span.xpath('img/@src'): if location_icon in span.xpath('img/@src')[0]: location = span_list[i + 1].xpath('string(.)') break return location def get_article_url(self, selector): """获取微博中头条文章的url""" article_url = '' text = selector.xpath('string(.)') if text.startswith(u'发布了头条文章'): url = selector.xpath('//a/@data-url') if url and url[0].startswith('http://t.cn'): article_url = url[0] return article_url def get_topics(self, selector): """获取参与的微博话题""" span_list = selector.xpath("//span[@class='surl-text']") topics = '' topic_list = [] for span in span_list: text = span.xpath('string(.)') if len(text) > 2 and text[0] == '#' and text[-1] == '#': topic_list.append(text[1:-1]) if topic_list: topics = ','.join(topic_list) return topics def get_at_users(self, selector): """获取@用户""" a_list = selector.xpath('//a') at_users = '' at_list = [] for a in a_list: if '@' + a.xpath('@href')[0][3:] == a.xpath('string(.)'): at_list.append(a.xpath('string(.)')[1:]) if at_list: at_users = ','.join(at_list) return at_users def string_to_int(self, string): """字符串转换为整数""" if isinstance(string, int): return string elif string.endswith(u'万+'): string = int(string[:-2] + '0000') elif string.endswith(u'万'): string = int(string[:-1] + '0000') return int(string) def standardize_date(self, created_at): """标准化微博发布时间""" if u'刚刚' in created_at: created_at = datetime.now().strftime('%Y-%m-%d') elif u'分钟' in created_at: minute = created_at[:created_at.find(u'分钟')] minute = timedelta(minutes=int(minute)) created_at = (datetime.now() - minute).strftime('%Y-%m-%d') elif u'小时' in created_at: hour = created_at[:created_at.find(u'小时')] hour = timedelta(hours=int(hour)) created_at = (datetime.now() - hour).strftime('%Y-%m-%d') elif u'昨天' in created_at: day = timedelta(days=1) created_at = (datetime.now() - day).strftime('%Y-%m-%d') else: created_at = created_at.replace('+0800 ', '') temp = datetime.strptime(created_at, '%c') created_at = datetime.strftime(temp, '%Y-%m-%d') return created_at def standardize_info(self, weibo): """标准化信息,去除乱码""" # for k, v in weibo.items(): # if 'bool' not in str(type(v)) and 'int' not in str( # type(v)) and 'list' not in str( # type(v)) and 'long' not in str(type(v)): # weibo[k] = v.replace(u'\u200b', '').encode( # sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding) return weibo def parse_weibo(self, weibo_info): weibo = OrderedDict() if weibo_info['user']: weibo['user_id'] = weibo_info['user']['id'] weibo['screen_name'] = weibo_info['user']['screen_name'] else: weibo['user_id'] = '' weibo['screen_name'] = '' weibo['id'] = int(weibo_info['id']) weibo['bid'] = weibo_info['bid'] text_body = weibo_info['text'] selector = etree.HTML(text_body) weibo['text'] = etree.HTML(text_body).xpath('string(.)') weibo['article_url'] = self.get_article_url(selector) weibo['pics'] = self.get_pics(weibo_info) weibo['video_url'] = self.get_video_url(weibo_info) weibo['location'] = self.get_location(selector) weibo['created_at'] = weibo_info['created_at'] weibo['source'] = weibo_info['source'] weibo['attitudes_count'] = self.string_to_int( weibo_info.get('attitudes_count', 0)) weibo['comments_count'] = self.string_to_int( weibo_info.get('comments_count', 0)) weibo['reposts_count'] = self.string_to_int( weibo_info.get('reposts_count', 0)) weibo['topics'] = self.get_topics(selector) weibo['at_users'] = self.get_at_users(selector) return self.standardize_info(weibo) def print_user_info(self): """打印用户信息""" logger.info('+' * 100) self.user.print() logger.info('+' * 100) def print_one_weibo(self, weibo): """打印一条微博""" try: logger.info(u'微博id:{}', weibo['id']) logger.info(u'微博正文:{}', weibo['text']) logger.info(u'原始图片url:{}', weibo['pics']) logger.info(u'微博位置:{}', weibo['location']) logger.info(u'发布时间:{}', weibo['created_at']) logger.info(u'发布工具:{}', weibo['source']) logger.info(u'点赞数:{}', weibo['attitudes_count']) logger.info(u'评论数:{}', weibo['comments_count']) logger.info(u'转发数:{}', weibo['reposts_count']) logger.info(u'话题:{}', weibo['topics']) logger.info(u'@用户:{}', weibo['at_users']) logger.info(u'url:https://m.weibo.cn/detail/{}', weibo['id']) except OSError: pass def print_weibo(self, weibo): """打印微博,若为转发微博,会同时打印原创和转发部分""" if weibo.get('retweet'): logger.info('*' * 100) logger.info(u'转发部分:') self.print_one_weibo(weibo['retweet']) logger.info('*' * 100) logger.info(u'原创部分:') self.print_one_weibo(weibo) logger.info('-' * 120) def get_one_weibo(self, info): """获取一条微博的全部信息""" try: weibo_info = info['mblog'] weibo_id = weibo_info['id'] retweeted_status = weibo_info.get('retweeted_status') is_long = True if weibo_info.get( 'pic_num') > 9 else weibo_info.get('isLongText') if retweeted_status and retweeted_status.get('id'): # 转发 retweet_id = retweeted_status.get('id') is_long_retweet = retweeted_status.get('isLongText') if is_long: weibo = self.get_long_weibo(weibo_id) if not weibo: weibo = self.parse_weibo(weibo_info) else: weibo = self.parse_weibo(weibo_info) if is_long_retweet: retweet = self.get_long_weibo(retweet_id) if not retweet: retweet = self.parse_weibo(retweeted_status) else: retweet = self.parse_weibo(retweeted_status) retweet['created_at'] = self.standardize_date( retweeted_status['created_at']) weibo['retweet'] = retweet else: # 原创 if is_long: weibo = self.get_long_weibo(weibo_id) if not weibo: weibo = self.parse_weibo(weibo_info) else: weibo = self.parse_weibo(weibo_info) weibo['created_at'] = self.standardize_date( weibo_info['created_at']) return weibo except Exception as e: logger.exception(e) def is_pinned_weibo(self, info): """判断微博是否为置顶微博""" weibo_info = info['mblog'] title = weibo_info.get('title') if title and title.get('text') == u'置顶': return True else: return False def get_one_page(self, page): """获取一页的全部微博""" try: js = self.get_weibo_json(page) if js['ok']: weibos = js['data']['cards'] if self.query: weibos = weibos[0]['card_group'] for w in weibos: if w['card_type'] == 9: wb = self.get_one_weibo(w) if wb: if wb['id'] in self.weibo_id_list: continue created_at = datetime.strptime( wb['created_at'], '%Y-%m-%d') since_date = datetime.strptime( self.user_config['since_date'], '%Y-%m-%d') if created_at < since_date: if self.is_pinned_weibo(w): continue else: logger.info( u'{}已获取{}({})的第{}页{}微博{}'.format( '-' * 30, self.user.screen_name, self.user.user_id, page, '包含"' + self.query + '"的' if self.query else '', '-' * 30)) return True if (not self.filter) or ( 'retweet' not in wb.keys()): hbase.check_create_table("weibo", {"info": {}, "img": {}, "video": {}}) hbase.update("weibo", str(wb['id']), {"info": WeiBoInfo(**wb).dict(), "img": {}, "video": {}}) self.weibo.append(wb) self.weibo_id_list.append(wb['id']) self.got_count += 1 self.print_weibo(wb) else: logger.info(u'正在过滤转发微博') else: return True logger.info(u'{}已获取{}({})的第{}页微博{}'.format( '-' * 30, self.user.screen_name, self.user.user_id, page, '-' * 30)) except Exception as e: logger.exception(e) def get_page_count(self): """获取微博页数""" try: page_count = int(math.ceil(int(self.user.statuses_count) / 10.0)) return page_count except KeyError: logger.exception( u'程序出错,错误原因可能为以下两者:\n' u'1.user_id不正确;\n' u'2.此用户微博可能需要设置cookie才能爬取。\n' u'解决方案:\n' u'请参考\n' u'https://github.com/dataabc/weibo-crawler#如何获取user_id\n' u'获取正确的user_id;\n' u'或者参考\n' u'https://github.com/dataabc/weibo-crawler#3程序设置\n' u'中的“设置cookie”部分设置cookie信息') def get_write_info(self, wrote_count): """获取要写入的微博信息""" write_info = [] for w in self.weibo[wrote_count:]: wb = OrderedDict() for k, v in w.items(): if k not in ['user_id', 'screen_name', 'retweet']: if 'unicode' in str(type(v)): v = v.encode('utf-8') wb[k] = v if not self.filter: if w.get('retweet'): wb['is_original'] = False for k2, v2 in w['retweet'].items(): if 'unicode' in str(type(v2)): v2 = v2.encode('utf-8') wb['retweet_' + k2] = v2 else: wb['is_original'] = True write_info.append(wb) return write_info def get_filepath(self, type): """获取结果文件路径""" try: dir_name = self.user.screen_name if self.result_dir_name: dir_name = self.user_config['user_id'] file_dir = os.path.split(os.path.realpath( __file__))[0] + os.sep + 'weibo' + os.sep + dir_name if type == 'img' or type == 'video': file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): os.makedirs(file_dir) if type == 'img' or type == 'video': return file_dir file_path = file_dir + os.sep + self.user_config[ 'user_id'] + '.' + type return file_path except Exception as e: logger.exception(e) def get_result_headers(self): """获取要写入结果文件的表头""" result_headers = [ 'id', 'bid', '正文', '头条文章url', '原始图片url', '视频url', '位置', '日期', '工具', '点赞数', '评论数', '转发数', '话题', '@用户' ] if not self.filter: result_headers2 = ['是否原创', '源用户id', '源用户昵称'] result_headers3 = ['源微博' + r for r in result_headers] result_headers = result_headers + result_headers2 + result_headers3 return result_headers def write_csv(self, wrote_count): """将爬到的信息写入csv文件""" write_info = self.get_write_info(wrote_count) result_headers = self.get_result_headers() result_data = [w.values() for w in write_info] file_path = self.get_filepath('csv') self.csv_helper(result_headers, result_data, file_path) def csv_helper(self, headers, result_data, file_path): """将指定信息写入csv文件""" if not os.path.isfile(file_path): is_first_write = 1 else: is_first_write = 0 if sys.version < '3': # python2.x with open(file_path, 'ab') as f: f.write(codecs.BOM_UTF8) writer = csv.writer(f) if is_first_write: writer.writerows([headers]) writer.writerows(result_data) else: # python3.x with open(file_path, 'a', encoding='utf-8-sig', newline='') as f: writer = csv.writer(f) if is_first_write: writer.writerows([headers]) writer.writerows(result_data) if headers[0] == 'id': logger.info(u'%d条微博写入csv文件完毕,保存路径:', self.got_count) else: logger.info(u'%s 信息写入csv文件完毕,保存路径:', self.user.screen_name) logger.info(file_path) def update_json_data(self, data, weibo_info): """更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中""" data['user'] = self.user if data.get('weibo'): is_new = 1 # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复 for old in data['weibo']: if weibo_info[-1]['id'] == old['id']: is_new = 0 break if is_new == 0: for new in weibo_info: flag = 1 for i, old in enumerate(data['weibo']): if new['id'] == old['id']: data['weibo'][i] = new flag = 0 break if flag: data['weibo'].append(new) else: data['weibo'] += weibo_info else: data['weibo'] = weibo_info return data def write_json(self, wrote_count): """将爬到的信息写入json文件""" data = {} path = self.get_filepath('json') if os.path.isfile(path): with codecs.open(path, 'r', encoding='utf-8') as f: data = json.load(f) weibo_info = self.weibo[wrote_count:] data = self.update_json_data(data, weibo_info) with codecs.open(path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False) logger.info(u'%d条微博写入json文件完毕,保存路径:', self.got_count) logger.info(path) def info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" try: import pymongo except ImportError: logger.warning( u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序') sys.exit() try: from pymongo import MongoClient client = MongoClient() db = client['weibo'] collection = db[collection] if len(self.write_mode) > 1: new_info_list = copy.deepcopy(info_list) else: new_info_list = info_list for info in new_info_list: if not collection.find_one({'id': info['id']}): collection.insert_one(info) else: collection.update_one({'id': info['id']}, {'$set': info}) except pymongo.errors.ServerSelectionTimeoutError: logger.warning( u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序') sys.exit() def weibo_to_mongodb(self, wrote_count): """将爬取的微博信息写入MongoDB数据库""" self.info_to_mongodb('weibo', self.weibo[wrote_count:]) logger.info(u'%d条微博写入MongoDB数据库完毕', self.got_count) def mysql_create(self, connection, sql): """创建MySQL数据库或表""" try: with connection.cursor() as cursor: cursor.execute(sql) finally: connection.close() def mysql_create_database(self, mysql_config, sql): """创建MySQL数据库""" try: import pymysql except ImportError: logger.warning( u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序') sys.exit() try: if self.mysql_config: mysql_config = self.mysql_config connection = pymysql.connect(**mysql_config) self.mysql_create(connection, sql) except pymysql.OperationalError: logger.warning(u'系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序') sys.exit() def mysql_create_table(self, mysql_config, sql): """创建MySQL表""" import pymysql if self.mysql_config: mysql_config = self.mysql_config mysql_config['db'] = 'weibo' connection = pymysql.connect(**mysql_config) self.mysql_create(connection, sql) def mysql_insert(self, mysql_config, table, data_list): """向MySQL表插入或更新数据""" import pymysql if len(data_list) > 0: keys = ', '.join(data_list[0].keys()) values = ', '.join(['%s'] * len(data_list[0])) if self.mysql_config: mysql_config = self.mysql_config mysql_config['db'] = 'weibo' connection = pymysql.connect(**mysql_config) cursor = connection.cursor() sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE""".format(table=table, keys=keys, values=values) update = ','.join([ ' {key} = values({key})'.format(key=key) for key in data_list[0] ]) sql += update try: cursor.executemany( sql, [tuple(data.values()) for data in data_list]) connection.commit() except Exception as e: connection.rollback() logger.exception(e) finally: connection.close() def weibo_to_mysql(self, wrote_count): """将爬取的微博信息写入MySQL数据库""" mysql_config = { 'host': 'localhost', 'port': 3306, 'user': '******', 'password': '******', 'charset': 'utf8mb4' } # 创建'weibo'表 create_table = """ CREATE TABLE IF NOT EXISTS weibo ( id varchar(20) NOT NULL, bid varchar(12) NOT NULL, user_id varchar(20), screen_name varchar(30), text varchar(2000), article_url varchar(100), topics varchar(200), at_users varchar(1000), pics varchar(3000), video_url varchar(1000), location varchar(100), created_at DATETIME, source varchar(30), attitudes_count INT, comments_count INT, reposts_count INT, retweet_id varchar(20), PRIMARY KEY (id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" self.mysql_create_table(mysql_config, create_table) weibo_list = [] retweet_list = [] if len(self.write_mode) > 1: info_list = copy.deepcopy(self.weibo[wrote_count:]) else: info_list = self.weibo[wrote_count:] for w in info_list: if 'retweet' in w: w['retweet']['retweet_id'] = '' retweet_list.append(w['retweet']) w['retweet_id'] = w['retweet']['id'] del w['retweet'] else: w['retweet_id'] = '' weibo_list.append(w) # 在'weibo'表中插入或更新微博数据 self.mysql_insert(mysql_config, 'weibo', retweet_list) self.mysql_insert(mysql_config, 'weibo', weibo_list) logger.info(u'%d条微博写入MySQL数据库完毕', self.got_count) def update_user_config_file(self, user_config_file_path): """更新用户配置文件""" with open(user_config_file_path, 'rb') as f: try: lines = f.read().splitlines() lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: logger.error(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序', user_config_file_path) sys.exit() for i, line in enumerate(lines): info = line.split(' ') if len(info) > 0 and info[0].isdigit(): if self.user_config['user_id'] == info[0]: if len(info) == 1: info.append(self.user.screen_name) info.append(self.start_date) if len(info) == 2: info.append(self.start_date) if len(info) > 2: info[2] = self.start_date lines[i] = ' '.join(info) break with codecs.open(user_config_file_path, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) def write_data(self, wrote_count): """将爬到的信息写入文件或数据库""" if self.got_count > wrote_count: if 'csv' in self.write_mode: self.write_csv(wrote_count) if 'json' in self.write_mode: self.write_json(wrote_count) if 'mysql' in self.write_mode: self.weibo_to_mysql(wrote_count) if 'mongo' in self.write_mode: self.weibo_to_mongodb(wrote_count) if self.original_pic_download: self.download_files('img', 'original', wrote_count) if self.original_video_download: self.download_files('video', 'original', wrote_count) if not self.filter: if self.retweet_pic_download: self.download_files('img', 'retweet', wrote_count) if self.retweet_video_download: self.download_files('video', 'retweet', wrote_count) def get_pages(self): """获取全部微博""" try: self.get_user_info() self.print_user_info() since_date = datetime.strptime(self.user_config['since_date'], '%Y-%m-%d') today = datetime.strptime(str(date.today()), '%Y-%m-%d') if since_date <= today: page_count = self.get_page_count() wrote_count = 0 page1 = 0 random_pages = random.randint(1, 5) self.start_date = datetime.now().strftime('%Y-%m-%d') for page in range(self.start_page, page_count + 1): is_end = self.get_one_page(page) if is_end: break if page % 20 == 0: # 每爬20页写入一次文件 self.write_data(wrote_count) wrote_count = self.got_count # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 if (page - page1) % random_pages == 0 and page < page_count: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) self.write_data(wrote_count) # 将剩余不足20页的微博写入文件 logger.info(u'微博爬取完成,共爬取%d条微博', self.got_count) except Exception as e: logger.exception(e) def get_user_config_list(self, file_path): """获取文件中的微博id信息""" with open(file_path, 'rb') as f: try: lines = f.read().splitlines() lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: logger.error(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序', file_path) sys.exit() user_config_list = [] for line in lines: info = line.split(' ') if len(info) > 0 and info[0].isdigit(): user_config = {} user_config['user_id'] = info[0] if len(info) > 2: if self.is_date(info[2]): user_config['since_date'] = info[2] elif info[2].isdigit(): since_date = date.today() - timedelta(int(info[2])) user_config['since_date'] = str(since_date) else: user_config['since_date'] = self.since_date if len(info) > 3: user_config['query_list'] = info[3].split(',') else: user_config['query_list'] = self.query_list if user_config not in user_config_list: user_config_list.append(user_config) return user_config_list def initialize_info(self, user_config): """初始化爬虫信息""" self.weibo = [] self.user = UserInfo() self.user_config = user_config self.got_count = 0 self.weibo_id_list = [] def start(self): """运行爬虫""" for user_config in self.user_config_list: if len(user_config['query_list']): for query in user_config['query_list']: self.query = query self.initialize_info(user_config) self.get_pages() else: self.initialize_info(user_config) self.get_pages() logger.info(u'信息抓取完毕') logger.info('*' * 100) if self.user_config_file_path and self.user.user_id: self.update_user_config_file(self.user_config_file_path)
def get_user_info(): u = get_current_user() if u is None: return None else: return UserInfo.get_or_insert(key_name='user:%s' % u.email())
def fetchUserInfo(self, user_id): ## *****Add attempt to get from session or memcache***** request = UserInfo.all().filter("fs_id = ", str(user_id)) user = request.get() return user if user else None
def fetchAccessToken(self, user_id): ## Add attempt to get from session or memcache request = UserInfo.all() request.filter("fs_id = ", str(user_id)) user_token = request.get() return user_token.token if user_token else None