def __init__(self, args={}): #Base.__init__() super(TWitter, self).__init__() #以上是俩种调用基类构造函数的方法 self.__consumer_key = 'c58jPuNxqLex5QttLkoVF621T' self.__consumer_secret = "qU2EfulVxZ9a9mSPVm0bww4HXDyC8qk4a2gQrq7bgy4dKOqfup" self.__access_token = "930249938012798978-BJCWSdIgciyVZ0IUKLXVXLlc1A3D2my" self.__access_secret = "HjDrf1nvRDZIT5NSXioGVeOeZoev26Ibi08hCBQMhMof4" super(Base, self).__init__(self.__consumer_key, self.__consumer_secret, self.__access_token, self.__access_secret) auth = tweepy.OAuthHandler(self.__consumer_key, self.__consumer_secret) auth.set_access_token(self.__access_token, self.__access_secret) self.__flag = 'twitter' self.api = tweepy.API(auth) self.args = args # self.crawler_list_queue = RedisQueue(name='twitter_list',redis_config=redis_config) self.crawler_tweets_err_queue = RedisQueue( name='twitter_error', redis_config=self.app_config['redis_config']) self.crawler_replay_queue = RedisQueue( name='twitter_replay', redis_config=self.app_config['redis_config']) self.crawler_tweets_queue = RedisQueue( name='twitter', redis_config=self.app_config['redis_config']) self.twitter_users_queue = RedisQueue( name='twitter_users', redis_config=self.app_config['redis_config'])
def update_facebook_users_count(self, crawler): #facebook 用户账户的like_num字段 print('【爬取twitter usersInfos】') db = self.create_mongo_conn(db='FaceBook', collection='facebook') weipa_count = 1 while True: try: facebook_users_queue = RedisQueue( name='facebook_users', redis_config=self.app_config['redis_config']) print('[===未抓取的的id个数为:%s===]' % facebook_users_queue.qsize()) if facebook_users_queue.empty(): if weipa_count >= 3: print('<-----用户抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = facebook_users_queue.get() tweet = db.find_one({"id": id}) print(tweet) if not tweet: continue if tweet['link'].endswith('/'): url = tweet['link'] + 'community/' else: url = tweet['link'] + '/' + 'community/' content = crawler.crawler_user_likes(url) for item in content: # print(item) if "isLoginStatus" in item: update_doc = db.find_one_and_update( {"id": id}, { '$set': { 'likes_num': item['like_count'], 'fan_count': item['fan_count'], "update_time": datetime.now(), "isLoginStatus": item["isLoginStatus"] } }, return_document=ReturnDocument.AFTER) print('更新了%s文档成功' % update_doc['id']) else: update_doc = db.find_one_and_update( {"id": id}, { '$set': { 'likes_num': item['like_count'], 'fan_count': item['fan_count'], "update_time": datetime.now(), } }, return_document=ReturnDocument.AFTER) print('更新了%s文档成功' % update_doc['id']) weipa_count = 1 except Exception as e: raise e
def check_queue_isEmpty(): try: queue = RedisQueue(name='twitter_replay', redis_config=app_config['redis_config']) if queue.qsize() > 0: twitter_every_day_update_count_job() else: print('[没有任务!!]') return except Exception as e: print(e) return
def __init__(self,args={}): #Base.__init__() super(FaceBook,self).__init__() #以上是俩种调用基类构造函数的方法 self.__username = "******" self.__password = "******" self.__access_token = "EAACEdEose0cBAAkdhoyXkFejburMPqbr7b773AxZCs7b1BORK7V2gUxVlmKkYydZCZBuyy4UcZA0QxThf7ii0tbDnsiCSzwFJ9DZAeGTcUCsGHQPTk7hPamWAZA2mN6IBjNXDsDQwwzrwet4h1piWTP5fuBnKjZCGm8ZCyXjCEWS7apZCoo1ZAuO5OBfoc9IDCgjSDfvc3pWKWGEPcICelHO456OUnZAxeDpLUZD" self.__flag = 'facebook' self.args = args self.crawler_tweets_err_queue = RedisQueue(name='facebook_error', redis_config=self.app_config['redis_config']) self.crawler_reactions_queue = RedisQueue(name='facebook_reactions',redis_config=self.app_config['redis_config']) self.crawler_tweets_queue = RedisQueue(name='facebook',redis_config=self.app_config['redis_config']) self.facebook_users_queue = RedisQueue(name='facebook_users', redis_config=self.app_config['redis_config'])
def completion_twitter_text(conn): # with open('twitter_user_ids.json','r') as f: # ids = json.load(f) current = 0; # for id in ids['ids']: with open(os.path.abspath('config.json'), 'r') as f: app_config = json.load(f) twitter_crawler_queue = RedisQueue(name='twitter_test_ids', redis_config=app_config['redis_config2']) while True: try: # print(id) if twitter_crawler_queue.empty(): print('所有的用户全部跑完') break id = twitter_crawler_queue.get() print('[===取出的ID为%s,目前还有%s条需要抓取====]' % (id,twitter_crawler_queue.qsize())) es_url = 'http://narnia.idatage.com/stq/api/v1/rowlet/findEsTextByUserIdOrKeywords?startDate=2016-12-30&endDate=2018-02-12&category=tw&ids=%s' % (id,) # if i in [1,2,3,4,5,6,7,8] else 'http://narnia.idatage.com/stq/api/v1/rowlet/findEsTextByUserIdOrKeywords?startDate=2017-%s-01&endDate=2017-%s-01&category=%s&ids=%s' % (i,i+1,category,id) es_body = requests.get(es_url) # print('取出的内容为:') # print(es_body.text) es_body_tw = json.loads(es_body.text)['tw'] print(len(es_body_tw)) def handele(x): # print(x) x['_source']['index_name'] = x['_index'] x['_source']['type_name'] = x['_type'] x['_source']['id']=x['_id'] x['_source']['url'] = 'https://twitter.com/%s/status/%s' % (x['_source']['user']['screen_name'], x['_source']['id_str']) return x['_source']; es_body_tw_urls = list(map(handele,filter(lambda x:not x['_source']['truncated'],es_body_tw))) # print(es_body_tw_urls) if len(es_body_tw_urls)>200: pool = mp.Pool() res = pool.map(asynchronous_request,(es_body_tw_urls[i:i+200] for i in range(0,len(es_body_tw_urls),200))) # current += 1; print('更新%s用户' % id) elif 0<len(es_body_tw_urls)<200: asynchronous_request(ops=es_body_tw_urls) # current += 1; print('更新%s用户' % id) # print('第几%s个' % current) else: current += 1; print('该用户%s无需更新' % id) print('第几%s个' % current) conn.send(id) except Exception as e: current = 0 # print(e) raise e
def update_twitter_users_count(self, crawler): #twitter 用户的list字段 # with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: # user_ids = json.load(f) # count=1; print('【爬取twitter usersInfos】') db = self.create_mongo_conn(db='Twitter', collection='twitter') weipa_count = 1 while True: try: twitter_users_queue = RedisQueue( name='twitter_users', redis_config=self.app_config['redis_config']) print('[===未抓取的的id个数为:%s===]' % twitter_users_queue.qsize()) if twitter_users_queue.empty(): if weipa_count >= 3: print('<-----用户抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = twitter_users_queue.get() tweet = db.find_one({"id_str": id}) tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count = crawler.crawler_list_count( tweet["screen_name"], user_id=id) print(tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count) after_doc = db.find_one_and_update( {"id_str": tweet['id_str']}, { "$set": { "list_num": list_count, "moment_num": moment_count, "followers_count": followers_count, "friends_count": flowing_count, "statuses_count": tweet_count, "favourites_count": favorites_count, "update_time": datetime.now(), } }, return_document=ReturnDocument.AFTER) print('更新%s文档成功' % after_doc['_id']) # count +=1; weipa_count = 1 except Exception as e: raise e
def crawler_init(name='twitter'): print('<-----初始化程序----->') if name == 'twitter': twitter_crawler_queue = RedisQueue(name='twitter') if twitter_crawler_queue.qsize() == 0: with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: twitter_crawler_queue.put(id) else: facebook_crawler_queue = RedisQueue(name='facebook') if facebook_crawler_queue.qsize() == 0: with open(os.path.abspath('facebook_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: facebook_crawler_queue.put(id) print('<-----初始化成功------>')
def clear_queue(site='twitter'): config = read_config() if site == 'twitter': twitter_crawler_queue = RedisQueue(name='twitter', redis_config=config['redis_config']) twitter_crawler_queue.clear() else: facebook_crawler_queue = RedisQueue( name='facebook', redis_config=config['redis_config']) facebook_crawler_queue.clear()
def crawler_twitter_init(): config = read_config() print('<-----初始化程序----->') twitter_crawler_queue = RedisQueue(name='twitter_users', redis_config=config['redis_config']) if twitter_crawler_queue.qsize() > 0: print('<-----有%s个任务还未完成---->' % twitter_crawler_queue.qsize()) if twitter_crawler_queue.empty(): with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: twitter_crawler_queue.put(id) print('<-----有%s个任务需要完成----->' % twitter_crawler_queue.qsize()) print('<-----twitter初始化完成----->')
import sys, os, json, time sys.path.append('.') #print(sys.path) from src.redis_helper import RedisQueue if __name__ == '__main__': twitter_crawler_queue = RedisQueue(name='twittter') with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) # print(user_ids) # print(twitter_crawler_queue.get_key()) for id in user_ids['ids']: print(id) twitter_crawler_queue.put(id) time.sleep(1) # print(twitter_crawler_queue.qsize())
class TWitter(Base, twython.Twython): """ The RequestHandler class for our server. """ def __init__(self, args={}): #Base.__init__() super(TWitter, self).__init__() #以上是俩种调用基类构造函数的方法 self.__consumer_key = 'c58jPuNxqLex5QttLkoVF621T' self.__consumer_secret = "qU2EfulVxZ9a9mSPVm0bww4HXDyC8qk4a2gQrq7bgy4dKOqfup" self.__access_token = "930249938012798978-BJCWSdIgciyVZ0IUKLXVXLlc1A3D2my" self.__access_secret = "HjDrf1nvRDZIT5NSXioGVeOeZoev26Ibi08hCBQMhMof4" super(Base, self).__init__(self.__consumer_key, self.__consumer_secret, self.__access_token, self.__access_secret) auth = tweepy.OAuthHandler(self.__consumer_key, self.__consumer_secret) auth.set_access_token(self.__access_token, self.__access_secret) self.__flag = 'twitter' self.api = tweepy.API(auth) self.args = args # self.crawler_list_queue = RedisQueue(name='twitter_list',redis_config=redis_config) self.crawler_tweets_err_queue = RedisQueue( name='twitter_error', redis_config=self.app_config['redis_config']) self.crawler_replay_queue = RedisQueue( name='twitter_replay', redis_config=self.app_config['redis_config']) self.crawler_tweets_queue = RedisQueue( name='twitter', redis_config=self.app_config['redis_config']) self.twitter_users_queue = RedisQueue( name='twitter_users', redis_config=self.app_config['redis_config']) def fetch_user_tweets(self, user_id=None, deadline=None, current_max_id=None, bucket="timelines"): if not user_id: raise Exception("user_timeline: user_id cannot be None") prev_max_id = -1 if not current_max_id: current_max_id = 0 last_lowest_id = current_max_id # used to workaround users who has less than 200 tweets, 1 loop is enough... cnt = 0 retry_cnt = 5 timeline = [] # while current_max_id != prev_max_id and retry_cnt > 1: try: if current_max_id > 0: tweets = self.get_user_timeline(user_id=user_id, max_id=current_max_id - 1, count=20) else: tweets = self.get_user_timeline(user_id=user_id, count=20) prev_max_id = current_max_id # if no new tweets are found, the prev_max_id will be the same as current_max_id # crawler_replay_list= [] for tweet in tweets: # print(tweet) if deadline: date = datetime.datetime.strptime( tweet['created_at'], '%a %b %d %H:%M:%S %z %Y') print(date) deadline_panduan = datetime.datetime.strptime( '%s +0000' % deadline, '%Y-%m-%d %z') # print(deadline_panduan) if (date - deadline_panduan).days <= 0: break # list = self.crawler_list_count(tweet['user']['screen_name']) # tweet['list_num']=list tweet['site'] = 'twitter' tweet['latest'] = 'true' tweet['update_status'] = False tweet['update_time'] = datetime.datetime.today() # print('存入mongo') object_id = self.save(tweet) # crawler_replay_list.append("https://twitter.com/%s/status/%s" % (tweet['user']['screen_name'], tweet['id_str'])) print('save %s ==>successfuly' % object_id) time_line = re.search( r'\w{3}\sOct\s\d{2}\s\d{2}:\d{2}:\d{2}\s\+\d{4}\s2017', tweet['created_at']) if current_max_id == 0 or current_max_id > int( tweet['id']): current_max_id = int(tweet['id']) # if len(crawler_replay_list)>0: # print(crawler_replay_list) # self.crawler_replay_queue.put(crawler_replay_list) # print("推入成功%s个" % len(crawler_replay_list)) time.sleep(1) # no new tweets found if (prev_max_id == current_max_id): print('此用户文章抓取完成 %s ' % user_id) break except Exception as e: # print('<%s重新加载到文章队列>' % user_id) # self.crawler_tweets_err_queue.lput({"user_id":user_id,"current_max_id":current_max_id}) # posts = self.get_mongod_client() # deleteObj = posts.delete_many({'id_str': user_id}) # print('<清除%s用户的所有文章,文章数为:%s>' % (user_id, deleteObj.deleted_count)) # break; # print(e) raise e def crawler_list_count(self, user_sreen_name=None, user_id=None): try: reponse = self.asynchronous_request("https://twitter.com/%s" % user_sreen_name) _ = pq(reponse[0]['content']) tweet_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--tweets span.ProfileNav-value' ).attr('data-count') flowing_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--following span.ProfileNav-value' ).attr('data-count') followers_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--followers span.ProfileNav-value' ).attr('data-count') favorites_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--favorites span.ProfileNav-value' ).attr('data-count') list_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--lists span.ProfileNav-value' ).text() moment_count = _( 'ul.ProfileNav-list>li.ProfileNav-item--moments span.ProfileNav-value' ).text() # print((tweet_count,flowing_count,followers_count,favorites_count,list_count,moment_count)) list_count = list_count if list_count else 0 moment_count = moment_count if moment_count else 0 flowing_count = flowing_count if flowing_count else 0 tweet_count = tweet_count if tweet_count else 0 favorites_count = favorites_count if favorites_count else 0 followers_count = followers_count if followers_count else 0 # print(list_count) if (tweet_count, followers_count, flowing_count, favorites_count, list_count, moment_count) == (0, 0, 0, 0, 0, 0): if _('.errorpage-body-content>h1').text(): print('此页面错误,无法抓取') return (0, 0, 0, 0, 0, 0) print('重新加入队列') self.twitter_users_queue.lput(user_id) return (tweet_count, flowing_count, followers_count, favorites_count, list_count, moment_count) except aiohttp.ClientError as e: print('重新加入队列') self.twitter_users_queue.lput(user_id) return (None, None, None, None, None, None) # raise e # print(e) # return None,None def crawler_replay_num(self, urls): try: response = self.asynchronous_request(urls) result_list = [] if response: for item in response: # print(item) try: _ = pq(item['content']) replay = _( 'div.js-tweet-details-fixer.tweet-details-fixer+div.stream-item-footer div.ProfileTweet-actionCountList.u-hiddenVisually span.ProfileTweet-action--reply.u-hiddenVisually>span' ).attr('data-tweet-stat-count') retweet = _( 'div.js-tweet-details-fixer.tweet-details-fixer+div.stream-item-footer div.ProfileTweet-actionCountList.u-hiddenVisually span.ProfileTweet-action--retweet.u-hiddenVisually>span' ).attr('data-tweet-stat-count') like = _( 'div.js-tweet-details-fixer.tweet-details-fixer+div.stream-item-footer div.ProfileTweet-actionCountList.u-hiddenVisually span.ProfileTweet-action--favorite.u-hiddenVisually>span' ).attr('data-tweet-stat-count') content = _( 'p.TweetTextSize.TweetTextSize--jumbo.js-tweet-text.tweet-text' ).text().replace( r'%s' % _('a.twitter-timeline-link.u-hidden').text(), '') result_list.append({ "url": item['url'], "reply_count": replay if replay else 0, "retweet_count": retweet if retweet else 0, "favorite_count": like if like else 0, 'content': content }) except Exception as e: print(e) result_list.append({ "url": item['url'], "reply_count": None, "retweet_count": None, "favorite_count": None, 'content': None }) return result_list except Exception as e: raise e # tweet['reply_count'] = reply_count # print(tweet['created_at']) def search_users(self, keyword=[], typeIndex=1): try: def handle(y): y = y._json if int(typeIndex) == 2: y['searchBy'] = 'EnglishName' else: y['searchBy'] = 'ChineseName' y['bySheet'] = self.args.sheet y['keywords'] = keyword[int(typeIndex)] # if(len(keyword)>1): # y['chinaName'] = keyword[1] # y['englishName'] = keyword[2] return y userList = self.api.search_users(keyword[int(typeIndex)]) users = list(map(handle, userList)) if users: for somebody in users: print(somebody) id = super().save(somebody) if (id): print('save %s==> ok' % id) else: print('no data provid') # super().saveAsExcel(users,self.__flag,keyword) except Exception as e: logging.exception(e) def get_user_info(self, screen_name=None): user_info = self.show_user(screen_name=screen_name) id = self.save_user(doc=user_info, dbName='Twitter', collectionName='twitter') print('[===%s存储成功===]' % id)
import sys,os,json sys.path.append(".") from src.redis_helper import RedisQueue if __name__ == '__main__': twitter_crawler_queue = RedisQueue(name='twittter') while True: print(twitter_crawler_queue.qsize()) print(twitter_crawler_queue.get())
def crawler_tweets_replay_count(self, crawler, history=False): #推文表中的replay_count字段 print('<=====启动tweet_replay抓取====>') db = self.create_mongo_conn() crawler_tweet_replay_queue = RedisQueue( name='twitter_replay', redis_config=self.app_config['redis_config']) weipa_count = 1 err_count = 1 if not history: es = Espusher() while True: try: print('[===未抓取的个数为:%s===]' % crawler_tweet_replay_queue.qsize()) # if crawler_tweet_replay_queue.empty(): # if weipa_count >=3: # print('<-----twitter_replay抓取完成----->') # break # else: # weipa_count+=1 # print('[==Retry:%s==]' % (weipa_count-1)) # time.sleep(10) # continue # print(weipa_count) urls = crawler_tweet_replay_queue.get() print("取出%s个" % len(urls)) content = crawler.crawler_replay_num(urls) for item in content: print(item) # print(item['url'].split('/')[-1]) if history: update_doc = db.update_many( { "id_str": item['url'].split('/')[-1], 'site': 'twitter' }, { '$set': { 'replay_count': item['reply_count'], 'retweet_count': item['retweet_count'], 'favorite_count': item['favorite_count'], "update_status": True } }) print('更新了%s个' % (update_doc.modified_count)) else: # print('push item to es') # print(item) data = db.find_one_and_delete({ "id_str": item['url'].split('/')[-1], 'site': 'twitter' }) data['replay_count'] = item['reply_count'] data['favorite_count'] = item['favorite_count'] data['retweet_count'] = item['retweet_count'] es.twitter_pusher(data) weipa_count = 1 err_count = 1 except Exception as e: print(e) continue
class FaceBook(Base): """ The RequestHandler class for our server. """ def __init__(self,args={}): #Base.__init__() super(FaceBook,self).__init__() #以上是俩种调用基类构造函数的方法 self.__username = "******" self.__password = "******" self.__access_token = "EAACEdEose0cBAAkdhoyXkFejburMPqbr7b773AxZCs7b1BORK7V2gUxVlmKkYydZCZBuyy4UcZA0QxThf7ii0tbDnsiCSzwFJ9DZAeGTcUCsGHQPTk7hPamWAZA2mN6IBjNXDsDQwwzrwet4h1piWTP5fuBnKjZCGm8ZCyXjCEWS7apZCoo1ZAuO5OBfoc9IDCgjSDfvc3pWKWGEPcICelHO456OUnZAxeDpLUZD" self.__flag = 'facebook' self.args = args self.crawler_tweets_err_queue = RedisQueue(name='facebook_error', redis_config=self.app_config['redis_config']) self.crawler_reactions_queue = RedisQueue(name='facebook_reactions',redis_config=self.app_config['redis_config']) self.crawler_tweets_queue = RedisQueue(name='facebook',redis_config=self.app_config['redis_config']) self.facebook_users_queue = RedisQueue(name='facebook_users', redis_config=self.app_config['redis_config']) def __reactions_handler(self,responseText=[]): # print(responseText) if not responseText or (len(responseText) <= 0): return None result_list = [] for item in responseText: try: bs = bs4.BeautifulSoup(item['content'], 'html.parser') if not bs: continue; html = bs.select( 'script') share = re.search(r'sharecount:\d{1,}', str(html)).group() if re.search(r'sharecount:\d{1,}', str(html)) else "sharecount:0" likes = re.search(r'likecount:\d{1,}', str(html)).group() if re.search(r'likecount:\d{1,}', str(html)) else "likecount:0" comment = re.search(r'commentcount:\d{1,}', str(html)).group() if re.search(r'commentcount:\d{1,}', str(html)) else "commentcount:0" # print(str1) # comment = re.search(r'count:\d{1,}', str1).group() # print(share,likes,comment) share_count = re.search(r'\d{1,}', share).group() if re.search(r'\d{1,}', share) else 0 likes_count = re.search(r'\d{1,}', likes).group() if re.search(r'\d{1,}', likes) else 0 comment_count = re.search(r'\d{1,}', comment).group() if re.search(r'\d{1,}', comment) else 0 result_list.append({ "url": item["url"], "reactions": { "share_count": share_count, "likes_count": likes_count, "comment_count": comment_count } }) except Exception as e: # raise e result_list.append({ "url":item['url'], "reactions":[] }) return result_list def make_next_page_url(self,url,page_id,next_time,back_end=False): default_next_page_ma = '09223372036854775788'; if back_end==1: return "https://www.facebook.com/pages_reaction_units/more/?page_id={0}&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A0000000000{1}%3A04611686018427387904%3A{2}%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%22profile_id%22%3A{3}%2C%22start%22%3A0%2C%22end%22%3A1517471999%2C%22query_type%22%3A36%2C%22filter%22%3A1%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=9&dpr=2&__user=0&__a=1&__req=j&__be=-1&__pc=EXP1:home_page_pkg&__rev=3574843".format(page_id,next_time,int(default_next_page_ma)-9,page_id) elif back_end==2: return "https://www.facebook.com/pages_reaction_units/more/?page_id={0}&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A0000000000{1}%3A04611686018427387904%3A{2}%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%22profile_id%22%3A{3}%2C%22start%22%3A1483257600%2C%22end%22%3A1514793599%2C%22query_type%22%3A8%2C%22filter%22%3A1%2C%22filter_after_timestamp%22%3A1487694945%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=8&dpr=2&__user=0&__a=1&__dyn=5V8WXBzamaUmgDxKS5o9FE9XGiWGey8jrWo466ES2N6xucxu13wFG2LzEjyR88xK5WAAzoOuVWxeUPwExnBg4bzojDx6aCyVeFFUkgmxGUO2S1iyECQ3e4oqyU9ooxqqVEgyk3GEtgWrwJxqawLh42ui2G262iu4rGUpCx65aBy9EixO12y9E9oKfzUy5uazrDwFxCibUK8Lz-icK8Cx6789E-8HgoUhwKl4ykby8cUSmh2osBK&__req=22&__be=-1&__pc=EXP1%3Ahome_page_pkg&__rev=3576820".format( page_id, next_time, int(default_next_page_ma) - 9,page_id) elif back_end==0: return "https://www.facebook.com/pages_reaction_units/more/?page_id={0}&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A0000000000{1}%3A04611686018427387904%3A{2}%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=9&dpr=2&__user=0&__a=1&__req=j&__be=-1&__pc=EXP1:home_page_pkg&__rev=3574843".format(page_id,next_time,int(default_next_page_ma)-9) def crawler_reactions_nums(self,url): try: content = self.asynchronous_request(url) return self.__reactions_handler(content) except Exception as e: raise e; def crawler_user_likes(self,url,user_id=None): try: content = self.asynchronous_request(url) return_list = [] for item in content: # print(item['content']) user_community = pq(item['content'])('._3xom').text() print(user_community) if user_community == '0': return_list.append({ "url": item['url'], "like_count": user_community, "fan_count": user_community }) elif user_community == '': return_list.append({ "url": item['url'], "isLoginStatus":True, "like_count": '0', "fan_count": '0' }) else: if(len(user_community))>1: if re.search(r'\s万',user_community): likes_count,fan_count, = tuple(user_community.replace(' 万','0000').split(' ')) else: likes_count, fan_count, = tuple(user_community.split(' ')) return_list.append({ "url": item['url'], "isLoginStatus": True, "like_count": likes_count, "fan_count": fan_count }) else: # likes_count, fan_count, = tuple(user_community.split(' ')) return_list.append({ "url": item['url'], "isLoginStatus": True, "like_count": user_community, "fan_count": 0 }) return return_list; except aiohttp.ClientError as e: print('重新加入队列') self.facebook_users_queue.lput(user_id) return_list = [] # if likes_count: # people_likes_num = re.search(r'\d+,\d+,\d+',likes_count) if re.search(r'\d+,\d+,\d+',likes_count) else 0 # else: # people_likes_num=0; # print(people_likes_num) # print(likes_count) return return_list; def timestamp_to_strtime(self,timestamp): local_str_time = datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%S.000Z') return local_str_time def fetch_user_tweets(self,id=None,deadline='2017-01-01',urls=[]): flag=True back=0 while True: try: content = self.asynchronous_request(urls) if re.search(r'(/posts)',urls): origin_html = content[0]['content'] else: origin = json.loads(content[0]['content'].decode()[9:])['domops'] origin_html = list(filter(lambda x: type(x) == dict, origin[0])) origin_html = origin_html[0]['__html'] def scrape(i, e): return { "name": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5 a').text(), "create_at": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5+div>span:nth-child(3) a>abbr').attr('data-utime'), "last_untime": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5+div>span:nth-child(3) a>abbr').attr( 'data-utime'), "permalink_url": pq(e)('div.userContentWrapper div._6a._5u5j._6b>h5+div>span:nth-child(3) a').attr('href'), "message": pq(e)('div.userContent p').text() + pq(e)('div.mtm div.mbs>a').text() } _ = pq(origin_html) tweets = list(_('div._4-u2._4-u8').map(scrape)) if(len(tweets)==0): print('没有数据tweets为0') break; # print(tweets) tweet3 = [] printFlag = True; for x in filter(lambda x:x['create_at'],tweets): # x['create_at']=re.sub(r'[年月日\(\)金木水火土]', ' ', x['create_at']) # if printFlag: # print(x['create_at']) # printFlag=False # thisTime = x['create_at'] # thisTime = thisTime.replace(',', '') # thisTime = thisTime.replace('at', '') # if 'am' in thisTime: # thisTime = thisTime.replace('am', ' AM') # if 'pm' in thisTime: # thisTime = thisTime.replace('pm', ' PM') # if 'Surday' in thisTime: # thisTime = thisTime.replace('Surday', 'Saturday') # # # x['create_at'] = datetime.strptime(thisTime, '%A %B %d %Y %H:%M %p').strftime('%Y-%m-%d %H:%M') # x['create_at'] = datetime.strptime(thisTime, '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M') #最新修改 # x['create_at'] = datetime.strptime(x['create_at'], '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M') #在本地跑数据 x['create_at']=self.timestamp_to_strtime(int(x['create_at'])) # print(x['create_at']) tweet3.append(x) def dedupe(items, key=None): seen = set() for item in items: val = item if key is None else key(item) if val not in seen: yield item seen.add(val) tweet3 = list(dedupe(tweet3, key=lambda d: (d['name'], d['create_at'],d['last_untime'],d['permalink_url'],d['message']))) if len(tweet3)<=1: back=back+1 urls=self.make_next_page_url(urls,id,tweet3[-1]['last_untime'],back_end=back) # crawler_reactions_list = [] for item in tweet3: # print(item) item['site']='facebook' item['latest']='true' item['update_status'] = False item['update_time'] = datetime.today() item['user_id'] = id item['permalink_url'] = 'https://facebook.com%s' % item['permalink_url'] if deadline and tweet3.index(item)!= 0: date = datetime.strptime(item['create_at'],'%Y-%m-%dT%H:%M:%S.000Z') print(date) deadline_panduan = datetime.strptime('%s' % deadline, '%Y-%m-%d') print((date - deadline_panduan).days) if (date - deadline_panduan).days <= 0: flag=False; break; item['create_at'] = datetime.strptime(item['create_at'], '%Y-%m-%dT%H:%M:%S.000Z') object_id = self.save(item) # crawler_reactions_list.append({'url':item['permalink_url'],'id':str(object_id)}) print('save %s ==>successfuly' % object_id) # self.crawler_reactions_queue.put(crawler_reactions_list) print('获取的文档长度:%s' % len(tweet3)) if not flag : print('此用户的文章爬取完成') back=0 break; except Exception as e: # print('<%s重新加载到文章队列>' % id) # self.crawler_tweets_err_queue.lput({'id':id,'url':urls}) # # posts = self.get_mongod_client() # # deleteObj = posts.delete_many({'user_id':id}) # # print('<清除%s用户的所有文章,文章数为:%s>' % (id,deleteObj.deleted_count)) # break; raise e def searchUserInfo(self,keyword=[],typeIndex=1): print(keyword[typeIndex]) self.__graph = facebook.GraphAPI(access_token=self.__access_token, version='2.10') kwInfo = self.__graph.search(type='page', q=keyword[int(typeIndex)]) kInfos = kwInfo['data'] if len(kInfos): for item in kInfos: res=self.__graph.get_object(item['id'],fields="name,id,current_location,birthday,category,fan_count,emails,hometown,link,location,website,likes.limit(3),new_like_count,about,description,verification_status") #friends = self.__graph.get_connections(id=item['id'], connection_name='friends') print(res['id']) res['keywords']=keyword[int(typeIndex)]; # if int(typeIndex) == 2: # res['searchBy'] = 'EnglishName' # else: # res['searchBy'] = 'ChineseName' res['bySheet'] = self.args.sheet # print(super().save(res)) id = super().save(res) if (id): print('save %s==> ok' % id) else: print('没有数据') #super().saveAsExcel([],self.__flag,kw) def login(self): try: driver = webdriver.Firefox(executable_path="/Users/suohailong/phantomjs/geckodriver") driver.get('https://www.facebook.com') driver.find_element_by_id('email').send_keys(self.__username) driver.find_element_by_id('pass').send_keys(self.__password) driver.find_element_by_id('login_form').submit() element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.NAME, "q")) ) return driver except Exception as e: return False def getToken(self): facebookApiUrl = "https://developers.facebook.com/tools/explorer/145634995501895/?method=GET&path=168597536563870&version=v2.11" driver = self.login() if driver: driver.get(facebookApiUrl) element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="facebook"]/body/div[2]/div[2]/div/div/div/div[2]/div/div[2]/a')) ) actions = action_chains.ActionChains(driver) actions.click(element).perform() #menu = driver.find_element_by_xpath('//div[@class="uiContextualLayer uiContextualLayerBelowLeft"]/div/div/ul') getUserTokenItem = driver.find_element_by_xpath('//div[@class="uiContextualLayer uiContextualLayerBelowLeft"]/div/div/ul/li[1]/a') getUserTokenItem.click() tokenButton = driver.find_element_by_xpath('//*[@id="facebook"]/body/div[8]/div[2]/div/div/div/div/div[3]/div/div/div[2]/div/div/button[1]') tokenButton.click() tokenIput = driver.find_element_by_xpath('//*[@id="facebook"]/body/div[2]/div[2]/div/div/div/div[2]/div/div[1]/label/input') self.__access_token=tokenIput.get_attribute('value') print(self.__access_token) driver.quit() return True else: return False def getPagePosts(self): pass; def search_users(self, keyword='',typeIndex=1): try: print('当前参数为:%s' % keyword) self.searchUserInfo(keyword,typeIndex) except Exception as e: if e.code == 190: print('access Token has expired =====>Reget Touken!') while self.getToken(): self.searchUserInfo(keyword, typeIndex) break; #logging.exception(e) def get_user_info(self,url): content = self.asynchronous_request(url) origin_html = content[0]['content'] # print(content) _ = pq(origin_html) # print(_('#content_container div.clearfix').text()) id = re.search(r'PagesProfileAboutInfoPagelet_\d+',origin_html.decode()) id = id.group() name = re.sub( r"[\u4E00-\u9FA5]|[\u3040-\u30FF\u31F0-\u31FF]|[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]|[-,.?:;\'\"!`]|(-{2})|(\.{3})|(\(\))|(\[\])|({})", '', _('#pageTitle').text()) birthday = _('#content_container div.clearfix').text() website = _('#content_container div.clearfix').text() origin_str = _('#content_container div.clearfix').text() # print(origin_str) if re.search(r'(\d+)年(\d+)月(\d+)日',birthday): birthday = re.search(r'(\d+)年(\d+)月(\d+)日',birthday).group() birthday = re.sub(r'(\d+)年(\d+)月(\d+)日',r'\1-\2-\3',birthday) birthday = re.sub( r"[\u4E00-\u9FA5]|[\u3040-\u30FF\u31F0-\u31FF]|[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]|\s|[.?:;\'\"!`]|(-{2})|(\.{3})|(\(\))|(\[\])|({})", '',birthday) else: birthday='' verified = re.search( r'Facebook \\u5df2\\u786e\\u8ba4\\u8fd9\\u662f\\u516c\\u4f17\\u4eba\\u7269\\u3001\\u5a92\\u4f53\\u516c\\u53f8\\u6216\\u54c1\\u724c\\u7684\\u771f\\u5b9e\\u4e3b\\u9875', origin_html.decode()) if verified: verified = True else: verified=False item = self.crawler_user_likes(url.replace('/about','')+'/community/') if re.search(r'((http|https)://)[\w1-9]+.[\w1-9]+.*',website): website = re.search(r'((http|https)://)[\w1-9]+.[\w1-9]+.[\w]+',website).group() else: website = '' user_id = self.save_user(doc={ "id": re.search(r'\d+',id).group(), "birthday": birthday, "link": url.replace('/about',''), 'website': website, # 'about':re.search(r'简介 (\w+\s)+.',origin_str).group().replace('简介','') if re.search(r'简介 (\w+\s)+.',origin_str) else '', 'about': _('div.text_exposed_root').text(), 'hometown':re.search(r'来自 (\S+\s)+简介',origin_str).group().replace('来自','').replace('简介','') if re.search(r'来自 (\S+\s)+简介',origin_str) else '', 'name': name.replace("Facebook","").replace('|','') , 'gender':re.search(r'性别 \S',origin_str).group().replace('性别','') if re.search(r'性别 \S',origin_str) else '', 'PoliticalViews':re.search(r'政治观点 \S+\s',origin_str).group().replace('政治观点','') if re.search(r'政治观点 \S+\s',origin_str) else '', 'ReligiousBeliefs':re.search(r'宗教信仰 \S+\s',origin_str).group().replace('宗教信仰','') if re.search(r'宗教信仰 \S+\s',origin_str) else '', 'category':re.search(r'categories \S+\s',origin_str).group().replace('categories','') if re.search(r'categories \S+\s',origin_str) else '', 'fan_count':item[0].get('fan_count',0), 'likes_num':item[0].get('like_count',0), 'verified':verified },dbName='FaceBook',collectionName='facebook') print("[===存储%s成功===]" % user_id)
def crawler_reactions(self, crawler, history=False): #facebook posts表中的各种量 print('<=====启动facebook_reactions抓取====>') db = self.create_mongo_conn() crawler_reactions_queue = RedisQueue( name='facebook_reactions', redis_config=self.app_config['redis_config']) weipa_count = 1 err_count = 1 if not history: es = Espusher() while True: try: # print(db.count({"site": "facebook","update_status":False})) # tweets = list(db.find({"site": "facebook","update_status":False}).limit(20)) # if (len(tweets) == 0): # print('全部爬取完成') # break; print('[===未抓取的个数为:%s===]' % crawler_reactions_queue.qsize()) # if crawler_reactions_queue.empty(): # if weipa_count >=3: # print('<-----facebook_reactions抓取完成----->') # break # else: # weipa_count+=1 # print('[==Retry:%s==]' % (weipa_count-1)) # time.sleep(10) # continue urls = crawler_reactions_queue.get( ) #map(lambda x:{"url":'https://facebook.com%s' % x['permalink_url'],'id':x['_id']},tweets) content = crawler.crawler_reactions_nums(urls) # print(content) if not content: continue for item in content: # print(item) print(item['reactions']) # print(item['url']) # print(url) if not item['reactions']: print(item) else: if history: print(objectid.ObjectId(item['url']['id'])) update_doc = db.find_one_and_update( {"_id": objectid.ObjectId(item['url']['id'])}, { '$set': { 'comment_num': item['reactions']['comment_count'], 'likes_num': item['reactions']['likes_count'], 'share_count': item['reactions']["share_count"], "update_status": True } }, return_document=ReturnDocument.AFTER) if update_doc != None: print('更新了%s个' % update_doc['_id']) else: data = db.find_one_and_delete( {'_id': objectid.ObjectId(item['url']['id'])}) data['comment_num'] = item['reactions'][ 'comment_count'] data['likes_num'] = item['reactions'][ 'likes_count'] data['share_count'] = item['reactions'][ "share_count"] es.facebook_pusher(data) weipa_count = 1 err_count = 1 except Exception as e: print(e) continue
def crawler_tweets(self, crawler, site='facebook', deadtime='2017-1-1'): print('<-----启动文章抓取----->') weipa_count = 1 if site == 'twitter': twitter_crawler_queue = RedisQueue( name='twitter', redis_config=self.app_config['redis_config']) twitter_crawler_error_queue = RedisQueue( name='twitter_error', redis_config=self.app_config['redis_config']) while True: if twitter_crawler_error_queue.qsize() > 0: err_item = twitter_crawler_error_queue.get() print('取出出错条目:%s', err_item) current_max_id = err_item['current_max_id'] id = err_item['user_id'] crawler.fetch_user_tweets(user_id=id, current_max_id=current_max_id, deadline=deadtime) else: print('\n') print('[===Info:%s 未抓取的的id个数为:%s===]' % (datetime.now(), twitter_crawler_queue.qsize())) if twitter_crawler_queue.empty(): if weipa_count >= 3: print('<-----文章抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = twitter_crawler_queue.get() crawler.fetch_user_tweets(user_id=id, deadline=deadtime) weipa_count = 1 else: facebook_crawler_queue = RedisQueue( name='facebook', redis_config=self.app_config['redis_config']) facebook_crawler_error_queue = RedisQueue( name='facebook_error', redis_config=self.app_config['redis_config']) db = self.create_mongo_conn(db='FaceBook', collection='facebook') while True: if facebook_crawler_error_queue.qsize() > 0: err_item = facebook_crawler_error_queue.get() print('取出出错条目:%s', err_item) id = err_item['id'] url = err_item['url'] if url: crawler.fetch_user_tweets(id=id, urls=url, deadline=deadtime) else: print('\n') print('[===Info:%s 未抓取的的id个数为:%s===]' % (datetime.now(), facebook_crawler_queue.qsize())) if facebook_crawler_queue.empty(): if weipa_count >= 3: print('<-----文章抓取完成----->') break else: weipa_count += 1 print('[==Retry:%s==]' % (weipa_count - 1)) time.sleep(5) continue id = facebook_crawler_queue.get() print(id) doc = db.find_one({"id": str(id)}) # print(doc) crawler.fetch_user_tweets(id=id, urls=doc['link'] + 'posts/', deadline=deadtime) # print('完成全部抓取') weipa_count = 1
def crawler_init(name='twitter'): print('<-----初始化程序----->') config = read_config() if name == 'twitter': twitter_crawler_queue = RedisQueue(name='twitter', redis_config=config['redis_config']) if twitter_crawler_queue.qsize() > 0: print('<-----有%s个任务还未完成---->' % twitter_crawler_queue.qsize()) if twitter_crawler_queue.empty(): with open(os.path.abspath('twitter_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: twitter_crawler_queue.put(id) # print(id) print('<-----有%s个任务需要完成----->' % twitter_crawler_queue.qsize()) print('<-----twitter初始化完成----->') else: facebook_crawler_queue = RedisQueue( name='facebook', redis_config=config['redis_config']) if facebook_crawler_queue.qsize() > 0: print('<-----有%s个任务还未完成---->' % facebook_crawler_queue.qsize()) if facebook_crawler_queue.empty(): with open(os.path.abspath('facebook_user_ids.json'), 'r') as f: user_ids = json.load(f) for id in user_ids['ids']: facebook_crawler_queue.put(id) print('<-----有%s个任务需要完成----->' % facebook_crawler_queue.qsize()) print('<-----facebook初始化完成----->')