class UsersCrawler: url_format = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%s&page=%d' querystring = {"version": "v4"} payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" headers = { 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", 'host': "m.weibo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-encoding': "gzip, deflate, sdch, br", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 'cookie': "SCF=AlTf48qNezF12LbNvCHGGee_Nymdun-Sp9kGATl9gjhJAPPkj2QBT2-Y2MECfIjqy1QjvcBbdVr9HWi6hgbgnTQ.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhEEfKT6-E_qQ8I2HTu2.Vu5JpX5o2p5NHD95Qp1hq4She41K-pWs4DqcjGC2Hkg.y8Kntt; SUB=_2A250CvjnDeRhGeBP7FoW9SvEwjiIHXVX9JivrDV6PUJbkdANLUvGkW1966OJJxi88Ah66us23Spcr23Dpw..; SUHB=0cSXjt5Dqq_ieZ; _T_WM=e0f6480701da87741a5b948440f9d665; SSOLoginState=1495508844; ALF=1498100844; H5_INDEX=0_all; H5_INDEX_TITLE=%E4%BD%A0%E5%B7%B2%E7%BB%8F%E8%A2%AB%E7%A7%BB%E9%99%A4%E7%BE%A4%E8%81%8A; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D4110491498745329%26luicode%3D10000011%26lfid%3D231051_-_followers_-_5979396421", 'postman-token': "0b85ea3b-073b-a799-4593-61095e4ed01a" } # response = requests.request("GET", url, data=payload, headers=headers, params=querystring) db_manager = None threads = [] run = False def __init__(self): self.db_manager = CrawlDatabaseManager(10) def get_users(self, uid, page): url = (self.url_format) % (uid, page) response = requests.request("GET", url, data=self.payload, headers=self.headers, params=self.querystring) return response.text def get_uid(self): return self.db_manager.dequeue_user() def start(self): self.run = True t = threading.Thread(target=self.crawl_feeds, name=None) self.threads.append(t) # set daemon so main thread can exit when receives ctrl-c t.setDaemon(True) t.start() def crawl_users(self): kickstart = True self.run = True while self.run: if kickstart: kickstart = False uid = start_uid else: uid = self.get_uid() user_str = self.get_users(uid, 1) users = json.loads(user_str) for user in users['cards'][1]['card_group']: name = user['user']['screen_name'] user_id = user['user']['id'] followers_count = user['user']['followers_count'] follow_count = user['user']['follow_count'] description = user['user']['description'] self.db_manager.enqueue_user(user_id, name=name, follow_count=follow_count, followers_count=followers_count, description=description) time.sleep(CRAWL_DELAY) print users break
def __init__(self): self.db_manager = CrawlDatabaseManager(10)
def __init__(self): self.db_manager = CrawlDatabaseManager() self.crawl_feeds = FeedsCrawler()
class UsersCrawler: url_format = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%s&page=%d' # querystring = {"version":"v4"} # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" headers = { 'host': "m.weibo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-encoding': "gzip, deflate, sdch, br", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 'cookie': "_T_WM=0c8015962736d2199e208ca8adb7d301; SCF=AgS4nly86VG3e5zq2Usv2iN_bQIUj-rGsjZvQJLMLobUOXiNKaU1wEUxFjfWYNVchEv1_4Mfh08RgGthig-4Qcg.; SUB=_2A253p1ZcDeRhGeNG71YS9S7Jwj-IHXVVaHoUrDV6PUJbkdANLXbgkW1NS0f7LHe4BMxiV6LodCnt1vITwWSST2DI; SUHB=0xnCvYrYJqVQyb; SSOLoginState=1520641555; M_WEIBOCN_PARAMS=uicode%3D20000174" } # response = requests.request("GET", url, data=payload, headers=headers, params=querystring) db_manager = None threads = [] run = False kickstart = True user_count = 0 max_threads = 10 def __init__(self): self.db_manager = CrawlDatabaseManager() self.crawl_feeds = FeedsCrawler() def get_users(self, uid, page): url = (self.url_format) % (uid, page) response = requests.request("GET", url, headers=self.headers) response.encoding = response.apparent_encoding return response.text def get_uid(self): return self.db_manager.dequeue_user() def start(self): self.run = True t = threading.Thread(target=self.crawl_feeds.crawl_feeds(), name=None) self.threads.append(t) # set daemon so main thread can exit when receives ctrl-c t.setDaemon(True) t.start() def crawl_users(self): self.outrange = False self.run = True self.page = 1 if self.kickstart: self.kickstart = False uid = start_uid else: uid = self.get_uid()[1] while self.run: print(self.page) count = 0 user_str = self.get_users(uid, self.page) users = json.loads(user_str) index = 1 if self.page == 1 else 0 try: page_count = len(users['data']['cards'][index]['card_group']) except: print("page %d has none follow information,break!" % self.page) break for i in range(page_count): user = users['data']['cards'][index]['card_group'][i] try: name = user['user']['screen_name'] if name.startswith("#"): continue user_id = user['user']['id'] followers_count = user['user']['followers_count'] follow_count = user['user']['follow_count'] description = user['user']['description'] print(name) except: continue if followers_count > 15 * follow_count: self.db_manager.enqueue_user( user_id=user_id, name=name, follow_count=follow_count, followers_count=followers_count, description=description) self.user_count += 1 count += 1 else: continue self.page += 1 time.sleep(CRAWL_DELAY) print(self.page) print( "%d users were qnququed this time ,%d users have been enququed from the start" % (count, self.user_count))
class FeedsCrawler(): url_format = "https://m.weibo.cn/api/container/getIndex?uid=%s&type=uid&value=%s&containerid=107603%s&page=%d" # querystring = {"version":"v4"} # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" headers = { 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", 'host': "m.weibo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-encoding': "gzip, deflate, sdch, br", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 'cookie': "_T_WM=0c8015962736d2199e208ca8adb7d301; SCF=AgS4nly86VG3e5zq2Usv2iN_bQIUj-rGsjZvQJLMLobUOXiNKaU1wEUxFjfWYNVchEv1_4Mfh08RgGthig-4Qcg.; SUB=_2A253p1ZcDeRhGeNG71YS9S7Jwj-IHXVVaHoUrDV6PUJbkdANLXbgkW1NS0f7LHe4BMxiV6LodCnt1vITwWSST2DI; SUHB=0xnCvYrYJqVQyb; SSOLoginState=1520641555; M_WEIBOCN_PARAMS=uicode%3D20000174", 'postman-token': "0b85ea3b-073b-a799-4593-61095e4ed01a" } db_manager = CrawlDatabaseManager() feeds_db_manager = FeedsMongoManager() threads = [] run = False def __init__(self): pass # 5s前 5分钟前 5小时前 05:05 05-05 2017-05-05 def get_time(self, created_time): print(created_time) if u's' in created_time: time_sec = re.findall(u'(\d*)s', created_time)[0] time = mktime( strptime( strftime("%Y-%m-%d-%H-%M-") + time_sec, "%Y-%m-%d-%H-%M-%S")) return time elif u'分钟' in created_time: time_min = str(60 - int(re.findall(u'(\d*)分', created_time)[0])) time = mktime( strptime( strftime("%Y-%m-%d-%H-%S-") + time_min, "%Y-%m-%d-%H-%S-%M")) return time elif u'小时' in created_time: time_hour = str(24 - int(re.findall(u'(\d*)小', created_time)[0])) time = mktime( strptime( strftime("%Y-%m-%d-%M-%S-") + time_hour, "%Y-%m-%d-%M-%S-%H")) return time elif u':' in created_time: time_hour_min = created_time time = mktime( strptime( strftime("%Y-%m-%d-%S-") + time_hour_min, "%Y-%m-%d-%S-%H:%M")) return time elif len(created_time) == 5: time_mon_day = created_time time = mktime( strptime( strftime("%Y-%H-%M-%S-") + time_mon_day, "%Y-%H-%M-%S-%m-%d")) return time elif len(created_time) == 10: time_year_mon_day = created_time time = mktime( strptime( strftime("%H-%M-%S-") + time_year_mon_day, "%H-%M-%S-%Y-%m-%d")) return time else: return None def get_feeds(self, uid, page): url = (self.url_format) % (uid, uid, uid, page) req = requests.request("GET", url, headers=self.headers) req.encoding = req.apparent_encoding return req.text def get_uid(self): uid = self.db_manager.dequeue_user() if uid is None: return None return uid[1] # ''' def start(self): # self.run = True # t = threading.Thread(target=self.crawl_feeds, name=None) # self.threads.append(t) # # set daemon so main thread can exit when receives ctrl-c # t.setDaemon(True) # t.start() ''' def crawl_feeds(self): self.run = True while self.run: uid = self.get_uid() print("start new user crawling") if uid is None: self.run = False break for page in range(1, MAX_PAGE): feeds_str = self.get_feeds(uid, page) feeds = json.loads(feeds_str) for feed in feeds['data']['cards']: if feed['card_type'] != 9: continue try: if 'mblog' in feed.keys(): self.feeds_db_manager.insert_feed( feed, self.get_time(feed['mblog']['created_at'])) print('--------\n' + feed['mblog']['user']['screen_name'] + '\n--------\n' + re.sub(r'<.*?>', '', feed['mblog']['text'])) else: continue except Exception as err: print(err) continue # item_id = feed['itemid'] # scheme = feed['scheme'] # uid = feed['mblog']['user']['id'] # name = feed['mblog']['user']['screen_name']1 # profile_image_url = feed['mblog']['user']['profile_image_url'] # created_at = feed['mblog']['created_at'] # text = feed['mblog']['text'] # feed_id = feed['mblog']['id'] # reposts_count = feed['mblog']['reposts_count'] # comments_count = feed['mblog']['comments_count'] # attitudes_count = feed['mblog']['attitudes_count'] # page_info = feed['mblog']['page_info'] # pics = feed['mblog']['pics'] time.sleep(CRAWL_DELAY)
class FeedsCrawler: url_format = "https://m.weibo.cn/api/container/getIndex?uid=%s&type=uid&value=%s&containerid=107603%s&page=%d" querystring = {"version": "v4"} payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" headers = { 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", 'host': "m.weibo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-encoding': "gzip, deflate, sdch, br", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 'cookie': "SCF=AlTf48qNezF12LbNvCHGGee_Nymdun-Sp9kGATl9gjhJAPPkj2QBT2-Y2MECfIjqy1QjvcBbdVr9HWi6hgbgnTQ.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhEEfKT6-E_qQ8I2HTu2.Vu5JpX5o2p5NHD95Qp1hq4She41K-pWs4DqcjGC2Hkg.y8Kntt; SUB=_2A250CvjnDeRhGeBP7FoW9SvEwjiIHXVX9JivrDV6PUJbkdANLUvGkW1966OJJxi88Ah66us23Spcr23Dpw..; SUHB=0cSXjt5Dqq_ieZ; _T_WM=e0f6480701da87741a5b948440f9d665; SSOLoginState=1495508844; ALF=1498100844; H5_INDEX=0_all; H5_INDEX_TITLE=%E4%BD%A0%E5%B7%B2%E7%BB%8F%E8%A2%AB%E7%A7%BB%E9%99%A4%E7%BE%A4%E8%81%8A; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D4110491498745329%26luicode%3D10000011%26lfid%3D231051_-_followers_-_5979396421", 'postman-token': "0b85ea3b-073b-a799-4593-61095e4ed01a" } db_manager = CrawlDatabaseManager(10) feeds_db_manager = FeedsMongoManager() threads = [] run = False def __init__(self): pass def get_time(self, created_time): # created_time = created_time.replace(' ', '') if u'秒前' in created_time: return time.time() mins = re.findall(u'(.*)分钟', created_time) if len(mins) > 0: return time.time() - int(mins[0]) * 60 hours = re.findall(u'(.*)小时', created_time) if len(hours) > 0: return time.time() - int(hours[0]) * 3600 today_time = re.findall(u'今天.?(\d\d:\d\d)', created_time) if len(today_time) > 0: ct = time.strftime(u'%Y/%m/%d ') + today_time[0] return time.mktime(time.strptime(ct, u'%Y/%m/%d %H:%M')) md = re.findall(u'\d{2}\-\d{2}', created_time) if len(md) > 0: return time.mktime(time.strptime(md[0], u'%m-%d')) yesterday = re.findall(u'昨天.*(\d{2}:\d{2})', created_time) if len(yesterday) > 0: return time.time() - 86400 + int(yesterday[0][0:2]) * 3600 + int( yesterday[0][3:]) * 60 str_time = re.findall(u'.*\d\d:\d\d', created_time) if len(str_time) > 0: if u'月' in str_time: return time.mktime( time.strptime( time.strftime(u'%Y-') + str_time, u'%Y-%m月%d日 %H:%M')) else: try: return time.mktime( time.strptime(str_time, u'%Y-%m-%d %H:%M')) except Exception: return time.mktime( time.strptime( time.strftime(u'%Y-') + str_time, u'%Y-%m-%d %H:%M')) def fetch_feeds(self, uid, page): url = (self.url_format) % (uid, uid, uid, page) return requests.request("GET", url, data=self.payload, headers=self.headers, params=self.querystring) def next_uid(self): uid = self.db_manager.dequeue_user() if uid is None: return None return uid['user_id'] def start(self): self.run = True t = threading.Thread(target=self.crawl_feeds, name=None) self.threads.append(t) # set daemon so main thread can exit when receives ctrl-c t.setDaemon(True) t.start() def crawl_feeds(self): self.run = True while self.run: uid = self.next_uid() if uid is None: print("No more user available") break print('uid ', uid) if uid is None: self.run = False break page = 1 while page < MAX_PAGE: feeds_str = self.fetch_feeds(uid, page) page += 1 feeds = json.loads(feeds_str.text) for feed in feeds['data']['cards']: print('running') if feed['card_type'] != 9: continue if 'mblog' in feed: self.feeds_db_manager.insert_feed( feed, self.get_time(feed['mblog']['created_at'])) print('--------\n' + feed['mblog']['user']['screen_name'] + '\n--------\n' + feed['mblog']['text']) # item_id = feed['itemid'] # scheme = feed['scheme'] # uid = feed['mblog']['user']['id'] # name = feed['mblog']['user']['screen_name'] # profile_image_url = feed['mblog']['user']['profile_image_url'] # created_at = feed['mblog']['created_at'] # text = feed['mblog']['text'] # feed_id = feed['mblog']['id'] # reposts_count = feed['mblog']['reposts_count'] # comments_count = feed['mblog']['comments_count'] # attitudes_count = feed['mblog']['attitudes_count'] # page_info = feed['mblog']['page_info'] # pics = feed['mblog']['pics'] time.sleep(CRAWL_DELAY) print(feeds)
def __init__(self, uid): self.db_manager = CrawlDatabaseManager(10) self.root_uid = uid