def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD): # login to sinaweibo self.driver = webdriver.PhantomJS() self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization if self.wl.login(): logging.info('login successfully') else: logging.info('login faied') sys.exit(1) self.sk = search_key.strip() return
def __init__(self, name="*****@*****.**", password="******", uid="1709818975", *args, **kwargs): super(UserInfoCrawl, self).__init__(*args, **kwargs) self.uid = uid self.start_urls = ["http://weibo.com"] self.allowed_domains = ["weibo.com", "weibo.cn"] self.url_base = "http://weibo.cn" self.first_flag_info = True # 不爬取自己的微博 self.first_flag_home = True # 处理自己资料的时候和其他账户有所不一 if os.path.exists("weibocookie.json"): with open("weibocookie.json", "r") as f: self.cookie = json.load(f) else: self.weibo = WeiboLogin() self.session = self.weibo.login(name, password) cookiejar = requests.utils.dict_from_cookiejar(self.session.cookies) # Set sina weibo cookie self.cookie = {'ALF': cookiejar['ALF'], 'sso_info': cookiejar['sso_info'], 'SUB': cookiejar['SUB'], 'SUBP': cookiejar['SUBP'], 'SUE': cookiejar['SUE'], 'SUHB': cookiejar['SUHB'], 'SUP': cookiejar['SUP'], 'SUS': cookiejar['SUS']} with open("weibocookie.json", "w") as f: json.dump(self.cookie, f)
def get_weibo_token(self, appkey, appsecret, url, username, password): logging.info("preparing weibo OAuth2:") logging.info("appkey: %s username: %s" % (appkey, username)) self.weibo_client = APIClient(app_key=appkey, app_secret=appsecret, redirect_uri=url) code = WeiboLogin(username, password, appkey, url).get_code() logging.info("code: %s" % code) r = self.weibo_client.request_access_token(code) self.weibo_client.set_access_token(r.access_token, r.expires_in) logging.info("token: %s" % r.access_token)
def __init__(self, name="*****@*****.**", password="******", uid="09424248189", *args, **kwargs): super(UserInfoCrawl, self).__init__(*args, **kwargs) self.uid = uid self.start_urls = ["http://weibo.com"] self.allowed_domains = ["weibo.com", "weibo.cn"] self.url_base = "http://weibo.cn" self.first_flag_info = True # 不爬取自己的微博 self.first_flag_home = True # 处理自己资料的时候和其他账户有所不一 if os.path.exists("weibocookie.json"): with open("weibocookie.json", "r") as f: self.cookie = json.load(f) else: self.weibo = WeiboLogin() self.session = self.weibo.login(name, password) cookiejar = requests.utils.dict_from_cookiejar( self.session.cookies) # Set sina weibo cookie self.cookie = { 'ALF': cookiejar['ALF'], 'sso_info': cookiejar['sso_info'], 'SUB': cookiejar['SUB'], 'SUBP': cookiejar['SUBP'], 'SUE': cookiejar['SUE'], 'SUHB': cookiejar['SUHB'], 'SUP': cookiejar['SUP'], 'SUS': cookiejar['SUS'] } with open("weibocookie.json", "w") as f: json.dump(self.cookie, f)
class UserInfoCrawl(Spider): name = "weibo_user_info" # allowed_domains = ["weibo.cn"] def __init__(self, name="*****@*****.**", password="******", uid="09424248189", *args, **kwargs): super(UserInfoCrawl, self).__init__(*args, **kwargs) self.uid = uid self.start_urls = ["http://weibo.com"] self.allowed_domains = ["weibo.com", "weibo.cn"] self.url_base = "http://weibo.cn" self.first_flag_info = True # 不爬取自己的微博 self.first_flag_home = True # 处理自己资料的时候和其他账户有所不一 if os.path.exists("weibocookie.json"): with open("weibocookie.json", "r") as f: self.cookie = json.load(f) else: self.weibo = WeiboLogin() self.session = self.weibo.login(name, password) cookiejar = requests.utils.dict_from_cookiejar( self.session.cookies) # Set sina weibo cookie self.cookie = { 'ALF': cookiejar['ALF'], 'sso_info': cookiejar['sso_info'], 'SUB': cookiejar['SUB'], 'SUBP': cookiejar['SUBP'], 'SUE': cookiejar['SUE'], 'SUHB': cookiejar['SUHB'], 'SUP': cookiejar['SUP'], 'SUS': cookiejar['SUS'] } with open("weibocookie.json", "w") as f: json.dump(self.cookie, f) def start_requests(self): # Parse weibo homepage home_url = "http://weibo.cn/u/%s" % self.uid yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def _parse_homepage(self, response): html = response.body soup = BeautifulSoup(html, "lxml") # 粉丝数 fans_count, uid = self.get_fans_count(soup) # 微博数量 weibo_count = self.get_weibo_count(soup) # 关注 follow_count, follow_url = self.get_follows(soup) # 微博,只爬第一条 weibo_item = self.parse_weibo_context(soup, uid) if weibo_item is not None: yield weibo_item weibo_social = WeiboSocialConnection() weibo_social["user_id"] = uid weibo_social["weibo"] = weibo_count weibo_social["fans"] = fans_count weibo_social["follow"] = follow_count if weibo_count > 10: yield weibo_social # 个人资料 detail_url_ele = soup.find("a", text=u"资料") if detail_url_ele: detail_url = self.url_base + detail_url_ele["href"] yield Request(url=detail_url, cookies=self.cookie, callback=self.parse_info, errback=self.parse_error, priority=1) if follow_url: yield Request(url=follow_url, cookies=self.cookie, callback=self.parse_follow, errback=self.parse_error) def parse_error(self, response): logger.error("post:%s" % response.url) def parse_info(self, response): html = response.body soup = BeautifulSoup(html, "lxml") info_tip_ele = soup.find("div", text=u"基本信息") uid = self.get_uid_from_response(response) info = {} if info_tip_ele: info_ele = info_tip_ele.next_sibling if self.first_flag_info: self.first_flag_info = False # info_eles = info_ele.find_all("a") # for ele in info_eles: # if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]: # info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8") # print ele.text, ele.next_sibling else: info_eles = info_ele.strings user_info = WeiboUserInfoItem() user_info["user_id"] = uid for ele in info_eles: el = ele.split(":") if len(el) == 2 and el[0] in [ u"昵称", u"性别", u"地区", u"生日", u"简介" ]: info[el[0]] = el[1] info_item = el[1].encode("utf-8") if el[0] == u"昵称": user_info["user_name"] = info_item elif el[0] == u"性别": user_info["sex"] = info_item elif el[0] == u"地区": region = info_item.split(" ") if len(region) == 1: user_info["province"] = "" user_info["city"] = region[0] else: user_info["province"] = region[0] user_info["city"] = region[1] elif el[0] == u"生日": if len(info_item.split("-")) < 3: user_info["birthday"] = "2050-" + info_item else: user_info["birthday"] = info_item p = re.compile(r"^\d{4}-\d{2}-\d{2}$") if not p.findall(user_info["birthday"]): user_info["birthday"] = None elif el[0] == u"简介": user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") yield user_info def parse_follow(self, response): html = response.body soup = BeautifulSoup(html, "lxml") table_eles = soup.find_all("table") for ele in table_eles: follower_url = ele.find("a")["href"] yield Request(url=follower_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def get_uid_from_response(self, response): if isinstance(response, str): url = response else: url = response.url pattern = re.compile(r'/(\d+)/?') res = re.findall(pattern, url) id = 0 if res: id = int(res[0]) # print "id:", id return id def parse_weibo_context(self, soup, uid): weibo_info = WeiboItem() if self.first_flag_home: self.first_flag_home = False return None else: contexts = soup.find_all("div", class_="c") for item in contexts: try: context = item.find("span", class_="ctt") if not context: continue weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") parent_ele = context.parent.parent like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$")) relay_ele = parent_ele.find( text=re.compile(u"^转发\[\d*\]$")) comment_ele = parent_ele.find( text=re.compile(u"^评论\[\d*\]$")) issue_time_ele = parent_ele.find("span", class_="ct") issue_time = issue_time_ele.text issue_time = issue_time.encode("utf-8") issue = issue_time.split("来自") issue_datetime = "" if len(issue) > 0: if "分钟" in issue[0]: min = filter(str.isdigit, issue[0]) t = datetime.datetime.now() - datetime.timedelta( minutes=int(min)) issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S") elif "今天" in issue[0]: time = issue[0].replace("今天 ", "").replace( "\xc2\xa0", "") issue_datetime = datetime.datetime.now().strftime( "%Y-%m-%d ") + time else: issue_datetime = issue[0].replace( "月", "-").replace("日", "").replace("\xc2\xa0", "") if issue[0].count("-") < 2: issue_datetime = datetime.datetime.now( ).strftime("%Y-") + issue_datetime issue_device = issue[1] if len(issue) > 1 else None weibo_info["context"] = weibo_text weibo_info["user_id"] = uid weibo_info["issue_time"] = issue_datetime.strip() weibo_info["get_time"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8")) weibo_info["relay_count"] = filter( str.isdigit, relay_ele.encode("utf-8")) weibo_info["comment_count"] = filter( str.isdigit, comment_ele.encode("utf-8")) weibo_info["device"] = issue_device # print issue_datetime, issue_device, weibo_text # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8") return weibo_info # 只爬去第一条微博 except Exception, e: logger.error(e)
#coding:utf-8 import urllib2 import post_encode from weibo_login import WeiboLogin import get_weibo if __name__ == '__main__': Login = WeiboLogin('17089368196', 'tttt5555') if Login.login() == True: print "登录成功" #可以根据page来循环以便达到爬取多页的目的 html = urllib2.urlopen("http://s.weibo.com/weibo/%25E5%2591%25A8%25E6%2589%25AC%25E9%259D%2592&page=3").read() #调用解析html内容的函数 get_weibo.write_all_info(html)
import requests import post_encode from weibo_login import WeiboLogin import get_weibo import re from lxml import html from lxml import etree import string import random import time import os import sys reload(sys) sys.setdefaultencoding('utf-8') #if __name__ == '__main__': Login = WeiboLogin('*****@*****.**', 'dan5493') if Login.login() == True: print "登录成功" urls_1, names_1 = get_weibo.get_url() length = len(urls_1) for i in range(length): urls = [] names_2 = re.sub('\\\\n', '', names_1[i]) names_3 = re.sub(' ', '', names_2) new_path = get_weibo.path(names_3.decode('unicode_escape')) url_1 = re.sub('\\\\', '', urls_1[i]) url_2 = 'http://s.weibo.com' + url_1 url_3 = re.sub('Refer=top', 'page=1.html', url_2) #每一话题的第一页 print url_3 sleeptime_rand = random.randint(3, 10) time.sleep(sleeptime_rand)
class UserInfoCrawl(Spider): name = "weibo_user_info" # allowed_domains = ["weibo.cn"] def __init__(self, name="*****@*****.**", password="******", uid="1709818975", *args, **kwargs): super(UserInfoCrawl, self).__init__(*args, **kwargs) self.uid = uid self.start_urls = ["http://weibo.com"] self.allowed_domains = ["weibo.com", "weibo.cn"] self.url_base = "http://weibo.cn" self.first_flag_info = True # 不爬取自己的微博 self.first_flag_home = True # 处理自己资料的时候和其他账户有所不一 if os.path.exists("weibocookie.json"): with open("weibocookie.json", "r") as f: self.cookie = json.load(f) else: self.weibo = WeiboLogin() self.session = self.weibo.login(name, password) cookiejar = requests.utils.dict_from_cookiejar(self.session.cookies) # Set sina weibo cookie self.cookie = {'ALF': cookiejar['ALF'], 'sso_info': cookiejar['sso_info'], 'SUB': cookiejar['SUB'], 'SUBP': cookiejar['SUBP'], 'SUE': cookiejar['SUE'], 'SUHB': cookiejar['SUHB'], 'SUP': cookiejar['SUP'], 'SUS': cookiejar['SUS']} with open("weibocookie.json", "w") as f: json.dump(self.cookie, f) def start_requests(self): # Parse weibo homepage home_url = "http://weibo.cn/u/%s" % self.uid yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def _parse_homepage(self, response): html = response.body soup = BeautifulSoup(html, "lxml") # 粉丝数 fans_count, uid = self.get_fans_count(soup) # 微博数量 weibo_count = self.get_weibo_count(soup) # 关注 follow_count, follow_url = self.get_follows(soup) # 微博,只爬第一条 weibo_item = self.parse_weibo_context(soup, uid) if weibo_item is not None: yield weibo_item weibo_social = WeiboSocialConnection() weibo_social["user_id"] = uid weibo_social["weibo"] = weibo_count weibo_social["fans"] = fans_count weibo_social["follow"] = follow_count if weibo_count > 10: yield weibo_social # 个人资料 detail_url_ele = soup.find("a", text=u"资料") if detail_url_ele: detail_url = self.url_base + detail_url_ele["href"] yield Request(url=detail_url, cookies=self.cookie, callback=self.parse_info, errback=self.parse_error, priority=1) if follow_url: yield Request(url=follow_url, cookies=self.cookie, callback=self.parse_follow, errback=self.parse_error) def parse_error(self, response): logger.error("post:%s" % response.url) def parse_info(self, response): html = response.body soup = BeautifulSoup(html, "lxml") info_tip_ele = soup.find("div", text=u"基本信息") uid = self.get_uid_from_response(response) info = {} if info_tip_ele: info_ele = info_tip_ele.next_sibling if self.first_flag_info: self.first_flag_info = False # info_eles = info_ele.find_all("a") # for ele in info_eles: # if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]: # info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8") # print ele.text, ele.next_sibling else: info_eles = info_ele.strings user_info = WeiboUserInfoItem() user_info["user_id"] = uid for ele in info_eles: el = ele.split(":") if len(el) == 2 and el[0] in [u"昵称", u"性别", u"地区", u"生日", u"简介"]: info[el[0]] = el[1] info_item = el[1].encode("utf-8") if el[0] == u"昵称": user_info["user_name"] = info_item elif el[0] == u"性别": user_info["sex"] = info_item elif el[0] == u"地区": region = info_item.split(" ") if len(region) == 1: user_info["province"] = "" user_info["city"] = region[0] else: user_info["province"] = region[0] user_info["city"] = region[1] elif el[0] == u"生日": if len(info_item.split("-")) < 3: user_info["birthday"] = "2050-" + info_item else: user_info["birthday"] = info_item p = re.compile(r"^\d{4}-\d{2}-\d{2}$") if not p.findall(user_info["birthday"]): user_info["birthday"] = None elif el[0] == u"简介": user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") yield user_info def parse_follow(self, response): html = response.body soup = BeautifulSoup(html, "lxml") table_eles = soup.find_all("table") for ele in table_eles: follower_url = ele.find("a")["href"] yield Request(url=follower_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def get_uid_from_response(self, response): if isinstance(response, str): url = response else: url = response.url pattern = re.compile(r'/(\d+)/?') res = re.findall(pattern, url) id = 0 if res: id = int(res[0]) # print "id:", id return id def parse_weibo_context(self, soup, uid): weibo_info = WeiboItem() if self.first_flag_home: self.first_flag_home = False return None else: contexts = soup.find_all("div", class_="c") for item in contexts: try: context = item.find("span", class_="ctt") if not context: continue weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") parent_ele = context.parent.parent like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$")) relay_ele = parent_ele.find(text=re.compile(u"^转发\[\d*\]$")) comment_ele = parent_ele.find(text=re.compile(u"^评论\[\d*\]$")) issue_time_ele = parent_ele.find("span", class_="ct") issue_time = issue_time_ele.text issue_time = issue_time.encode("utf-8") issue = issue_time.split("来自") issue_datetime = "" if len(issue) > 0: if "分钟" in issue[0]: min = filter(str.isdigit, issue[0]) t = datetime.datetime.now() - datetime.timedelta(minutes=int(min)) issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S") elif "今天" in issue[0]: time = issue[0].replace("今天 ", "").replace("\xc2\xa0", "") issue_datetime = datetime.datetime.now().strftime("%Y-%m-%d ") + time else: issue_datetime = issue[0].replace("月", "-").replace("日", "").replace("\xc2\xa0", "") if issue[0].count("-") < 2: issue_datetime =datetime.datetime.now().strftime("%Y-") + issue_datetime issue_device = issue[1] if len(issue) > 1 else None weibo_info["context"] = weibo_text weibo_info["user_id"] = uid weibo_info["issue_time"] = issue_datetime.strip() weibo_info["get_time"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8")) weibo_info["relay_count"] = filter(str.isdigit, relay_ele.encode("utf-8")) weibo_info["comment_count"] = filter(str.isdigit, comment_ele.encode("utf-8")) weibo_info["device"] = issue_device # print issue_datetime, issue_device, weibo_text # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8") return weibo_info # 只爬去第一条微博 except Exception, e: logger.error(e)
from weibo import APIClient, APIError from weibo_login import WeiboLogin, WeiboLoginError APP_KEY = "3226611318" APP_SECRET = "4f94b19d1d30c6bce2505e69d22cd62e" CALLBACK_URL = "https://api.weibo.com/oauth2/default.html" print("start login...") client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) code = '' try: code = WeiboLogin("*****@*****.**", "s2013h1cfr", APP_KEY, CALLBACK_URL).get_code() except WeiboLoginError as e: print("Login Fail [%s]: %s" % (e.error_code, e.error)) exit(1) print("code: %s" % code) r = client.request_access_token(code) access_token = r.access_token expires_in = r.expires_in print("token: %s" % access_token) print("expires in %s" % expires_in) client.set_access_token(access_token, expires_in)
class WeiboCrawler(): ''' crawl weibo using keywords ''' def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD): # login to sinaweibo self.driver = webdriver.PhantomJS() self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization if self.wl.login(): logging.info('login successfully') else: logging.info('login faied') sys.exit(1) self.sk = search_key.strip() return def __del__(self): self.driver.quit() return def crawl(self, page_count=1, comments=False): ''' crawl the weibo using the keywords page_count: how many pages would be crawled ''' self.results = [] # get the mids from each result page pages = list(range(1, page_count+1)) random.shuffle(pages) for t in ('hot', 'time'): for i in pages: url_to_crawl = self.get_search_url(i) logging.info('crawling page {}:{}'.format(i, url_to_crawl)) self.driver.get(url_to_crawl) # wait the page loading the content try: element = WebDriverWait(self.driver, 5).until( lambda x: x.find_elements_by_class_name('feed_list') ) except TimeoutException: logging.info('there is no weibo content in {}'.format(url_to_crawl)) logging.info('you are considered as a robot') logging.info(self.driver.current_url) self.driver.get_screenshot_as_file('./screenshot/error.png') # let user input the verification code verify_user(self.driver, 'search') # break weibo_list = self.get_weibo_list(self.driver.page_source) # mid is used to crawl the original weibo content, using batch mode self.results.extend(weibo_list) # sleep some time to prevent hitting too much # time.sleep(1) else: continue break # for r in results: # logging.info_dict(r) logging.info('total result {}'.format(len(self.results))) if comments: logging.info('crawling the comments') self.crawl_comments() return def get_search_url(self, page=1, w_type='hot'): ''' compose a search url based on page_num and weibo type ''' # logging.info('generating the url') url='' url += 'http://' url += search_domain url += '/wb' url += urllib.parse.quote('/'+self.sk) url += '&' url += urllib.parse.urlencode([ ('page', page), ('xsort', w_type) ]) return url def get_weibo_list(self, content): ''' parse the weibo content in the current result page content: the source page of the keywords result return: a list of weibo object ''' weibo_list = [] soup = BeautifulSoup(content, 'html5lib') for t in soup.find_all('dl', class_='feed_list'): if t.has_attr('mid'): weibo = self.parse_weibo(t) if weibo: weibo_list.append(weibo) logging.info('There are {} weibo on this page'.format(len(weibo_list))) return weibo_list def parse_weibo(self, t): ''' parse weibo object from html t: the tag object that has weibo content Return weibo object ''' weibo = {} try: weibo['keywords'] = self.sk.split(' ') #keywords is a list of words weibo['mid'] = t['mid'] # the user name weibo['screen_name'] = t.find(name='dt', class_='face').find('a').get('title') weibo['user_profile'] = t.find(name='dt', class_='face').find('a').get('href') # the content of weibo weibo['text'] = t.find(name='dd', class_='content').find('em').get_text().strip() # the source url of the weibo weibo['source_url'] = t.find(name='a', class_='date').get('href').strip() logging.info(weibo['source_url']) # logging.info(weibo['text']) # meta data epoch_length = len(str(int(time.time()))) time_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find(name='a', class_='date').get('date')[0:epoch_length] time_now = time.localtime(int(time_str)) weibo['created_at'] = datetime.datetime(*time_now[0:6]) weibo['source'] = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('a', rel='nofollow').string.strip() pop_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('span').get_text().strip().replace('\n', '') pop_type = { # key: source representation, value: attr '赞': 'like_count', '转发': 'repost_count', '评论': 'comment_count' } for key in list(pop_type.keys()): pattern = re.compile(r'.*(%s\((\d+)\)).*' % key) match = pattern.match(pop_str) if match: # logging.info match.group(1) # logging.info match.group(2) weibo[pop_type[key]] = int(match.group(2)) else: # logging.info key, 'not found.' weibo[pop_type[key]] = 0 except Exception as e: logging.info(e) return None # logging.info_dict(weibo) return weibo def save(self, dist_dir='result'): ''' save the search results to file ''' if dist_dir not in os.listdir(os.curdir): os.mkdir(dist_dir) for w in self.results: file_name = ''.join([ '_'.join([k for k in w['keywords']]), w['mid'] ]) file_name += '.txt' f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8') json.dump(w, f, ensure_ascii = False, default=json_util.default, indent = 2) # logging.info(w['text']) logging.info('writed to file {}'.format(file_name)) return def crawl_comments(self): ''' crawl the comments after getting all the results and update the results list --> self ''' client = self.wl.authorize_app() if client: for w in self.results: # logging.info(w['mid']) w['comments'] = [] crawler = WeiboCommentsCrawler(client, weibo_mid = w['mid']) r = crawler.crawl() # filter out the unrelated fields for c in r: c.pop('status') w['comments'].extend(r) else: logging.error('认证失败,不能获取评论列表') return
# -*- coding:utf-8 -*- import sys from weibo_login import WeiboLogin from spider.SearchSpider import SearchSpider from spider.RCSpider import RCSpider import requests import time import csv from logconfig import LogConfig logger = LogConfig.get_logger() reload(sys) sys.setdefaultencoding("utf-8") # 登录,保存Session s_login = requests.session() w = WeiboLogin() s_login = w.login_un() logger.info("login has finished") time.sleep(1.5) Search_urls = [ # 搜索关键词语:杨洋 傅园慧 可以运行 "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&type=all&queryVal=%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&luicode=20000174&title=%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&v_p=11&ext=&fid=100103type%3D1%26q%3D%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&uicode=10000011&next_cursor=&page=", # 搜索关键词语:傅园慧 可以运行 "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7&type=all&queryVal=%E5%82%85%E5%9B%AD%E6%85%A7&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7&title=%E5%82%85%E5%9B%AD%E6%85%A7&v_p=11&ext=&fid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7&uicode=10000011&next_cursor=&page=", # 搜索关键词语:洪荒少女 可以运行 "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&type=all&queryVal=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&title=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&v_p=11&ext=&fid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&uicode=10000011&next_cursor=&page=", # 搜索关键词语:洪荒少女傅园慧 可以运行 "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&type=all&queryVal=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&title=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&v_p=11&ext=&fid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&uicode=10000011&next_cursor=&page=", # 搜索关键词语:傅园慧表情包 可以运行 "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&type=all&queryVal=%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&title=%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&v_p=11&ext=&fid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&uicode=10000011&next_cursor=&page=",
#coding:utf-8 import urllib2 import post_encode import time import os from weibo_login import WeiboLogin import get_weibo if __name__ == '__main__': #Login = WeiboLogin('17089368196', 'tttt5555') Login = WeiboLogin('用户名', '密码') if Login.login() == True: print "登录成功" rnd = long((time.time()) * 1000) #可以根据page来循环以便达到爬取多页的目的 init_url = "http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3917356680052180&max_id=3917762218921722" url = init_url + "&page=1&_rnd=" + str(rnd) html = urllib2.urlopen(url) #调用解析html内容的函数 forward_html = urllib2.urlopen( "http://weibo.com/1336593085/D7hwE0dzC?type=repost#_rnd1449550776472" ).read() #print forward_html origin_uid = get_weibo.get_origin_weibo(forward_html) (uid, origin_uid2_no, uid2_no, time_no, time_no2, total_forward, total_page, current_page) = get_weibo.get_forward(html, forward_html) #循环抓取多页 print total_page uid = [] origin_uid2 = [] uid2 = [] forward_time = []
class WeiboCrawler(): ''' crawl weibo using keywords ''' def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD): # login to sinaweibo self.driver = webdriver.PhantomJS() self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization if self.wl.login(): logging.info('login successfully') else: logging.info('login faied') sys.exit(1) self.sk = search_key.strip() return def __del__(self): self.driver.quit() return def crawl(self, page_count=1, comments=False): ''' crawl the weibo using the keywords page_count: how many pages would be crawled ''' self.results = [] # get the mids from each result page pages = list(range(1, page_count + 1)) random.shuffle(pages) for t in ('hot', 'time'): for i in pages: url_to_crawl = self.get_search_url(i) logging.info('crawling page {}:{}'.format(i, url_to_crawl)) self.driver.get(url_to_crawl) # wait the page loading the content try: element = WebDriverWait(self.driver, 5).until( lambda x: x.find_elements_by_class_name('feed_list')) except TimeoutException: logging.info( 'there is no weibo content in {}'.format(url_to_crawl)) logging.info('you are considered as a robot') logging.info(self.driver.current_url) self.driver.get_screenshot_as_file( './screenshot/error.png') # let user input the verification code verify_user(self.driver, 'search') # break weibo_list = self.get_weibo_list( self.driver.page_source ) # mid is used to crawl the original weibo content, using batch mode self.results.extend(weibo_list) # sleep some time to prevent hitting too much # time.sleep(1) else: continue break # for r in results: # logging.info_dict(r) logging.info('total result {}'.format(len(self.results))) if comments: logging.info('crawling the comments') self.crawl_comments() return def get_search_url(self, page=1, w_type='hot'): ''' compose a search url based on page_num and weibo type ''' # logging.info('generating the url') url = '' url += 'http://' url += search_domain url += '/wb' url += urllib.parse.quote('/' + self.sk) url += '&' url += urllib.parse.urlencode([('page', page), ('xsort', w_type)]) return url def get_weibo_list(self, content): ''' parse the weibo content in the current result page content: the source page of the keywords result return: a list of weibo object ''' weibo_list = [] soup = BeautifulSoup(content, 'html5lib') for t in soup.find_all('dl', class_='feed_list'): if t.has_attr('mid'): weibo = self.parse_weibo(t) if weibo: weibo_list.append(weibo) logging.info('There are {} weibo on this page'.format(len(weibo_list))) return weibo_list def parse_weibo(self, t): ''' parse weibo object from html t: the tag object that has weibo content Return weibo object ''' weibo = {} try: weibo['keywords'] = self.sk.split( ' ') #keywords is a list of words weibo['mid'] = t['mid'] # the user name weibo['screen_name'] = t.find(name='dt', class_='face').find('a').get('title') weibo['user_profile'] = t.find(name='dt', class_='face').find('a').get('href') # the content of weibo weibo['text'] = t.find( name='dd', class_='content').find('em').get_text().strip() # the source url of the weibo weibo['source_url'] = t.find(name='a', class_='date').get('href').strip() logging.info(weibo['source_url']) # logging.info(weibo['text']) # meta data epoch_length = len(str(int(time.time()))) time_str = t.find('dd', class_='content').find( 'p', class_='info W_linkb W_textb').find( name='a', class_='date').get('date')[0:epoch_length] time_now = time.localtime(int(time_str)) weibo['created_at'] = datetime.datetime(*time_now[0:6]) weibo['source'] = t.find('dd', class_='content').find( 'p', class_='info W_linkb W_textb').find( 'a', rel='nofollow').string.strip() pop_str = t.find('dd', class_='content').find( 'p', class_='info W_linkb W_textb').find( 'span').get_text().strip().replace('\n', '') pop_type = { # key: source representation, value: attr '赞': 'like_count', '转发': 'repost_count', '评论': 'comment_count' } for key in list(pop_type.keys()): pattern = re.compile(r'.*(%s\((\d+)\)).*' % key) match = pattern.match(pop_str) if match: # logging.info match.group(1) # logging.info match.group(2) weibo[pop_type[key]] = int(match.group(2)) else: # logging.info key, 'not found.' weibo[pop_type[key]] = 0 except Exception as e: logging.info(e) return None # logging.info_dict(weibo) return weibo def save(self, dist_dir='result'): ''' save the search results to file ''' if dist_dir not in os.listdir(os.curdir): os.mkdir(dist_dir) for w in self.results: file_name = ''.join( ['_'.join([k for k in w['keywords']]), w['mid']]) file_name += '.txt' f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8') json.dump(w, f, ensure_ascii=False, default=json_util.default, indent=2) # logging.info(w['text']) logging.info('writed to file {}'.format(file_name)) return def crawl_comments(self): ''' crawl the comments after getting all the results and update the results list --> self ''' client = self.wl.authorize_app() if client: for w in self.results: # logging.info(w['mid']) w['comments'] = [] crawler = WeiboCommentsCrawler(client, weibo_mid=w['mid']) r = crawler.crawl() # filter out the unrelated fields for c in r: c.pop('status') w['comments'].extend(r) else: logging.error('认证失败,不能获取评论列表') return
def check_uid(uid): info = wb.get("http://weibo.com/{}".format(uid)).content if "page_error" not in info: matched = user_pat.search(info.decode("utf-8")) if matched: return matched.group(1) else: return True app = Flask(__name__) app.config.from_pyfile("config.py") wb = WeiboLogin(app.config["USERNAME"], app.config["PASSWORD"], app.config["COOKIE_FILE"]) wb.load_cookies() @app.route("/weibo", methods=["POST", "GET"]) def weibo(): wb.test_log_status() args = request.form or request.args user = args.get("user", "") uid = args.get("uid", "") action = args.get("action", "start") door = args.get("door", "") if request.method == "GET": base64_image = "" if not wb.logged: