def main(): # url='http://www.lagou.com/zhaopin/houduankaifa' url = 'http://www.lagou.com/jobs/positionAjax.json' headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36', 'Host': 'www.lagou.com', 'Origin': 'http://www.lagou.com', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'X-Requested-With': 'XMLHttpRequest' } params = { 'needAdditionalResult': 'False' } data = { 'first': 'False', 'pn': 320, 'kd': '后端开发' } # params=json.dumps(params) try: r = requests.post(url, headers=headers, params=params, data=data) print r.url for i in r.json()['content']['positionResult']['result']: print i['positionId'] except TypeError, ValueError: Logging.error(u'page out of range')
def parse(self): DOM = BeautifulSoup(self.html, "html.parser") el = DOM.find("div", class_="zm-profile-header") elem = el.find("div", class_="title-section") # Name, Bio ( 一句话介绍自己? ) name = elem.find("a", class_="name").get_text() name = re.sub("^\n+|\n+$", "", name) bio = elem.find("span", class_="bio").get_text() bio = re.sub("^\n+|\n+$", "", bio) # SNS Info ( Weibo | QQ | ... ) sns = {"weibo": ""} wb_el = el.find("div", class_="top").find("div", class_="weibo-wrap") try: sns["weibo"] = wb_el.find("a", class_="zm-profile-header-user-weibo")["href"] except: pass # avatar avatar = el.find("div", class_="body").find("img", class_="avatar")["src"] # descp descp = ( el.find("div", class_="body").find("span", class_="description").find("span", class_="content").get_text() ) descp = re.sub("^\n+|\n+$", "", descp) # Hash ID try: self.hash_id = DOM.find("div", class_="zm-profile-header-op-btns").find("button")["data-id"] except: self.hash_id = "" f_el = DOM.find("div", class_="zm-profile-side-following").find_all("strong") if len(f_el) < 2: followees_num = 0 followers_num = 0 else: # 该用户关注的人 followees followees_num = int(f_el[0].string.replace("\n", "")) followees = self._fetch_followees(followees_num) # 关注该用户的人 followers followers_num = int(f_el[1].string.replace("\n", "")) followers = self._fetch_followers(followers_num) print followers # 关注的专栏 # 关注的话题 el = DOM.find("div", class_="zm-profile-section-list") # 成就 ( 赞同数, 感谢 ) reputation = {"agree": 0, "thanks": 0, "favors": 0, "share": 0} elems = el.find("div", class_="zm-profile-details-reputation").find_all("strong") if len(elems) == 4: reputation["agree"] = int(elems[0].string) reputation["thanks"] = int(elems[1].string) reputation["favors"] = int(elems[2].string) reputation["share"] = int(elems[3].string) else: Logging.error(u"用户个人成就信息解析失败") Logging.debug(elems) "次要信息, 待完善 ..."
def get_title(self): soup = self.soup try: title = soup.find_all('h1')[1]['title'] except AttributeError as ex: Logging.error(u'无法解析职位名称') Logging.error(ex) title = None return title
def pull(self): url = "http://www.zhihu.com/people/%s/about" % (self.token) r = requests.get(url) if r.status_code != 200: raise IOError("network error.") self.html = r.content try: self.xsrf = re.compile(r"input\stype.*?name=.\_xsrf.\svalue=.(\w+).", re.DOTALL).findall(self.html)[0] except Exception as e: Logging.error(u"XSRF值提取失败") Logging.debug(e)
def get_company(self): soup = self.soup try: company = re.sub( r'\s+', '', soup.find('h2', class_='fl').get_text()) # 这里应该还有更好的方法来取得公司名称,不包括’未认证‘的内容 except AttributeError as ex: Logging.error(u'无法解析公司名称') Logging.error(ex) company = None return company
def get_description(self): soup = self.soup try: description = soup.find( 'dd', class_='job_request').find_all('span') salary = description[0].string location = description[1].string experience = description[2].string education = description[3].string jtype = description[4].string except IndexError, AttributeError: Logging.error(u'无法解析职位信息') Logging.error(ex) raise ValueError('{} can\'t be resolved'.format(self.url))
def login_accounts(): # read and login, save cookies.获得一组登陆账户 accounts = get_accounts() if len(accounts) == 0: Logging.error(u'读取账户失败,请检查表格文件.') print accounts login_accounts = list() count = 0 for account in accounts[:35]: cookies = 'cookies' + str(count) login = Login(account[0], account[1]) login.initialize(cookies) if not login.login(): import os os.remove('captcha.gif') Logging.error(u'账户{}登陆失败,请检查账户和密码') continue login_accounts.append((account, cookies)) count += 1 # print 'login accounts are: {}'.format(str(login_accounts)) # 随机获取登陆账户并发送消息 return login_accounts
def login_accounts(): # read and login, save cookies.获得一组登陆账户 accounts = get_accounts() if len(accounts) == 0: Logging.error(u'读取账户失败,请检查表格文件.') print accounts login_accounts = list() count = 0 for account in accounts[:35]: cookies = 'cookies'+str(count) login = Login(account[0], account[1]) login.initialize(cookies) if not login.login(): import os os.remove('captcha.gif') Logging.error(u'账户{}登陆失败,请检查账户和密码') continue login_accounts.append((account, cookies)) count += 1 # print 'login accounts are: {}'.format(str(login_accounts)) # 随机获取登陆账户并发送消息 return login_accounts
def _fetch_followees(self, total): # 获取 该用户关注的人 # http://www.zhihu.com/people/leng-zhe/followees url = "http://www.zhihu.com/node/ProfileFolloweesListV2" """ HTTP POST: method:next params:{"offset":20,"order_by":"created","hash_id":"06f3b1c891d0d504eea8af883150b497"} _xsrf:f11a7023d52d5a0ec95914ecff30885f <div class="zm-profile-card zm-profile-section-item zg-clear no-hovercard"> <div class="zg-right"> <button data-follow="m:button" data-id="dfadd95bc7af994cc8933c444cc9327e" class="zg-btn zg-btn-follow zm-rich-follow-btn small nth-0"> 关注 </button> </div> <a title="黄云忠" data-tip="p$t$huangdoc" class="zm-item-link-avatar" href="/people/huangdoc"> <img src="https://pic2.zhimg.com/b7dde5a21_m.jpg" class="zm-item-img-avatar"> </a> <div class="zm-list-content-medium"> <h2 class="zm-list-content-title"> <a data-tip="p$t$huangdoc" href="http://www.zhihu.com/people/huangdoc" class="zg-link" title="黄云忠">黄云忠</a> </h2> <div class="zg-big-gray">风险投资人</div> <div class="details zg-gray"> <a target="_blank" href="/people/huangdoc/followers" class="zg-link-gray-normal">4846 关注者</a> / <a target="_blank" href="/people/huangdoc/asks" class="zg-link-gray-normal">17 提问</a> / <a target="_blank" href="/people/huangdoc/answers" class="zg-link-gray-normal">23 回答</a> / <a target="_blank" href="/people/huangdoc" class="zg-link-gray-normal">8 赞同</a> </div> </div> </div> """ offset = 0 followees = [] while offset < total: params = {"offset": offset, "order_by": "created", "hash_id": self.hash_id} data = {"method": "next", "params": json.dumps(params), "_xsrf": self.xsrf} Logging.info(u"获取该用户关注者: %s " % json.dumps(data)) r = requests.post(url, data=data) if r.status_code != 200: raise IOError("network error.") try: res = json.loads(r.content) if res["r"] == 0 and type(res["msg"]) == type([]): result = res["msg"] else: result = [] except Exception as e: Logging.error(u"数据格式解析失败") Logging.debug(e) result = [] for p in result: r = re.compile(r"\/people/(\S+)\"|\'", re.DOTALL).findall(p) if len(r) > 0: followees.append(r[0]) else: Logging.warn(u"提取用户token失败") Logging.warn(p) offset += len(result) return followees
def parse(self): DOM = BeautifulSoup(self.html, "html.parser") # 问题标题 title = DOM.find("h2", class_="zm-item-title").get_text() title = re.sub("^\n+|\n+$", "", title) # 问题主体 el = DOM.find("div", id="zh-question-detail") id = int(el["data-resourceid"]) # 问题资源编号, 区别于Token self.id = id content = el.find("div", class_="zm-editable-content").get_text() content = re.sub("^\n+|\n+$", "", content) # 问题关注者 followers = self._fetch_followers() # 答案数量 try: el = DOM.find("h3", id="zh-question-answer-num") answers_num = int(el["data-num"]) except: answers_num = 0 # 答案 token 列表 answers = self._fetch_answers(answers_num) # 问题 状态, 关注该问题的人员列表、相关问题、被浏览次数、相关话题关注者人数 sections = DOM.find_all("div", class_="zm-side-section") if len(sections) == 3: elems = sections[-1].find_all("div", class_="zg-gray-normal") # 最后更新时间, 2015-10-02 | 23:35 utime_string = elems[0].find("span", class_="time").string elems = elems[1].find_all("strong") # 被浏览次数 visit_times = int(elems[0].string) # 相关话题关注者人数 RT_for_CN = int(elems[1].string) else: utime_string = "" visit_times = 0 RT_for_CN = 0 # 问题所属 话题列表 topics = [] elems = DOM.find_all("a", class_="zm-item-tag") if elems == None: elems = [] for el in elems: try: topics.append( { "id": el["data-topicid"], "token": el["data-token"], "name": el.contents[0].string.replace("\n", ""), } ) except: Logging.error(u"话题解析失败") Logging.debug(el) # 获取该 问题的评论 comments = self._fetch_comments() print u"title: %s" % title print u"content: %s" % content print u"topics: " _print = [] map(lambda topic: _print.append("%s(%s), " % (topic["name"], topic["token"])), topics) print "\t%s" % ", ".join(_print) print u"followers: " _print = [] map(lambda topic: _print.append("%s(%s), " % (topic["name"], topic["token"])), followers) print "\t%s" % ", ".join(_print) print u"答案列表(%d):" % (len(answers)) print u"\t ", answers print u"问题评论:" for comment in comments: print u"\t %s\t%s\t%s" % (comment["utime"], comment["people"]["name"], comment["content"]) print u"问题状态:" print u"\t浏览次数: %d" % visit_times print u"\t相关话题关注者人数: %d" % RT_for_CN print u"\t最后修改时间: %s" % utime_string
def _fetch_comments(self): # 获取该问题的评论 url = "http://www.zhihu.com/node/QuestionCommentBoxV2" # 注意,这里的 question id 并非 是 question token. params = {"params": json.dumps({"question_id": self.id})} r = requests.get(url, params=params) if r.status_code != 200: return [] """ http response: <div class="zm-comment-box" data-count="2"> <i class="icon icon-spike zm-comment-bubble"></i> <a class="zg-anchor-hidden" name="comment-0"></a> <div class="zm-comment-list"> <div class="zm-item-comment" data-id="90669446"> <a class="zg-anchor-hidden" name="comment-90669446"></a> <a title="薯薯薯薯条" data-tip="p$t$xia-mu-de-cha-wan" class="zm-item-link-avatar" href="/people/xia-mu-de-cha-wan"> <img src="https://pic3.zhimg.com/98a00c51721216c0c61b74be7338c20a_s.jpg" class="zm-item-img-avatar"> </a> <div class="zm-comment-content-wrap"> <div class="zm-comment-hd"> <a data-tip="p$t$xia-mu-de-cha-wan" href="http://www.zhihu.com/people/xia-mu-de-cha-wan" class="zg-link" title="薯薯薯薯条">薯薯薯薯条</a> </div> <div class="zm-comment-content"> ( •̀∀•́ )坐等看故事 </div> <div class="zm-comment-ft"> <span class="date">2015-08-20</span> <a href="#" class="reply zm-comment-op-link" name="reply_comment"> <i class="zg-icon zg-icon-comment-reply"></i>回复</a> <a href="#" class="like zm-comment-op-link " name="like_comment"> <i class="zg-icon zg-icon-comment-like"></i>赞</a> <span class="like-num nil" data-tip="s$r$0 人觉得这个很赞"> <em>0</em> <span>赞</span></span> <a href="#" name="report" class="report zm-comment-op-link needsfocus"> <i class="zg-icon z-icon-no-help"></i>举报</a> </div> </div> </div> <!-- comment list end --> </div> </div> """ soup = BeautifulSoup(r.content, "html.parser") elems = soup.find_all("div", class_="zm-item-comment") comments = [] for elem in elems: # comment id el = elem.find("a", class_="zm-item-link-avatar") id = int(elem["data-id"]) people = { "token": el["href"].split("/")[-1], "avatar": el.find("img")["src"], "name": elem.find("div", class_="zm-comment-hd").find("a")["title"], } utime = elem.find("span", class_="date").string content = elem.find("div", class_="zm-comment-content").get_text() if content == None: Logging.error(u"问题评论解析失败") Logging.debug(elem) else: content = re.sub("^\n+|\n+$", "", content) comments.append({"id": id, "people": people, "content": content, "utime": utime}) return comments
# module from auth import islogin from auth import Logging """ Note: 1. 身份验证由 `auth.py` 完成。 2. 身份信息保存在当前目录的 `cookies` 文件中。 3. `requests` 对象可以直接使用,身份信息已经自动加载。 """ requests = requests.Session() requests.cookies = cookielib.LWPCookieJar("cookies") try: requests.cookies.load(ignore_discard=True) except: Logging.error(u"你还没有登录知乎哦 ...") Logging.info(u"执行 `python auth.py` 即可以完成登录。") raise Exception("无权限(403)") class People: """ people: name domain avatar profile location name sex job
return None if __name__ == '__main__': parse = argparse.ArgumentParser(description='Crawl book infomation') parse.add_argument('url', help='url to crawl') parse.add_argument('path', help='path to save book info') parse.add_argument('-p', '--page', help='specific page you want to crawl', type=int, default=1) args = parse.parse_args() colorama.init() requests = requests.Session() requests.cookies = cookielib.LWPCookieJar('cookies') try: requests.cookies.load(ignore_discard=True) except IOError: Logging.error('run auth.py to log in') raise Exception('have not been authenticated') if not is_login(): Logging.error('cookies have expired, please run auth.py again') raise Exception('have not been authenticated') book = Book(args.url, args.page) book.save_to_excel(args.path)
import requests, termcolor, html2text from bs4 import BeautifulSoup # module from auth import islogin from auth import Logging #from __builtin__ import None requests = requests.Session() requests.cookies = cookielib.LWPCookieJar('cookies') try: requests.cookies.load(ignore_discard=True) except: Logging.error("not login yet") Logging.info("run auth.py to login") raise Exception("not authorized(403)") if islogin() != True: Logging.error("Identity expired. Relogin to generate identity(auth.py)") raise Exception("not authorized(403)") class User: user_url = None # session = None soup = None
""" Note: 1. 身份验证由 `auth.py` 完成。 2. 身份信息保存在当前目录的 `cookies` 文件中。 3. `requests` 对象可以直接使用,身份信息已经自动加载。 By Luozijun (https://github.com/LuoZijun), 09/09 2015 """ requests = requests.Session() requests.cookies = cookielib.LWPCookieJar('cookies') try: requests.cookies.load(ignore_discard=True) except: Logging.error(u"你还没有登录知乎哦 ...") Logging.info(u"执行 `python auth.py` 即可以完成登录。") raise Exception("无权限(403)") if islogin() != True: Logging.error(u"你的身份信息已经失效,请重新生成身份信息( `python auth.py` )。") raise Exception("无权限(403)") reload(sys) sys.setdefaultencoding('utf8') class Question: url = None soup = None
yield None def get_published_year(self): return None if __name__ == '__main__': parse = argparse.ArgumentParser(description='Crawl book infomation') parse.add_argument('url', help='url to crawl') parse.add_argument('path', help='path to save book info') parse.add_argument('-p', '--page', help='specific page you want to crawl', type=int, default=1) args = parse.parse_args() colorama.init() requests = requests.Session() requests.cookies = cookielib.LWPCookieJar('cookies') try: requests.cookies.load(ignore_discard=True) except IOError: Logging.error('run auth.py to log in') raise Exception('have not been authenticated') if not is_login(): Logging.error('cookies have expired, please run auth.py again') raise Exception('have not been authenticated') book = Book(args.url, args.page) book.save_to_excel(args.path)