コード例 #1
0
ファイル: lagou.py プロジェクト: Allianzcortex/lagou_crawler
def main():
    # url='http://www.lagou.com/zhaopin/houduankaifa'
    url = 'http://www.lagou.com/jobs/positionAjax.json'
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36',
        'Host': 'www.lagou.com',
        'Origin': 'http://www.lagou.com',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'X-Requested-With': 'XMLHttpRequest'
    }
    params = {
        'needAdditionalResult': 'False'
    }
    data = {
        'first': 'False',
        'pn': 320,
        'kd': '后端开发'
    }
    # params=json.dumps(params)
    try:
        r = requests.post(url, headers=headers, params=params, data=data)
        print r.url
        for i in r.json()['content']['positionResult']['result']:
            print i['positionId']
    except TypeError, ValueError:
        Logging.error(u'page out of range')
コード例 #2
0
ファイル: core.py プロジェクト: LuoZijun/zhihu-python
    def parse(self):
        DOM = BeautifulSoup(self.html, "html.parser")
        el = DOM.find("div", class_="zm-profile-header")
        elem = el.find("div", class_="title-section")
        # Name, Bio ( 一句话介绍自己? )
        name = elem.find("a", class_="name").get_text()
        name = re.sub("^\n+|\n+$", "", name)
        bio = elem.find("span", class_="bio").get_text()
        bio = re.sub("^\n+|\n+$", "", bio)
        # SNS Info ( Weibo | QQ | ... )
        sns = {"weibo": ""}
        wb_el = el.find("div", class_="top").find("div", class_="weibo-wrap")
        try:
            sns["weibo"] = wb_el.find("a", class_="zm-profile-header-user-weibo")["href"]
        except:
            pass
        # avatar
        avatar = el.find("div", class_="body").find("img", class_="avatar")["src"]
        # descp
        descp = (
            el.find("div", class_="body").find("span", class_="description").find("span", class_="content").get_text()
        )
        descp = re.sub("^\n+|\n+$", "", descp)

        # Hash ID
        try:
            self.hash_id = DOM.find("div", class_="zm-profile-header-op-btns").find("button")["data-id"]
        except:
            self.hash_id = ""

        f_el = DOM.find("div", class_="zm-profile-side-following").find_all("strong")
        if len(f_el) < 2:
            followees_num = 0
            followers_num = 0
        else:
            # 该用户关注的人 followees
            followees_num = int(f_el[0].string.replace("\n", ""))
            followees = self._fetch_followees(followees_num)
            # 关注该用户的人 followers
            followers_num = int(f_el[1].string.replace("\n", ""))
            followers = self._fetch_followers(followers_num)

        print followers
        # 关注的专栏

        # 关注的话题

        el = DOM.find("div", class_="zm-profile-section-list")
        # 成就 ( 赞同数, 感谢 )
        reputation = {"agree": 0, "thanks": 0, "favors": 0, "share": 0}
        elems = el.find("div", class_="zm-profile-details-reputation").find_all("strong")
        if len(elems) == 4:
            reputation["agree"] = int(elems[0].string)
            reputation["thanks"] = int(elems[1].string)
            reputation["favors"] = int(elems[2].string)
            reputation["share"] = int(elems[3].string)
        else:
            Logging.error(u"用户个人成就信息解析失败")
            Logging.debug(elems)
        "次要信息, 待完善 ..."
コード例 #3
0
ファイル: lagou.py プロジェクト: Allianzcortex/lagou_crawler
 def get_title(self):
     soup = self.soup
     try:
         title = soup.find_all('h1')[1]['title']
     except AttributeError as ex:
         Logging.error(u'无法解析职位名称')
         Logging.error(ex)
         title = None
     return title
コード例 #4
0
ファイル: core.py プロジェクト: LuoZijun/zhihu-python
 def pull(self):
     url = "http://www.zhihu.com/people/%s/about" % (self.token)
     r = requests.get(url)
     if r.status_code != 200:
         raise IOError("network error.")
     self.html = r.content
     try:
         self.xsrf = re.compile(r"input\stype.*?name=.\_xsrf.\svalue=.(\w+).", re.DOTALL).findall(self.html)[0]
     except Exception as e:
         Logging.error(u"XSRF值提取失败")
         Logging.debug(e)
コード例 #5
0
ファイル: lagou.py プロジェクト: Allianzcortex/lagou_crawler
 def get_company(self):
     soup = self.soup
     try:
         company = re.sub(
             r'\s+', '', soup.find('h2', class_='fl').get_text())
         # 这里应该还有更好的方法来取得公司名称,不包括’未认证‘的内容
     except AttributeError as ex:
         Logging.error(u'无法解析公司名称')
         Logging.error(ex)
         company = None
     return company
コード例 #6
0
ファイル: lagou.py プロジェクト: Allianzcortex/lagou_crawler
 def get_description(self):
     soup = self.soup
     try:
         description = soup.find(
             'dd', class_='job_request').find_all('span')
         salary = description[0].string
         location = description[1].string
         experience = description[2].string
         education = description[3].string
         jtype = description[4].string
     except IndexError, AttributeError:
         Logging.error(u'无法解析职位信息')
         Logging.error(ex)
         raise ValueError('{} can\'t be resolved'.format(self.url))
コード例 #7
0
def login_accounts():
    # read and login, save cookies.获得一组登陆账户
    accounts = get_accounts()
    if len(accounts) == 0:
        Logging.error(u'读取账户失败,请检查表格文件.')
    print accounts
    login_accounts = list()
    count = 0
    for account in accounts[:35]:
        cookies = 'cookies' + str(count)
        login = Login(account[0], account[1])
        login.initialize(cookies)
        if not login.login():
            import os
            os.remove('captcha.gif')
            Logging.error(u'账户{}登陆失败,请检查账户和密码')
            continue
        login_accounts.append((account, cookies))
        count += 1
    # print 'login accounts are: {}'.format(str(login_accounts))
    # 随机获取登陆账户并发送消息
    return login_accounts
コード例 #8
0
def login_accounts():
    # read and login, save cookies.获得一组登陆账户
    accounts = get_accounts()
    if len(accounts) == 0:
        Logging.error(u'读取账户失败,请检查表格文件.')
    print accounts
    login_accounts = list()
    count = 0
    for account in accounts[:35]:
        cookies = 'cookies'+str(count)
        login = Login(account[0], account[1])
        login.initialize(cookies)
        if not login.login():
            import os
            os.remove('captcha.gif')
            Logging.error(u'账户{}登陆失败,请检查账户和密码')
            continue
        login_accounts.append((account, cookies))
        count += 1
    # print 'login accounts are: {}'.format(str(login_accounts))
    # 随机获取登陆账户并发送消息
    return login_accounts
コード例 #9
0
ファイル: core.py プロジェクト: LuoZijun/zhihu-python
    def _fetch_followees(self, total):
        # 获取 该用户关注的人
        # http://www.zhihu.com/people/leng-zhe/followees
        url = "http://www.zhihu.com/node/ProfileFolloweesListV2"
        """
            HTTP POST:
                method:next
                params:{"offset":20,"order_by":"created","hash_id":"06f3b1c891d0d504eea8af883150b497"}
                _xsrf:f11a7023d52d5a0ec95914ecff30885f

            <div class="zm-profile-card zm-profile-section-item zg-clear no-hovercard"> 
                <div class="zg-right"> 
                    <button 
                        data-follow="m:button" 
                        data-id="dfadd95bc7af994cc8933c444cc9327e" 
                        class="zg-btn zg-btn-follow zm-rich-follow-btn small nth-0">
                            关注
                    </button> 
                </div>
                <a title="黄云忠" data-tip="p$t$huangdoc" class="zm-item-link-avatar" href="/people/huangdoc">
                    <img src="https://pic2.zhimg.com/b7dde5a21_m.jpg" class="zm-item-img-avatar">
                </a>
                <div class="zm-list-content-medium">
                    <h2 class="zm-list-content-title">
                        <a data-tip="p$t$huangdoc" href="http://www.zhihu.com/people/huangdoc" class="zg-link" title="黄云忠">黄云忠</a>
                    </h2>
                    <div class="zg-big-gray">风险投资人</div>
                    <div class="details zg-gray"> 
                        <a target="_blank" href="/people/huangdoc/followers" class="zg-link-gray-normal">4846 关注者</a> / 
                        <a target="_blank" href="/people/huangdoc/asks" class="zg-link-gray-normal">17 提问</a> / 
                        <a target="_blank" href="/people/huangdoc/answers" class="zg-link-gray-normal">23 回答</a> / 
                        <a target="_blank" href="/people/huangdoc" class="zg-link-gray-normal">8 赞同</a> 
                    </div> 
                </div> 
            </div>
        """
        offset = 0
        followees = []
        while offset < total:
            params = {"offset": offset, "order_by": "created", "hash_id": self.hash_id}
            data = {"method": "next", "params": json.dumps(params), "_xsrf": self.xsrf}

            Logging.info(u"获取该用户关注者: %s " % json.dumps(data))

            r = requests.post(url, data=data)
            if r.status_code != 200:
                raise IOError("network error.")
            try:
                res = json.loads(r.content)
                if res["r"] == 0 and type(res["msg"]) == type([]):
                    result = res["msg"]
                else:
                    result = []
            except Exception as e:
                Logging.error(u"数据格式解析失败")
                Logging.debug(e)
                result = []
            for p in result:
                r = re.compile(r"\/people/(\S+)\"|\'", re.DOTALL).findall(p)
                if len(r) > 0:
                    followees.append(r[0])
                else:
                    Logging.warn(u"提取用户token失败")
                    Logging.warn(p)
            offset += len(result)
        return followees
コード例 #10
0
ファイル: core.py プロジェクト: LuoZijun/zhihu-python
    def parse(self):
        DOM = BeautifulSoup(self.html, "html.parser")

        # 问题标题
        title = DOM.find("h2", class_="zm-item-title").get_text()
        title = re.sub("^\n+|\n+$", "", title)

        # 问题主体
        el = DOM.find("div", id="zh-question-detail")

        id = int(el["data-resourceid"])  # 问题资源编号, 区别于Token
        self.id = id
        content = el.find("div", class_="zm-editable-content").get_text()
        content = re.sub("^\n+|\n+$", "", content)

        # 问题关注者
        followers = self._fetch_followers()

        # 答案数量
        try:
            el = DOM.find("h3", id="zh-question-answer-num")
            answers_num = int(el["data-num"])
        except:
            answers_num = 0
        # 答案 token 列表
        answers = self._fetch_answers(answers_num)

        # 问题 状态, 关注该问题的人员列表、相关问题、被浏览次数、相关话题关注者人数
        sections = DOM.find_all("div", class_="zm-side-section")
        if len(sections) == 3:
            elems = sections[-1].find_all("div", class_="zg-gray-normal")
            # 最后更新时间, 2015-10-02 | 23:35
            utime_string = elems[0].find("span", class_="time").string
            elems = elems[1].find_all("strong")
            # 被浏览次数
            visit_times = int(elems[0].string)
            # 相关话题关注者人数
            RT_for_CN = int(elems[1].string)

        else:
            utime_string = ""
            visit_times = 0
            RT_for_CN = 0
        # 问题所属 话题列表
        topics = []
        elems = DOM.find_all("a", class_="zm-item-tag")
        if elems == None:
            elems = []
        for el in elems:
            try:
                topics.append(
                    {
                        "id": el["data-topicid"],
                        "token": el["data-token"],
                        "name": el.contents[0].string.replace("\n", ""),
                    }
                )
            except:
                Logging.error(u"话题解析失败")
                Logging.debug(el)
        # 获取该 问题的评论
        comments = self._fetch_comments()

        print u"title: %s" % title
        print u"content: %s" % content
        print u"topics: "
        _print = []
        map(lambda topic: _print.append("%s(%s), " % (topic["name"], topic["token"])), topics)
        print "\t%s" % ", ".join(_print)

        print u"followers: "
        _print = []
        map(lambda topic: _print.append("%s(%s), " % (topic["name"], topic["token"])), followers)
        print "\t%s" % ", ".join(_print)

        print u"答案列表(%d):" % (len(answers))
        print u"\t ", answers

        print u"问题评论:"
        for comment in comments:
            print u"\t %s\t%s\t%s" % (comment["utime"], comment["people"]["name"], comment["content"])

        print u"问题状态:"
        print u"\t浏览次数: %d" % visit_times
        print u"\t相关话题关注者人数: %d" % RT_for_CN
        print u"\t最后修改时间: %s" % utime_string
コード例 #11
0
ファイル: core.py プロジェクト: LuoZijun/zhihu-python
    def _fetch_comments(self):
        # 获取该问题的评论
        url = "http://www.zhihu.com/node/QuestionCommentBoxV2"
        # 注意,这里的 question id 并非 是 question token.
        params = {"params": json.dumps({"question_id": self.id})}
        r = requests.get(url, params=params)
        if r.status_code != 200:
            return []
        """
            http response:
            <div class="zm-comment-box" data-count="2">
                <i class="icon icon-spike zm-comment-bubble"></i>
                <a class="zg-anchor-hidden" name="comment-0"></a>
                <div class="zm-comment-list">
                    <div class="zm-item-comment" data-id="90669446">
                        <a class="zg-anchor-hidden" name="comment-90669446"></a>
                        <a title="薯薯薯薯条"
                            data-tip="p$t$xia-mu-de-cha-wan"
                            class="zm-item-link-avatar"
                            href="/people/xia-mu-de-cha-wan">
                                <img src="https://pic3.zhimg.com/98a00c51721216c0c61b74be7338c20a_s.jpg" class="zm-item-img-avatar">
                        </a>
                        <div class="zm-comment-content-wrap">
                            <div class="zm-comment-hd">
                                <a data-tip="p$t$xia-mu-de-cha-wan" href="http://www.zhihu.com/people/xia-mu-de-cha-wan" class="zg-link" title="薯薯薯薯条">薯薯薯薯条</a>

                            </div>
                            <div class="zm-comment-content">
                            ( •̀∀•́ )坐等看故事
                            </div>
                            <div class="zm-comment-ft">
                                <span class="date">2015-08-20</span>
                                <a href="#" class="reply zm-comment-op-link" name="reply_comment">
                                <i class="zg-icon zg-icon-comment-reply"></i>回复</a>
                                <a href="#" class="like zm-comment-op-link " name="like_comment">
                                <i class="zg-icon zg-icon-comment-like"></i>赞</a>
                                <span class="like-num  nil" data-tip="s$r$0 人觉得这个很赞">
                                <em>0</em> <span>赞</span></span>


                                <a href="#" name="report" class="report zm-comment-op-link needsfocus">
                                <i class="zg-icon z-icon-no-help"></i>举报</a>
                            </div>
                        </div>
                    </div>
                <!-- comment list end -->
                </div>
            </div>

        """
        soup = BeautifulSoup(r.content, "html.parser")
        elems = soup.find_all("div", class_="zm-item-comment")

        comments = []
        for elem in elems:
            # comment id
            el = elem.find("a", class_="zm-item-link-avatar")
            id = int(elem["data-id"])

            people = {
                "token": el["href"].split("/")[-1],
                "avatar": el.find("img")["src"],
                "name": elem.find("div", class_="zm-comment-hd").find("a")["title"],
            }
            utime = elem.find("span", class_="date").string
            content = elem.find("div", class_="zm-comment-content").get_text()
            if content == None:
                Logging.error(u"问题评论解析失败")
                Logging.debug(elem)
            else:
                content = re.sub("^\n+|\n+$", "", content)
                comments.append({"id": id, "people": people, "content": content, "utime": utime})

        return comments
コード例 #12
0
ファイル: core.py プロジェクト: LuoZijun/zhihu-python
# module
from auth import islogin
from auth import Logging

"""
    Note:
        1. 身份验证由 `auth.py` 完成。
        2. 身份信息保存在当前目录的 `cookies` 文件中。
        3. `requests` 对象可以直接使用,身份信息已经自动加载。
"""
requests = requests.Session()
requests.cookies = cookielib.LWPCookieJar("cookies")
try:
    requests.cookies.load(ignore_discard=True)
except:
    Logging.error(u"你还没有登录知乎哦 ...")
    Logging.info(u"执行 `python auth.py` 即可以完成登录。")
    raise Exception("无权限(403)")


class People:
    """
        people:
            name
            domain
            avatar
            profile
                location
                    name
                sex
                job
コード例 #13
0
        return None


if __name__ == '__main__':
    parse = argparse.ArgumentParser(description='Crawl book infomation')
    parse.add_argument('url', help='url to crawl')
    parse.add_argument('path', help='path to save book info')
    parse.add_argument('-p',
                       '--page',
                       help='specific page you want to crawl',
                       type=int,
                       default=1)
    args = parse.parse_args()

    colorama.init()

    requests = requests.Session()
    requests.cookies = cookielib.LWPCookieJar('cookies')
    try:
        requests.cookies.load(ignore_discard=True)
    except IOError:
        Logging.error('run auth.py to log in')
        raise Exception('have not been authenticated')

    if not is_login():
        Logging.error('cookies have expired, please run auth.py again')
        raise Exception('have not been authenticated')

    book = Book(args.url, args.page)
    book.save_to_excel(args.path)
コード例 #14
0
import requests, termcolor, html2text

from bs4 import BeautifulSoup


# module
from auth import islogin
from auth import Logging
#from __builtin__ import None

requests = requests.Session()
requests.cookies = cookielib.LWPCookieJar('cookies')
try:
    requests.cookies.load(ignore_discard=True)
except:
    Logging.error("not login yet")
    Logging.info("run auth.py to login")
    raise Exception("not authorized(403)")


if islogin() != True:
    Logging.error("Identity expired. Relogin to generate identity(auth.py)")
    raise Exception("not authorized(403)")



class User:
    user_url = None
    # session = None
    soup = None
コード例 #15
0
ファイル: zhihu.py プロジェクト: zlxwl/zhihu-python
"""
    Note:
        1. 身份验证由 `auth.py` 完成。
        2. 身份信息保存在当前目录的 `cookies` 文件中。
        3. `requests` 对象可以直接使用,身份信息已经自动加载。

    By Luozijun (https://github.com/LuoZijun), 09/09 2015

"""
requests = requests.Session()
requests.cookies = cookielib.LWPCookieJar('cookies')
try:
    requests.cookies.load(ignore_discard=True)
except:
    Logging.error(u"你还没有登录知乎哦 ...")
    Logging.info(u"执行 `python auth.py` 即可以完成登录。")
    raise Exception("无权限(403)")


if islogin() != True:
    Logging.error(u"你的身份信息已经失效,请重新生成身份信息( `python auth.py` )。")
    raise Exception("无权限(403)")


reload(sys)
sys.setdefaultencoding('utf8')

class Question:
    url = None
    soup = None
コード例 #16
0
ファイル: zhihu.py プロジェクト: uxlsl/zhihu-python
"""
    Note:
        1. 身份验证由 `auth.py` 完成。
        2. 身份信息保存在当前目录的 `cookies` 文件中。
        3. `requests` 对象可以直接使用,身份信息已经自动加载。

    By Luozijun (https://github.com/LuoZijun), 09/09 2015

"""
requests = requests.Session()
requests.cookies = cookielib.LWPCookieJar('cookies')
try:
    requests.cookies.load(ignore_discard=True)
except:
    Logging.error(u"你还没有登录知乎哦 ...")
    Logging.info(u"执行 `python auth.py` 即可以完成登录。")
    raise Exception("无权限(403)")


if islogin() != True:
    Logging.error(u"你的身份信息已经失效,请重新生成身份信息( `python auth.py` )。")
    raise Exception("无权限(403)")


reload(sys)
sys.setdefaultencoding('utf8')

class Question:
    url = None
    soup = None
コード例 #17
0
ファイル: crawler.py プロジェクト: polyval/goodreads-crawler
                yield None

    def get_published_year(self):
        return None


if __name__ == '__main__':
    parse = argparse.ArgumentParser(description='Crawl book infomation')
    parse.add_argument('url', help='url to crawl')
    parse.add_argument('path', help='path to save book info')
    parse.add_argument('-p', '--page', help='specific page you want to crawl',
                       type=int, default=1)
    args = parse.parse_args()

    colorama.init()

    requests = requests.Session()
    requests.cookies = cookielib.LWPCookieJar('cookies')
    try:
        requests.cookies.load(ignore_discard=True)
    except IOError:
        Logging.error('run auth.py to log in')
        raise Exception('have not been authenticated')

    if not is_login():
        Logging.error('cookies have expired, please run auth.py again')
        raise Exception('have not been authenticated')

    book = Book(args.url, args.page)
    book.save_to_excel(args.path)