def zhihu_question_parser(html, url):
    name = txt_wrap_by(
        '<title>',
        ' - 知乎</title>',
        html
    )
    name = unescape(name)
    if  '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html)

    tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count =  int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html 
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done
Ejemplo n.º 2
0
def page_parse(htm_file):

    html = open(htm_file).read()
    title = txt_wrap_by('<title>','- 知乎',html)
    tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html)
    reply_raw_list = txt_wrap_by_all('<div class="xmo">','class="xnq xml xnh">',html)
    replies = [ htm2txt(x)[0] for x in reply_raw_list ]

    js = '["current_question",' +txt_wrap_by("(['current_question', ",');',html)
    a = loads(js)

    answer_list=[]

    question_info={}
    question_info['answer'] = answer_list
    question_info['tags'] = [ x[0] for x in a[1][3] ]
    question_info['title'] = title
    question_info['body'] = htm2txt(txt_wrap_by('<div class="xvrw">','<a href="javascript',html))[0]
    replies_line = zip(a[1][12],replies)

    for x in replies_line:
        try:
            new_ans={}
            new_ans['name'] = x[0][2][0]
            new_ans['answer'] = x[1]
            new_ans['id'] = x[0][2][1]
            new_ans['signature'] = x[0][3]
            new_ans['votes'] = x[0][4]
            answer_list.append(new_ans)
        except:
            continue
    out_file.write(dumps(question_info)+'\n')
Ejemplo n.º 3
0
def zhihu_question_parser(html, url):
    name = txt_wrap_by('<title>', ' - 知乎</title>', html)
    name = unescape(name)
    if '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</',
                                   html)

    tag = map(
        unescape,
        txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count = int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>',
                                           html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done
Ejemplo n.º 4
0
def page_parse(htm_file):

    html = open(htm_file).read()
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html)
    reply_raw_list = txt_wrap_by_all('<div class="xmo">',
                                     'class="xnq xml xnh">', html)
    replies = [htm2txt(x)[0] for x in reply_raw_list]

    js = '["current_question",' + txt_wrap_by("(['current_question', ", ');',
                                              html)
    a = loads(js)

    answer_list = []

    question_info = {}
    question_info['answer'] = answer_list
    question_info['tags'] = [x[0] for x in a[1][3]]
    question_info['title'] = title
    question_info['body'] = htm2txt(
        txt_wrap_by('<div class="xvrw">', '<a href="javascript', html))[0]
    replies_line = zip(a[1][12], replies)

    for x in replies_line:
        try:
            new_ans = {}
            new_ans['name'] = x[0][2][0]
            new_ans['answer'] = x[1]
            new_ans['id'] = x[0][2][1]
            new_ans['signature'] = x[0][3]
            new_ans['votes'] = x[0][4]
            answer_list.append(new_ans)
        except:
            continue
    out_file.write(dumps(question_info) + '\n')
Ejemplo n.º 5
0
def main():
    cookies = ((
        '*****@*****.**',
        '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In'
    ), )

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Language':
        'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Host':
        'www.zhihu.com',
        'Referer:http':
        '//www.zhihu.com/',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
    }
    count = 0
    headers['cookie'] = cookies[0][1]
    explore_page = fetch('http://www.zhihu.com/explore', headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ');', explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')]
                   for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div',
                                 explore_page)
    result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    title_list = [
        txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i))
        for i in url_list
    ]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list,
                     url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1],
                      entry[4], [], pic_list)
Ejemplo n.º 6
0
def read_next(start, offset):
    data = {
        'offset':offset,
        'start':start
    }
    result = []
    data = urlencode(data)
    headers = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7',
               'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
               'Accept-Language':'zh-cn,zh;q=0.5',
               'Accept-Charset':'gb18030,utf-8;q=0.7,*;q=0.7',
               'Content-type':'application/x-www-form-urlencoded'
            }


    headers['Cookie'] = """__utma=155987696.1466564421.1323058814.1323081063.1323082137.3; __utmz=155987696.1323082137.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=site%3Azhihu.com; __utmv=155987696.Logged%20In; _xsrf=5f0d189d485b43cca16068abe2d981ec; __utmc=155987696; __utmb=155987696.70.10.1323082137; checkcode=d3Nsag==|1323081329|606c2864ea806947dae5b5a8d7ab17c2ad22894e; q_c0=MTY2MzIxfFZHUkQxQ2xweUp6Y1czMDk=|1323083404|aabdf01be80a6e1b1c2f6817b03ef2de8a62eb2f"""

    request = urllib2.Request(
        url='http://www.zhihu.com/log/questions',
        data=data,
        headers=headers
    )
    urlopener = urllib2.build_opener()
    r = urlopener.open(request)

    j = r.read()
    j = loads(j)
    html = j['msg'][1]

    name_list = txt_wrap_by_all('''</h2>

<div>

<a''', '<', html)

    id_list = txt_wrap_by_all('logitem-' , '">', html)
    begin = '<a href="/question/'
    end = '</a'
    for id, name, i in zip(id_list, name_list, txt_wrap_by_all(begin, end, html)):
        i = i.split('">', 1)
        i.append(id)
        name = unescape(name).strip()[14:].split('">', 1)
        if len(name) < 2:
            name = '?', '?'
        i.extend(name)
        result.append(i)

    return 20+offset, result
Ejemplo n.º 7
0
def zhihu_topic_parser(html, url):
    global FETCH_COUNT

    #txt = txt_wrap_by( 'DZMT.push(["current_topic",', ')', html )
    #print loads(txt)[:2][0][0]
    question_id_list = map(int, filter(str.isdigit, txt_wrap_by_all('href="/question/', '">', html)))
    QUESTION_ID_SET.update(question_id_list)
    #QUESTION_ID_SET
    feed_id_list = txt_wrap_by_all('id="feed-', '">', html)
    print feed_id_list
#    for i in feed_id_list:
#        yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i
    if len(feed_id_list) >= 20:
        last_one = feed_id_list[-1]
        yield zhihu_topic_feed, {'url':url, 'data':urlencode(dict(start=last_one, offset=20))}, 20
Ejemplo n.º 8
0
def find_next(page_file):
    global question_set_fetched, question_set
    with open(page_file) as page:
        link = set(
            txt_wrap_by_all('<a class="xu" name="rlq" id="rlq-', '"',
                            page.read()))
        question_set |= link
def page_parse(html):
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('data-tip="t$b$', '"', html)
    for i in tags:
        print i,
    print title
    print ''
    def __call__(self, html, url):
        html = txt_wrap_by('<ul class="list-m">', '</ul>', html)
        items = txt_wrap_by_all('<li class="item">', '</div>', html)
        if not items:
            items = txt_wrap_by_all('<h3><a', '</h3', html)

        links = []
        for item in items:
            link = txt_wrap_by('href="', '"', item)

            id = txt_wrap_by('http://www.douban.com/event/', '/', link)
            id = int(id)
            event = ImportDoubanEvent.get(id)
            if not event:
                yield self.parse_event_page, link , id
                ImportDoubanEvent(id=id,event_id=0).save()
Ejemplo n.º 11
0
def user_id_by_txt(htm):
    r = [
        str(uid).rstrip('/') for uid in set(
            txt_wrap_by_all('href="http://www.douban.com/people/', '"', htm))
    ]
    r = [i for i in r if i.isalnum()]
    return r
Ejemplo n.º 12
0
 def func_url(self, title):
     t = [
         i.split('">', 1)
         for i in txt_wrap_by_all('<a href="', '</a>', title)
     ]
     url, topic_name = t[1]
     return parse_topic_htm, url
Ejemplo n.º 13
0
def wm_parser(html, url):
    if "&p=" not in url:
        REAL_USER.add(url.rsplit("=",1)[-1])
        page_id = txt_wrap_by_all(' pageid="','"',html)
        if page_id:
            page_id = int(page_id[-1])
            for i in xrange(1,page_id+1):
                yield wm_parser, url+"&p=%s"%i 
         
    for user_name in txt_wrap_by_all(' href="/user/','"', html):
        if "/" not in user_name:
            if (user_name in EXIST_USER) or (user_name in REAL_USER):
                continue
            EXIST_USER.add(user_name)
            yield wm_parser , "http://www.wumii.com/user/list/followings?u=%s"%user_name
            yield wm_parser , "http://www.wumii.com/user/list/fans?u=%s"%user_name 
Ejemplo n.º 14
0
def page_parse(html):
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('data-tip="t$b$', '"', html)
    for i in tags:
        print i,
    print title
    print ''
Ejemplo n.º 15
0
 def parse_index(self,page,url):
     link_wrap_list = txt_wrap_by_all('已翻译','<span',page)
     link_list = []
     for link_wrap in link_wrap_list:
         url = txt_wrap_by('href="','"',link_wrap)
         if url and not url_is_fetched(url):
             yield self.parse_page,'http://dongxi.net/%s'%url
Ejemplo n.º 16
0
    def __call__(self, html, url):
        html = txt_wrap_by('<ul class="list-m">', '</ul>', html)
        items = txt_wrap_by_all('<li class="item">', '</div>', html)
        if not items:
            items = txt_wrap_by_all('<h3><a', '</h3', html)

        links = []
        for item in items:
            link = txt_wrap_by('href="', '"', item)

            id = txt_wrap_by('http://www.douban.com/event/', '/', link)
            id = int(id)
            event = ImportDoubanEvent.get(id)
            if not event:
                yield self.parse_event_page, link, id
                ImportDoubanEvent(id=id, event_id=0).save()
Ejemplo n.º 17
0
    def parse_page(self,filepath):
        with open(filepath) as f:
            page = f.read()

            title = txt_wrap_by('<title>译言网 | ', '</ti', page)
            tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page)
            tags = tags_wrapper.split(',')
            author = txt_wrap_by('<h2 id="user_info"', '/a', page)
            author = txt_wrap_by('">','<',author)
            rating = txt_wrap_by('已有<span class="number">', '</span', page)
            content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page)
            url = txt_wrap_by('wumiiPermaLink = "','"',page)
            if content_wrapper:
                content,pic_list = htm2txt(content_wrapper)
            else:
                return 

            content = str(content)

            reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page)
            reply_list = []
            for reply_wrapper in reply_wrapper_list:
                reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper))

            Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)
Ejemplo n.º 18
0
def wm_parser(html, url):
    if "&p=" not in url:
        REAL_USER.add(url.rsplit("=", 1)[-1])
        page_id = txt_wrap_by_all(' pageid="', '"', html)
        if page_id:
            page_id = int(page_id[-1])
            for i in xrange(1, page_id + 1):
                yield wm_parser, url + "&p=%s" % i

    for user_name in txt_wrap_by_all(' href="/user/', '"', html):
        if "/" not in user_name:
            if (user_name in EXIST_USER) or (user_name in REAL_USER):
                continue
            EXIST_USER.add(user_name)
            yield wm_parser, "http://www.wumii.com/user/list/followings?u=%s" % user_name
            yield wm_parser, "http://www.wumii.com/user/list/fans?u=%s" % user_name
Ejemplo n.º 19
0
def read_next(start, offset):
    data = {'offset': offset, 'start': start}
    result = []
    data = urlencode(data)
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7',
        'Accept':
        ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
        'Accept-Language': 'zh-cn,zh;q=0.5',
        'Accept-Charset': 'gb18030,utf-8;q=0.7,*;q=0.7',
        'Content-type': 'application/x-www-form-urlencoded'
    }

    headers[
        'Cookie'] = """__utma=155987696.1466564421.1323058814.1323081063.1323082137.3; __utmz=155987696.1323082137.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=site%3Azhihu.com; __utmv=155987696.Logged%20In; _xsrf=5f0d189d485b43cca16068abe2d981ec; __utmc=155987696; __utmb=155987696.70.10.1323082137; checkcode=d3Nsag==|1323081329|606c2864ea806947dae5b5a8d7ab17c2ad22894e; q_c0=MTY2MzIxfFZHUkQxQ2xweUp6Y1czMDk=|1323083404|aabdf01be80a6e1b1c2f6817b03ef2de8a62eb2f"""

    request = urllib2.Request(url='http://www.zhihu.com/log/questions',
                              data=data,
                              headers=headers)
    urlopener = urllib2.build_opener()
    r = urlopener.open(request)

    j = r.read()
    j = loads(j)
    html = j['msg'][1]

    name_list = txt_wrap_by_all('''</h2>

<div>

<a''', '<', html)

    id_list = txt_wrap_by_all('logitem-', '">', html)
    begin = '<a href="/question/'
    end = '</a'
    for id, name, i in zip(id_list, name_list,
                           txt_wrap_by_all(begin, end, html)):
        i = i.split('">', 1)
        i.append(id)
        name = unescape(name).strip()[14:].split('">', 1)
        if len(name) < 2:
            name = '?', '?'
        i.extend(name)
        result.append(i)

    return 20 + offset, result
Ejemplo n.º 20
0
def zhihu_topic_feed(html, url, offset):
    o = loads(html)
    #pprint(o)
    id_list = txt_wrap_by_all('id=\\"feed-', '\\"', html)
    question_id_list = txt_wrap_by_all('href=\\"/question/', '\\"', html)
    QUESTION_ID_SET.update(map(int,question_id_list))

    print ">>>", len(QUESTION_ID_SET),'question', how_long.done, how_long.remain, how_long.estimate()

#    for i in id_list:
#        yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i
#    print id_list
    if len(id_list)>3:
        offset += o['msg'][0]
        yield zhihu_topic_feed, {'url':url['url'], 'data':urlencode(dict(start=id_list[-1], offset=offset))}, offset
    else:
        print "done", how_long.again(), how_long.done, how_long.remain
def user_id_by_txt(htm):
    r = [
        str(uid).rstrip('/')
        for uid in
        set(txt_wrap_by_all('href="http://www.douban.com/people/', '"', htm))
    ]
    r = [i for i in r if i.isalnum()]
    return r
Ejemplo n.º 22
0
def main():
    cookies = (
        (
            "*****@*****.**",
            "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In",
        ),
    )

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Host": "www.zhihu.com",
        "Referer:http": "//www.zhihu.com/",
        "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11",
    }
    count = 0
    headers["cookie"] = cookies[0][1]
    explore_page = fetch("http://www.zhihu.com/explore", headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ");", explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page)
    result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
Ejemplo n.º 23
0
    def parse_page(self,page,url):
        print "Dongxi...%s"%url
        title = txt_wrap_by('<div class="content_title clearfix">','</h1>',page).strip().split('>')[-1].strip()
        author = txt_wrap_by('<a class="link_text_blue" href="','</a>',page).strip().split('>')[-1].strip()

        tags = map(lambda x:x.split('>')[-1],txt_wrap_by_all("<a  class='link_text_blue'",'</a>',page))
        rating_num = txt_wrap_by('onclick="favorate(',')',page)
        
        content = txt_wrap_by('id="full_text">','</div',page)

        yield self.parse_rat,'http://dongxi.net/content/widget/page_id/%s'%rating_num,title,author,tags, url,content
Ejemplo n.º 24
0
def parse_content(txt):
    #id = txt_wrap_by('<a href="/question/', '/log" class="xrv">', txt)
    #t = unescape(txt_wrap_by('<title>', ' - 知乎</title>', txt))
    tlist = txt_wrap_by_all('<div class="xmrw">', '</div>', txt)

    r = [htm2txt(i) for i in tlist if i.strip()]

    #for pos, i in enumerate(r[:3]):
    #    print pos, len(i), i
    #    print "\n"
    return r
Ejemplo n.º 25
0
def zhihu_topic_parser(html, url):
    global FETCH_COUNT

    #txt = txt_wrap_by( 'DZMT.push(["current_topic",', ')', html )
    #print loads(txt)[:2][0][0]
    question_id_list = map(
        int,
        filter(str.isdigit, txt_wrap_by_all('href="/question/', '">', html)))
    QUESTION_ID_SET.update(question_id_list)
    #QUESTION_ID_SET
    feed_id_list = txt_wrap_by_all('id="feed-', '">', html)
    print feed_id_list
    #    for i in feed_id_list:
    #        yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i
    if len(feed_id_list) >= 20:
        last_one = feed_id_list[-1]
        yield zhihu_topic_feed, {
            'url': url,
            'data': urlencode(dict(start=last_one, offset=20))
        }, 20
Ejemplo n.º 26
0
 def parse_index(self,page, url):
     print "!"
     link_wrapper_list = txt_wrap_by_all('<h5 clas', '</h5', page)
     link_list = []
     for link_wrapper in link_wrapper_list:
         url = txt_wrap_by('href="', '"', link_wrapper)
         filename = self.name_builder(url)
         if not url_is_fetched(url):
             yield self.save_page, url
         else:
             self.parse_page(filename)
def parse_content(txt):
    #id = txt_wrap_by('<a href="/question/', '/log" class="xrv">', txt)
    #t = unescape(txt_wrap_by('<title>', ' - 知乎</title>', txt))
    tlist = txt_wrap_by_all('<div class="xmrw">', '</div>', txt)
    
    r = [htm2txt(i) for i in tlist if i.strip()]

    #for pos, i in enumerate(r[:3]):
    #    print pos, len(i), i
    #    print "\n"
    return r
Ejemplo n.º 28
0
    def func_url(self, title):
        t = [i.split('">', 1) for i in txt_wrap_by_all('<a href="', '</a>', title)]
        url , note_title = t[1]

        if url.startswith('http://www.douban.com/note/'):
            func = parse_note_people_htm
        elif url.startswith('http://site.douban.com/widget/notes/'):
            func = parse_note_site_htm
        else:
            func = 0
        return func, url
Ejemplo n.º 29
0
def zhihu_topic_feed(html, url, offset):
    o = loads(html)
    #pprint(o)
    id_list = txt_wrap_by_all('id=\\"feed-', '\\"', html)
    question_id_list = txt_wrap_by_all('href=\\"/question/', '\\"', html)
    QUESTION_ID_SET.update(map(int, question_id_list))

    print ">>>", len(
        QUESTION_ID_SET
    ), 'question', how_long.done, how_long.remain, how_long.estimate()

    #    for i in id_list:
    #        yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i
    #    print id_list
    if len(id_list) > 3:
        offset += o['msg'][0]
        yield zhihu_topic_feed, {
            'url': url['url'],
            'data': urlencode(dict(start=id_list[-1], offset=offset))
        }, offset
    else:
        print "done", how_long.again(), how_long.done, how_long.remain
Ejemplo n.º 30
0
    def func_url(self, title):
        t = [
            i.split('">', 1)
            for i in txt_wrap_by_all('<a href="', '</a>', title)
        ]
        url, note_title = t[1]

        if url.startswith('http://www.douban.com/note/'):
            func = parse_note_people_htm
        elif url.startswith('http://site.douban.com/widget/notes/'):
            func = parse_note_site_htm
        else:
            func = 0
        return func, url
Ejemplo n.º 31
0
def wm_parser(html, url):
    user = txt_wrap_by('&u=', '&', url)
    #print user
    time = txt_wrap_by('<li id="maxActionTimeInMs"  m="', '"', html)
    if time and 'm=' + time not in url and int(time) > 0:
        yield wm_parser, url[:url.rfind('=') + 1] + str(time)

    user_id = wm_user_id(user)
    for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html):
        if 'class="content"' in i:
            id = i[:i.find('"')]

            wm = SpiderWm.get(wmid=id)
            if wm is None:
                yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s' % id, user_id
            else:
                wm_fav(user_id, wm.id)
Ejemplo n.º 32
0
def wm_parser(html, url):
    user = txt_wrap_by('&u=', '&', url)
    #print user
    time = txt_wrap_by('<li id="maxActionTimeInMs"  m="', '"', html)
    if time and 'm='+time not in url and int(time) > 0:
        yield wm_parser, url[:url.rfind('=')+1]+str(time)

    user_id = wm_user_id(user)
    for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html):
        if 'class="content"' in i:
            id = i[:i.find('"')]

            wm = SpiderWm.get(wmid=id)
            if wm is None:
                yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s'%id, user_id
            else:
                wm_fav(user_id, wm.id)
Ejemplo n.º 33
0
    def htm(self, data):
        result = [ ]
        html = txt_wrap_by('<div class="topic-content">', '</div>', data)
        if html:
            result.append(html)
        user_id = self.user_id(data)
        topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data)
        topic_reply = txt_wrap_by_all(' <div class="reply-doc">', ' class="lnk-reply">回应</a>', topic_reply)

        for i in topic_reply:
            owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i)
            owner_id = txt_wrap_by('<a href="http://www.douban.com/people/', '/">', owner_id)
            if owner_id != user_id:
                break
            result.append(txt_wrap_by('</div>', '<div class="operation_div"', i))

        return '\n'.join(result)
Ejemplo n.º 34
0
    def htm(self, data):
        result = []
        html = txt_wrap_by('<div class="topic-content">', '</div>', data)
        if html:
            result.append(html)
        user_id = self.user_id(data)
        topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data)
        topic_reply = txt_wrap_by_all(' <div class="reply-doc">',
                                      ' class="lnk-reply">回应</a>', topic_reply)

        for i in topic_reply:
            owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i)
            owner_id = txt_wrap_by('<a href="http://www.douban.com/people/',
                                   '/">', owner_id)
            if owner_id != user_id:
                break
            result.append(
                txt_wrap_by('</div>', '<div class="operation_div"', i))

        return '\n'.join(result)
Ejemplo n.º 35
0
def find_next(page_file):
    global question_set_fetched,question_set
    with open(page_file) as page:
        link = set(txt_wrap_by_all('<a class="xu" name="rlq" id="rlq-','"',  page.read()))
        question_set|=link
Ejemplo n.º 36
0
 def func_url(self, title):
     t = [i.split('">', 1) for i in txt_wrap_by_all('<a href="', '</a>', title)]
     url , topic_name = t[1]
     return parse_topic_htm, url
Ejemplo n.º 37
0
touch = 'http://42qu.com/google_plus?q='

while buffer:
    uid = buffer.pop()
    passed.add(uid)
    url = 'https://plus.google.com/%s/posts?hl=en' % uid
    print url
    try:
        html = urlopen(url, timeout=60).read()
    except:
        traceback.print_exc()
        continue
    if not has_cn(html):
        continue

    for i in txt_wrap_by_all('href="/', '"', html):
        if i.isdigit():
            i = int(i)
            if i in passed:
                continue
            if i in buffer:
                continue
            buffer.add(i)
            print i, datetime.datetime.now()
            sys.stdout.flush()
            try:
                urlopen(touch + str(i), timeout=30)
            except:
                traceback.print_exc()
                continue