def zhihu_question_parser(html, url):
    name = txt_wrap_by(
        '<title>',
        ' - 知乎</title>',
        html
    )
    name = unescape(name)
    if  '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html)

    tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count =  int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html 
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done
Esempio n. 2
0
def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt
Esempio n. 3
0
def zhihu_question_parser(html, url):
    name = txt_wrap_by('<title>', ' - 知乎</title>', html)
    name = unescape(name)
    if '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</',
                                   html)

    tag = map(
        unescape,
        txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count = int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>',
                                           html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done
Esempio n. 4
0
def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt
Esempio n. 5
0
def feed_import_new(zsite_id, rid, title, txt, url, rank):
    title = utf8_ftoj(unescape(title))
    txt = utf8_ftoj(format_txt(txt))

    if import_feed_duplicator.txt_is_duplicate(txt):
        return
    #print zsite_id, rid, title
    #sleep(0.1)

    feed_user = user_by_feed_id_zsite_id(zsite_id, rid)
    if feed_user:
        po_meta_user_id = feed_user.id
    else:
        po_meta_user_id = 0

    new_feed = FeedImport(title=title,
                          txt=txt,
                          zsite_id=zsite_id,
                          rid=rid,
                          url=url,
                          tag_id_list='',
                          state=FEED_IMPORT_STATE_WITHOUT_TAG,
                          rank=rank,
                          po_meta_user_id=po_meta_user_id)

    new_feed.save()
    id = new_feed.id
    import_feed_duplicator.set_record(txt, id)

    if feed_user:
        user_id = feed_user.user_id
        if user_id:
            feed_import_user_new(user_id, id)

    return new_feed
Esempio n. 6
0
def zhihu_question_parser(html, url):
    name = txt_wrap_by(
        '<title>',
        ' - 知乎</title>',
        html
    )
    name = unescape(name)
    print name
    print how_long.again(), how_long.remain, how_long.done
Esempio n. 7
0
def title_normal_sign(title):
    title = unescape(title)\
            .replace('【', '[')\
            .replace('】', ']')\
            .replace('[', '[')\
            .replace(']', ']')\
            .replace('(', '(')\
            .replace(')', ')')\
            .replace(':', ':').strip()

    return title
def feed_import_new(zsite_id, rid, title, txt, url,  rank):
    title = utf8_ftoj(unescape(title))
    txt = utf8_ftoj(format_txt(txt))

    if import_feed_duplicator.txt_is_duplicate(txt):
        return
    #print zsite_id, rid, title
    #sleep(0.1)

    feed_user = user_by_feed_id_zsite_id(zsite_id, rid)
    if feed_user:
        po_meta_user_id = feed_user.id
    else:
        po_meta_user_id = 0

    new_feed = FeedImport(
        title=title,
        txt=txt,
        zsite_id=zsite_id,
        rid=rid,
        url=url,
        tag_id_list='',
        state=FEED_IMPORT_STATE_WITHOUT_TAG,
        rank=rank,
        po_meta_user_id=po_meta_user_id    
    )

    new_feed.save()
    id = new_feed.id
    import_feed_duplicator.set_record(txt, id)

    if feed_user:
        user_id = feed_user.user_id
        if user_id:
            feed_import_user_new(user_id, id)
    
    return new_feed
Esempio n. 9
0
 def name(self, data):
     return unescape(
         str(txt_wrap_by('<title>', '</title>', data).strip())[:-6])  #xxx小组
Esempio n. 10
0
 def name(self, data):
     return unescape(str(txt_wrap_by("<title>", "</title>", data).strip())[:-6])  # xxx小组