def zhihu_question_parser(html, url): name = txt_wrap_by( '<title>', ' - 知乎</title>', html ) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def link_title_uid_txt(i): if 'alternate' in i: link = i['alternate'][0]['href'] else: link = '' if 'title' in i: title = i['title'] title = unescape(title) else: title = '无题' rss_uid = i.get('id') or 1 snippet = i.get('summary') or i.get('content') or None if not snippet: return if snippet: htm = snippet['content'] if not htm: return htm = txttidy(htm) htm = txt_map('<pre', '</pre>', htm, pre_br) htm = tidy_fragment(htm, {'indent': 0})[0] htm = htm.replace('<br />', '\n') txt = htm2txt(htm) if not txt: return return link, title, rss_uid, txt
def zhihu_question_parser(html, url): name = txt_wrap_by('<title>', ' - 知乎</title>', html) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map( unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def feed_import_new(zsite_id, rid, title, txt, url, rank): title = utf8_ftoj(unescape(title)) txt = utf8_ftoj(format_txt(txt)) if import_feed_duplicator.txt_is_duplicate(txt): return #print zsite_id, rid, title #sleep(0.1) feed_user = user_by_feed_id_zsite_id(zsite_id, rid) if feed_user: po_meta_user_id = feed_user.id else: po_meta_user_id = 0 new_feed = FeedImport(title=title, txt=txt, zsite_id=zsite_id, rid=rid, url=url, tag_id_list='', state=FEED_IMPORT_STATE_WITHOUT_TAG, rank=rank, po_meta_user_id=po_meta_user_id) new_feed.save() id = new_feed.id import_feed_duplicator.set_record(txt, id) if feed_user: user_id = feed_user.user_id if user_id: feed_import_user_new(user_id, id) return new_feed
def zhihu_question_parser(html, url): name = txt_wrap_by( '<title>', ' - 知乎</title>', html ) name = unescape(name) print name print how_long.again(), how_long.remain, how_long.done
def title_normal_sign(title): title = unescape(title)\ .replace('【', '[')\ .replace('】', ']')\ .replace('[', '[')\ .replace(']', ']')\ .replace('(', '(')\ .replace(')', ')')\ .replace(':', ':').strip() return title
def feed_import_new(zsite_id, rid, title, txt, url, rank): title = utf8_ftoj(unescape(title)) txt = utf8_ftoj(format_txt(txt)) if import_feed_duplicator.txt_is_duplicate(txt): return #print zsite_id, rid, title #sleep(0.1) feed_user = user_by_feed_id_zsite_id(zsite_id, rid) if feed_user: po_meta_user_id = feed_user.id else: po_meta_user_id = 0 new_feed = FeedImport( title=title, txt=txt, zsite_id=zsite_id, rid=rid, url=url, tag_id_list='', state=FEED_IMPORT_STATE_WITHOUT_TAG, rank=rank, po_meta_user_id=po_meta_user_id ) new_feed.save() id = new_feed.id import_feed_duplicator.set_record(txt, id) if feed_user: user_id = feed_user.user_id if user_id: feed_import_user_new(user_id, id) return new_feed
def name(self, data): return unescape( str(txt_wrap_by('<title>', '</title>', data).strip())[:-6]) #xxx小组
def name(self, data): return unescape(str(txt_wrap_by("<title>", "</title>", data).strip())[:-6]) # xxx小组