def page_parse(htm_file): html = open(htm_file).read() title = txt_wrap_by('<title>', '- 知乎', html) tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html) reply_raw_list = txt_wrap_by_all('<div class="xmo">', 'class="xnq xml xnh">', html) replies = [htm2txt(x)[0] for x in reply_raw_list] js = '["current_question",' + txt_wrap_by("(['current_question', ", ');', html) a = loads(js) answer_list = [] question_info = {} question_info['answer'] = answer_list question_info['tags'] = [x[0] for x in a[1][3]] question_info['title'] = title question_info['body'] = htm2txt( txt_wrap_by('<div class="xvrw">', '<a href="javascript', html))[0] replies_line = zip(a[1][12], replies) for x in replies_line: try: new_ans = {} new_ans['name'] = x[0][2][0] new_ans['answer'] = x[1] new_ans['id'] = x[0][2][1] new_ans['signature'] = x[0][3] new_ans['votes'] = x[0][4] answer_list.append(new_ans) except: continue out_file.write(dumps(question_info) + '\n')
def page_parse(htm_file): html = open(htm_file).read() title = txt_wrap_by('<title>','- 知乎',html) tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html) reply_raw_list = txt_wrap_by_all('<div class="xmo">','class="xnq xml xnh">',html) replies = [ htm2txt(x)[0] for x in reply_raw_list ] js = '["current_question",' +txt_wrap_by("(['current_question', ",');',html) a = loads(js) answer_list=[] question_info={} question_info['answer'] = answer_list question_info['tags'] = [ x[0] for x in a[1][3] ] question_info['title'] = title question_info['body'] = htm2txt(txt_wrap_by('<div class="xvrw">','<a href="javascript',html))[0] replies_line = zip(a[1][12],replies) for x in replies_line: try: new_ans={} new_ans['name'] = x[0][2][0] new_ans['answer'] = x[1] new_ans['id'] = x[0][2][1] new_ans['signature'] = x[0][3] new_ans['votes'] = x[0][4] answer_list.append(new_ans) except: continue out_file.write(dumps(question_info)+'\n')
def zhihu_question_parser(html, url): name = txt_wrap_by( '<title>', ' - 知乎</title>', html ) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def parse_page(self,filepath): with open(filepath) as f: page = f.read() title = txt_wrap_by('<title>译言网 | ', '</ti', page) tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page) tags = tags_wrapper.split(',') author = txt_wrap_by('<h2 id="user_info"', '/a', page) author = txt_wrap_by('">','<',author) rating = txt_wrap_by('已有<span class="number">', '</span', page) content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page) url = txt_wrap_by('wumiiPermaLink = "','"',page) if content_wrapper: content,pic_list = htm2txt(content_wrapper) else: return content = str(content) reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page) reply_list = [] for reply_wrapper in reply_wrapper_list: reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper)) Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)
def zhihu_question_parser(html, url): name = txt_wrap_by('<title>', ' - 知乎</title>', html) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map( unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def link_title_uid_txt(i): if 'alternate' in i: link = i['alternate'][0]['href'] else: link = '' if 'title' in i: title = i['title'] title = unescape(title) else: title = '无题' rss_uid = i.get('id') or 1 snippet = i.get('summary') or i.get('content') or None if not snippet: return if snippet: htm = snippet['content'] if not htm: return htm = txttidy(htm) htm = txt_map('<pre', '</pre>', htm, pre_br) htm = tidy_fragment(htm, {'indent': 0})[0] htm = htm.replace('<br />', '\n') txt = htm2txt(htm) if not txt: return return link, title, rss_uid, txt
def save_event(self, phone, address, begin_time, end_time, title, intro, douban_event_id, typ): begin_time = time_by_string(begin_time) end_time = time_by_string(end_time) if begin_time < datetime.now(): return None if typ in EVENT_DICT: event_cid = EVENT_DICT[typ] else: event_cid = EVENT_DICT[u'其他'] city = address[0] place = address[1] if len(address) == 2: address = address[1] else: address = address[2] city_pid = location_finder(city) pid = location_finder(place) if pid not in PLACE_L1L2[city_pid]: pid = city_pid begin = datetime_to_minutes(begin_time) end = datetime_to_minutes(end_time) id = 0 limit_up = 42 limit_down = 0 transport = '' price = 0 event = event_new(self.user_id, event_cid, city_pid, pid, address, transport, begin, end, 0, limit_up, limit_down, phone, 0, id) id = event.id po = po_new(CID_EVENT, self.user_id, '', STATE_SECRET, id=id, zsite_id=self.zsite_id) if po: po.name_ = title po.txt_set(htm2txt(intro)[0]) po.save() event_init2to_review(id) import_douban_event = ImportDoubanEvent.get_or_create( id=int(douban_event_id)) import_douban_event.event_id = id import_douban_event.save() return event
def feed_import_by_douban_feed(): from model.douban import douban_feed_to_review_iter, DoubanUser for i in douban_feed_to_review_iter(): #print i.id txt = i.htm.replace('豆友', '网友').replace('豆油', '私信').replace('豆邮', '私信') #print i.id, i.title txt = htm2txt(txt) feed_import_new(ZSITE_DOUBAN_ID, i.id, i.title, txt, i.link, i.like + i.rec)
def main(): cookies = (( '*****@*****.**', '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In' ), ) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Language': 'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', 'Referer:http': '//www.zhihu.com/', 'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11', } count = 0 headers['cookie'] = cookies[0][1] explore_page = fetch('http://www.zhihu.com/explore', headers=headers) entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page) reting_raw = txt_wrap_by("['explore_list',", ');', explore_page) data = loads(reting_raw) author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')] for i in data] rating_list = [i[3][3] for i in data] label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div', explore_page) result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list] url_list = txt_wrap_by_all('<h2', '</h2>', explore_page) id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list] title_list = [ txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i)) for i in url_list ] url_list = txt_wrap_by_all('<h2', '</h2>', explore_page) id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list] url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list] entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list) for entry in entry_list: content, pic_list = htm2txt(entry[5]) Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
def parse_content(txt): #id = txt_wrap_by('<a href="/question/', '/log" class="xrv">', txt) #t = unescape(txt_wrap_by('<title>', ' - 知乎</title>', txt)) tlist = txt_wrap_by_all('<div class="xmrw">', '</div>', txt) r = [htm2txt(i) for i in tlist if i.strip()] #for pos, i in enumerate(r[:3]): # print pos, len(i), i # print "\n" return r
def feed_import_by_douban_feed(): from model.douban import douban_feed_to_review_iter, DoubanUser for i in douban_feed_to_review_iter(): #print i.id txt = i.htm.replace( '豆友', '网友' ).replace('豆油', '私信').replace('豆邮', '私信') #print i.id, i.title txt = htm2txt(txt) feed_import_new( ZSITE_DOUBAN_ID, i.id, i.title, txt, i.link, i.like+i.rec )
def parse_rat(self,page,url,title,author,tags, po_url, content): rating = 0 try: dic = loads(page) rating = dic['fav_count'] except: pass content,pic_list = htm2txt(content) content = str(content) pic_list = ['http://dongxi.net'+i for i in pic_list] out = dumps([title,tags,content ,author ,rating, po_url,None ]) #Spider.insert(title, tags, content, author, rating ,url, None, pic_list) print out #print >>out_f,out raw_input()
def main(): cookies = ( ( "*****@*****.**", "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In", ), ) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.zhihu.com", "Referer:http": "//www.zhihu.com/", "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11", } count = 0 headers["cookie"] = cookies[0][1] explore_page = fetch("http://www.zhihu.com/explore", headers=headers) entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page) reting_raw = txt_wrap_by("['explore_list',", ");", explore_page) data = loads(reting_raw) author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data] rating_list = [i[3][3] for i in data] label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page) result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list] url_list = txt_wrap_by_all("<h2", "</h2>", explore_page) id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list] title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list] url_list = txt_wrap_by_all("<h2", "</h2>", explore_page) id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list] url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list] entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list) for entry in entry_list: content, pic_list = htm2txt(entry[5]) Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
def main(): author_dict = {} with open("ucdchina_st.data") as f: for line in f: data = loads(line) author = htm2txt(data[2].replace(" ",''))[0] blog = data[3] title = data[0] if author in author_dict: author_dict[author][0]+=1 author_dict[author][2]+=" %s"%title else: author_t=[None]*3 author_dict[author]=author_t author_t[0]=1 author_t[1]=blog author_t[2]=title author_dict = sorted(author_dict.iteritems(),key=lambda x:x[1][0],reverse=True) for k,v in author_dict: print v[0],k,v[1],v[2]
def parse_page(self,filepath): with open(filepath) as f: page = f.read() title = txt_wrap_by('<title>', '- UCD大社区', page) author = txt_wrap_by('style=" float:left; color:#999;">', '</span', page) author = txt_wrap_by('作者:', '|', author) content_wrapper = txt_wrap_by('<div id="pageContentWrap" style="font-size:13px; ">', '</div', page) url =txt_wrap_by('阅读和发布评论:<a href="','"',page) blog_url = txt_wrap_by('>推荐您进入文章源地址阅读和发布评论:<a href="','"',page) if content_wrapper: content,pic_list = htm2txt(content_wrapper.decode('utf-8','ignore' )) else: return content = str(content) tags = TAGGER.get_tag(content+title) #tags = TAGGER.get_tag(content+title) #out = dumps([title,url,tags]) #print out out = dumps([ title, content, author, tags ]) #out = dumps([ title, content, author, blog_url ]) print out
def main(): author_dict = {} with open("ucdchina_st.data") as f: for line in f: data = loads(line) author = htm2txt(data[2].replace(" ", ''))[0] blog = data[3] title = data[0] if author in author_dict: author_dict[author][0] += 1 author_dict[author][2] += " %s" % title else: author_t = [None] * 3 author_dict[author] = author_t author_t[0] = 1 author_t[1] = blog author_t[2] = title author_dict = sorted(author_dict.iteritems(), key=lambda x: x[1][0], reverse=True) for k, v in author_dict: print v[0], k, v[1], v[2]
continue if u'author' in line: author = line['author'] else: continue if u'content' in line: content = line['content'] elif u'summary' in line: content = line['summary'] else: continue link = line['alternate'][0]['href'] content = content['content'] content = str(htm2txt(content)) source = content.find("源地址:") if source >= 0: slink = content[source:].split("\n", 1)[0].strip() slink = slink[slink.find("http"):] content = content[:source] link = slink if len(content) < 2000: continue user = PoMetaUser.get_or_create(name=author, cid=ZSITE_UCD_CHINA_ID) if not user.id: user.url = 0 user.save() user.url = user.id
'/wp-content/plugins/', ): if i in url: return '' if netloc == UPYUN_DOMAIN: return line result = upyun_fetch_pic(url) if result: result = '图:%s\n'%result else: result = line return result if __name__ == '__main__': a = ''' 图:[[http:///sdfsdf]] <a href="http://tp2.sinaimg.cn/1483383365/50/5610781374/0"><img src='http://tp2.sinaimg.cn/1483383365/50/5610781374/0'/></a> 如果 **某一天** , 你身上多了一个“恢复出厂设置”按钮,一按身体和记忆一切归为出生时。 你会去按它吗? ''' from zkit.htm2txt import htm2txt print txt_img_fetch(htm2txt(a))
def save_event(self, phone, address, begin_time, end_time, title, intro, douban_event_id , typ): begin_time = time_by_string(begin_time) end_time = time_by_string(end_time) if begin_time < datetime.now(): return None if typ in EVENT_DICT: event_cid = EVENT_DICT[typ] else: event_cid = EVENT_DICT[u'其他'] city = address[0] place = address[1] if len(address) == 2: address = address[1] else: address = address[2] city_pid = location_finder(city) pid = location_finder(place) if pid not in PLACE_L1L2[city_pid]: pid = city_pid begin = datetime_to_minutes(begin_time) end = datetime_to_minutes(end_time) id = 0 limit_up = 42 limit_down = 0 transport = '' price = 0 event = event_new( self.user_id, event_cid, city_pid, pid, address, transport, begin, end, 0, limit_up, limit_down, phone, 0, id ) id = event.id po = po_new(CID_EVENT, self.user_id, '', STATE_SECRET , id=id, zsite_id=self.zsite_id) if po: po.name_ = title po.txt_set(htm2txt(intro)[0]) po.save() event_init2to_review(id) import_douban_event = ImportDoubanEvent.get_or_create(id=int(douban_event_id)) import_douban_event.event_id = id import_douban_event.save() return event
continue if u'author' in line: author = line['author'] else: continue if u'content' in line: content = line['content'] elif u'summary' in line: content = line['summary'] else: continue link = line['alternate'][0]['href'] content = content['content'] content = str(htm2txt(content)) source = content.find("源地址:") if source >= 0: slink = content[source:].split("\n",1)[0].strip() slink = slink[slink.find("http"):] content = content[:source] link = slink if len(content)<2000: continue user = PoMetaUser.get_or_create(name=author, cid=ZSITE_UCD_CHINA_ID) if not user.id: user.url = 0 user.save()