Python htm2txt Examples, zkit.htm2txt.htm2txt Python Examples

Example #1

0

Show file

File: zhihu_parse.py Project: xqk/42qu_github_mirror

def page_parse(htm_file):

    html = open(htm_file).read()
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html)
    reply_raw_list = txt_wrap_by_all('<div class="xmo">',
                                     'class="xnq xml xnh">', html)
    replies = [htm2txt(x)[0] for x in reply_raw_list]

    js = '["current_question",' + txt_wrap_by("(['current_question', ", ');',
                                              html)
    a = loads(js)

    answer_list = []

    question_info = {}
    question_info['answer'] = answer_list
    question_info['tags'] = [x[0] for x in a[1][3]]
    question_info['title'] = title
    question_info['body'] = htm2txt(
        txt_wrap_by('<div class="xvrw">', '<a href="javascript', html))[0]
    replies_line = zip(a[1][12], replies)

    for x in replies_line:
        try:
            new_ans = {}
            new_ans['name'] = x[0][2][0]
            new_ans['answer'] = x[1]
            new_ans['id'] = x[0][2][1]
            new_ans['signature'] = x[0][3]
            new_ans['votes'] = x[0][4]
            answer_list.append(new_ans)
        except:
            continue
    out_file.write(dumps(question_info) + '\n')

Example #2

0

Show file

File: zhihu_parse.py Project: immissile/42qu_github_mirror

def page_parse(htm_file):

    html = open(htm_file).read()
    title = txt_wrap_by('<title>','- 知乎',html)
    tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html)
    reply_raw_list = txt_wrap_by_all('<div class="xmo">','class="xnq xml xnh">',html)
    replies = [ htm2txt(x)[0] for x in reply_raw_list ]

    js = '["current_question",' +txt_wrap_by("(['current_question', ",');',html)
    a = loads(js)

    answer_list=[]

    question_info={}
    question_info['answer'] = answer_list
    question_info['tags'] = [ x[0] for x in a[1][3] ]
    question_info['title'] = title
    question_info['body'] = htm2txt(txt_wrap_by('<div class="xvrw">','<a href="javascript',html))[0]
    replies_line = zip(a[1][12],replies)

    for x in replies_line:
        try:
            new_ans={}
            new_ans['name'] = x[0][2][0]
            new_ans['answer'] = x[1]
            new_ans['id'] = x[0][2][1]
            new_ans['signature'] = x[0][3]
            new_ans['votes'] = x[0][4]
            answer_list.append(new_ans)
        except:
            continue
    out_file.write(dumps(question_info)+'\n')

Example #3

0

Show file

File: zhihu_question_no_login.py Project: immissile/42qu_github_mirror

def zhihu_question_parser(html, url):
    name = txt_wrap_by(
        '<title>',
        ' - 知乎</title>',
        html
    )
    name = unescape(name)
    if  '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html)

    tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count =  int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html 
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done

Example #4

0

Show file

File: yeeyan.py Project: immissile/42qu_github_mirror

    def parse_page(self,filepath):
        with open(filepath) as f:
            page = f.read()

            title = txt_wrap_by('<title>译言网 | ', '</ti', page)
            tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page)
            tags = tags_wrapper.split(',')
            author = txt_wrap_by('<h2 id="user_info"', '/a', page)
            author = txt_wrap_by('">','<',author)
            rating = txt_wrap_by('已有<span class="number">', '</span', page)
            content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page)
            url = txt_wrap_by('wumiiPermaLink = "','"',page)
            if content_wrapper:
                content,pic_list = htm2txt(content_wrapper)
            else:
                return 

            content = str(content)

            reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page)
            reply_list = []
            for reply_wrapper in reply_wrapper_list:
                reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper))

            Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)

Example #5

0

Show file

def zhihu_question_parser(html, url):
    name = txt_wrap_by('<title>', ' - 知乎</title>', html)
    name = unescape(name)
    if '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</',
                                   html)

    tag = map(
        unescape,
        txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count = int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>',
                                           html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done

Example #6

0

Show file

File: rss_update.py Project: immissile/42qu_github_mirror

def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt

Example #7

0

Show file

def save_event(self, phone, address, begin_time, end_time, title, intro,
               douban_event_id, typ):

    begin_time = time_by_string(begin_time)
    end_time = time_by_string(end_time)

    if begin_time < datetime.now():
        return None

    if typ in EVENT_DICT:
        event_cid = EVENT_DICT[typ]
    else:
        event_cid = EVENT_DICT[u'其他']

    city = address[0]
    place = address[1]
    if len(address) == 2:
        address = address[1]
    else:
        address = address[2]

    city_pid = location_finder(city)
    pid = location_finder(place)

    if pid not in PLACE_L1L2[city_pid]:
        pid = city_pid

    begin = datetime_to_minutes(begin_time)
    end = datetime_to_minutes(end_time)

    id = 0
    limit_up = 42
    limit_down = 0
    transport = ''
    price = 0

    event = event_new(self.user_id, event_cid, city_pid, pid, address,
                      transport, begin, end, 0, limit_up, limit_down, phone, 0,
                      id)

    id = event.id

    po = po_new(CID_EVENT,
                self.user_id,
                '',
                STATE_SECRET,
                id=id,
                zsite_id=self.zsite_id)
    if po:
        po.name_ = title
        po.txt_set(htm2txt(intro)[0])
        po.save()

        event_init2to_review(id)
        import_douban_event = ImportDoubanEvent.get_or_create(
            id=int(douban_event_id))
        import_douban_event.event_id = id
        import_douban_event.save()

        return event

Example #8

0

Show file

def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt

Example #9

0

Show file

def feed_import_by_douban_feed():
    from model.douban import douban_feed_to_review_iter, DoubanUser
    for i in douban_feed_to_review_iter():
        #print i.id
        txt = i.htm.replace('豆友', '网友').replace('豆油', '私信').replace('豆邮', '私信')
        #print i.id, i.title
        txt = htm2txt(txt)
        feed_import_new(ZSITE_DOUBAN_ID, i.id, i.title, txt, i.link,
                        i.like + i.rec)

Example #10

0

Show file

File: zhihu_explorer.py Project: xqk/42qu_github_mirror

def main():
    cookies = ((
        '*****@*****.**',
        '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In'
    ), )

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Language':
        'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Host':
        'www.zhihu.com',
        'Referer:http':
        '//www.zhihu.com/',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
    }
    count = 0
    headers['cookie'] = cookies[0][1]
    explore_page = fetch('http://www.zhihu.com/explore', headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ');', explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')]
                   for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div',
                                 explore_page)
    result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    title_list = [
        txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i))
        for i in url_list
    ]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list,
                     url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1],
                      entry[4], [], pic_list)

Example #11

0

Show file

File: zhihu_question_load.py Project: immissile/42qu_github_mirror

def parse_content(txt):
    #id = txt_wrap_by('<a href="/question/', '/log" class="xrv">', txt)
    #t = unescape(txt_wrap_by('<title>', ' - 知乎</title>', txt))
    tlist = txt_wrap_by_all('<div class="xmrw">', '</div>', txt)
    
    r = [htm2txt(i) for i in tlist if i.strip()]

    #for pos, i in enumerate(r[:3]):
    #    print pos, len(i), i
    #    print "\n"
    return r

Example #12

0

Show file

def parse_content(txt):
    #id = txt_wrap_by('<a href="/question/', '/log" class="xrv">', txt)
    #t = unescape(txt_wrap_by('<title>', ' - 知乎</title>', txt))
    tlist = txt_wrap_by_all('<div class="xmrw">', '</div>', txt)

    r = [htm2txt(i) for i in tlist if i.strip()]

    #for pos, i in enumerate(r[:3]):
    #    print pos, len(i), i
    #    print "\n"
    return r

Example #13

0

Show file

File: feed_import.py Project: immissile/42qu_github_mirror

def feed_import_by_douban_feed():
    from model.douban import douban_feed_to_review_iter, DoubanUser
    for i in douban_feed_to_review_iter():
        #print i.id
        txt = i.htm.replace(
            '豆友', '网友'
        ).replace('豆油', '私信').replace('豆邮', '私信')
        #print i.id, i.title
        txt = htm2txt(txt)
        feed_import_new(
           ZSITE_DOUBAN_ID, i.id, i.title, txt, i.link,  i.like+i.rec
        )

Example #14

0

Show file

    def parse_rat(self,page,url,title,author,tags, po_url, content):
        rating = 0
        try:
            dic = loads(page)
            rating = dic['fav_count']
        except:
            pass

        content,pic_list = htm2txt(content)
        content = str(content)
        pic_list = ['http://dongxi.net'+i for i in pic_list]

        out = dumps([title,tags,content ,author ,rating, po_url,None ])
        #Spider.insert(title, tags, content, author, rating ,url, None, pic_list)
        print out
        #print >>out_f,out 
        raw_input()

Example #15

0

Show file

File: zhihu_explorer.py Project: immissile/42qu_github_mirror

def main():
    cookies = (
        (
            "*****@*****.**",
            "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In",
        ),
    )

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Host": "www.zhihu.com",
        "Referer:http": "//www.zhihu.com/",
        "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11",
    }
    count = 0
    headers["cookie"] = cookies[0][1]
    explore_page = fetch("http://www.zhihu.com/explore", headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ");", explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page)
    result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)

Example #16

0

Show file

File: ucdchina_st.py Project: immissile/42qu_github_mirror

def main():
    author_dict = {}
    with open("ucdchina_st.data") as f:
        for line in f:
            data = loads(line)
            author = htm2txt(data[2].replace("&nbsp;",''))[0]
            blog = data[3]
            title = data[0]

            if author in author_dict:
                author_dict[author][0]+=1
                author_dict[author][2]+=" %s"%title
            else:
                author_t=[None]*3
                author_dict[author]=author_t
                author_t[0]=1
                author_t[1]=blog
                author_t[2]=title

    author_dict = sorted(author_dict.iteritems(),key=lambda x:x[1][0],reverse=True)
    for k,v in author_dict:
        print v[0],k,v[1],v[2]

Example #17

0

Show file

    def parse_page(self,filepath):
        with open(filepath) as f:
            page = f.read()

            title = txt_wrap_by('<title>', '- UCD大社区', page)
            author = txt_wrap_by('style=" float:left; color:#999;">', '</span', page)
            author = txt_wrap_by('作者：', '|', author)
            content_wrapper = txt_wrap_by('<div id="pageContentWrap" style="font-size:13px; ">', '</div', page)
            url =txt_wrap_by('阅读和发布评论：<a href="','"',page)
            blog_url = txt_wrap_by('>推荐您进入文章源地址阅读和发布评论：<a href="','"',page)

            if content_wrapper:
                content,pic_list = htm2txt(content_wrapper.decode('utf-8','ignore' ))
            else:
                return 
            
            content = str(content)
            tags = TAGGER.get_tag(content+title)
            #tags = TAGGER.get_tag(content+title)
            #out = dumps([title,url,tags])
            #print out
            out = dumps([ title, content, author, tags ])
            #out = dumps([ title, content, author, blog_url ])
            print out

Example #18

0

Show file

File: ucdchina_st.py Project: xqk/42qu_github_mirror

def main():
    author_dict = {}
    with open("ucdchina_st.data") as f:
        for line in f:
            data = loads(line)
            author = htm2txt(data[2].replace("&nbsp;", ''))[0]
            blog = data[3]
            title = data[0]

            if author in author_dict:
                author_dict[author][0] += 1
                author_dict[author][2] += " %s" % title
            else:
                author_t = [None] * 3
                author_dict[author] = author_t
                author_t[0] = 1
                author_t[1] = blog
                author_t[2] = title

    author_dict = sorted(author_dict.iteritems(),
                         key=lambda x: x[1][0],
                         reverse=True)
    for k, v in author_dict:
        print v[0], k, v[1], v[2]

Example #19

0

Show file

            continue
        if u'author' in line:
            author = line['author']
        else:
            continue

        if u'content' in line:
            content = line['content']
        elif u'summary' in line:
            content = line['summary']
        else:
            continue
        link = line['alternate'][0]['href']
        content = content['content']

        content = str(htm2txt(content))
        source = content.find("源地址：")
        if source >= 0:
            slink = content[source:].split("\n", 1)[0].strip()
            slink = slink[slink.find("http"):]
            content = content[:source]
            link = slink

        if len(content) < 2000:
            continue

        user = PoMetaUser.get_or_create(name=author, cid=ZSITE_UCD_CHINA_ID)
        if not user.id:
            user.url = 0
            user.save()
            user.url = user.id

Example #20

0

Show file

        '/wp-content/plugins/',
    ):
        if i in url:
            return ''

    if netloc == UPYUN_DOMAIN:
        return line


    result = upyun_fetch_pic(url)
    if result:
        result = '图:%s\n'%result
    else:
        result = line

    return result

if __name__ == '__main__':
    a = '''
    图:[[http:///sdfsdf]]
    <a href="http://tp2.sinaimg.cn/1483383365/50/5610781374/0"><img src='http://tp2.sinaimg.cn/1483383365/50/5610781374/0'/></a>
    如果
    **某一天**
    ，
    你身上多了一个“恢复出厂设置”按钮，一按身体和记忆一切归为出生时。 你会去按它吗？
    '''
    from zkit.htm2txt import htm2txt
    print txt_img_fetch(htm2txt(a))

Example #21

0

Show file

File: import_douban_event.py Project: immissile/42qu_github_mirror

def save_event(self, phone, address, begin_time, end_time, title, intro, douban_event_id , typ):

    begin_time = time_by_string(begin_time)
    end_time = time_by_string(end_time)

    if begin_time < datetime.now():
        return None

    if typ in EVENT_DICT:
        event_cid = EVENT_DICT[typ]
    else:
        event_cid = EVENT_DICT[u'其他']

    city = address[0]
    place = address[1]
    if len(address) == 2:
        address = address[1]
    else:
        address = address[2]


    city_pid = location_finder(city)
    pid = location_finder(place)

    if pid not in PLACE_L1L2[city_pid]:
        pid = city_pid

    begin = datetime_to_minutes(begin_time)
    end = datetime_to_minutes(end_time)

    id = 0
    limit_up = 42
    limit_down = 0
    transport = ''
    price = 0

    event = event_new(
        self.user_id,
        event_cid,
        city_pid,
        pid,
        address,
        transport,
        begin,
        end,
        0,
        limit_up,
        limit_down,
        phone,
        0,
        id
    )

    id = event.id


    po = po_new(CID_EVENT, self.user_id, '', STATE_SECRET , id=id, zsite_id=self.zsite_id)
    if po:
        po.name_ = title
        po.txt_set(htm2txt(intro)[0])
        po.save()

        event_init2to_review(id)
        import_douban_event = ImportDoubanEvent.get_or_create(id=int(douban_event_id))
        import_douban_event.event_id = id
        import_douban_event.save()

        return event

Example #22

0

Show file

File: ucd_china_import_by_js.py Project: immissile/42qu_github_mirror

            continue
        if u'author' in line:
            author = line['author']
        else:
            continue
 
        if u'content' in line:
            content = line['content']
        elif u'summary' in line:
            content = line['summary']
        else:
            continue
        link = line['alternate'][0]['href']
        content = content['content']

        content = str(htm2txt(content)) 
        source = content.find("源地址：")
        if source >= 0:
            slink = content[source:].split("\n",1)[0].strip()
            slink = slink[slink.find("http"):]
            content = content[:source]
            link = slink

        if len(content)<2000:
            continue
    

        user = PoMetaUser.get_or_create(name=author, cid=ZSITE_UCD_CHINA_ID)
        if not user.id:
            user.url = 0 
            user.save()