Esempio n. 1
0
##!/usr/bin/python
# -*- coding: UTF-8 -*-
import bs4

from dbHelper import getConn
from framework.htmlParser import getSoupByStr

csor, conn = getConn()

# sql = "select * from t_topic_new where id = 2645 or id = 2678"
# sql = "select * from t_topic_new where id > 2994 and id < 3094"
# sql = "select * from t_topic_new where id > 3411 and  id < 3730"#初中
sql = "select * from t_topic_new where id > 1647 and  id < 1757"  #大学

print sql
csor.execute(sql)

results = csor.fetchall()
for row in results:
    content = row[4]
    # content = row[4].replace('mi', 'mo')
    id = row[0]

    soup = getSoupByStr(content)

    t = soup.select('.title')
    if t and len(t) == 1:
        t[0].extract()
    else:
        print 'exteact Title fail, title:', unicode(t).encode('utf-8')
Esempio n. 2
0
def juren():
    csor, conn = getConn()

    #小升初笑话
    for i in range(1, 25):
        #笑话
        # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/index_' + str(i) + '.html'
        #故事
        url = 'http://aoshu.juren.com/tiku/mryt/yimryt/index_' + str(
            i) + '.html'
        #名人
        # url =
        if i == 1:
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/'
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaogushi/'
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/neirongsucai/'
            url = 'http://aoshu.juren.com/tiku/mryt/yimryt/'
        content = getContent(url)
        if not content:
            print 'get content failed, url: ', url
            continue
        soup = getSoupByStr(content)
        if not soup:
            print 'get soup filed, url:', url
            continue
        for listting in soup.select(".listing1"):
            for a in listting.select('a'):
                text = a.get_text()
                titles = text.split(u':')
                if len(titles) < 2:
                    titles = text.split(u':')
                if len(titles) < 2:
                    title = text
                else:
                    title = titles[1]
                deatilUrl = a['href']
                contentHtml = getContent(deatilUrl)
                if not contentHtml:
                    print 'get detail failed'
                    continue
                contentSoup = getSoupByStr(contentHtml).select('.mainContent')
                content = ''
                ps = contentSoup[0].select('p')
                length = len(ps)
                for j in range(1, length):
                    pJ = ps[j]
                    pText = pJ.get_text()
                    if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText\
                            or u'点击下一页查看答案' in pText or u'下一页查看答案' in pText or u'查看答案' in pText\
                            or len(pJ.select('a')) > 0:
                        print 'not content,break,  text:' + pText
                        break
                    content += unicode(pJ)
                contentHtml2 = getContent(deatilUrl.replace(
                    '.html', '_2.html'))
                if not contentHtml2:
                    print 'get detail failed'
                    continue
                # contentSoup2 = getSoupByStr(contentHtml2.replace('<br /></p>','')).select('.mainContent')
                contentSoup2 = getSoupByStr(contentHtml2).select(
                    '.mainContent')
                ps = contentSoup2[0].select('p')
                length = len(ps)
                for j in range(0, length):
                    pJ = ps[j]
                    pText = pJ.get_text()
                    if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText or len(
                            pJ.select('a')) > 0:
                        print 'not content,break,  text:' + pText
                        break
                    content += unicode(pJ)

                sql = "INSERT ignore INTO daily(name, \
                                        type, content,stage, gred) \
                                        VALUES ('%s', '%d', '%s', '%s', '%d')"                                                                               % \
                      (title, 3, content, '3', 1)
                try:
                    # 执行sql语句
                    print sql
                    csor.execute(sql)
                    # 提交到数据库执行
                    print conn.commit()
                except:
                    # 发生错误时回滚
                    conn.rollback()
    conn.close()