Ejemplo n.º 1
0
def insertArticleSampleText(articleSampleText):
    try:
        conToMysql()

        cur.execute('CREATE DATABASE IF NOT EXISTS autohome')
        conn.select_db('autohome')
        cur.execute(
            'CREATE TABLE IF NOT EXISTS autohome_sample_text('
            'article_url VARCHAR(100) PRIMARY KEY COMMENT"文章的url",'
            'article_cover_url VARCHAR(200) COMMENT"文章封面url",'
            'article_title VARCHAR(100) COMMENT"文章标题",'
            'article_viewer_num VARCHAR(20) COMMENT"查看文章人数",'
            'article_short_text VARCHAR(500) COMMENT"文章简略信息",'
            'insert_time DATETIME COMMENT"插入时间",'
            'change_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT"更新时间"'
            ')COMMENT"文章简述"')

        cur.execute(
            'INSERT INTO autohome_sample_text VALUES('
            '%s,%s,%s,%s,%s,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP)',
            articleSampleText)

        closeMysql()
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb insertArticleSampleText Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb insertArticleSampleText Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 2
0
def insertPageUrl(pageUrl):
    try:
        conToMysql()

        #---创建autohome 创建表结构---#
        #使用execute方法执行SQL语句
        cur.execute('CREATE DATABASE IF NOT EXISTS autohome')
        conn.select_db('autohome')
        cur.execute(
            'CREATE TABLE IF NOT EXISTS autohome_page_url('
            'page_url VARCHAR(100) PRIMARY KEY COMMENT"汽车之家页面url",'
            'insert_time DATETIME COMMENT"插入时间",'
            'change_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT"更新时间"'
            ')COMMENT"页面url"')
        cur.execute(
            'INSERT INTO autohome_page_url VALUES(%s,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP)',
            pageUrl)

        closeMysql()
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb insertPageUrl Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb insertPageUrl Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 3
0
def getSoupFromUrl(url):
    params = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Host': 'www.autohome.com.cn',
        'Pragma': 'no-cache',
        'User-Agent': changeUA.getUA()
    }
    try:
        m_str_content = requests.get(url, params=params, timeout=30).content
        m_soup = BeautifulSoup(m_str_content, "html5lib")
        #判断正文是否有翻页,并解决
        if (m_soup.find(name='div', attrs={'class': 'page'})):
            #正则表达式解决方案
            if (re.search('-', url)):  #判断是否有'-',有是第一页,没有不是第一页
                m_str_url = re.sub('-.*\.', '-all.', url)  #有的替换方式
            else:
                m_str_url = re.sub('\.html', '-all.html', url)  #没有的替换方式
            m_str_content = requests.get(m_str_url, params=params,
                                         timeout=30).content
            m_soup = BeautifulSoup(m_str_content, "html5lib")
        return m_soup
    except Exception, e:
        writeLog.writeErrorLog(
            'fail detailText getSoupFromUrl %s : %s' % (Exception, e),
            'autohome')
        writeLog.writeErrorLog(
            'fail detailText getSoupFromUrl %s : %s' % (Exception, e),
            'autohome_error')
Ejemplo n.º 4
0
def dropDatabase(databaseName):
    try:
        conToMysql()
        cur.execute('DROP DATABASE IF EXISTS autohome')
        closeMysql()
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb dropDatabase Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb dropDatabase Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 5
0
def closeMysql():
    try:
        conn.commit()  #提交事务
        cur.close()  #关闭游标
        conn.close()  #关闭连接
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb closeMysql Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb closeMysql Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 6
0
def getArticleUrl(database, table, field):
    try:
        m_ay_fieldValue = []
        #自定义库conToMysql
        m_tup_fieldValue = optMysqldb.getValues(database, table, field)
        for m_str_eachValue in m_tup_fieldValue:
            m_ay_fieldValue.append(m_str_eachValue[0])
        return m_ay_fieldValue
    except Exception, e:
        writeLog.writeErrorLog(
            'fail detailText getArticleUrl %s : %s' % (Exception, e),
            'autohome')
        writeLog.writeErrorLog(
            'fail detailText getArticleUrl %s : %s' % (Exception, e),
            'autohome_error')
Ejemplo n.º 7
0
def getValues(database, table, field):
    try:
        conToMysql()

        conn.select_db(database)
        #生成mysql语句
        m_str_selectSql = 'SELECT %s FROM %s' % (field, table)
        m_str_allValue = cur.execute(m_str_selectSql)
        m_str_getAllVal = cur.fetchmany(m_str_allValue)

        closeMysql()
        return m_str_getAllVal
    except MySQLdb.Error, e:
        writeLog.writeErrorLog('fail optMysqldb getValues Mysql Error %d: %s' %
                               (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog('fail optMysqldb getValues Mysql Error %d: %s' %
                               (e.args[0], e.args[1]),
                               'autohome_error')  #写入错误日志文
Ejemplo n.º 8
0
def conToMysql():
    try:
        #打开数据库连接
        global conn
        conn = MySQLdb.connect(host='localhost',
                               user='******',
                               passwd='mysql',
                               port=3306,
                               use_unicode=True,
                               charset='utf8')
        #使用cursor()方法获取操作游标
        global cur
        cur = conn.cursor()
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb conToMysql Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb conToMysql Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 9
0
def insertCommentDetail(commentDetail):
    try:
        conToMysql()

        cur.execute('CREATE DATABASE IF NOT EXISTS autohome')
        conn.select_db('autohome')
        cur.execute(
            'CREATE TABLE IF NOT EXISTS autohome_comment_detail('
            'article_url VARCHAR(100) COMMENT"评论文章的url",'
            'article_title VARCHAR(55) COMMENT"评论文章的标题",'
            'reply_id VARCHAR(20) PRIMARY KEY COMMENT"该评论在汽车之家的sk",'
            'repler_homePage VARCHAR(45) COMMENT"评论者的个人主页",'
            'repler_logo VARCHAR(150) COMMENT"评论者的头像",'
            'repler_name VARCHAR(35) COMMENT"评论者的名字",'
            'repler_device VARCHAR(30) COMMENT"评论者所用的设备",'
            'reply_timeAndFloor VARCHAR(55) COMMENT"评论时间和楼层",'
            'reply_content TEXT COMMENT"评论发布内容",'
            'reply_like VARCHAR(20) COMMENT"该评论获得的赞",'
            'preRepler_logo VARCHAR(150) COMMENT"原评论者的头像",'
            'preRepler_homePage VARCHAR(45) COMMENT"原评论者的个人主页",'
            'preRepler_name VARCHAR(35) COMMENT"原评论者的名字",'
            'preReply_date VARCHAR(25) COMMENT"原评论的日期",'
            'preReply_floor VARCHAR(15) COMMENT"原评论的楼层",'
            'preReply_content TEXT COMMENT"原评论的内容",'
            'insert_time DATETIME COMMENT"插入时间",'
            'change_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT"更新时间"'
            ')COMMENT"评论页面表"')

        cur.execute(
            'INSERT INTO autohome_comment_detail VALUES('
            '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP)',
            commentDetail)

        closeMysql()
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb insertCommentDetail Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb insertCommentDetail Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 10
0
def insertArticleDetailText(articleDetailText):
    try:
        conToMysql()

        cur.execute('CREATE DATABASE IF NOT EXISTS autohome')
        conn.select_db('autohome')
        cur.execute(
            'CREATE TABLE IF NOT EXISTS autohome_detail_text('
            'article_url VARCHAR(100) PRIMARY KEY COMMENT"文章url",'
            'article_classify VARCHAR(30) COMMENT"文章分类",'
            'article_carClassify VARCHAR(40) COMMENT"文章描述的汽车分类",'
            'article_title VARCHAR(55) COMMENT"文章标题",'
            'article_pubTime VARCHAR(25) COMMENT"发布时间",'
            'article_pubSrcsys VARCHAR(20) COMMENT"发布系统",'
            'article_pubType VARCHAR(15) COMMENT"发布类型",'
            'article_pubAuthor VARCHAR(10) COMMENT"文章作者",'
            'article_detailText MEDIUMTEXT COMMENT"正文",'
            'article_commentPage VARCHAR(100) COMMENT"评论页面url",'
            'article_picUrlList MEDIUMTEXT COMMENT"文章图片列表",'
            'insert_time DATETIME COMMENT"插入时间",'
            'change_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT"更新时间"'
            ')COMMENT"文章详情表"')

        cur.execute(
            'INSERT INTO autohome_detail_text VALUES('
            '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP)',
            articleDetailText)

        closeMysql()
    except MySQLdb.Error, e:
        writeLog.writeErrorLog(
            'fail optMysqldb insertArticleDetailText Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog(
            'fail optMysqldb insertArticleDetailText Mysql Error %d: %s' %
            (e.args[0], e.args[1]), 'autohome_error')  #写入错误日志文
Ejemplo n.º 11
0
def getCommentPage(firstPageUrl, soup):
    try:
        #存放评论页面
        m_queue_commentPage = Queue.Queue(maxsize=-1)
        m_url_nextPage = firstPageUrl

        while True:
            m_queue_commentPage.put(m_url_nextPage)  #进队
            #获得下一页连接
            m_url_nextPage = soup.find(name='div',
                                       attrs={
                                           'class': 'page page-small'
                                       }).find(name='a',
                                               attrs={
                                                   'class': 'current'
                                               }).next_sibling['href']
            soup = getSoupFromUrl(m_url_nextPage)  #解析url
            #判断是否到达评论最后一页
            if (soup.find(name='div', attrs={
                    'class': 'page page-small'
            }).find(name='a', attrs={
                    'class': 'current'
            }).next_sibling['href'] == 'javascript:void(0);'):
                m_queue_commentPage.put(m_url_nextPage)  #最后一页进队
                break
        print 'succeed to get comment page ' + firstPageUrl
        return m_queue_commentPage

    except Exception, e:
        print 'fail to get comment page ' + firstPageUrl + ' [%s,%s]' % (
            Exception, e)
        writeLog.writeErrorLog('fail commentDetail getCommentPage %s : %s' %
                               (Exception, e), 'autohome')  #写入总日志文件
        writeLog.writeErrorLog('fail commentDetail getCommentPage %s : %s' %
                               (Exception, e), 'autohome_error')  #写入错误日志文件
        return 0
Ejemplo n.º 12
0
def getUrlAndShort():
    #---变量声明---#
    s_n_isloop = 0  #0循环 1退出
    m_str_tempPage = '/all/2/#liststart'  #开始页面

    #===获取autohome所有页面链接 文章简介===#
    #---所有页面链接 m_str_tempUrlList---#
    #---所有文章简介 m_str_articleSampleText---#
    #删除已存在的数据库
    optMysqldb.dropDatabase('autohome')
    while (s_n_isloop == 0):
        try:
            #拼接网页连接  放进数组
            m_str_tempPageUrl = "http://www.autohome.com.cn%s" % m_str_tempPage
            #---m_str_tempUrlList导入Mysql的autohome_page_url表---#
            optMysqldb.insertPageUrl(m_str_tempPageUrl)
            writeLog.writerNormalLog(
                'succeed get page url %s' % m_str_tempPageUrl, 'autohome')

            #获取网页源代码  用bs4解析
            params = {
                'Accept': '*/*',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'Cache-Control': 'no-cache',
                'Connection': 'keep-alive',
                'Host': 'www.autohome.com.cn',
                'Pragma': 'no-cache',
                'User-Agent': changeUA.getUA()
            }
            m_str_tempContent = requests.get(m_str_tempPageUrl,
                                             params=params,
                                             timeout=30).content
            m_str_tempSoup = BeautifulSoup(m_str_tempContent, "html5lib")
            #获取下一个页面的连接  用bs4的下一个兄弟节点的方法
            m_str_tempPage = m_str_tempSoup.find(
                name='div', attrs={
                    'id': 'channelPage'
                }).find(name='a', attrs='current').next_sibling['href']

            #获取文章的标题等信息(简略)
            m_str_articleHtml = m_str_tempSoup.find(
                id="auto-channel-lazyload-article").find_all(name='li')
            for m_n_rankOfList in range(len(m_str_articleHtml)):
                m_str_part_1 = m_str_articleHtml[m_n_rankOfList].a[
                    "href"]  #文章连接 article_url
                m_str_part_2 = m_str_articleHtml[m_n_rankOfList].div.img[
                    "src"].strip()  #图片地址 article_cover_url
                m_str_part_3 = m_str_articleHtml[
                    m_n_rankOfList].h3.string  #文章标题 article_title
                m_str_part_4 = m_str_articleHtml[m_n_rankOfList].em.contents[
                    1].strip()  #阅读数 article_viewer
                m_str_part_5 = m_str_articleHtml[
                    m_n_rankOfList].p.string  #文章征文部分
                m_str_articleSampleText = (m_str_part_1, m_str_part_2,
                                           m_str_part_3, m_str_part_4,
                                           m_str_part_5)

                #---m_str_articleSampleText导入Mysql的autohome_sample_text表---#
                optMysqldb.insertArticleSampleText(m_str_articleSampleText)
                writeLog.writerNormalLog(
                    'succeed get article%s,%s' % (m_str_part_1, m_str_part_3),
                    'autohome')
        except Exception, e:
            s_n_isloop = 1
            writeLog.writeErrorLog(
                'fail allUrlAndShort getUrlAndShort %s : %s' % (Exception, e),
                'autohome')  #写入总日志文件
            writeLog.writeErrorLog(
                'fail allUrlAndShort getUrlAndShort %s : %s' % (Exception, e),
                'autohome_error')  #写入错误日志文件
Ejemplo n.º 13
0
import optMysqldb

#===设置全局UTF-8格式===#
reload(sys)
sys.setdefaultencoding( "utf-8" )

try:
    #===获取autohome所有url和简介===#
    allUrlAndShort.getUrlAndShort()

    #===获取autohome文章的详细信息===#
    m_list_articleUrl = detailText.getArticleUrl('autohome','autohome_sample_text','article_url')
    for str_eachArticleUrl in m_list_articleUrl:
        m_str_soup = detailText.getSoupFromUrl(str_eachArticleUrl)
        m_str_detailText = detailText.getArticleDetail(str_eachArticleUrl, m_str_soup)
        optMysqldb.insertArticleDetailText(m_str_detailText)

    #===获取autohome评论详情===#
m_list_articleCommentPage = commentDetail.getArticleUrl('autohome','autohome_detail_text','article_commentPage')
for m_str_articleCommentPage in m_list_articleCommentPage:
    try:
        m_str_soup = commentDetail.getSoupFromUrl(m_str_articleCommentPage)
        m_queue_commentPage = commentDetail.getCommentPage(m_str_articleCommentPage, m_str_soup)
        commentDetail.getCommentDetail(m_queue_commentPage)
    except Exception,e:
        writeLog.writeErrorLog('fail main %s : %s'%(Exception,e),'autohome')            #写入总日志文件
        writeLog.writeErrorLog('fail main %s : %s'%(Exception,e),'autohome_error') #写入错误日志文件
        continue
except Exception,e:
    writeLog.writeErrorLog('fail main %s : %s'%(Exception,e),'autohome')                #写入总日志文件
    writeLog.writeErrorLog('fail main %s : %s'%(Exception,e),'autohome_error')     #写入错误日志文件
Ejemplo n.º 14
0
def getCommentDetail(queue):
    while (queue.empty() != True):
        try:
            m_url_eachCommentPage = queue.get()
            #解析评论页面成soup
            m_soup_eachCommentPage = getSoupFromUrl(m_url_eachCommentPage)
            #评论文章url
            m_url_srcArticle = m_soup_eachCommentPage.find(
                name='h1', attrs={
                    'class': 'tit_rev'
                }).a['href']  #.h1.get_text(strip=True)
            #评论文章标题
            m_str_srcArticleTitle = m_soup_eachCommentPage.find(name='h1',
                                                                attrs={
                                                                    'class':
                                                                    'tit_rev'
                                                                }).a.string
            #获取评论页面所有的dt标签
            m_list_eachPageDt = m_soup_eachCommentPage.find(
                name='dl', attrs={
                    'id': 'reply-list'
                }).find_all(name='dt')
            #获取评论页面所有的dd标签
            m_list_eachPageDd = m_soup_eachCommentPage.find(
                name='dl', attrs={
                    'id': 'reply-list'
                }).find_all(name='dd')
            for number in range(0, len(m_list_eachPageDt)):

                #---dt模块---#
                #回复者的sk
                try:
                    m_str_skReply = m_list_eachPageDt[number].a['name']
                except:
                    m_str_skReply = 'null'
                #回复者的主页
                try:
                    m_url_replerPage = m_list_eachPageDt[number].find(
                        name='a', attrs={'class': 'user-fl'})['href']
                except:
                    m_url_replerPage = 'null'
                #回复人头像
                try:
                    m_url_replerLogo = m_list_eachPageDt[number].img['src']
                except:
                    m_url_replerLogo = 'null'
                #回复人昵称
                try:
                    m_str_replerName = m_list_eachPageDt[number].find(
                        name='a', attrs={
                            'class': 'user-fl'
                        }).text
                except:
                    m_str_replerName = 'null'
                #通过什么设备回复
                try:
                    m_str_replerDevice = m_list_eachPageDt[number].find(
                        name='a', attrs={
                            'class': 'revgrey'
                        }).text
                except:
                    m_str_replerDevice = 'null'
                #几分钟前,楼层数
                try:
                    m_str_replerTimeAndFloor = m_list_eachPageDt[number].find(
                        name='span', attrs={
                            'class': 'fn-right'
                        }).text
                except:
                    m_str_replerTimeAndFloor = 'null'

                #---dd模块---#
                #当前用户回复内容
                try:
                    m_str_replyContent = m_list_eachPageDd[number].p.text
                except:
                    m_str_replyContent = 'null'
                # 当前用户获得赞的数量
                try:
                    m_n_likeNum = m_list_eachPageDd[number].find(
                        name='div', attrs={
                            'class': 'text-right'
                        }).find(name='a', attrs={
                            'target': '_self'
                        }).text
                except:
                    m_n_likeNum = 'null'
                #--当前用户回复以往用户 以往用户的个人信息--#
                #原评论部分
                try:
                    m_str_preReplyPart = m_list_eachPageDd[number].find(
                        name='div', attrs={'class': 'reply'})
                except:
                    m_str_preReplyPart = 'null'
                #原评论用户头像
                try:
                    m_url_preReplerLogo = m_str_preReplyPart.img['src']
                except:
                    m_url_preReplerLogo = 'null'
                #原评论用户主页
                try:
                    m_url_preReplerPage = m_str_preReplyPart.find(
                        name='a', attrs={'class': 'grey666'})['href']
                except:
                    m_url_preReplerPage = 'null'
                #原评论用户昵称
                try:
                    m_str_preReplerName = m_str_preReplyPart.find(name='a',
                                                                  attrs={
                                                                      'class':
                                                                      'grey666'
                                                                  }).text
                except:
                    m_str_preReplerName = 'null'
                #原评论用户回复日期
                try:
                    m_str_preReplyDate = m_str_preReplyPart.find(
                        name='span', attrs={
                            'class': 'grey'
                        }, text='于').next_sibling
                except:
                    m_str_preReplyDate = 'null'
                #原评论评论所在楼数
                try:
                    m_n_preReplyFloor = m_str_preReplyPart.find(name='span',
                                                                attrs={
                                                                    'class':
                                                                    'grey666'
                                                                }).text
                except:
                    m_n_preReplyFloor = 'null'
                #原评论的回复
                try:
                    m_str_preReplyContent = m_str_preReplyPart.find(
                        name='p', attrs={
                            'class': 'reply-name'
                        }).next_sibling.text
                except:
                    m_str_preReplyContent = 'null'

                #---返回顺序:评论文章url,评论文章标题,回复者的sk,回复者的主页,回复人头像,回复人昵称,通过什么设备回复,几分钟前楼层数,
                # 当前用户回复内容,当前用户获得赞的数量,原评论用户头像,原评论用户回复日期,原评论评论所在楼数,原评论的回复
                m_str_CommentConnent = (m_url_srcArticle,
                                        m_str_srcArticleTitle, m_str_skReply,
                                        m_url_replerPage, m_url_replerLogo,
                                        m_str_replerName, m_str_replerDevice,
                                        m_str_replerTimeAndFloor,
                                        m_str_replyContent, m_n_likeNum,
                                        m_url_preReplerLogo,
                                        m_url_preReplerPage,
                                        m_str_preReplerName,
                                        m_str_preReplyDate, m_n_preReplyFloor,
                                        m_str_preReplyContent)
                optMysqldb.insertCommentDetail(m_str_CommentConnent)
                writeLog.writerNormalLog(
                    'succeed get comment skReplay %s' % m_str_skReply,
                    'autohome')
            print 'succeed to get comment detail ' + m_url_eachCommentPage
        except Exception, e:
            print 'fail to get comment page ' + m_url_srcArticle + ' [%s,%s]' % (
                Exception, e)
            writeLog.writeErrorLog(
                'fail commentDetail getCommentDetail %s : %s' % (Exception, e),
                'autohome')  #写入总日志文件
            writeLog.writeErrorLog(
                'fail commentDetail getCommentDetail %s : %s' % (Exception, e),
                'autohome_error')  #写入错误日志文件
Ejemplo n.º 15
0
def getArticleDetail(srcUrl, soup):
    try:
        global m_str_returnContent
        #--附加信息--#
        #文章所属分类
        try:
            m_str_articleClassify = soup.find(name='div',
                                              attrs={
                                                  'class': 'breadnav fn-left'
                                              }).get_text(strip=True)[0:-12]
        except:
            m_str_articleClassify = 'null'
        #所属车系
        try:
            m_str_articleCarClassify = soup.find(name='div',
                                                 attrs={
                                                     'class':
                                                     'subnav-title-name'
                                                 }).get_text(strip=True)
        except:
            m_str_articleCarClassify = 'null'
        #文章标题
        try:
            m_str_articleTitle = soup.find(name='div',
                                           attrs={
                                               'class': 'area article'
                                           }).h1.get_text(strip=True)
        except:
            m_str_articleTitle = 'null'

        #---发布详情---#
        #时间
        try:
            m_str_articlePubInfo = [
                text for text in soup.find(name='div',
                                           attrs={
                                               'class': 'article-info'
                                           }).stripped_strings
            ]
            m_str_articlePubTime = m_str_articlePubInfo[0]
        except:
            m_str_articlePubTime = 'null'
        #来源
        try:
            m_str_articlePubSrcsys = m_str_articlePubInfo[2]
        except:
            m_str_articlePubSrcsys = 'null'
        #类型
        try:
            m_str_articlePubType = m_str_articlePubInfo[3]
        except:
            m_str_articlePubType = 'null'
        #作者
        try:
            m_str_articlePubAuthor = m_str_articlePubInfo[5]
        except:
            m_str_articlePubAuthor = 'null'

        #---正文---#
        #粗略版  要删减
        try:
            m_str_articleDetail = soup.find(name='div',
                                            attrs={
                                                'class': 'article-content'
                                            }).get_text(strip=True)
        except:
            m_str_articleDetail = 'null'

        #---正文的图片---#
        #list以|P|为分隔符,以picStart开始,以picEnd结束
        try:
            target = soup.find(name='div', attrs={
                'class': 'area article'
            }).find_all(name='img')
            m_list_pictureUrl = 'picStart'
            for i in range(0, len(target)):
                m_list_pictureUrl = m_list_pictureUrl + '|P|' + target[i]['src']
            m_list_pictureUrl = m_list_pictureUrl + '|P|' + 'picEnd'
        except:
            m_list_pictureUrl = 'null'
        try:

            #--评论页面--#
            m_str_articleCommentPage = soup.find(
                name='a',
                attrs={'id':
                       'reply-all-btn1'})['href']  #.href#.get_text(strip=True)
        except:
            m_str_articleCommentPage = 'null'

        #--返回顺序:文章分类,文章车系,文章标题,时间,来源,类型,作者,正文,评论页面,图片字符串--#
        m_str_returnContent = (srcUrl, m_str_articleClassify,
                               m_str_articleCarClassify, m_str_articleTitle,
                               m_str_articlePubTime, m_str_articlePubSrcsys,
                               m_str_articlePubType, m_str_articlePubAuthor,
                               m_str_articleDetail, m_str_articleCommentPage,
                               m_list_pictureUrl)
        writeLog.writerNormalLog('succeed get article %s detail' % srcUrl,
                                 'autohome')
        return m_str_returnContent
    except Exception, e:
        writeLog.writeErrorLog(
            'fail detailText getArticleDetail %s : %s' % (Exception, e),
            'autohome')
        writeLog.writeErrorLog(
            'fail detailText getArticleDetail %s : %s' % (Exception, e),
            'autohome_error')